fix conflict of merging

This commit is contained in:
Sun Junyi 2013-06-24 13:48:16 +08:00
parent c01680c6a8
commit b9b1f1a418
4 changed files with 84 additions and 38 deletions

View File

@ -82,6 +82,7 @@ def initialize(*args):
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
min_freq = min(FREQ.values()) min_freq = min(FREQ.values())
print("dumping model to file cache " + cache_file, file=sys.stderr) print("dumping model to file cache " + cache_file, file=sys.stderr)
try:
tmp_suffix = "."+str(random.random()) tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file: with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file) marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
@ -91,7 +92,10 @@ def initialize(*args):
else: else:
replace_file = os.rename replace_file = os.rename
replace_file(cache_file+tmp_suffix,cache_file) replace_file(cache_file+tmp_suffix,cache_file)
except:
import traceback
print("dump cache file failed.",file=sys.stderr)
print(traceback.format_exc(),file=sys.stderr)
initialized = True initialized = True
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr) print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
@ -263,9 +267,16 @@ def load_userdict(f):
if line_no==1: if line_no==1:
word = word.replace('\ufeff',"") #remove bom flag if it exists word = word.replace('\ufeff',"") #remove bom flag if it exists
if len(tup)==3: if len(tup)==3:
user_word_tag_tab[word]=tup[2].strip() add_word(word, freq, tup[2])
else:
add_word(word, freq)
def add_word(word, freq, tag=None):
global FREQ, trie, total, user_word_tag_tab
freq = float(freq) freq = float(freq)
FREQ[word] = log(freq / total) FREQ[word] = log(freq / total)
if tag is not None:
user_word_tag_tab[word] = tag.strip()
p = trie p = trie
for c in word: for c in word:
if not c in p: if not c in p:
@ -333,3 +344,29 @@ def get_abs_path_dict():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
abs_path = os.path.join(_curpath,DICTIONARY) abs_path = os.path.join(_curpath,DICTIONARY)
return abs_path return abs_path
def tokenize(unicode_sentence,mode="default"):
#mode ("default" or "search")
if not isinstance(unicode_sentence, str):
raise Exception("jieba: the input parameter should string.")
start = 0
if mode=='default':
for w in cut(unicode_sentence):
width = len(w)
yield (w,start,start+width)
start+=width
else:
for w in cut(unicode_sentence):
width = len(w)
if len(w)>2:
for i in range(len(w)-1):
gram2 = w[i:i+2]
if gram2 in FREQ:
yield (gram2,start+i,start+i+2)
if len(w)>3:
for i in range(len(w)-2):
gram3 = w[i:i+3]
if gram3 in FREQ:
yield (gram3,start+i,start+i+3)
yield (w,start,start+width)
start+=width

View File

@ -150,5 +150,15 @@ class JiebaTestCase(unittest.TestCase):
print(" , ".join([w.word + " / " + w.flag for w in result]),file=sys.stderr) print(" , ".join([w.word + " / " + w.flag for w in result]),file=sys.stderr)
print("testPosseg",file=sys.stderr) print("testPosseg",file=sys.stderr)
def testTokenize(self):
for content in test_contents:
result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize",file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -7,10 +7,9 @@ g_mode="default"
def cuttest(test_sent): def cuttest(test_sent):
global g_mode global g_mode
test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode) result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -15,40 +15,40 @@ ix = create_in("tmp", schema) # for create new index
writer = ix.writer() writer = ix.writer()
writer.add_document( writer.add_document(
title=u"document1", title="document1",
path=u"/a", path="/a",
content=u"This is the first document weve added!" content="This is the first document weve added!"
) )
writer.add_document( writer.add_document(
title=u"document2", title="document2",
path=u"/b", path="/b",
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果" content="The second one 你 中文测试中文 is even more interesting! 吃水果"
) )
writer.add_document( writer.add_document(
title=u"document3", title="document3",
path=u"/c", path="/c",
content=u"买水果然后来世博园。" content="买水果然后来世博园。"
) )
writer.add_document( writer.add_document(
title=u"document4", title="document4",
path=u"/c", path="/c",
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
) )
writer.add_document( writer.add_document(
title=u"document4", title="document4",
path=u"/c", path="/c",
content=u"咱俩交换一下吧。" content="咱俩交换一下吧。"
) )
writer.commit() writer.commit()
searcher = ix.searcher() searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema) parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果世博园",u"",u"first",u"中文",u"交换机",u"交换"): for keyword in ("水果世博园","","first","中文","交换机","交换"):
print "result of ",keyword print "result of ",keyword
q = parser.parse(keyword) q = parser.parse(keyword)
results = searcher.search(q) results = searcher.search(q)
@ -56,5 +56,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
print hit.highlights("content") print hit.highlights("content")
print "="*10 print "="*10
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."): for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
print t.text print t.text