diff --git a/jieba/__init__.py b/jieba/__init__.py index 9d06158..bac3075 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -82,16 +82,20 @@ def initialize(*args): FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize min_freq = min(FREQ.values()) print("dumping model to file cache " + cache_file, file=sys.stderr) - tmp_suffix = "."+str(random.random()) - with open(cache_file+tmp_suffix,'wb') as temp_cache_file: - marshal.dump((trie,FREQ,total,min_freq),temp_cache_file) - if os.name=='nt': - import shutil - replace_file = shutil.move - else: - replace_file = os.rename - replace_file(cache_file+tmp_suffix,cache_file) - + try: + tmp_suffix = "."+str(random.random()) + with open(cache_file+tmp_suffix,'wb') as temp_cache_file: + marshal.dump((trie,FREQ,total,min_freq),temp_cache_file) + if os.name=='nt': + import shutil + replace_file = shutil.move + else: + replace_file = os.rename + replace_file(cache_file+tmp_suffix,cache_file) + except: + import traceback + print("dump cache file failed.",file=sys.stderr) + print(traceback.format_exc(),file=sys.stderr) initialized = True print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr) @@ -263,15 +267,22 @@ def load_userdict(f): if line_no==1: word = word.replace('\ufeff',"") #remove bom flag if it exists if len(tup)==3: - user_word_tag_tab[word]=tup[2].strip() - freq = float(freq) - FREQ[word] = log(freq / total) - p = trie - for c in word: - if not c in p: - p[c] ={} - p = p[c] - p['']='' #ending flag + add_word(word, freq, tup[2]) + else: + add_word(word, freq) + +def add_word(word, freq, tag=None): + global FREQ, trie, total, user_word_tag_tab + freq = float(freq) + FREQ[word] = log(freq / total) + if tag is not None: + user_word_tag_tab[word] = tag.strip() + p = trie + for c in word: + if not c in p: + p[c] = {} + p = p[c] + p[''] = '' # ending flag __ref_cut = cut __ref_cut_for_search = cut_for_search @@ -333,3 +344,29 @@ def get_abs_path_dict(): _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) abs_path = os.path.join(_curpath,DICTIONARY) return abs_path + +def tokenize(unicode_sentence,mode="default"): + #mode ("default" or "search") + if not isinstance(unicode_sentence, str): + raise Exception("jieba: the input parameter should string.") + start = 0 + if mode=='default': + for w in cut(unicode_sentence): + width = len(w) + yield (w,start,start+width) + start+=width + else: + for w in cut(unicode_sentence): + width = len(w) + if len(w)>2: + for i in range(len(w)-1): + gram2 = w[i:i+2] + if gram2 in FREQ: + yield (gram2,start+i,start+i+2) + if len(w)>3: + for i in range(len(w)-2): + gram3 = w[i:i+3] + if gram3 in FREQ: + yield (gram3,start+i,start+i+3) + yield (w,start,start+width) + start+=width diff --git a/test/jieba_test.py b/test/jieba_test.py index eea76f3..9404704 100644 --- a/test/jieba_test.py +++ b/test/jieba_test.py @@ -150,5 +150,15 @@ class JiebaTestCase(unittest.TestCase): print(" , ".join([w.word + " / " + w.flag for w in result]),file=sys.stderr) print("testPosseg",file=sys.stderr) + def testTokenize(self): + for content in test_contents: + result = jieba.tokenize(content) + assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" + result = list(result) + assert isinstance(result, list), "Test Tokenize error on content: %s" % content + for tk in result: + print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) + print("testTokenize",file=sys.stderr) + if __name__ == "__main__": unittest.main() diff --git a/test/test_tokenize.py b/test/test_tokenize.py index 994e1ca..ed7e172 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -7,10 +7,9 @@ g_mode="default" def cuttest(test_sent): global g_mode - test_sent = test_sent.decode('utf-8') result = jieba.tokenize(test_sent,mode=g_mode) for tk in result: - print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) + print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) if __name__ == "__main__": diff --git a/test/test_whoosh.py b/test/test_whoosh.py index bb2fda6..5617b5d 100644 --- a/test/test_whoosh.py +++ b/test/test_whoosh.py @@ -15,40 +15,40 @@ ix = create_in("tmp", schema) # for create new index writer = ix.writer() writer.add_document( - title=u"document1", - path=u"/a", - content=u"This is the first document we’ve added!" + title="document1", + path="/a", + content="This is the first document we’ve added!" ) writer.add_document( - title=u"document2", - path=u"/b", - content=u"The second one 你 中文测试中文 is even more interesting! 吃水果" + title="document2", + path="/b", + content="The second one 你 中文测试中文 is even more interesting! 吃水果" ) writer.add_document( - title=u"document3", - path=u"/c", - content=u"买水果然后来世博园。" + title="document3", + path="/c", + content="买水果然后来世博园。" ) writer.add_document( - title=u"document4", - path=u"/c", - content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" + title="document4", + path="/c", + content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" ) writer.add_document( - title=u"document4", - path=u"/c", - content=u"咱俩交换一下吧。" + title="document4", + path="/c", + content="咱俩交换一下吧。" ) writer.commit() searcher = ix.searcher() parser = QueryParser("content", schema=ix.schema) -for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"): +for keyword in ("水果世博园","你","first","中文","交换机","交换"): print "result of ",keyword q = parser.parse(keyword) results = searcher.search(q) @@ -56,5 +56,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交 print hit.highlights("content") print "="*10 -for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."): +for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."): print t.text