diff --git a/jieba/__init__.py b/jieba/__init__.py index 4561e39..0e333bd 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -18,8 +18,7 @@ from . import finalseg DICTIONARY = "dict.txt" DICT_LOCK = threading.RLock() -pfdict = None # to be initialized -FREQ = {} +FREQ = {} # to be initialized total = 0 user_word_tag_tab = {} initialized = False @@ -41,7 +40,6 @@ def setLogLevel(log_level): def gen_pfdict(f_name): lfreq = {} - pfdict = set() ltotal = 0 with open(f_name, 'rb') as f: lineno = 0 @@ -53,15 +51,17 @@ def gen_pfdict(f_name): lfreq[word] = freq ltotal += freq for ch in xrange(len(word)): - pfdict.add(word[:ch + 1]) + wfrag = word[:ch + 1] + if wfrag not in lfreq: + lfreq[wfrag] = 0 except ValueError as e: logger.debug('%s at line %s %s' % (f_name, lineno, line)) raise e - return pfdict, lfreq, ltotal + return lfreq, ltotal def initialize(dictionary=None): - global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK + global FREQ, total, initialized, DICTIONARY, DICT_LOCK if not dictionary: dictionary = DICTIONARY with DICT_LOCK: @@ -83,19 +83,18 @@ def initialize(dictionary=None): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: - pfdict, FREQ, total = marshal.load(cf) - # prevent conflict with old version - load_from_cache_fail = not isinstance(pfdict, set) + FREQ, total = marshal.load(cf) + load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: - pfdict, FREQ, total = gen_pfdict(abs_path) + FREQ, total = gen_pfdict(abs_path) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: - marshal.dump((pfdict, FREQ, total), temp_cache_file) + marshal.dump((FREQ, total), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: @@ -140,23 +139,24 @@ def __cut_all(sentence): def calc(sentence, DAG, route): N = len(sentence) - route[N] = (0.0, '') + route[N] = (0, 0) + logtotal = log(total) for idx in xrange(N - 1, -1, -1): - route[idx] = max((log(FREQ.get(sentence[idx:x + 1], 1)) - - log(total) + route[x + 1][0], x) for x in DAG[idx]) + route[idx] = max((log(FREQ.get(sentence[idx:x + 1]) or 1) - + logtotal + route[x + 1][0], x) for x in DAG[idx]) @require_initialized def get_DAG(sentence): - global pfdict, FREQ + global FREQ DAG = {} N = len(sentence) for k in xrange(N): tmplist = [] i = k frag = sentence[k] - while i < N and frag in pfdict: - if frag in FREQ: + while i < N and frag in FREQ: + if FREQ[frag]: tmplist.append(i) i += 1 frag = sentence[k:i + 1] @@ -165,7 +165,7 @@ def get_DAG(sentence): DAG[k] = tmplist return DAG -re_eng = re.compile(r'[a-zA-Z0-9]', re.U) +re_eng = re.compile('[a-zA-Z0-9]', re.U) def __cut_DAG_NO_HMM(sentence): @@ -210,7 +210,7 @@ def __cut_DAG(sentence): yield buf buf = '' else: - if buf not in FREQ: + if not FREQ.get(buf): recognized = finalseg.cut(buf) for t in recognized: yield t @@ -224,7 +224,7 @@ def __cut_DAG(sentence): if buf: if len(buf) == 1: yield buf - elif buf not in FREQ: + elif not FREQ.get(buf): recognized = finalseg.cut(buf) for t in recognized: yield t @@ -288,12 +288,12 @@ def cut_for_search(sentence, HMM=True): if len(w) > 2: for i in xrange(len(w) - 1): gram2 = w[i:i + 2] - if gram2 in FREQ: + if FREQ.get(gram2): yield gram2 if len(w) > 3: for i in xrange(len(w) - 2): gram3 = w[i:i + 3] - if gram3 in FREQ: + if FREQ.get(gram3): yield gram3 yield w @@ -324,14 +324,16 @@ def load_userdict(f): @require_initialized def add_word(word, freq, tag=None): - global FREQ, pfdict, total, user_word_tag_tab + global FREQ, total, user_word_tag_tab freq = int(freq) FREQ[word] = freq total += freq if tag is not None: user_word_tag_tab[word] = tag for ch in xrange(len(word)): - pfdict.add(word[:ch + 1]) + wfrag = word[:ch + 1] + if wfrag not in lfreq: + lfreq[wfrag] = 0 __ref_cut = cut __ref_cut_for_search = cut_for_search @@ -430,12 +432,12 @@ def tokenize(unicode_sentence, mode="default", HMM=True): if len(w) > 2: for i in xrange(len(w) - 1): gram2 = w[i:i + 2] - if gram2 in FREQ: + if FREQ.get(gram2): yield (gram2, start + i, start + i + 2) if len(w) > 3: for i in xrange(len(w) - 2): gram3 = w[i:i + 3] - if gram3 in FREQ: + if FREQ.get(gram3): yield (gram3, start + i, start + i + 3) yield (w, start, start + width) start += width diff --git a/test/test_cut_for_search.py b/test/test_cut_for_search.py index ea86c16..db8e9e5 100644 --- a/test/test_cut_for_search.py +++ b/test/test_cut_for_search.py @@ -95,4 +95,4 @@ if __name__ == "__main__": cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') - cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') \ No newline at end of file + cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') diff --git a/test/test_cutall.py b/test/test_cutall.py index ab01b38..d42da32 100644 --- a/test/test_cutall.py +++ b/test/test_cutall.py @@ -1,4 +1,5 @@ #encoding=utf-8 +from __future__ import print_function import sys sys.path.append("../") import jieba @@ -94,4 +95,4 @@ if __name__ == "__main__": cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') - cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') \ No newline at end of file + cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') diff --git a/test/test_file.py b/test/test_file.py index 4d0650f..f33d81e 100644 --- a/test/test_file.py +++ b/test/test_file.py @@ -1,4 +1,4 @@ -import sys,time +import time import sys sys.path.append("../") import jieba diff --git a/test/test_pos_no_hmm.py b/test/test_pos_no_hmm.py index 491cae1..7d9c1d5 100644 --- a/test/test_pos_no_hmm.py +++ b/test/test_pos_no_hmm.py @@ -1,4 +1,5 @@ #encoding=utf-8 +from __future__ import print_function import sys sys.path.append("../") import jieba.posseg as pseg @@ -95,4 +96,4 @@ if __name__ == "__main__": cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') - cuttest('枪杆子中出政权') \ No newline at end of file + cuttest('枪杆子中出政权') diff --git a/test/test_userdict.py b/test/test_userdict.py index 977a991..ccb9758 100644 --- a/test/test_userdict.py +++ b/test/test_userdict.py @@ -1,4 +1,5 @@ #encoding=utf-8 +from __future__ import print_function import sys sys.path.append("../") import jieba