diff --git a/jieba/__init__.py b/jieba/__init__.py index 27c9dc2..c41bdea 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -20,10 +20,12 @@ DICTIONARY = "dict.txt" DICT_LOCK = threading.RLock() pfdict = None # to be initialized FREQ = {} -min_freq = 0.0 -total = 0.0 +total = 0 user_word_tag_tab = {} initialized = False +pool = None + +_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) log_console = logging.StreamHandler(sys.stderr) logger = logging.getLogger(__name__) @@ -37,14 +39,14 @@ def setLogLevel(log_level): def gen_pfdict(f_name): lfreq = {} pfdict = set() - ltotal = 0.0 + ltotal = 0 with open(f_name, 'rb') as f: lineno = 0 for line in f.read().rstrip().decode('utf-8').split('\n'): lineno += 1 try: - word,freq = line.split(' ')[:2] - freq = float(freq) + word, freq = line.split(' ')[:2] + freq = int(freq) lfreq[word] = freq ltotal += freq for ch in xrange(len(word)): @@ -61,10 +63,6 @@ def initialize(dictionary=None): with DICT_LOCK: if initialized: return - if pfdict: - del pfdict - pfdict = None - _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) @@ -75,31 +73,29 @@ def initialize(dictionary=None): cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest()) load_from_cache_fail = True - if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): + if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: - pfdict,FREQ,total,min_freq = marshal.load(cf) + pfdict, FREQ, total = marshal.load(cf) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) - except: + except Exception: load_from_cache_fail = True if load_from_cache_fail: - pfdict,FREQ,total = gen_pfdict(abs_path) - FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.iteritems()) #normalize - min_freq = min(FREQ.itervalues()) + pfdict, FREQ, total = gen_pfdict(abs_path) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: - marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file) + marshal.dump((pfdict, FREQ, total), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) - except: + except Exception: logger.exception("Dump cache file failed.") initialized = True @@ -140,7 +136,7 @@ def calc(sentence, DAG, route): N = len(sentence) route[N] = (0.0, '') for idx in xrange(N-1, -1, -1): - route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]) + route[idx] = max((log(FREQ.get(sentence[idx:x+1], 1)) - log(total) + route[x+1][0], x) for x in DAG[idx]) @require_initialized def get_DAG(sentence): @@ -203,7 +199,7 @@ def __cut_DAG(sentence): yield buf buf = u'' else: - if (buf not in FREQ): + if buf not in FREQ: recognized = finalseg.cut(buf) for t in recognized: yield t @@ -217,7 +213,7 @@ def __cut_DAG(sentence): if buf: if len(buf) == 1: yield buf - elif (buf not in FREQ): + elif buf not in FREQ: recognized = finalseg.cut(buf) for t in recognized: yield t @@ -298,26 +294,24 @@ def load_userdict(f): ''' if isinstance(f, (str, unicode)): f = open(f, 'rb') - content = f.read().decode('utf-8') + content = f.read().decode('utf-8').lstrip(u'\ufeff') line_no = 0 for line in content.split("\n"): line_no += 1 if not line.rstrip(): continue - tup = line.split(" ") - word, freq = tup[0], tup[1] - if freq.isdigit() is False: - continue - if line_no == 1: - word = word.replace(u'\ufeff',u"") #remove bom flag if it exists - add_word(*tup) + tup = line.strip().split(" ") + if tup[1].isdigit(): + add_word(*tup) @require_initialized def add_word(word, freq, tag=None): global FREQ, pfdict, total, user_word_tag_tab - FREQ[word] = log(float(freq) / total) + freq = int(freq) + FREQ[word] = freq + total += freq if tag is not None: - user_word_tag_tab[word] = tag.strip() + user_word_tag_tab[word] = tag for ch in xrange(len(word)): pfdict.add(word[:ch+1]) @@ -369,8 +363,8 @@ def enable_parallel(processnum=None): cut_for_search = pcut_for_search def disable_parallel(): - global pool,cut,cut_for_search - if 'pool' in globals(): + global pool, cut, cut_for_search + if pool: pool.close() pool = None cut = __ref_cut @@ -386,9 +380,7 @@ def set_dictionary(dictionary_path): initialized = False def get_abs_path_dict(): - _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - abs_path = os.path.join(_curpath,DICTIONARY) - return abs_path + return os.path.join(_curpath, DICTIONARY) def tokenize(unicode_sentence, mode="default", HMM=True): """Tokenize a sentence and yields tuples of (word, start, end) diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index 3f24e51..646b750 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -41,8 +41,9 @@ def load_model(): if sys.platform.startswith("java"): start_P, trans_P, emit_P = load_model() else: - import prob_start,prob_trans,prob_emit - start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P + from prob_start import P as start_P + from prob_trans import P as trans_P + from prob_emit import P as emit_P def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular @@ -50,7 +51,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): for y in states: #init V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) path[y] = [y] - for t in xrange(1,len(obs)): + for t in xrange(1, len(obs)): V.append({}) newpath = {} for y in states: @@ -68,7 +69,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): def __cut(sentence): global emit_P prob, pos_list = viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P) - begin, next = 0,0 + begin, nexti = 0, 0 #print pos_list, sentence for i,char in enumerate(sentence): pos = pos_list[i] @@ -76,12 +77,12 @@ def __cut(sentence): begin = i elif pos == 'E': yield sentence[begin:i+1] - next = i+1 + nexti = i+1 elif pos == 'S': yield char - next = i+1 - if next < len(sentence): - yield sentence[next:] + nexti = i+1 + if nexti < len(sentence): + yield sentence[nexti:] def cut(sentence): if not isinstance(sentence, unicode): diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 484874d..e104374 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -55,8 +55,11 @@ def load_model(f_name, isJython=True): if sys.platform.startswith("java"): char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict()) else: - import char_state_tab, prob_start, prob_trans, prob_emit - char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P + from char_state_tab import P as char_state_tab_P + from prob_start import P as start_P + from prob_trans import P as trans_P + from prob_emit import P as emit_P + word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False) def makesure_userdict_loaded(fn): @@ -165,16 +168,14 @@ def __cut_DAG(sentence): if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) - buf = u'' + elif buf not in jieba.FREQ: + recognized = __cut_detail(buf) + for t in recognized: + yield t else: - if (buf not in jieba.FREQ): - recognized = __cut_detail(buf) - for t in recognized: - yield t - else: - for elem in buf: - yield pair(elem, word_tag_tab.get(elem, 'x')) - buf = u'' + for elem in buf: + yield pair(elem, word_tag_tab.get(elem, 'x')) + buf = u'' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y @@ -229,7 +230,7 @@ def __lcut_internal_no_hmm(sentence): @makesure_userdict_loaded def cut(sentence, HMM=True): - if (not hasattr(jieba, 'pool')) or (jieba.pool is None): + if jieba.pool is None: for w in __cut_internal(sentence, HMM=HMM): yield w else: