Merge pull request #195 from gumblex/master

统一获取关键词接口，优化缓存命名
2025-07-10 00:01:33 +08:00 · 2014-11-01 12:54:57 +08:00 · 2014-11-01 12:54:57 +08:00 · cf2aa88122
commit cf2aa88122
parent d6ef07a472 751ff35eb5
6 changed files with 74 additions and 64 deletions
--- a/README.md
+++ b/README.md
@ -153,17 +153,16 @@ jieba.analyse.textrank(raw_text)
 来自`__main__`的示例结果：
 ```
-吉林 100.0
+吉林 1.0
-欧亚 86.4592606421
+欧亚 0.864834432786
-置业 55.3262889963
+置业 0.553465925497
-实现 52.0353476663
+实现 0.520660869531
-收入 37.9475518129
+收入 0.379699688954
-增资 35.5042189944
+增资 0.355086023683
-子公司 34.9286032861
+子公司 0.349758490263
-全资 30.8154823412
+全资 0.308537396283
-城市 30.6031961172
+城市 0.306103738053
-商业 30.4779050167
+商业 0.304837414946
 ```
 4) : 词性标注
--- a/jieba/init.py
+++ b/jieba/init.py
@ -14,6 +14,7 @@ import random
 import threading
 from functools import wraps
 import logging
 from hashlib import md5
 DICTIONARY = "dict.txt"
 DICT_LOCK = threading.RLock()
@ -53,12 +54,10 @@ def gen_pfdict(f_name):
                raise ValueError, e
    return pfdict, lfreq, ltotal
-def initialize(*args):
+def initialize(dictionary=None):
-    global pfdict, FREQ, total, min_freq, initialized
+    global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
-    if not args:
+    if not dictionary:
        dictionary = DICTIONARY
    else:
        dictionary = args[0]
    with DICT_LOCK:
        if initialized:
            return
@ -67,13 +66,13 @@ def initialize(*args):
            pfdict = None
        _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-        abs_path = os.path.join(_curpath,dictionary)
+        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
        else: #custom dictionary
-            cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
+            cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
        load_from_cache_fail = True
        if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
@ -87,18 +86,18 @@ def initialize(*args):
        if load_from_cache_fail:
            pfdict,FREQ,total = gen_pfdict(abs_path)
-            FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
+            FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.iteritems()) #normalize
            min_freq = min(FREQ.itervalues())
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
-                tmp_suffix = "."+str(random.random())
+                fd, fpath = tempfile.mkstemp()
-                with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
+                with os.fdopen(fd, 'wb') as temp_cache_file:
                    marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
-                replace_file(cache_file + tmp_suffix, cache_file)
+                replace_file(fpath, cache_file)
            except:
                logger.exception("Dump cache file failed.")
@ -136,12 +135,11 @@ def __cut_all(sentence):
                    old_j = j
-def calc(sentence,DAG,idx,route):
+def calc(sentence, DAG, idx, route):
    N = len(sentence)
    route[N] = (0.0, '')
    for idx in xrange(N-1, -1, -1):
-        candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
+        route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
        route[idx] = max(candidates)
@require_initialized
 def get_DAG(sentence):
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
    DAG = get_DAG(sentence)
    route = {}
-    calc(sentence, DAG, 0, route=route)
+    calc(sentence, DAG, 0, route)
    x = 0
    N = len(sentence)
    buf = u''
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,6 +1,7 @@
 #encoding=utf-8
 import jieba
 import os
 from operator import itemgetter
 try:
    from analyzer import ChineseAnalyzer
 except ImportError:
@ -26,13 +27,11 @@ class IDFLoader:
        if self.path != new_idf_path:
            content = open(new_idf_path, 'rb').read().decode('utf-8')
            idf_freq = {}
-            lines = content.split('\n')
+            lines = content.rstrip('\n').split('\n')
            if lines and not lines[-1]:
                lines.pop(-1)
            for line in lines:
                word, freq = line.split(' ')
                idf_freq[word] = float(freq)
-            median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
+            median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
            self.idf_freq = idf_freq
            self.median_idf = median_idf
            self.path = new_idf_path
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
        STOP_WORDS.add(line)
 def extract_tags(sentence, topK=20, withWeight=False):
-    global STOP_WORDS
+    """
    Extract keywords from sentence using TF-IDF algorithm.
    Parameter:
        - topK: return how many top keywords. `None` for all possible words.
        - withWeight: if True, return a list of (word, weight);
                      if False, return a list of words.
    """
    global STOP_WORDS, idf_loader
    idf_freq, median_idf = idf_loader.get_idf()
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
-        if len(w.strip()) < 2:
+        if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
            continue
        if w.lower() in STOP_WORDS:
            continue
        freq[w] = freq.get(w, 0.0) + 1.0
    total = sum(freq.values())
-    freq = [(k,v/total) for k,v in freq.iteritems()]
+    for k in freq:
-
+        freq[k] *= idf_freq.get(k, median_idf) / total
    tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
    st_list = sorted(tf_idf_list, reverse=True)
    if withWeight:
-        tags = st_list[:topK]
+        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
    else:
        tags = sorted(freq, key=freq.__getitem__, reverse=True)
    if topK:
        return tags[:topK]
    else:
        top_tuples = st_list[:topK]
        tags = [a[1] for a in top_tuples]
        return tags
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@ -1,9 +1,10 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import jieba.posseg as pseg
 import collections
 import sys
 import collections
 from operator import itemgetter
 import jieba.posseg as pseg
 class UndirectWeightedGraph:
    d = 0.85
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
                max_rank = w
        for n, w in ws.items():
-            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
+            # to unify the weights, don't *100.
            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
        return ws
-def textrank(raw, topk=10):
+def textrank(sentence, topK=10, withWeight=False):
    """
    Extract keywords from sentence using TextRank algorithm.
    Parameter:
        - topK: return how many top keywords. `None` for all possible words.
        - withWeight: if True, return a list of (word, weight);
                      if False, return a list of words.
    """
    pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
    g = UndirectWeightedGraph()
    cm = collections.defaultdict(int)
    span = 5
-    words = [x for x in pseg.cut(raw)]
+    words = list(pseg.cut(sentence))
    for i in xrange(len(words)):
        if words[i].flag in pos_filt:
            for j in xrange(i + 1, i + span):
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
        g.addEdge(terms[0], terms[1], w)
    nodes_rank = g.rank()
-    nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
+    if withWeight:
-    return nrs[:topk]
+        tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
    else:
        tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
    if topK:
        return tags[:topK]
    else:
        return tags
 if __name__ == '__main__':
    s = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"
-    for x, w in textrank(s):
+    for x, w in textrank(s, withWeight=True):
        print x, w
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -19,25 +19,22 @@ PrevStatus = {
 }
 def load_model():
-    _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    start_p = {}
    abs_path = os.path.join(_curpath, PROB_START_P)
-    with open(abs_path, mode='r') as f:
+    with open(abs_path, 'rb') as f:
        start_p = marshal.load(f)
    f.closed
    trans_p = {}
    abs_path = os.path.join(_curpath, PROB_TRANS_P)
-    with open(abs_path, 'r') as f:
+    with open(abs_path, 'rb') as f:
        trans_p = marshal.load(f)
    f.closed
    emit_p = {}
    abs_path = os.path.join(_curpath, PROB_EMIT_P)
-    with open(abs_path, 'r') as f:
+    with open(abs_path, 'rb') as f:
        emit_p = marshal.load(f)
    f.closed
    return start_p, trans_p, emit_p
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -25,27 +25,24 @@ def load_model(f_name, isJython=True):
                continue
            word, _, tag = line.split(' ')
            result[word.decode('utf-8')] = tag
-    f.closed
+
    if not isJython:
        return result
    start_p = {}
    abs_path = os.path.join(_curpath, PROB_START_P)
-    with open(abs_path, mode='r') as f:
+    with open(abs_path, 'rb') as f:
        start_p = marshal.load(f)
    f.closed
    trans_p = {}
    abs_path = os.path.join(_curpath, PROB_TRANS_P)
-    with open(abs_path, 'r') as f:
+    with open(abs_path, 'rb') as f:
        trans_p = marshal.load(f)
    f.closed
    emit_p = {}
    abs_path = os.path.join(_curpath, PROB_EMIT_P)
-    with open(abs_path, 'r') as f:
+    with open(abs_path, 'rb') as f:
        emit_p = marshal.load(f)
    f.closed
    state = {}
    abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)