Merge pull request #195 from gumblex/master

统一获取关键词接口,优化缓存命名
This commit is contained in:
Sun Junyi 2014-11-01 12:54:57 +08:00
commit cf2aa88122
6 changed files with 74 additions and 64 deletions

View File

@ -153,17 +153,16 @@ jieba.analyse.textrank(raw_text)
来自`__main__`的示例结果: 来自`__main__`的示例结果:
``` ```
吉林 100.0 吉林 1.0
欧亚 86.4592606421 欧亚 0.864834432786
置业 55.3262889963 置业 0.553465925497
实现 52.0353476663 实现 0.520660869531
收入 37.9475518129 收入 0.379699688954
增资 35.5042189944 增资 0.355086023683
子公司 34.9286032861 子公司 0.349758490263
全资 30.8154823412 全资 0.308537396283
城市 30.6031961172 城市 0.306103738053
商业 30.4779050167 商业 0.304837414946
``` ```
4) : 词性标注 4) : 词性标注

View File

@ -14,6 +14,7 @@ import random
import threading import threading
from functools import wraps from functools import wraps
import logging import logging
from hashlib import md5
DICTIONARY = "dict.txt" DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock() DICT_LOCK = threading.RLock()
@ -53,12 +54,10 @@ def gen_pfdict(f_name):
raise ValueError, e raise ValueError, e
return pfdict, lfreq, ltotal return pfdict, lfreq, ltotal
def initialize(*args): def initialize(dictionary=None):
global pfdict, FREQ, total, min_freq, initialized global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
if not args: if not dictionary:
dictionary = DICTIONARY dictionary = DICTIONARY
else:
dictionary = args[0]
with DICT_LOCK: with DICT_LOCK:
if initialized: if initialized:
return return
@ -67,13 +66,13 @@ def initialize(*args):
pfdict = None pfdict = None
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary) abs_path = os.path.join(_curpath, dictionary)
logger.debug("Building prefix dict from %s ..." % abs_path) logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time() t1 = time.time()
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #custom dictionary else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path)) cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
load_from_cache_fail = True load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
@ -87,18 +86,18 @@ def initialize(*args):
if load_from_cache_fail: if load_from_cache_fail:
pfdict,FREQ,total = gen_pfdict(abs_path) pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.iteritems()) #normalize
min_freq = min(FREQ.itervalues()) min_freq = min(FREQ.itervalues())
logger.debug("Dumping model to file cache %s" % cache_file) logger.debug("Dumping model to file cache %s" % cache_file)
try: try:
tmp_suffix = "."+str(random.random()) fd, fpath = tempfile.mkstemp()
with open(cache_file+tmp_suffix,'wb') as temp_cache_file: with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file) marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name == 'nt': if os.name == 'nt':
from shutil import move as replace_file from shutil import move as replace_file
else: else:
replace_file = os.rename replace_file = os.rename
replace_file(cache_file + tmp_suffix, cache_file) replace_file(fpath, cache_file)
except: except:
logger.exception("Dump cache file failed.") logger.exception("Dump cache file failed.")
@ -136,12 +135,11 @@ def __cut_all(sentence):
old_j = j old_j = j
def calc(sentence,DAG,idx,route): def calc(sentence, DAG, idx, route):
N = len(sentence) N = len(sentence)
route[N] = (0.0, '') route[N] = (0.0, '')
for idx in xrange(N-1, -1, -1): for idx in xrange(N-1, -1, -1):
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]] route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
route[idx] = max(candidates)
@require_initialized @require_initialized
def get_DAG(sentence): def get_DAG(sentence):
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route = {} route = {}
calc(sentence, DAG, 0, route=route) calc(sentence, DAG, 0, route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = u'' buf = u''

View File

@ -1,6 +1,7 @@
#encoding=utf-8 #encoding=utf-8
import jieba import jieba
import os import os
from operator import itemgetter
try: try:
from analyzer import ChineseAnalyzer from analyzer import ChineseAnalyzer
except ImportError: except ImportError:
@ -26,13 +27,11 @@ class IDFLoader:
if self.path != new_idf_path: if self.path != new_idf_path:
content = open(new_idf_path, 'rb').read().decode('utf-8') content = open(new_idf_path, 'rb').read().decode('utf-8')
idf_freq = {} idf_freq = {}
lines = content.split('\n') lines = content.rstrip('\n').split('\n')
if lines and not lines[-1]:
lines.pop(-1)
for line in lines: for line in lines:
word, freq = line.split(' ') word, freq = line.split(' ')
idf_freq[word] = float(freq) idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2] median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
self.idf_freq = idf_freq self.idf_freq = idf_freq
self.median_idf = median_idf self.median_idf = median_idf
self.path = new_idf_path self.path = new_idf_path
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
STOP_WORDS.add(line) STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False): def extract_tags(sentence, topK=20, withWeight=False):
global STOP_WORDS """
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
global STOP_WORDS, idf_loader
idf_freq, median_idf = idf_loader.get_idf() idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence) words = jieba.cut(sentence)
freq = {} freq = {}
for w in words: for w in words:
if len(w.strip()) < 2: if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
continue
if w.lower() in STOP_WORDS:
continue continue
freq[w] = freq.get(w, 0.0) + 1.0 freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values()) total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()] for k in freq:
freq[k] *= idf_freq.get(k, median_idf) / total
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list, reverse=True)
if withWeight: if withWeight:
tags = st_list[:topK] tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else: else:
top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags return tags

View File

@ -1,9 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import jieba.posseg as pseg
import collections
import sys import sys
import collections
from operator import itemgetter
import jieba.posseg as pseg
class UndirectWeightedGraph: class UndirectWeightedGraph:
d = 0.85 d = 0.85
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
max_rank = w max_rank = w
for n, w in ws.items(): for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100 # to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws return ws
def textrank(raw, topk=10): def textrank(sentence, topK=10, withWeight=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
pos_filt = frozenset(('ns', 'n', 'vn', 'v')) pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
g = UndirectWeightedGraph() g = UndirectWeightedGraph()
cm = collections.defaultdict(int) cm = collections.defaultdict(int)
span = 5 span = 5
words = [x for x in pseg.cut(raw)] words = list(pseg.cut(sentence))
for i in xrange(len(words)): for i in xrange(len(words)):
if words[i].flag in pos_filt: if words[i].flag in pos_filt:
for j in xrange(i + 1, i + span): for j in xrange(i + 1, i + span):
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
g.addEdge(terms[0], terms[1], w) g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank() nodes_rank = g.rank()
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True) if withWeight:
return nrs[:topk] tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
if __name__ == '__main__': if __name__ == '__main__':
s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。" s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
for x, w in textrank(s): for x, w in textrank(s, withWeight=True):
print x, w print x, w

View File

@ -19,25 +19,22 @@ PrevStatus = {
} }
def load_model(): def load_model():
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {} start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P) abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='r') as f: with open(abs_path, 'rb') as f:
start_p = marshal.load(f) start_p = marshal.load(f)
f.closed
trans_p = {} trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P) abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'r') as f: with open(abs_path, 'rb') as f:
trans_p = marshal.load(f) trans_p = marshal.load(f)
f.closed
emit_p = {} emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P) abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'r') as f: with open(abs_path, 'rb') as f:
emit_p = marshal.load(f) emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p return start_p, trans_p, emit_p

View File

@ -25,27 +25,24 @@ def load_model(f_name, isJython=True):
continue continue
word, _, tag = line.split(' ') word, _, tag = line.split(' ')
result[word.decode('utf-8')] = tag result[word.decode('utf-8')] = tag
f.closed
if not isJython: if not isJython:
return result return result
start_p = {} start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P) abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='r') as f: with open(abs_path, 'rb') as f:
start_p = marshal.load(f) start_p = marshal.load(f)
f.closed
trans_p = {} trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P) abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'r') as f: with open(abs_path, 'rb') as f:
trans_p = marshal.load(f) trans_p = marshal.load(f)
f.closed
emit_p = {} emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P) abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'r') as f: with open(abs_path, 'rb') as f:
emit_p = marshal.load(f) emit_p = marshal.load(f)
f.closed
state = {} state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)