Merge pull request #195 from gumblex/master

统一获取关键词接口,优化缓存命名
This commit is contained in:
Sun Junyi 2014-11-01 12:54:57 +08:00
commit cf2aa88122
6 changed files with 74 additions and 64 deletions

View File

@ -153,17 +153,16 @@ jieba.analyse.textrank(raw_text)
来自`__main__`的示例结果:
```
吉林 100.0
欧亚 86.4592606421
置业 55.3262889963
实现 52.0353476663
收入 37.9475518129
增资 35.5042189944
子公司 34.9286032861
全资 30.8154823412
城市 30.6031961172
商业 30.4779050167
吉林 1.0
欧亚 0.864834432786
置业 0.553465925497
实现 0.520660869531
收入 0.379699688954
增资 0.355086023683
子公司 0.349758490263
全资 0.308537396283
城市 0.306103738053
商业 0.304837414946
```
4) : 词性标注

View File

@ -14,6 +14,7 @@ import random
import threading
from functools import wraps
import logging
from hashlib import md5
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
@ -53,12 +54,10 @@ def gen_pfdict(f_name):
raise ValueError, e
return pfdict, lfreq, ltotal
def initialize(*args):
global pfdict, FREQ, total, min_freq, initialized
if not args:
def initialize(dictionary=None):
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
if not dictionary:
dictionary = DICTIONARY
else:
dictionary = args[0]
with DICT_LOCK:
if initialized:
return
@ -67,13 +66,13 @@ def initialize(*args):
pfdict = None
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary)
abs_path = os.path.join(_curpath, dictionary)
logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time()
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
@ -87,18 +86,18 @@ def initialize(*args):
if load_from_cache_fail:
pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.iteritems()) #normalize
min_freq = min(FREQ.itervalues())
logger.debug("Dumping model to file cache %s" % cache_file)
try:
tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
fd, fpath = tempfile.mkstemp()
with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name == 'nt':
from shutil import move as replace_file
else:
replace_file = os.rename
replace_file(cache_file + tmp_suffix, cache_file)
replace_file(fpath, cache_file)
except:
logger.exception("Dump cache file failed.")
@ -136,12 +135,11 @@ def __cut_all(sentence):
old_j = j
def calc(sentence,DAG,idx,route):
def calc(sentence, DAG, idx, route):
N = len(sentence)
route[N] = (0.0, '')
for idx in xrange(N-1, -1, -1):
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
route[idx] = max(candidates)
route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
@require_initialized
def get_DAG(sentence):
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence)
route = {}
calc(sentence, DAG, 0, route=route)
calc(sentence, DAG, 0, route)
x = 0
N = len(sentence)
buf = u''

View File

@ -1,6 +1,7 @@
#encoding=utf-8
import jieba
import os
from operator import itemgetter
try:
from analyzer import ChineseAnalyzer
except ImportError:
@ -26,13 +27,11 @@ class IDFLoader:
if self.path != new_idf_path:
content = open(new_idf_path, 'rb').read().decode('utf-8')
idf_freq = {}
lines = content.split('\n')
if lines and not lines[-1]:
lines.pop(-1)
lines = content.rstrip('\n').split('\n')
for line in lines:
word, freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
self.idf_freq = idf_freq
self.median_idf = median_idf
self.path = new_idf_path
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False):
global STOP_WORDS
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
global STOP_WORDS, idf_loader
idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip()) < 2:
continue
if w.lower() in STOP_WORDS:
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()]
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list, reverse=True)
for k in freq:
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
tags = st_list[:topK]
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags

View File

@ -1,9 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import jieba.posseg as pseg
import collections
import sys
import collections
from operator import itemgetter
import jieba.posseg as pseg
class UndirectWeightedGraph:
d = 0.85
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
max_rank = w
for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws
def textrank(raw, topk=10):
def textrank(sentence, topK=10, withWeight=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
g = UndirectWeightedGraph()
cm = collections.defaultdict(int)
span = 5
words = [x for x in pseg.cut(raw)]
words = list(pseg.cut(sentence))
for i in xrange(len(words)):
if words[i].flag in pos_filt:
for j in xrange(i + 1, i + span):
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
return nrs[:topk]
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
if __name__ == '__main__':
s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
for x, w in textrank(s):
for x, w in textrank(s, withWeight=True):
print x, w

View File

@ -19,25 +19,22 @@ PrevStatus = {
}
def load_model():
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='r') as f:
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p

View File

@ -25,27 +25,24 @@ def load_model(f_name, isJython=True):
continue
word, _, tag = line.split(' ')
result[word.decode('utf-8')] = tag
f.closed
if not isJython:
return result
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='r') as f:
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'r') as f:
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)