mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
commit
cf2aa88122
21
README.md
21
README.md
@ -153,17 +153,16 @@ jieba.analyse.textrank(raw_text)
|
||||
来自`__main__`的示例结果:
|
||||
|
||||
```
|
||||
吉林 100.0
|
||||
欧亚 86.4592606421
|
||||
置业 55.3262889963
|
||||
实现 52.0353476663
|
||||
收入 37.9475518129
|
||||
增资 35.5042189944
|
||||
子公司 34.9286032861
|
||||
全资 30.8154823412
|
||||
城市 30.6031961172
|
||||
商业 30.4779050167
|
||||
|
||||
吉林 1.0
|
||||
欧亚 0.864834432786
|
||||
置业 0.553465925497
|
||||
实现 0.520660869531
|
||||
收入 0.379699688954
|
||||
增资 0.355086023683
|
||||
子公司 0.349758490263
|
||||
全资 0.308537396283
|
||||
城市 0.306103738053
|
||||
商业 0.304837414946
|
||||
```
|
||||
|
||||
4) : 词性标注
|
||||
|
@ -14,6 +14,7 @@ import random
|
||||
import threading
|
||||
from functools import wraps
|
||||
import logging
|
||||
from hashlib import md5
|
||||
|
||||
DICTIONARY = "dict.txt"
|
||||
DICT_LOCK = threading.RLock()
|
||||
@ -53,12 +54,10 @@ def gen_pfdict(f_name):
|
||||
raise ValueError, e
|
||||
return pfdict, lfreq, ltotal
|
||||
|
||||
def initialize(*args):
|
||||
global pfdict, FREQ, total, min_freq, initialized
|
||||
if not args:
|
||||
def initialize(dictionary=None):
|
||||
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
|
||||
if not dictionary:
|
||||
dictionary = DICTIONARY
|
||||
else:
|
||||
dictionary = args[0]
|
||||
with DICT_LOCK:
|
||||
if initialized:
|
||||
return
|
||||
@ -67,13 +66,13 @@ def initialize(*args):
|
||||
pfdict = None
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
abs_path = os.path.join(_curpath,dictionary)
|
||||
abs_path = os.path.join(_curpath, dictionary)
|
||||
logger.debug("Building prefix dict from %s ..." % abs_path)
|
||||
t1 = time.time()
|
||||
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
||||
else: #custom dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
|
||||
|
||||
load_from_cache_fail = True
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||
@ -87,18 +86,18 @@ def initialize(*args):
|
||||
|
||||
if load_from_cache_fail:
|
||||
pfdict,FREQ,total = gen_pfdict(abs_path)
|
||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
|
||||
FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.iteritems()) #normalize
|
||||
min_freq = min(FREQ.itervalues())
|
||||
logger.debug("Dumping model to file cache %s" % cache_file)
|
||||
try:
|
||||
tmp_suffix = "."+str(random.random())
|
||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||
fd, fpath = tempfile.mkstemp()
|
||||
with os.fdopen(fd, 'wb') as temp_cache_file:
|
||||
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
|
||||
if os.name == 'nt':
|
||||
from shutil import move as replace_file
|
||||
else:
|
||||
replace_file = os.rename
|
||||
replace_file(cache_file + tmp_suffix, cache_file)
|
||||
replace_file(fpath, cache_file)
|
||||
except:
|
||||
logger.exception("Dump cache file failed.")
|
||||
|
||||
@ -136,12 +135,11 @@ def __cut_all(sentence):
|
||||
old_j = j
|
||||
|
||||
|
||||
def calc(sentence,DAG,idx,route):
|
||||
def calc(sentence, DAG, idx, route):
|
||||
N = len(sentence)
|
||||
route[N] = (0.0, '')
|
||||
for idx in xrange(N-1, -1, -1):
|
||||
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
|
||||
route[idx] = max(candidates)
|
||||
route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
|
||||
|
||||
@require_initialized
|
||||
def get_DAG(sentence):
|
||||
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
|
||||
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
||||
DAG = get_DAG(sentence)
|
||||
route = {}
|
||||
calc(sentence, DAG, 0, route=route)
|
||||
calc(sentence, DAG, 0, route)
|
||||
x = 0
|
||||
N = len(sentence)
|
||||
buf = u''
|
||||
|
@ -1,6 +1,7 @@
|
||||
#encoding=utf-8
|
||||
import jieba
|
||||
import os
|
||||
from operator import itemgetter
|
||||
try:
|
||||
from analyzer import ChineseAnalyzer
|
||||
except ImportError:
|
||||
@ -26,13 +27,11 @@ class IDFLoader:
|
||||
if self.path != new_idf_path:
|
||||
content = open(new_idf_path, 'rb').read().decode('utf-8')
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
if lines and not lines[-1]:
|
||||
lines.pop(-1)
|
||||
lines = content.rstrip('\n').split('\n')
|
||||
for line in lines:
|
||||
word, freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
|
||||
self.idf_freq = idf_freq
|
||||
self.median_idf = median_idf
|
||||
self.path = new_idf_path
|
||||
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
|
||||
STOP_WORDS.add(line)
|
||||
|
||||
def extract_tags(sentence, topK=20, withWeight=False):
|
||||
global STOP_WORDS
|
||||
"""
|
||||
Extract keywords from sentence using TF-IDF algorithm.
|
||||
Parameter:
|
||||
- topK: return how many top keywords. `None` for all possible words.
|
||||
- withWeight: if True, return a list of (word, weight);
|
||||
if False, return a list of words.
|
||||
"""
|
||||
global STOP_WORDS, idf_loader
|
||||
|
||||
idf_freq, median_idf = idf_loader.get_idf()
|
||||
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip()) < 2:
|
||||
continue
|
||||
if w.lower() in STOP_WORDS:
|
||||
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
|
||||
continue
|
||||
freq[w] = freq.get(w, 0.0) + 1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||
|
||||
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list, reverse=True)
|
||||
for k in freq:
|
||||
freq[k] *= idf_freq.get(k, median_idf) / total
|
||||
|
||||
if withWeight:
|
||||
tags = st_list[:topK]
|
||||
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
||||
else:
|
||||
top_tuples = st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
||||
if topK:
|
||||
return tags[:topK]
|
||||
else:
|
||||
return tags
|
||||
|
@ -1,9 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import jieba.posseg as pseg
|
||||
import collections
|
||||
import sys
|
||||
import collections
|
||||
from operator import itemgetter
|
||||
import jieba.posseg as pseg
|
||||
|
||||
class UndirectWeightedGraph:
|
||||
d = 0.85
|
||||
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
|
||||
max_rank = w
|
||||
|
||||
for n, w in ws.items():
|
||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
|
||||
# to unify the weights, don't *100.
|
||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
||||
|
||||
return ws
|
||||
|
||||
|
||||
def textrank(raw, topk=10):
|
||||
def textrank(sentence, topK=10, withWeight=False):
|
||||
"""
|
||||
Extract keywords from sentence using TextRank algorithm.
|
||||
Parameter:
|
||||
- topK: return how many top keywords. `None` for all possible words.
|
||||
- withWeight: if True, return a list of (word, weight);
|
||||
if False, return a list of words.
|
||||
"""
|
||||
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
|
||||
g = UndirectWeightedGraph()
|
||||
cm = collections.defaultdict(int)
|
||||
span = 5
|
||||
words = [x for x in pseg.cut(raw)]
|
||||
words = list(pseg.cut(sentence))
|
||||
for i in xrange(len(words)):
|
||||
if words[i].flag in pos_filt:
|
||||
for j in xrange(i + 1, i + span):
|
||||
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
|
||||
g.addEdge(terms[0], terms[1], w)
|
||||
|
||||
nodes_rank = g.rank()
|
||||
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
|
||||
return nrs[:topk]
|
||||
if withWeight:
|
||||
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
||||
else:
|
||||
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
||||
if topK:
|
||||
return tags[:topK]
|
||||
else:
|
||||
return tags
|
||||
|
||||
if __name__ == '__main__':
|
||||
s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
for x, w in textrank(s):
|
||||
for x, w in textrank(s, withWeight=True):
|
||||
print x, w
|
||||
|
@ -19,25 +19,22 @@ PrevStatus = {
|
||||
}
|
||||
|
||||
def load_model():
|
||||
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='r') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'r') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with open(abs_path, 'r') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
return start_p, trans_p, emit_p
|
||||
|
||||
|
@ -25,27 +25,24 @@ def load_model(f_name, isJython=True):
|
||||
continue
|
||||
word, _, tag = line.split(' ')
|
||||
result[word.decode('utf-8')] = tag
|
||||
f.closed
|
||||
|
||||
if not isJython:
|
||||
return result
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='r') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'r') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with open(abs_path, 'r') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
state = {}
|
||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||
|
Loading…
x
Reference in New Issue
Block a user