mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
port extract_tags, etc to jieba3k; add auto2to3 script
This commit is contained in:
parent
fd9f1f2c0e
commit
7a6caa0c3c
29
README.md
29
README.md
@ -156,17 +156,16 @@ jieba.analyse.textrank(raw_text)
|
||||
来自`__main__`的示例结果:
|
||||
|
||||
```
|
||||
吉林 100.0
|
||||
欧亚 86.4592606421
|
||||
置业 55.3262889963
|
||||
实现 52.0353476663
|
||||
收入 37.9475518129
|
||||
增资 35.5042189944
|
||||
子公司 34.9286032861
|
||||
全资 30.8154823412
|
||||
城市 30.6031961172
|
||||
商业 30.4779050167
|
||||
|
||||
吉林 1.0
|
||||
欧亚 0.864834432786
|
||||
置业 0.553465925497
|
||||
实现 0.520660869531
|
||||
收入 0.379699688954
|
||||
增资 0.355086023683
|
||||
子公司 0.349758490263
|
||||
全资 0.308537396283
|
||||
城市 0.306103738053
|
||||
商业 0.304837414946
|
||||
```
|
||||
|
||||
4) : 词性标注
|
||||
@ -344,6 +343,10 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
|
||||
作者:falood
|
||||
地址:https://github.com/falood/exjieba
|
||||
|
||||
结巴分词 R 版本
|
||||
----------------
|
||||
作者:qinwf
|
||||
地址:https://github.com/qinwf/jiebaR
|
||||
|
||||
系统集成
|
||||
========
|
||||
@ -411,9 +414,9 @@ seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||
print("Full Mode:", "/ ".join(seg_list)) # 全模式
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||
print("Default Mode:", "/ ".join(seg_list)) # 精确模式
|
||||
print("Default Mode:", "/ ".join(seg_list)) # 默认模式
|
||||
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||
print(", ".join(seg_list))
|
||||
|
||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||
|
@ -13,6 +13,7 @@ import random
|
||||
import threading
|
||||
from functools import wraps
|
||||
import logging
|
||||
from hashlib import md5
|
||||
|
||||
DICTIONARY = "dict.txt"
|
||||
DICT_LOCK = threading.RLock()
|
||||
@ -52,12 +53,10 @@ def gen_pfdict(f_name):
|
||||
raise e
|
||||
return pfdict, lfreq, ltotal
|
||||
|
||||
def initialize(*args):
|
||||
global pfdict, FREQ, total, min_freq, initialized
|
||||
if not args:
|
||||
def initialize(dictionary=None):
|
||||
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
|
||||
if not dictionary:
|
||||
dictionary = DICTIONARY
|
||||
else:
|
||||
dictionary = args[0]
|
||||
with DICT_LOCK:
|
||||
if initialized:
|
||||
return
|
||||
@ -66,13 +65,13 @@ def initialize(*args):
|
||||
pfdict = None
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
abs_path = os.path.join(_curpath,dictionary)
|
||||
abs_path = os.path.join(_curpath, dictionary)
|
||||
logger.debug("Building prefix dict from %s ..." % abs_path)
|
||||
t1 = time.time()
|
||||
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
||||
else: #custom dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
|
||||
|
||||
load_from_cache_fail = True
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||
@ -87,18 +86,18 @@ def initialize(*args):
|
||||
|
||||
if load_from_cache_fail:
|
||||
pfdict,FREQ,total = gen_pfdict(abs_path)
|
||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
||||
FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize
|
||||
min_freq = min(FREQ.values())
|
||||
logger.debug("Dumping model to file cache %s" % cache_file)
|
||||
try:
|
||||
tmp_suffix = "."+str(random.random())
|
||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||
fd, fpath = tempfile.mkstemp()
|
||||
with os.fdopen(fd, 'wb') as temp_cache_file:
|
||||
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
|
||||
if os.name == 'nt':
|
||||
from shutil import move as replace_file
|
||||
else:
|
||||
replace_file = os.rename
|
||||
replace_file(cache_file + tmp_suffix, cache_file)
|
||||
replace_file(fpath, cache_file)
|
||||
except:
|
||||
logger.exception("Dump cache file failed.")
|
||||
|
||||
@ -136,12 +135,11 @@ def __cut_all(sentence):
|
||||
old_j = j
|
||||
|
||||
|
||||
def calc(sentence,DAG,idx,route):
|
||||
def calc(sentence, DAG, idx, route):
|
||||
N = len(sentence)
|
||||
route[N] = (0.0, '')
|
||||
for idx in range(N-1, -1, -1):
|
||||
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
|
||||
route[idx] = max(candidates)
|
||||
route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
|
||||
|
||||
@require_initialized
|
||||
def get_DAG(sentence):
|
||||
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
|
||||
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
||||
DAG = get_DAG(sentence)
|
||||
route = {}
|
||||
calc(sentence, DAG, 0, route=route)
|
||||
calc(sentence, DAG, 0, route)
|
||||
x = 0
|
||||
N = len(sentence)
|
||||
buf = ''
|
||||
|
@ -1,6 +1,7 @@
|
||||
#encoding=utf-8
|
||||
import jieba
|
||||
import os
|
||||
from operator import itemgetter
|
||||
try:
|
||||
from .analyzer import ChineseAnalyzer
|
||||
except ImportError:
|
||||
@ -26,9 +27,7 @@ class IDFLoader:
|
||||
if self.path != new_idf_path:
|
||||
content = open(new_idf_path, 'r', encoding='utf-8').read()
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
if lines and not lines[-1]:
|
||||
lines.pop(-1)
|
||||
lines = content.rstrip('\n').split('\n')
|
||||
for line in lines:
|
||||
word, freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
|
||||
STOP_WORDS.add(line)
|
||||
|
||||
def extract_tags(sentence, topK=20, withWeight=False):
|
||||
global STOP_WORDS
|
||||
"""
|
||||
Extract keywords from sentence using TF-IDF algorithm.
|
||||
Parameter:
|
||||
- topK: return how many top keywords. `None` for all possible words.
|
||||
- withWeight: if True, return a list of (word, weight);
|
||||
if False, return a list of words.
|
||||
"""
|
||||
global STOP_WORDS, idf_loader
|
||||
|
||||
idf_freq, median_idf = idf_loader.get_idf()
|
||||
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip()) < 2:
|
||||
continue
|
||||
if w.lower() in STOP_WORDS:
|
||||
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
|
||||
continue
|
||||
freq[w] = freq.get(w, 0.0) + 1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.items()]
|
||||
|
||||
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list, reverse=True)
|
||||
for k in freq:
|
||||
freq[k] *= idf_freq.get(k, median_idf) / total
|
||||
|
||||
if withWeight:
|
||||
tags = st_list[:topK]
|
||||
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
||||
else:
|
||||
top_tuples = st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
||||
if topK:
|
||||
return tags[:topK]
|
||||
else:
|
||||
return tags
|
||||
|
@ -1,9 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import jieba.posseg as pseg
|
||||
import collections
|
||||
import sys
|
||||
import collections
|
||||
from operator import itemgetter
|
||||
import jieba.posseg as pseg
|
||||
|
||||
class UndirectWeightedGraph:
|
||||
d = 0.85
|
||||
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
|
||||
max_rank = w
|
||||
|
||||
for n, w in ws.items():
|
||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
|
||||
# to unify the weights, don't *100.
|
||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
||||
|
||||
return ws
|
||||
|
||||
|
||||
def textrank(raw, topk=10):
|
||||
def textrank(sentence, topK=10, withWeight=False):
|
||||
"""
|
||||
Extract keywords from sentence using TextRank algorithm.
|
||||
Parameter:
|
||||
- topK: return how many top keywords. `None` for all possible words.
|
||||
- withWeight: if True, return a list of (word, weight);
|
||||
if False, return a list of words.
|
||||
"""
|
||||
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
|
||||
g = UndirectWeightedGraph()
|
||||
cm = collections.defaultdict(int)
|
||||
span = 5
|
||||
words = [x for x in pseg.cut(raw)]
|
||||
words = list(pseg.cut(sentence))
|
||||
for i in range(len(words)):
|
||||
if words[i].flag in pos_filt:
|
||||
for j in range(i + 1, i + span):
|
||||
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
|
||||
g.addEdge(terms[0], terms[1], w)
|
||||
|
||||
nodes_rank = g.rank()
|
||||
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
|
||||
return nrs[:topk]
|
||||
if withWeight:
|
||||
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
||||
else:
|
||||
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
||||
if topK:
|
||||
return tags[:topK]
|
||||
else:
|
||||
return tags
|
||||
|
||||
if __name__ == '__main__':
|
||||
s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。"
|
||||
for x, w in textrank(s):
|
||||
for x, w in textrank(s, withWeight=True):
|
||||
print(x, w)
|
||||
|
@ -18,25 +18,22 @@ PrevStatus = {
|
||||
}
|
||||
|
||||
def load_model():
|
||||
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
return start_p, trans_p, emit_p
|
||||
|
||||
|
@ -25,27 +25,23 @@ def load_model(f_name, isJython=True):
|
||||
line = line.decode("utf-8")
|
||||
word, _, tag = line.split(" ")
|
||||
result[word] = tag
|
||||
f.closed
|
||||
if not isJython:
|
||||
return result
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
with open(abs_path, 'rb') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
state = {}
|
||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||
|
File diff suppressed because it is too large
Load Diff
178554
jieba/posseg/prob_emit.py
178554
jieba/posseg/prob_emit.py
File diff suppressed because it is too large
Load Diff
450
test/2to3.diff
Normal file
450
test/2to3.diff
Normal file
@ -0,0 +1,450 @@
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
|
||||
--- ./jieba/analyse/analyzer.py 2014-11-07 23:07:02.779210408 +0800
|
||||
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-07 23:07:02.079210422 +0800
|
||||
@@ -1,4 +1,4 @@
|
||||
-##encoding=utf-8
|
||||
+#encoding=utf-8
|
||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
from whoosh.lang.porter import stem
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
|
||||
--- ./jieba/analyse/__init__.py 2014-11-07 23:07:02.879210406 +0800
|
||||
+++ ../jieba/jieba/analyse/__init__.py 2014-11-07 23:16:27.171198767 +0800
|
||||
@@ -25,7 +25,7 @@
|
||||
|
||||
def set_new_path(self, new_idf_path):
|
||||
if self.path != new_idf_path:
|
||||
- content = open(new_idf_path, 'rb').read().decode('utf-8')
|
||||
+ content = open(new_idf_path, 'r', encoding='utf-8').read()
|
||||
idf_freq = {}
|
||||
lines = content.rstrip('\n').split('\n')
|
||||
for line in lines:
|
||||
@@ -81,7 +81,7 @@
|
||||
freq[k] *= idf_freq.get(k, median_idf) / total
|
||||
|
||||
if withWeight:
|
||||
- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
|
||||
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
||||
else:
|
||||
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
||||
if topK:
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
|
||||
--- ./jieba/analyse/textrank.py 2014-11-07 23:07:02.827210407 +0800
|
||||
+++ ../jieba/jieba/analyse/textrank.py 2014-11-07 23:18:22.059196398 +0800
|
||||
@@ -1,4 +1,4 @@
|
||||
-#!/usr/bin/env python
|
||||
+#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
@@ -22,12 +22,12 @@
|
||||
outSum = collections.defaultdict(float)
|
||||
|
||||
wsdef = 1.0 / len(self.graph)
|
||||
- for n, out in list(self.graph.items()):
|
||||
+ for n, out in self.graph.items():
|
||||
ws[n] = wsdef
|
||||
outSum[n] = sum((e[2] for e in out), 0.0)
|
||||
|
||||
for x in range(10): # 10 iters
|
||||
- for n, inedges in list(self.graph.items()):
|
||||
+ for n, inedges in self.graph.items():
|
||||
s = 0
|
||||
for e in inedges:
|
||||
s += e[2] / outSum[e[1]] * ws[e[1]]
|
||||
@@ -41,7 +41,7 @@
|
||||
elif w > max_rank:
|
||||
max_rank = w
|
||||
|
||||
- for n, w in list(ws.items()):
|
||||
+ for n, w in ws.items():
|
||||
# to unify the weights, don't *100.
|
||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
||||
|
||||
@@ -70,12 +70,12 @@
|
||||
continue
|
||||
cm[(words[i].word, words[j].word)] += 1
|
||||
|
||||
- for terms, w in list(cm.items()):
|
||||
+ for terms, w in cm.items():
|
||||
g.addEdge(terms[0], terms[1], w)
|
||||
|
||||
nodes_rank = g.rank()
|
||||
if withWeight:
|
||||
- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
|
||||
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
||||
else:
|
||||
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
||||
if topK:
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
|
||||
--- ./jieba/finalseg/__init__.py 2014-11-07 23:07:03.147210400 +0800
|
||||
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-07 23:18:43.495195956 +0800
|
||||
@@ -1,4 +1,3 @@
|
||||
-
|
||||
import re
|
||||
import os
|
||||
import marshal
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
|
||||
--- ./jieba/__init__.py 2014-11-07 23:07:02.751210408 +0800
|
||||
+++ ../jieba/jieba/__init__.py 2014-11-07 23:22:34.963191182 +0800
|
||||
@@ -1,4 +1,3 @@
|
||||
-
|
||||
__version__ = '0.34'
|
||||
__license__ = 'MIT'
|
||||
|
||||
@@ -51,7 +50,7 @@
|
||||
pfdict.add(word[:ch+1])
|
||||
except ValueError as e:
|
||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
||||
- raise ValueError(e)
|
||||
+ raise e
|
||||
return pfdict, lfreq, ltotal
|
||||
|
||||
def initialize(dictionary=None):
|
||||
@@ -78,7 +77,8 @@
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||
logger.debug("Loading model from cache %s" % cache_file)
|
||||
try:
|
||||
- pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
||||
+ with open(cache_file, 'rb') as cf:
|
||||
+ pfdict,FREQ,total,min_freq = marshal.load(cf)
|
||||
# prevent conflict with old version
|
||||
load_from_cache_fail = not isinstance(pfdict, set)
|
||||
except:
|
||||
@@ -228,11 +228,11 @@
|
||||
'''The main function that segments an entire sentence that contains
|
||||
Chinese characters into seperated words.
|
||||
Parameter:
|
||||
- - sentence: The str/unicode to be segmented.
|
||||
+ - sentence: The str to be segmented.
|
||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||
- HMM: Whether to use the Hidden Markov Model.
|
||||
'''
|
||||
- if not isinstance(sentence, str):
|
||||
+ if isinstance(sentence, bytes):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
@@ -338,8 +338,6 @@
|
||||
global pool, cut, cut_for_search
|
||||
if os.name == 'nt':
|
||||
raise Exception("jieba: parallel mode only supports posix system")
|
||||
- if sys.version_info[0]==2 and sys.version_info[1]<6:
|
||||
- raise Exception("jieba: the parallel feature needs Python version>2.5")
|
||||
from multiprocessing import Pool, cpu_count
|
||||
if processnum is None:
|
||||
processnum = cpu_count()
|
||||
@@ -392,12 +390,12 @@
|
||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
||||
"""Tokenize a sentence and yields tuples of (word, start, end)
|
||||
Parameter:
|
||||
- - sentence: the unicode to be segmented.
|
||||
+ - sentence: the str to be segmented.
|
||||
- mode: "default" or "search", "search" is for finer segmentation.
|
||||
- HMM: whether to use the Hidden Markov Model.
|
||||
"""
|
||||
if not isinstance(unicode_sentence, str):
|
||||
- raise Exception("jieba: the input parameter should be unicode.")
|
||||
+ raise Exception("jieba: the input parameter should be str.")
|
||||
start = 0
|
||||
if mode == 'default':
|
||||
for w in cut(unicode_sentence, HMM=HMM):
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
|
||||
--- ./jieba/__main__.py 2014-11-07 23:07:02.563210412 +0800
|
||||
+++ ../jieba/jieba/__main__.py 2014-11-07 23:07:02.079210422 +0800
|
||||
@@ -40,7 +40,7 @@
|
||||
ln = fp.readline()
|
||||
while ln:
|
||||
l = ln.rstrip('\r\n')
|
||||
- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
|
||||
+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
|
||||
ln = fp.readline()
|
||||
|
||||
fp.close()
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
|
||||
--- ./jieba/posseg/__init__.py 2014-11-07 23:07:03.047210402 +0800
|
||||
+++ ../jieba/jieba/posseg/__init__.py 2014-11-07 23:19:40.883194772 +0800
|
||||
@@ -1,4 +1,3 @@
|
||||
-
|
||||
import re
|
||||
import os
|
||||
from . import viterbi
|
||||
@@ -18,14 +17,14 @@
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
result = {}
|
||||
- with open(f_name, "r") as f:
|
||||
+ with open(f_name, "rb") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
- word, _, tag = line.split(' ')
|
||||
- result[word.decode('utf-8')] = tag
|
||||
-
|
||||
+ line = line.decode("utf-8")
|
||||
+ word, _, tag = line.split(" ")
|
||||
+ result[word] = tag
|
||||
if not isJython:
|
||||
return result
|
||||
|
||||
@@ -46,7 +45,7 @@
|
||||
|
||||
state = {}
|
||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||
- with open(abs_path, 'r') as f:
|
||||
+ with open(abs_path, 'rb') as f:
|
||||
state = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
|
||||
--- ./jieba/posseg/viterbi.py 2014-11-07 23:07:03.079210402 +0800
|
||||
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-07 23:07:02.095210422 +0800
|
||||
@@ -3,14 +3,13 @@
|
||||
MIN_INF = float("-inf")
|
||||
|
||||
def get_top_states(t_state_v, K=4):
|
||||
- items = list(t_state_v.items())
|
||||
- topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
|
||||
+ topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
|
||||
return [x[0] for x in topK]
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
V = [{}] #tabular
|
||||
mem_path = [{}]
|
||||
- all_states = list(trans_p.keys())
|
||||
+ all_states = trans_p.keys()
|
||||
for y in states.get(obs[0], all_states): #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
||||
mem_path[0][y] = ''
|
||||
@@ -18,9 +17,9 @@
|
||||
V.append({})
|
||||
mem_path.append({})
|
||||
#prev_states = get_top_states(V[t-1])
|
||||
- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
|
||||
+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
|
||||
|
||||
- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
|
||||
+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
|
||||
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
|
||||
|
||||
if not obs_states:
|
||||
@@ -31,7 +30,7 @@
|
||||
V[t][y] = prob
|
||||
mem_path[t][y] = state
|
||||
|
||||
- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
|
||||
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
|
||||
#if len(last)==0:
|
||||
#print obs
|
||||
prob, state = max(last)
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
|
||||
--- ./README.md 2014-11-07 23:07:02.067210423 +0800
|
||||
+++ ../jieba/README.md 2014-11-07 23:24:49.263188412 +0800
|
||||
@@ -4,6 +4,9 @@
|
||||
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
|
||||
- _Scroll down for English documentation._
|
||||
|
||||
+注意!
|
||||
+========
|
||||
+这个branch `jieba3k` 是专门用于Python3.x的版本
|
||||
|
||||
特点
|
||||
========
|
||||
@@ -68,16 +71,16 @@
|
||||
import jieba
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||
-print "Full Mode:", "/ ".join(seg_list) # 全模式
|
||||
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||
-print "Default Mode:", "/ ".join(seg_list) # 精确模式
|
||||
+print("Default Mode:", "/ ".join(seg_list)) # 精确模式
|
||||
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
|
||||
-print ", ".join(seg_list)
|
||||
+print(", ".join(seg_list))
|
||||
|
||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||
-print ", ".join(seg_list)
|
||||
+print(", ".join(seg_list))
|
||||
```
|
||||
|
||||
输出:
|
||||
@@ -174,7 +177,7 @@
|
||||
>>> import jieba.posseg as pseg
|
||||
>>> words = pseg.cut("我爱北京天安门")
|
||||
>>> for w in words:
|
||||
-... print w.word, w.flag
|
||||
+... print(w.word, w.flag)
|
||||
...
|
||||
我 r
|
||||
爱 v
|
||||
@@ -203,7 +206,7 @@
|
||||
```python
|
||||
result = jieba.tokenize(u'永和服装饰品有限公司')
|
||||
for tk in result:
|
||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||
```
|
||||
|
||||
```
|
||||
@@ -219,7 +222,7 @@
|
||||
```python
|
||||
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
||||
for tk in result:
|
||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||
```
|
||||
|
||||
```
|
||||
@@ -408,16 +411,16 @@
|
||||
import jieba
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||
-print "Full Mode:", "/ ".join(seg_list) # 全模式
|
||||
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||
-print "Default Mode:", "/ ".join(seg_list) # 默认模式
|
||||
+print("Default Mode:", "/ ".join(seg_list)) # 默认模式
|
||||
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||
-print ", ".join(seg_list)
|
||||
+print(", ".join(seg_list))
|
||||
|
||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||
-print ", ".join(seg_list)
|
||||
+print(", ".join(seg_list))
|
||||
```
|
||||
|
||||
Output:
|
||||
@@ -483,7 +486,7 @@
|
||||
>>> import jieba.posseg as pseg
|
||||
>>> words = pseg.cut("我爱北京天安门")
|
||||
>>> for w in words:
|
||||
-... print w.word, w.flag
|
||||
+... print(w.word, w.flag)
|
||||
...
|
||||
我 r
|
||||
爱 v
|
||||
@@ -512,7 +515,7 @@
|
||||
```python
|
||||
result = jieba.tokenize(u'永和服装饰品有限公司')
|
||||
for tk in result:
|
||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||
```
|
||||
|
||||
```
|
||||
@@ -528,7 +531,7 @@
|
||||
```python
|
||||
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
|
||||
for tk in result:
|
||||
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
||||
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||
```
|
||||
|
||||
```
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
|
||||
--- ./setup.py 2014-11-07 23:07:02.067210423 +0800
|
||||
+++ ../jieba/setup.py 2014-11-07 23:07:02.095210422 +0800
|
||||
@@ -1,5 +1,5 @@
|
||||
from distutils.core import setup
|
||||
-setup(name='jieba',
|
||||
+setup(name='jieba3k',
|
||||
version='0.34',
|
||||
description='Chinese Words Segementation Utilities',
|
||||
author='Sun, Junyi',
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
|
||||
--- ./test/extract_topic.py 2014-11-07 23:07:03.707210389 +0800
|
||||
+++ ../jieba/test/extract_topic.py 2014-11-07 23:07:02.095210422 +0800
|
||||
@@ -51,13 +51,13 @@
|
||||
print("training...")
|
||||
|
||||
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
|
||||
-print(("done in %0.3fs." % (time.time() - t0)))
|
||||
+print("done in %0.3fs." % (time.time() - t0))
|
||||
|
||||
# Inverse the vectorizer vocabulary to be able
|
||||
feature_names = count_vect.get_feature_names()
|
||||
|
||||
for topic_idx, topic in enumerate(nmf.components_):
|
||||
- print(("Topic #%d:" % topic_idx))
|
||||
- print((" ".join([feature_names[i]
|
||||
- for i in topic.argsort()[:-n_top_words - 1:-1]])))
|
||||
+ print("Topic #%d:" % topic_idx)
|
||||
+ print(" ".join([feature_names[i]
|
||||
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
|
||||
print("")
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
|
||||
--- ./test/jiebacmd.py 2014-11-07 23:07:03.211210399 +0800
|
||||
+++ ../jieba/test/jiebacmd.py 2014-11-07 23:07:02.099210422 +0800
|
||||
@@ -23,6 +23,6 @@
|
||||
break
|
||||
line = line.strip()
|
||||
for word in jieba.cut(line):
|
||||
- print(word.encode(default_encoding))
|
||||
+ print(word)
|
||||
|
||||
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
|
||||
--- ./test/jieba_test.py 2014-11-07 23:07:03.947210384 +0800
|
||||
+++ ../jieba/test/jieba_test.py 2014-11-07 23:07:02.099210422 +0800
|
||||
@@ -1,5 +1,6 @@
|
||||
#-*-coding: utf-8 -*-
|
||||
import sys
|
||||
+import imp
|
||||
sys.path.append("../")
|
||||
import unittest
|
||||
import types
|
||||
@@ -97,7 +98,7 @@
|
||||
|
||||
class JiebaTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
- reload(jieba)
|
||||
+ imp.reload(jieba)
|
||||
|
||||
def tearDown(self):
|
||||
pass
|
||||
@@ -151,7 +152,7 @@
|
||||
|
||||
def testTokenize(self):
|
||||
for content in test_contents:
|
||||
- result = jieba.tokenize(content.decode('utf-8'))
|
||||
+ result = jieba.tokenize(content)
|
||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||
result = list(result)
|
||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||
@@ -180,7 +181,7 @@
|
||||
|
||||
def testTokenize_NOHMM(self):
|
||||
for content in test_contents:
|
||||
- result = jieba.tokenize(content.decode('utf-8'),HMM=False)
|
||||
+ result = jieba.tokenize(content,HMM=False)
|
||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||
result = list(result)
|
||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
|
||||
--- ./test/test_tokenize_no_hmm.py 2014-11-07 23:07:04.031210382 +0800
|
||||
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-07 23:07:02.099210422 +0800
|
||||
@@ -7,7 +7,6 @@
|
||||
|
||||
def cuttest(test_sent):
|
||||
global g_mode
|
||||
- test_sent = test_sent.decode('utf-8')
|
||||
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
|
||||
for tk in result:
|
||||
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
|
||||
--- ./test/test_tokenize.py 2014-11-07 23:07:04.071210381 +0800
|
||||
+++ ../jieba/test/test_tokenize.py 2014-11-07 23:07:02.099210422 +0800
|
||||
@@ -7,7 +7,6 @@
|
||||
|
||||
def cuttest(test_sent):
|
||||
global g_mode
|
||||
- test_sent = test_sent.decode('utf-8')
|
||||
result = jieba.tokenize(test_sent,mode=g_mode)
|
||||
for tk in result:
|
||||
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
30
test/auto2to3
Executable file
30
test/auto2to3
Executable file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
# Set 2to3 path.
|
||||
PYTHON2TO3=2to3
|
||||
# Copy the python2 version.
|
||||
echo Jieba 2to3 manual conversion tool
|
||||
echo
|
||||
if ! git rev-parse; then
|
||||
exit 1
|
||||
fi
|
||||
echo Copying working directory to ../jieba2
|
||||
if [ -d ../jieba2 ]; then
|
||||
echo Found existing ../jieba2
|
||||
read -p "Replace it with new one? (y/n) " -r
|
||||
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
|
||||
echo Cancelled.
|
||||
exit
|
||||
else
|
||||
rm -rf ../jieba2
|
||||
fi
|
||||
fi
|
||||
git checkout master
|
||||
cp -r . ../jieba2
|
||||
git checkout jieba3k
|
||||
cd ../jieba2
|
||||
# Here starts auto conversion.
|
||||
echo Converting jieba2 to Python3 ...
|
||||
find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
|
||||
find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
|
||||
patch -p0 <2to3.diff
|
||||
echo Done. Compare jieba and jieba2 to manually port.
|
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
|
||||
|
||||
if withWeight is True:
|
||||
for tag in tags:
|
||||
print("tag: %s\t\t weight: %f" % (tag[1],tag[0]))
|
||||
print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
|
||||
else:
|
||||
print(",".join(tags))
|
||||
|
Loading…
x
Reference in New Issue
Block a user