port extract_tags, etc to jieba3k; add auto2to3 script

This commit is contained in:
Dingyuan Wang 2014-11-07 23:33:31 +08:00
parent fd9f1f2c0e
commit 7a6caa0c3c
12 changed files with 96481 additions and 95988 deletions

View File

@ -156,17 +156,16 @@ jieba.analyse.textrank(raw_text)
来自`__main__`的示例结果:
```
吉林 100.0
欧亚 86.4592606421
置业 55.3262889963
实现 52.0353476663
收入 37.9475518129
增资 35.5042189944
子公司 34.9286032861
全资 30.8154823412
城市 30.6031961172
商业 30.4779050167
吉林 1.0
欧亚 0.864834432786
置业 0.553465925497
实现 0.520660869531
收入 0.379699688954
增资 0.355086023683
子公司 0.349758490263
全资 0.308537396283
城市 0.306103738053
商业 0.304837414946
```
4) : 词性标注
@ -344,6 +343,10 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
作者falood
地址https://github.com/falood/exjieba
结巴分词 R 版本
----------------
作者qinwf
地址https://github.com/qinwf/jiebaR
系统集成
========
@ -411,9 +414,9 @@ seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode:", "/ ".join(seg_list)) # 精确模式
print("Default Mode:", "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
seg_list = jieba.cut("他来到了网易杭研大厦")
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式

View File

@ -13,6 +13,7 @@ import random
import threading
from functools import wraps
import logging
from hashlib import md5
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
@ -52,12 +53,10 @@ def gen_pfdict(f_name):
raise e
return pfdict, lfreq, ltotal
def initialize(*args):
global pfdict, FREQ, total, min_freq, initialized
if not args:
def initialize(dictionary=None):
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
if not dictionary:
dictionary = DICTIONARY
else:
dictionary = args[0]
with DICT_LOCK:
if initialized:
return
@ -66,13 +65,13 @@ def initialize(*args):
pfdict = None
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary)
abs_path = os.path.join(_curpath, dictionary)
logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time()
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
@ -87,18 +86,18 @@ def initialize(*args):
if load_from_cache_fail:
pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize
min_freq = min(FREQ.values())
logger.debug("Dumping model to file cache %s" % cache_file)
try:
tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
fd, fpath = tempfile.mkstemp()
with os.fdopen(fd, 'wb') as temp_cache_file:
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name == 'nt':
from shutil import move as replace_file
else:
replace_file = os.rename
replace_file(cache_file + tmp_suffix, cache_file)
replace_file(fpath, cache_file)
except:
logger.exception("Dump cache file failed.")
@ -136,12 +135,11 @@ def __cut_all(sentence):
old_j = j
def calc(sentence,DAG,idx,route):
def calc(sentence, DAG, idx, route):
N = len(sentence)
route[N] = (0.0, '')
for idx in range(N-1, -1, -1):
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
route[idx] = max(candidates)
route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
@require_initialized
def get_DAG(sentence):
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence)
route = {}
calc(sentence, DAG, 0, route=route)
calc(sentence, DAG, 0, route)
x = 0
N = len(sentence)
buf = ''

View File

@ -1,6 +1,7 @@
#encoding=utf-8
import jieba
import os
from operator import itemgetter
try:
from .analyzer import ChineseAnalyzer
except ImportError:
@ -26,9 +27,7 @@ class IDFLoader:
if self.path != new_idf_path:
content = open(new_idf_path, 'r', encoding='utf-8').read()
idf_freq = {}
lines = content.split('\n')
if lines and not lines[-1]:
lines.pop(-1)
lines = content.rstrip('\n').split('\n')
for line in lines:
word, freq = line.split(' ')
idf_freq[word] = float(freq)
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False):
global STOP_WORDS
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
global STOP_WORDS, idf_loader
idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip()) < 2:
continue
if w.lower() in STOP_WORDS:
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.items()]
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list, reverse=True)
for k in freq:
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
tags = st_list[:topK]
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags

View File

@ -1,9 +1,10 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import jieba.posseg as pseg
import collections
import sys
import collections
from operator import itemgetter
import jieba.posseg as pseg
class UndirectWeightedGraph:
d = 0.85
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
max_rank = w
for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws
def textrank(raw, topk=10):
def textrank(sentence, topK=10, withWeight=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
"""
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
g = UndirectWeightedGraph()
cm = collections.defaultdict(int)
span = 5
words = [x for x in pseg.cut(raw)]
words = list(pseg.cut(sentence))
for i in range(len(words)):
if words[i].flag in pos_filt:
for j in range(i + 1, i + span):
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
return nrs[:topk]
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
if __name__ == '__main__':
s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
for x, w in textrank(s):
for x, w in textrank(s, withWeight=True):
print(x, w)

View File

@ -18,25 +18,22 @@ PrevStatus = {
}
def load_model():
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p

View File

@ -25,27 +25,23 @@ def load_model(f_name, isJython=True):
line = line.decode("utf-8")
word, _, tag = line.split(" ")
result[word] = tag
f.closed
if not isJython:
return result
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
with open(abs_path, 'rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

450
test/2to3.diff Normal file
View File

@ -0,0 +1,450 @@
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
--- ./jieba/analyse/analyzer.py 2014-11-07 23:07:02.779210408 +0800
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-07 23:07:02.079210422 +0800
@@ -1,4 +1,4 @@
-##encoding=utf-8
+#encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
--- ./jieba/analyse/__init__.py 2014-11-07 23:07:02.879210406 +0800
+++ ../jieba/jieba/analyse/__init__.py 2014-11-07 23:16:27.171198767 +0800
@@ -25,7 +25,7 @@
def set_new_path(self, new_idf_path):
if self.path != new_idf_path:
- content = open(new_idf_path, 'rb').read().decode('utf-8')
+ content = open(new_idf_path, 'r', encoding='utf-8').read()
idf_freq = {}
lines = content.rstrip('\n').split('\n')
for line in lines:
@@ -81,7 +81,7 @@
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
--- ./jieba/analyse/textrank.py 2014-11-07 23:07:02.827210407 +0800
+++ ../jieba/jieba/analyse/textrank.py 2014-11-07 23:18:22.059196398 +0800
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
@@ -22,12 +22,12 @@
outSum = collections.defaultdict(float)
wsdef = 1.0 / len(self.graph)
- for n, out in list(self.graph.items()):
+ for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)
for x in range(10): # 10 iters
- for n, inedges in list(self.graph.items()):
+ for n, inedges in self.graph.items():
s = 0
for e in inedges:
s += e[2] / outSum[e[1]] * ws[e[1]]
@@ -41,7 +41,7 @@
elif w > max_rank:
max_rank = w
- for n, w in list(ws.items()):
+ for n, w in ws.items():
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
@@ -70,12 +70,12 @@
continue
cm[(words[i].word, words[j].word)] += 1
- for terms, w in list(cm.items()):
+ for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
--- ./jieba/finalseg/__init__.py 2014-11-07 23:07:03.147210400 +0800
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-07 23:18:43.495195956 +0800
@@ -1,4 +1,3 @@
-
import re
import os
import marshal
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
--- ./jieba/__init__.py 2014-11-07 23:07:02.751210408 +0800
+++ ../jieba/jieba/__init__.py 2014-11-07 23:22:34.963191182 +0800
@@ -1,4 +1,3 @@
-
__version__ = '0.34'
__license__ = 'MIT'
@@ -51,7 +50,7 @@
pfdict.add(word[:ch+1])
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
- raise ValueError(e)
+ raise e
return pfdict, lfreq, ltotal
def initialize(dictionary=None):
@@ -78,7 +77,8 @@
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
logger.debug("Loading model from cache %s" % cache_file)
try:
- pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
+ with open(cache_file, 'rb') as cf:
+ pfdict,FREQ,total,min_freq = marshal.load(cf)
# prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
except:
@@ -228,11 +228,11 @@
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- - sentence: The str/unicode to be segmented.
+ - sentence: The str to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
- if not isinstance(sentence, str):
+ if isinstance(sentence, bytes):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
@@ -338,8 +338,6 @@
global pool, cut, cut_for_search
if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system")
- if sys.version_info[0]==2 and sys.version_info[1]<6:
- raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool, cpu_count
if processnum is None:
processnum = cpu_count()
@@ -392,12 +390,12 @@
def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- - sentence: the unicode to be segmented.
+ - sentence: the str to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, str):
- raise Exception("jieba: the input parameter should be unicode.")
+ raise Exception("jieba: the input parameter should be str.")
start = 0
if mode == 'default':
for w in cut(unicode_sentence, HMM=HMM):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
--- ./jieba/__main__.py 2014-11-07 23:07:02.563210412 +0800
+++ ../jieba/jieba/__main__.py 2014-11-07 23:07:02.079210422 +0800
@@ -40,7 +40,7 @@
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
ln = fp.readline()
fp.close()
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
--- ./jieba/posseg/__init__.py 2014-11-07 23:07:03.047210402 +0800
+++ ../jieba/jieba/posseg/__init__.py 2014-11-07 23:19:40.883194772 +0800
@@ -1,4 +1,3 @@
-
import re
import os
from . import viterbi
@@ -18,14 +17,14 @@
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
- with open(f_name, "r") as f:
+ with open(f_name, "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
- word, _, tag = line.split(' ')
- result[word.decode('utf-8')] = tag
-
+ line = line.decode("utf-8")
+ word, _, tag = line.split(" ")
+ result[word] = tag
if not isJython:
return result
@@ -46,7 +45,7 @@
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
- with open(abs_path, 'r') as f:
+ with open(abs_path, 'rb') as f:
state = marshal.load(f)
f.closed
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
--- ./jieba/posseg/viterbi.py 2014-11-07 23:07:03.079210402 +0800
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-07 23:07:02.095210422 +0800
@@ -3,14 +3,13 @@
MIN_INF = float("-inf")
def get_top_states(t_state_v, K=4):
- items = list(t_state_v.items())
- topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
+ topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
mem_path = [{}]
- all_states = list(trans_p.keys())
+ all_states = trans_p.keys()
for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = ''
@@ -18,9 +17,9 @@
V.append({})
mem_path.append({})
#prev_states = get_top_states(V[t-1])
- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
if not obs_states:
@@ -31,7 +30,7 @@
V[t][y] = prob
mem_path[t][y] = state
- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0:
#print obs
prob, state = max(last)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
--- ./README.md 2014-11-07 23:07:02.067210423 +0800
+++ ../jieba/README.md 2014-11-07 23:24:49.263188412 +0800
@@ -4,6 +4,9 @@
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
- _Scroll down for English documentation._
+注意!
+========
+这个branch `jieba3k` 是专门用于Python3.x的版本
特点
========
@@ -68,16 +71,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 精确模式
+print("Default Mode:", "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
输出:
@@ -174,7 +177,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -203,7 +206,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -219,7 +222,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -408,16 +411,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 默认模式
+print("Default Mode:", "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
Output:
@@ -483,7 +486,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -512,7 +515,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -528,7 +531,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
--- ./setup.py 2014-11-07 23:07:02.067210423 +0800
+++ ../jieba/setup.py 2014-11-07 23:07:02.095210422 +0800
@@ -1,5 +1,5 @@
from distutils.core import setup
-setup(name='jieba',
+setup(name='jieba3k',
version='0.34',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
--- ./test/extract_topic.py 2014-11-07 23:07:03.707210389 +0800
+++ ../jieba/test/extract_topic.py 2014-11-07 23:07:02.095210422 +0800
@@ -51,13 +51,13 @@
print("training...")
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
-print(("done in %0.3fs." % (time.time() - t0)))
+print("done in %0.3fs." % (time.time() - t0))
# Inverse the vectorizer vocabulary to be able
feature_names = count_vect.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
- print(("Topic #%d:" % topic_idx))
- print((" ".join([feature_names[i]
- for i in topic.argsort()[:-n_top_words - 1:-1]])))
+ print("Topic #%d:" % topic_idx)
+ print(" ".join([feature_names[i]
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("")
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
--- ./test/jiebacmd.py 2014-11-07 23:07:03.211210399 +0800
+++ ../jieba/test/jiebacmd.py 2014-11-07 23:07:02.099210422 +0800
@@ -23,6 +23,6 @@
break
line = line.strip()
for word in jieba.cut(line):
- print(word.encode(default_encoding))
+ print(word)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
--- ./test/jieba_test.py 2014-11-07 23:07:03.947210384 +0800
+++ ../jieba/test/jieba_test.py 2014-11-07 23:07:02.099210422 +0800
@@ -1,5 +1,6 @@
#-*-coding: utf-8 -*-
import sys
+import imp
sys.path.append("../")
import unittest
import types
@@ -97,7 +98,7 @@
class JiebaTestCase(unittest.TestCase):
def setUp(self):
- reload(jieba)
+ imp.reload(jieba)
def tearDown(self):
pass
@@ -151,7 +152,7 @@
def testTokenize(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'))
+ result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
@@ -180,7 +181,7 @@
def testTokenize_NOHMM(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+ result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
--- ./test/test_tokenize_no_hmm.py 2014-11-07 23:07:04.031210382 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-07 23:07:02.099210422 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
--- ./test/test_tokenize.py 2014-11-07 23:07:04.071210381 +0800
+++ ../jieba/test/test_tokenize.py 2014-11-07 23:07:02.099210422 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

30
test/auto2to3 Executable file
View File

@ -0,0 +1,30 @@
#!/bin/bash
# Set 2to3 path.
PYTHON2TO3=2to3
# Copy the python2 version.
echo Jieba 2to3 manual conversion tool
echo
if ! git rev-parse; then
exit 1
fi
echo Copying working directory to ../jieba2
if [ -d ../jieba2 ]; then
echo Found existing ../jieba2
read -p "Replace it with new one? (y/n) " -r
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
echo Cancelled.
exit
else
rm -rf ../jieba2
fi
fi
git checkout master
cp -r . ../jieba2
git checkout jieba3k
cd ../jieba2
# Here starts auto conversion.
echo Converting jieba2 to Python3 ...
find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
patch -p0 <2to3.diff
echo Done. Compare jieba and jieba2 to manually port.

View File

@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
if withWeight is True:
for tag in tags:
print("tag: %s\t\t weight: %f" % (tag[1],tag[0]))
print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
else:
print(",".join(tags))