diff --git a/Changelog b/Changelog index 671add0..be1aaa3 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,6 @@ +2014-11-15: version 0.35.1 +1. 修复 Python 3.2 的兼容性问题 + 2014-11-13: version 0.35 1. 改进词典cache的dump和加载机制;by @gumblex 2. 提升关键词提取的性能; by @gumblex diff --git a/jieba/__init__.py b/jieba/__init__.py index 4e46fa3..27c9dc2 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -1,5 +1,5 @@ from __future__ import with_statement -__version__ = '0.34' +__version__ = '0.35' __license__ = 'MIT' import re @@ -78,7 +78,8 @@ def initialize(dictionary=None): if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: - pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) + with open(cache_file, 'rb') as cf: + pfdict,FREQ,total,min_freq = marshal.load(cf) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) except: diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 30160d4..484874d 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -46,7 +46,7 @@ def load_model(f_name, isJython=True): state = {} abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) - with open(abs_path, 'r') as f: + with open(abs_path, 'rb') as f: state = marshal.load(f) f.closed @@ -126,7 +126,7 @@ def __cut_detail(sentence): def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route = {} - jieba.calc(sentence, DAG, 0, route=route) + jieba.calc(sentence, DAG, route) x = 0 N = len(sentence) buf = u'' @@ -151,7 +151,7 @@ def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} - jieba.calc(sentence,DAG,0,route=route) + jieba.calc(sentence, DAG, route) x = 0 buf = u'' diff --git a/jieba/posseg/viterbi.py b/jieba/posseg/viterbi.py index 0130f5b..5a643fb 100644 --- a/jieba/posseg/viterbi.py +++ b/jieba/posseg/viterbi.py @@ -3,9 +3,7 @@ MIN_FLOAT = -3.14e100 MIN_INF = float("-inf") def get_top_states(t_state_v, K=4): - items = t_state_v.items() - topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K] - return [x[0] for x in topK] + return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K] def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular @@ -27,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p): obs_states = prev_states_expect_next if prev_states_expect_next else all_states for y in obs_states: - prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states]) + prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states) V[t][y] = prob mem_path[t][y] = state diff --git a/setup.py b/setup.py index 3e25168..10af9e9 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,85 @@ +# -*- coding: utf-8 -*- from distutils.core import setup +LONGDOC = u""" +jieba +===== + +“结巴”中文分词:做最好的 Python 中文分词组件 + +"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to +be the best Python Chinese word segmentation module. + +完整文档见 ``README.md`` + +GitHub: https://github.com/fxsjy/jieba + +特点 +==== + +- 支持三种分词模式: + + - 精确模式,试图将句子最精确地切开,适合文本分析; + - 全模式,把句子中所有的可以成词的词语都扫描出来, + 速度非常快,但是不能解决歧义; + - 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 + +- 支持繁体分词 +- 支持自定义词典 + +在线演示: http://jiebademo.ap01.aws.af.cm/ + +安装说明 +======== + +Python 2.x +---------- + +- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba`` +- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行 + python setup.py install +- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录 +- 通过 ``import jieba`` 来引用 + +Python 3.x +---------- + +见 https://pypi.python.org/pypi/jieba3k/ + +- 目前 master 分支是只支持 Python 2.x 的 +- Python 3.x 版本的分支也已经基本可用: + https://github.com/fxsjy/jieba/tree/jieba3k + +.. code:: bash + + git clone https://github.com/fxsjy/jieba.git + git checkout jieba3k + python setup.py install + +- 或使用pip3安装: pip3 install jieba3k + +""" + setup(name='jieba', version='0.35', description='Chinese Words Segementation Utilities', + long_description=LONGDOC, author='Sun, Junyi', author_email='ccnusjy@gmail.com', - url='http://github.com/fxsjy', + url='https://github.com/fxsjy/jieba', + license="MIT", + classifiers=[ + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic', + ], + keywords='NLP,tokenizing,Chinese word segementation', packages=['jieba'], package_dir={'jieba':'jieba'}, package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']} diff --git a/test/2to3.diff b/test/2to3.diff new file mode 100644 index 0000000..2c4396f --- /dev/null +++ b/test/2to3.diff @@ -0,0 +1,522 @@ +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py +--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800 ++++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800 +@@ -1,4 +1,4 @@ +-##encoding=utf-8 ++#encoding=utf-8 + from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter + from whoosh.analysis import Tokenizer,Token + from whoosh.lang.porter import stem +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py +--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800 ++++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800 +@@ -26,7 +26,7 @@ + + def set_new_path(self, new_idf_path): + if self.path != new_idf_path: +- content = open(new_idf_path, 'rb').read().decode('utf-8') ++ content = open(new_idf_path, 'r', encoding='utf-8').read() + idf_freq = {} + lines = content.rstrip('\n').split('\n') + for line in lines: +@@ -93,7 +93,7 @@ + freq[k] *= idf_freq.get(k, median_idf) / total + + if withWeight: +- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True) ++ tags = sorted(freq.items(), key=itemgetter(1), reverse=True) + else: + tags = sorted(freq, key=freq.__getitem__, reverse=True) + if topK: +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py +--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800 ++++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800 +@@ -1,4 +1,4 @@ +-#!/usr/bin/env python ++#!/usr/bin/env python3 + # -*- coding: utf-8 -*- + + import sys +@@ -22,12 +22,12 @@ + outSum = collections.defaultdict(float) + + wsdef = 1.0 / len(self.graph) +- for n, out in list(self.graph.items()): ++ for n, out in self.graph.items(): + ws[n] = wsdef + outSum[n] = sum((e[2] for e in out), 0.0) + + for x in range(10): # 10 iters +- for n, inedges in list(self.graph.items()): ++ for n, inedges in self.graph.items(): + s = 0 + for e in inedges: + s += e[2] / outSum[e[1]] * ws[e[1]] +@@ -41,7 +41,7 @@ + elif w > max_rank: + max_rank = w + +- for n, w in list(ws.items()): ++ for n, w in ws.items(): + # to unify the weights, don't *100. + ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) + +@@ -72,12 +72,12 @@ + continue + cm[(words[i].word, words[j].word)] += 1 + +- for terms, w in list(cm.items()): ++ for terms, w in cm.items(): + g.addEdge(terms[0], terms[1], w) + + nodes_rank = g.rank() + if withWeight: +- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True) ++ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) + else: + tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) + if topK: +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py +--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800 ++++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800 +@@ -1,4 +1,3 @@ +- + import re + import os + import marshal +@@ -89,7 +88,7 @@ + sentence = sentence.decode('utf-8') + except UnicodeDecodeError: + sentence = sentence.decode('gbk', 'ignore') +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)") ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") + blocks = re_han.split(sentence) + for blk in blocks: + if re_han.match(blk): +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py +--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800 ++++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800 +@@ -1,4 +1,3 @@ +- + __version__ = '0.35' + __license__ = 'MIT' + +@@ -51,7 +50,7 @@ + pfdict.add(word[:ch+1]) + except ValueError as e: + logger.debug('%s at line %s %s' % (f_name, lineno, line)) +- raise ValueError(e) ++ raise e + return pfdict, lfreq, ltotal + + def initialize(dictionary=None): +@@ -229,11 +228,11 @@ + '''The main function that segments an entire sentence that contains + Chinese characters into seperated words. + Parameter: +- - sentence: The str/unicode to be segmented. ++ - sentence: The str to be segmented. + - cut_all: Model type. True for full pattern, False for accurate pattern. + - HMM: Whether to use the Hidden Markov Model. + ''' +- if not isinstance(sentence, str): ++ if isinstance(sentence, bytes): + try: + sentence = sentence.decode('utf-8') + except UnicodeDecodeError: +@@ -243,9 +242,9 @@ + # \r\n|\s : whitespace characters. Will not be handled. + + if cut_all: +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U) ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U) + else: +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U) ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U) + blocks = re_han.split(sentence) + if cut_all: + cut_block = __cut_all +@@ -339,8 +338,6 @@ + global pool, cut, cut_for_search + if os.name == 'nt': + raise Exception("jieba: parallel mode only supports posix system") +- if sys.version_info[0]==2 and sys.version_info[1]<6: +- raise Exception("jieba: the parallel feature needs Python version>2.5") + from multiprocessing import Pool, cpu_count + if processnum is None: + processnum = cpu_count() +@@ -393,12 +390,12 @@ + def tokenize(unicode_sentence, mode="default", HMM=True): + """Tokenize a sentence and yields tuples of (word, start, end) + Parameter: +- - sentence: the unicode to be segmented. ++ - sentence: the str to be segmented. + - mode: "default" or "search", "search" is for finer segmentation. + - HMM: whether to use the Hidden Markov Model. + """ + if not isinstance(unicode_sentence, str): +- raise Exception("jieba: the input parameter should be unicode.") ++ raise Exception("jieba: the input parameter should be str.") + start = 0 + if mode == 'default': + for w in cut(unicode_sentence, HMM=HMM): +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py +--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800 ++++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800 +@@ -40,7 +40,7 @@ + ln = fp.readline() + while ln: + l = ln.rstrip('\r\n') +- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))) ++ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))) + ln = fp.readline() + + fp.close() +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py +--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800 ++++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800 +@@ -1,4 +1,3 @@ +- + import re + import os + from . import viterbi +@@ -18,14 +17,14 @@ + _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + result = {} +- with open(f_name, "r") as f: ++ with open(f_name, "rb") as f: + for line in f: + line = line.strip() + if not line: + continue +- word, _, tag = line.split(' ') +- result[word.decode('utf-8')] = tag +- ++ line = line.decode("utf-8") ++ word, _, tag = line.split(" ") ++ result[word] = tag + if not isJython: + return result + +@@ -105,8 +104,8 @@ + yield pair(sentence[next:], pos_list[next][1]) + + def __cut_detail(sentence): +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)") +- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)") ++ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") + blocks = re_han.split(sentence) + for blk in blocks: + if re_han.match(blk): +@@ -130,7 +129,7 @@ + x = 0 + N = len(sentence) + buf = '' +- re_eng = re.compile(r'[a-zA-Z0-9]',re.U) ++ re_eng = re.compile('[a-zA-Z0-9]',re.U) + while x < N: + y = route[x][1]+1 + l_word = sentence[x:y] +@@ -195,8 +194,8 @@ + sentence = sentence.decode('utf-8') + except UnicodeDecodeError: + sentence = sentence.decode('gbk', 'ignore') +- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)") +- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") ++ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)") ++ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") + blocks = re_han.split(sentence) + if HMM: + __cut_blk = __cut_DAG +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py +--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800 ++++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800 +@@ -8,7 +8,7 @@ + def viterbi(obs, states, start_p, trans_p, emit_p): + V = [{}] #tabular + mem_path = [{}] +- all_states = list(trans_p.keys()) ++ all_states = trans_p.keys() + for y in states.get(obs[0], all_states): #init + V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT) + mem_path[0][y] = '' +@@ -16,9 +16,9 @@ + V.append({}) + mem_path.append({}) + #prev_states = get_top_states(V[t-1]) +- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0] ++ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0] + +- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys()))) ++ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys())) + obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next + + if not obs_states: +@@ -29,7 +29,7 @@ + V[t][y] = prob + mem_path[t][y] = state + +- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())] ++ last = [(V[-1][y], y) for y in mem_path[-1].keys()] + #if len(last)==0: + #print obs + prob, state = max(last) +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md +--- ./README.md 2014-11-29 15:46:08.487925926 +0800 ++++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800 +@@ -4,6 +4,9 @@ + "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. + - _Scroll down for English documentation._ + ++注意! ++======== ++这个branch `jieba3k` 是专门用于Python3.x的版本 + + 特点 + ======== +@@ -68,16 +71,16 @@ + import jieba + + seg_list = jieba.cut("我来到北京清华大学", cut_all=True) +-print "Full Mode:", "/ ".join(seg_list) # 全模式 ++print("Full Mode:", "/ ".join(seg_list)) # 全模式 + + seg_list = jieba.cut("我来到北京清华大学", cut_all=False) +-print "Default Mode:", "/ ".join(seg_list) # 精确模式 ++print("Default Mode:", "/ ".join(seg_list)) # 精确模式 + + seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 +-print ", ".join(seg_list) ++print(", ".join(seg_list)) + + seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 +-print ", ".join(seg_list) ++print(", ".join(seg_list)) + ``` + + 输出: +@@ -174,7 +177,7 @@ + >>> import jieba.posseg as pseg + >>> words = pseg.cut("我爱北京天安门") + >>> for w in words: +-... print w.word, w.flag ++... print(w.word, w.flag) + ... + 我 r + 爱 v +@@ -203,7 +206,7 @@ + ```python + result = jieba.tokenize(u'永和服装饰品有限公司') + for tk in result: +- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) ++ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + ``` + + ``` +@@ -219,7 +222,7 @@ + ```python + result = jieba.tokenize(u'永和服装饰品有限公司',mode='search') + for tk in result: +- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) ++ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + ``` + + ``` +@@ -408,16 +411,16 @@ + import jieba + + seg_list = jieba.cut("我来到北京清华大学", cut_all=True) +-print "Full Mode:", "/ ".join(seg_list) # 全模式 ++print("Full Mode:", "/ ".join(seg_list)) # 全模式 + + seg_list = jieba.cut("我来到北京清华大学", cut_all=False) +-print "Default Mode:", "/ ".join(seg_list) # 默认模式 ++print("Default Mode:", "/ ".join(seg_list)) # 默认模式 + + seg_list = jieba.cut("他来到了网易杭研大厦") +-print ", ".join(seg_list) ++print(", ".join(seg_list)) + + seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 +-print ", ".join(seg_list) ++print(", ".join(seg_list)) + ``` + + Output: +@@ -483,7 +486,7 @@ + >>> import jieba.posseg as pseg + >>> words = pseg.cut("我爱北京天安门") + >>> for w in words: +-... print w.word, w.flag ++... print(w.word, w.flag) + ... + 我 r + 爱 v +@@ -512,7 +515,7 @@ + ```python + result = jieba.tokenize(u'永和服装饰品有限公司') + for tk in result: +- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) ++ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + ``` + + ``` +@@ -528,7 +531,7 @@ + ```python + result = jieba.tokenize(u'永和服装饰品有限公司',mode='search') + for tk in result: +- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) ++ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) + ``` + + ``` +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py +--- ./setup.py 2014-11-29 15:46:46.379925565 +0800 ++++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800 +@@ -11,7 +11,7 @@ + + 完整文档见 ``README.md`` + +-GitHub: https://github.com/fxsjy/jieba ++GitHub: https://github.com/fxsjy/jieba/tree/jieba3k + + 特点 + ==== +@@ -34,17 +34,11 @@ + Python 2.x + ---------- + +-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba`` +-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行 +- python setup.py install +-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录 +-- 通过 ``import jieba`` 来引用 ++见 https://pypi.python.org/pypi/jieba/ + + Python 3.x + ---------- + +-见 https://pypi.python.org/pypi/jieba3k/ +- + - 目前 master 分支是只支持 Python 2.x 的 + - Python 3.x 版本的分支也已经基本可用: + https://github.com/fxsjy/jieba/tree/jieba3k +@@ -59,13 +53,13 @@ + + """ + +-setup(name='jieba', ++setup(name='jieba3k', + version='0.35.1', + description='Chinese Words Segementation Utilities', + long_description=LONGDOC, + author='Sun, Junyi', + author_email='ccnusjy@gmail.com', +- url='https://github.com/fxsjy/jieba', ++ url='https://github.com/fxsjy/jieba/tree/jieba3k', + license="MIT", + classifiers=[ + 'Intended Audience :: Developers', +@@ -73,9 +67,8 @@ + 'Operating System :: OS Independent', + 'Natural Language :: Chinese (Simplified)', + 'Natural Language :: Chinese (Traditional)', + 'Programming Language :: Python', +- 'Programming Language :: Python :: 2', ++ 'Programming Language :: Python :: 3', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic', +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py +--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800 ++++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800 +@@ -51,13 +51,13 @@ + print("training...") + + nmf = decomposition.NMF(n_components=n_topic).fit(tfidf) +-print(("done in %0.3fs." % (time.time() - t0))) ++print("done in %0.3fs." % (time.time() - t0)) + + # Inverse the vectorizer vocabulary to be able + feature_names = count_vect.get_feature_names() + + for topic_idx, topic in enumerate(nmf.components_): +- print(("Topic #%d:" % topic_idx)) +- print((" ".join([feature_names[i] +- for i in topic.argsort()[:-n_top_words - 1:-1]]))) ++ print("Topic #%d:" % topic_idx) ++ print(" ".join([feature_names[i] ++ for i in topic.argsort()[:-n_top_words - 1:-1]])) + print("") +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py +--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800 ++++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800 +@@ -23,6 +23,6 @@ + break + line = line.strip() + for word in jieba.cut(line): +- print(word.encode(default_encoding)) ++ print(word) + + +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py +--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800 ++++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800 +@@ -152,7 +152,7 @@ + #-*-coding: utf-8 -*- + import sys ++import imp + sys.path.append("../") + import unittest + import types +@@ -97,7 +98,7 @@ + + class JiebaTestCase(unittest.TestCase): + def setUp(self): +- reload(jieba) ++ imp.reload(jieba) + + def tearDown(self): + pass +@@ -151,7 +152,7 @@ + + def testTokenize(self): + for content in test_contents: +- result = jieba.tokenize(content.decode('utf-8')) ++ result = jieba.tokenize(content) + assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" + result = list(result) + assert isinstance(result, list), "Test Tokenize error on content: %s" % content +@@ -181,7 +181,7 @@ + + def testTokenize_NOHMM(self): + for content in test_contents: +- result = jieba.tokenize(content.decode('utf-8'),HMM=False) ++ result = jieba.tokenize(content,HMM=False) + assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" + result = list(result) + assert isinstance(result, list), "Test Tokenize error on content: %s" % content +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py +--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800 ++++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800 +@@ -7,7 +7,6 @@ + + def cuttest(test_sent): + global g_mode +- test_sent = test_sent.decode('utf-8') + result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) + for tk in result: + print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) +diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py +--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800 ++++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800 +@@ -7,7 +7,6 @@ + + def cuttest(test_sent): + global g_mode +- test_sent = test_sent.decode('utf-8') + result = jieba.tokenize(test_sent,mode=g_mode) + for tk in result: + print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) diff --git a/test/auto2to3 b/test/auto2to3 new file mode 100644 index 0000000..72ed37d --- /dev/null +++ b/test/auto2to3 @@ -0,0 +1,34 @@ +#!/bin/bash +# Set 2to3 path. +PYTHON2TO3=2to3 +# Copy the python2 version. +echo Jieba 2to3 manual conversion tool +echo +if ! git rev-parse; then + exit 1 +fi +echo Copying working directory to ../jieba2 +if [ -d ../jieba2 ]; then + echo Found existing ../jieba2 + read -p "Replace it with new one? (y/n) " -r + if ! [[ $REPLY =~ ^[Yy]$ ]]; then + echo Cancelled. + exit + else + rm -rf ../jieba2 + fi +fi +if ! git checkout jieba3k; then + exit 1 +fi +cp -r . ../jieba2 +cd ../jieba2 +if ! git checkout master; then + exit 1 +fi +# Here starts auto conversion. +echo Converting jieba2 to Python3 ... +find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} + +find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \; +patch -p0 -s <../jieba/test/2to3.diff +echo Done. Compare jieba and jieba2 to manually port.