From bb1e6000c61247c6134f8edebfcedc4589ba9e46 Mon Sep 17 00:00:00 2001 From: Dingyuan Wang Date: Sun, 19 Oct 2014 10:57:46 +0800 Subject: [PATCH] fix version; fix spaces at end of line --- Changelog | 14 +++++++------- jieba/__main__.py | 4 +++- jieba/analyse/analyzer.py | 2 +- jieba/finalseg/__init__.py | 8 ++++---- jieba/posseg/__init__.py | 31 ++++++++++++++++--------------- setup.py | 18 +++++++++--------- test/test_pos.py | 4 ++-- test/test_userdict.py | 2 +- test/test_whoosh.py | 14 +++++++------- test/test_whoosh_flie.py | 6 +++--- 10 files changed, 53 insertions(+), 50 deletions(-) diff --git a/Changelog b/Changelog index 258dda2..99be4ee 100644 --- a/Changelog +++ b/Changelog @@ -13,10 +13,10 @@ 2013-07-01: version 0.31 1. 修改了代码缩进格式,遵循PEP8标准 -2. 支持Jython解析器,感谢 @piaolingxue +2. 支持Jython解析器,感谢 @piaolingxue 3. 修复中英混合词汇不能识别数字在前词语的Bug -4. 部分代码重构,感谢 @chao78787 -5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin +4. 部分代码重构,感谢 @chao78787 +5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin 6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖 @@ -55,8 +55,8 @@ 2013-04-27: version 0.28 ======================== 1) 新增词典lazy load功能,用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf -2) 显示词典加载异常时错误的词条信息. 感谢neuront -3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront +2) 显示词典加载异常时错误的词条信息. 感谢neuront +3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront 2013-04-22: version 0.27 ======================== @@ -93,7 +93,7 @@ 2012-11-28: version 0.22 ======================== 1) 新增jieba.cut_for_search方法, 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。 -2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k +2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k 2012-11-23: version 0.21 @@ -104,7 +104,7 @@ 2012-11-06: version 0.20 ======================== -1) 新增词性标注功能 +1) 新增词性标注功能 2012-10-25: version 0.19 diff --git a/jieba/__main__.py b/jieba/__main__.py index b2bd203..d90096d 100644 --- a/jieba/__main__.py +++ b/jieba/__main__.py @@ -3,7 +3,7 @@ import sys import jieba from argparse import ArgumentParser -parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", version="Jieba " + jieba.__version__, epilog="If no filename specified, use STDIN instead.") +parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.") parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ', nargs='?', const=' ', help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM") @@ -14,6 +14,8 @@ parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false", default=True, help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, help="don't print loading messages to stderr") +parser.add_argument("-V", '--version', action='version', + version="Jieba " + jieba.__version__) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() diff --git a/jieba/analyse/analyzer.py b/jieba/analyse/analyzer.py index cc73589..d1b16b5 100644 --- a/jieba/analyse/analyzer.py +++ b/jieba/analyse/analyzer.py @@ -1,6 +1,6 @@ ##encoding=utf-8 from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter -from whoosh.analysis import Tokenizer,Token +from whoosh.analysis import Tokenizer,Token from whoosh.lang.porter import stem import jieba diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index 9426a72..fa47268 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -23,26 +23,26 @@ def load_model(): start_p = {} abs_path = os.path.join(_curpath, PROB_START_P) - with open(abs_path, mode='rb') as f: + with open(abs_path, mode='r') as f: start_p = marshal.load(f) f.closed trans_p = {} abs_path = os.path.join(_curpath, PROB_TRANS_P) - with open(abs_path, 'rb') as f: + with open(abs_path, 'r') as f: trans_p = marshal.load(f) f.closed emit_p = {} abs_path = os.path.join(_curpath, PROB_EMIT_P) - with file(abs_path, 'rb') as f: + with open(abs_path, 'r') as f: emit_p = marshal.load(f) f.closed return start_p, trans_p, emit_p if sys.platform.startswith("java"): - start_P, trans_P, emit_P = load_model() + start_P, trans_P, emit_P = load_model() else: import prob_start,prob_trans,prob_emit start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index b45136e..a048d22 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -15,40 +15,41 @@ PROB_EMIT_P = "prob_emit.p" CHAR_STATE_TAB_P = "char_state_tab.p" def load_model(f_name, isJython=True): - _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) result = {} - with file(f_name, "rb") as f: - for line in open(f_name,"rb"): + with open(f_name, "r") as f: + for line in f: line = line.strip() - if line=="":continue + if not line: + continue word, _, tag = line.split(' ') result[word.decode('utf-8')] = tag f.closed if not isJython: return result - + start_p = {} abs_path = os.path.join(_curpath, PROB_START_P) - with open(abs_path, mode='rb') as f: + with open(abs_path, mode='r') as f: start_p = marshal.load(f) f.closed - + trans_p = {} abs_path = os.path.join(_curpath, PROB_TRANS_P) - with open(abs_path, 'rb') as f: + with open(abs_path, 'r') as f: trans_p = marshal.load(f) f.closed - + emit_p = {} abs_path = os.path.join(_curpath, PROB_EMIT_P) - with file(abs_path, 'rb') as f: + with open(abs_path, 'r') as f: emit_p = marshal.load(f) f.closed state = {} abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) - with file(abs_path, 'rb') as f: + with open(abs_path, 'r') as f: state = marshal.load(f) f.closed @@ -62,14 +63,14 @@ else: word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False) def makesure_userdict_loaded(fn): - + @wraps(fn) def wrapped(*args,**kwargs): - if len(jieba.user_word_tag_tab)>0: + if jieba.user_word_tag_tab: word_tag_tab.update(jieba.user_word_tag_tab) jieba.user_word_tag_tab = {} return fn(*args,**kwargs) - + return wrapped class pair(object): @@ -152,7 +153,7 @@ def __cut_DAG_NO_HMM(sentence): def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} - + jieba.calc(sentence,DAG,0,route=route) x = 0 diff --git a/setup.py b/setup.py index a65e500..db06809 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ -from distutils.core import setup -setup(name='jieba', - version='0.33', - description='Chinese Words Segementation Utilities', - author='Sun, Junyi', - author_email='ccnusjy@gmail.com', - url='http://github.com/fxsjy', - packages=['jieba'], +from distutils.core import setup +setup(name='jieba', + version='0.33', + description='Chinese Words Segementation Utilities', + author='Sun, Junyi', + author_email='ccnusjy@gmail.com', + url='http://github.com/fxsjy', + packages=['jieba'], package_dir={'jieba':'jieba'}, package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']} -) +) diff --git a/test/test_pos.py b/test/test_pos.py index 5354a43..3815e72 100644 --- a/test/test_pos.py +++ b/test/test_pos.py @@ -6,7 +6,7 @@ import jieba.posseg as pseg def cuttest(test_sent): result = pseg.cut(test_sent) for w in result: - print w.word, "/", w.flag, ", ", + print w.word, "/", w.flag, ", ", print "" @@ -95,4 +95,4 @@ if __name__ == "__main__": cuttest('AT&T是一件不错的公司,给你发offer了吗?') cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') - cuttest('枪杆子中出政权') \ No newline at end of file + cuttest('枪杆子中出政权') diff --git a/test/test_userdict.py b/test/test_userdict.py index e5a4727..0a3d1ee 100644 --- a/test/test_userdict.py +++ b/test/test_userdict.py @@ -14,7 +14,7 @@ for w in words: result = pseg.cut(test_sent) for w in result: - print w.word, "/", w.flag, ", ", + print w.word, "/", w.flag, ", ", print "\n========" diff --git a/test/test_whoosh.py b/test/test_whoosh.py index 9a7c033..4995139 100644 --- a/test/test_whoosh.py +++ b/test/test_whoosh.py @@ -5,7 +5,7 @@ from whoosh.index import create_in,open_dir from whoosh.fields import * from whoosh.qparser import QueryParser -from jieba.analyse import ChineseAnalyzer +from jieba.analyse import ChineseAnalyzer analyzer = ChineseAnalyzer() @@ -18,31 +18,31 @@ ix = create_in("tmp", schema) # for create new index writer = ix.writer() writer.add_document( - title=u"document1", + title=u"document1", path=u"/a", content=u"This is the first document we’ve added!" ) writer.add_document( - title=u"document2", + title=u"document2", path=u"/b", content=u"The second one 你 中文测试中文 is even more interesting! 吃水果" ) writer.add_document( - title=u"document3", + title=u"document3", path=u"/c", content=u"买水果然后来世博园。" ) writer.add_document( - title=u"document4", + title=u"document4", path=u"/c", content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" ) writer.add_document( - title=u"document4", + title=u"document4", path=u"/c", content=u"咱俩交换一下吧。" ) @@ -55,7 +55,7 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交 print "result of ",keyword q = parser.parse(keyword) results = searcher.search(q) - for hit in results: + for hit in results: print hit.highlights("content") print "="*10 diff --git a/test/test_whoosh_flie.py b/test/test_whoosh_flie.py index 3610b49..d403213 100644 --- a/test/test_whoosh_flie.py +++ b/test/test_whoosh_flie.py @@ -6,7 +6,7 @@ from whoosh.index import create_in from whoosh.fields import * from whoosh.qparser import QueryParser -from jieba.analyse import ChineseAnalyzer +from jieba.analyse import ChineseAnalyzer analyzer = ChineseAnalyzer() @@ -23,7 +23,7 @@ with open(file_name,"rb") as inf: for line in inf: i+=1 writer.add_document( - title=u"line"+str(i), + title=u"line"+str(i), path=u"/a", content=line.decode('gbk','ignore') ) @@ -36,6 +36,6 @@ for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换" print "result of ",keyword q = parser.parse(keyword) results = searcher.search(q) - for hit in results: + for hit in results: print hit.highlights("content") print "="*10