fix version; fix spaces at end of line

This commit is contained in:
Dingyuan Wang 2014-10-19 10:57:46 +08:00
parent 51df77831b
commit bb1e6000c6
10 changed files with 53 additions and 50 deletions

View File

@ -13,10 +13,10 @@
2013-07-01: version 0.31
1. 修改了代码缩进格式遵循PEP8标准
2. 支持Jython解析器感谢 @piaolingxue
2. 支持Jython解析器感谢 @piaolingxue
3. 修复中英混合词汇不能识别数字在前词语的Bug
4. 部分代码重构,感谢 @chao78787
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数感谢@linkerlin
4. 部分代码重构,感谢 @chao78787
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数感谢@linkerlin
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
@ -55,8 +55,8 @@
2013-04-27: version 0.28
========================
1) 新增词典lazy load功能用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
2) 显示词典加载异常时错误的词条信息. 感谢neuront
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
2) 显示词典加载异常时错误的词条信息. 感谢neuront
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
2013-04-22: version 0.27
========================
@ -93,7 +93,7 @@
2012-11-28: version 0.22
========================
1) 新增jieba.cut_for_search方法 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列从这个版本起有一个单独的jieba3k
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列从这个版本起有一个单独的jieba3k
2012-11-23: version 0.21
@ -104,7 +104,7 @@
2012-11-06: version 0.20
========================
1) 新增词性标注功能
1) 新增词性标注功能
2012-10-25: version 0.19

View File

@ -3,7 +3,7 @@ import sys
import jieba
from argparse import ArgumentParser
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", version="Jieba " + jieba.__version__, epilog="If no filename specified, use STDIN instead.")
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
nargs='?', const=' ',
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
@ -14,6 +14,8 @@ parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
help="don't print loading messages to stderr")
parser.add_argument("-V", '--version', action='version',
version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")
args = parser.parse_args()

View File

@ -1,6 +1,6 @@
##encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
import jieba

View File

@ -23,26 +23,26 @@ def load_model():
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
with open(abs_path, mode='r') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
with open(abs_path, 'r') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f:
with open(abs_path, 'r') as f:
emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p
if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model()
start_P, trans_P, emit_P = load_model()
else:
import prob_start,prob_trans,prob_emit
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P

View File

@ -15,40 +15,41 @@ PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name, isJython=True):
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
with file(f_name, "rb") as f:
for line in open(f_name,"rb"):
with open(f_name, "r") as f:
for line in f:
line = line.strip()
if line=="":continue
if not line:
continue
word, _, tag = line.split(' ')
result[word.decode('utf-8')] = tag
f.closed
if not isJython:
return result
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
with open(abs_path, mode='r') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
with open(abs_path, 'r') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f:
with open(abs_path, 'r') as f:
emit_p = marshal.load(f)
f.closed
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
with file(abs_path, 'rb') as f:
with open(abs_path, 'r') as f:
state = marshal.load(f)
f.closed
@ -62,14 +63,14 @@ else:
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
def makesure_userdict_loaded(fn):
@wraps(fn)
def wrapped(*args,**kwargs):
if len(jieba.user_word_tag_tab)>0:
if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab)
jieba.user_word_tag_tab = {}
return fn(*args,**kwargs)
return wrapped
class pair(object):
@ -152,7 +153,7 @@ def __cut_DAG_NO_HMM(sentence):
def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence)
route = {}
jieba.calc(sentence,DAG,0,route=route)
x = 0

View File

@ -1,11 +1,11 @@
from distutils.core import setup
setup(name='jieba',
version='0.33',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',
url='http://github.com/fxsjy',
packages=['jieba'],
from distutils.core import setup
setup(name='jieba',
version='0.33',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',
url='http://github.com/fxsjy',
packages=['jieba'],
package_dir={'jieba':'jieba'},
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
)
)

View File

@ -6,7 +6,7 @@ import jieba.posseg as pseg
def cuttest(test_sent):
result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",
print w.word, "/", w.flag, ", ",
print ""
@ -95,4 +95,4 @@ if __name__ == "__main__":
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
cuttest('枪杆子中出政权')
cuttest('枪杆子中出政权')

View File

@ -14,7 +14,7 @@ for w in words:
result = pseg.cut(test_sent)
for w in result:
print w.word, "/", w.flag, ", ",
print w.word, "/", w.flag, ", ",
print "\n========"

View File

@ -5,7 +5,7 @@ from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()
@ -18,31 +18,31 @@ ix = create_in("tmp", schema) # for create new index
writer = ix.writer()
writer.add_document(
title=u"document1",
title=u"document1",
path=u"/a",
content=u"This is the first document weve added!"
)
writer.add_document(
title=u"document2",
title=u"document2",
path=u"/b",
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
)
writer.add_document(
title=u"document3",
title=u"document3",
path=u"/c",
content=u"买水果然后来世博园。"
)
writer.add_document(
title=u"document4",
title=u"document4",
path=u"/c",
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
)
writer.add_document(
title=u"document4",
title=u"document4",
path=u"/c",
content=u"咱俩交换一下吧。"
)
@ -55,7 +55,7 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
print "result of ",keyword
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
for hit in results:
print hit.highlights("content")
print "="*10

View File

@ -6,7 +6,7 @@ from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer
from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer()
@ -23,7 +23,7 @@ with open(file_name,"rb") as inf:
for line in inf:
i+=1
writer.add_document(
title=u"line"+str(i),
title=u"line"+str(i),
path=u"/a",
content=line.decode('gbk','ignore')
)
@ -36,6 +36,6 @@ for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"
print "result of ",keyword
q = parser.parse(keyword)
results = searcher.search(q)
for hit in results:
for hit in results:
print hit.highlights("content")
print "="*10