mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix version; fix spaces at end of line
This commit is contained in:
parent
51df77831b
commit
bb1e6000c6
14
Changelog
14
Changelog
@ -13,10 +13,10 @@
|
||||
|
||||
2013-07-01: version 0.31
|
||||
1. 修改了代码缩进格式,遵循PEP8标准
|
||||
2. 支持Jython解析器,感谢 @piaolingxue
|
||||
2. 支持Jython解析器,感谢 @piaolingxue
|
||||
3. 修复中英混合词汇不能识别数字在前词语的Bug
|
||||
4. 部分代码重构,感谢 @chao78787
|
||||
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
||||
4. 部分代码重构,感谢 @chao78787
|
||||
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
||||
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
|
||||
|
||||
|
||||
@ -55,8 +55,8 @@
|
||||
2013-04-27: version 0.28
|
||||
========================
|
||||
1) 新增词典lazy load功能,用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
|
||||
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
||||
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
||||
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
||||
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
||||
|
||||
2013-04-22: version 0.27
|
||||
========================
|
||||
@ -93,7 +93,7 @@
|
||||
2012-11-28: version 0.22
|
||||
========================
|
||||
1) 新增jieba.cut_for_search方法, 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
|
||||
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
||||
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
||||
|
||||
|
||||
2012-11-23: version 0.21
|
||||
@ -104,7 +104,7 @@
|
||||
|
||||
2012-11-06: version 0.20
|
||||
========================
|
||||
1) 新增词性标注功能
|
||||
1) 新增词性标注功能
|
||||
|
||||
|
||||
2012-10-25: version 0.19
|
||||
|
@ -3,7 +3,7 @@ import sys
|
||||
import jieba
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", version="Jieba " + jieba.__version__, epilog="If no filename specified, use STDIN instead.")
|
||||
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
|
||||
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
||||
nargs='?', const=' ',
|
||||
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
|
||||
@ -14,6 +14,8 @@ parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
||||
default=True, help="don't use the Hidden Markov Model")
|
||||
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
||||
help="don't print loading messages to stderr")
|
||||
parser.add_argument("-V", '--version', action='version',
|
||||
version="Jieba " + jieba.__version__)
|
||||
parser.add_argument("filename", nargs='?', help="input file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
@ -1,6 +1,6 @@
|
||||
##encoding=utf-8
|
||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
from whoosh.lang.porter import stem
|
||||
|
||||
import jieba
|
||||
|
@ -23,26 +23,26 @@ def load_model():
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
with open(abs_path, mode='r') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
return start_p, trans_p, emit_p
|
||||
|
||||
if sys.platform.startswith("java"):
|
||||
start_P, trans_P, emit_P = load_model()
|
||||
start_P, trans_P, emit_P = load_model()
|
||||
else:
|
||||
import prob_start,prob_trans,prob_emit
|
||||
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
|
||||
|
@ -15,40 +15,41 @@ PROB_EMIT_P = "prob_emit.p"
|
||||
CHAR_STATE_TAB_P = "char_state_tab.p"
|
||||
|
||||
def load_model(f_name, isJython=True):
|
||||
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
result = {}
|
||||
with file(f_name, "rb") as f:
|
||||
for line in open(f_name,"rb"):
|
||||
with open(f_name, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line=="":continue
|
||||
if not line:
|
||||
continue
|
||||
word, _, tag = line.split(' ')
|
||||
result[word.decode('utf-8')] = tag
|
||||
f.closed
|
||||
if not isJython:
|
||||
return result
|
||||
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
with open(abs_path, mode='r') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
state = {}
|
||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
state = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
@ -62,14 +63,14 @@ else:
|
||||
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
|
||||
|
||||
def makesure_userdict_loaded(fn):
|
||||
|
||||
|
||||
@wraps(fn)
|
||||
def wrapped(*args,**kwargs):
|
||||
if len(jieba.user_word_tag_tab)>0:
|
||||
if jieba.user_word_tag_tab:
|
||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||
jieba.user_word_tag_tab = {}
|
||||
return fn(*args,**kwargs)
|
||||
|
||||
|
||||
return wrapped
|
||||
|
||||
class pair(object):
|
||||
@ -152,7 +153,7 @@ def __cut_DAG_NO_HMM(sentence):
|
||||
def __cut_DAG(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route = {}
|
||||
|
||||
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
|
||||
x = 0
|
||||
|
18
setup.py
18
setup.py
@ -1,11 +1,11 @@
|
||||
from distutils.core import setup
|
||||
setup(name='jieba',
|
||||
version='0.33',
|
||||
description='Chinese Words Segementation Utilities',
|
||||
author='Sun, Junyi',
|
||||
author_email='ccnusjy@gmail.com',
|
||||
url='http://github.com/fxsjy',
|
||||
packages=['jieba'],
|
||||
from distutils.core import setup
|
||||
setup(name='jieba',
|
||||
version='0.33',
|
||||
description='Chinese Words Segementation Utilities',
|
||||
author='Sun, Junyi',
|
||||
author_email='ccnusjy@gmail.com',
|
||||
url='http://github.com/fxsjy',
|
||||
packages=['jieba'],
|
||||
package_dir={'jieba':'jieba'},
|
||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||
)
|
||||
)
|
||||
|
@ -6,7 +6,7 @@ import jieba.posseg as pseg
|
||||
def cuttest(test_sent):
|
||||
result = pseg.cut(test_sent)
|
||||
for w in result:
|
||||
print w.word, "/", w.flag, ", ",
|
||||
print w.word, "/", w.flag, ", ",
|
||||
print ""
|
||||
|
||||
|
||||
@ -95,4 +95,4 @@ if __name__ == "__main__":
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest('枪杆子中出政权')
|
||||
cuttest('枪杆子中出政权')
|
||||
|
@ -14,7 +14,7 @@ for w in words:
|
||||
result = pseg.cut(test_sent)
|
||||
|
||||
for w in result:
|
||||
print w.word, "/", w.flag, ", ",
|
||||
print w.word, "/", w.flag, ", ",
|
||||
|
||||
print "\n========"
|
||||
|
||||
|
@ -5,7 +5,7 @@ from whoosh.index import create_in,open_dir
|
||||
from whoosh.fields import *
|
||||
from whoosh.qparser import QueryParser
|
||||
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
|
||||
analyzer = ChineseAnalyzer()
|
||||
|
||||
@ -18,31 +18,31 @@ ix = create_in("tmp", schema) # for create new index
|
||||
writer = ix.writer()
|
||||
|
||||
writer.add_document(
|
||||
title=u"document1",
|
||||
title=u"document1",
|
||||
path=u"/a",
|
||||
content=u"This is the first document we’ve added!"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document2",
|
||||
title=u"document2",
|
||||
path=u"/b",
|
||||
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document3",
|
||||
title=u"document3",
|
||||
path=u"/c",
|
||||
content=u"买水果然后来世博园。"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document4",
|
||||
title=u"document4",
|
||||
path=u"/c",
|
||||
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document4",
|
||||
title=u"document4",
|
||||
path=u"/c",
|
||||
content=u"咱俩交换一下吧。"
|
||||
)
|
||||
@ -55,7 +55,7 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
|
||||
print "result of ",keyword
|
||||
q = parser.parse(keyword)
|
||||
results = searcher.search(q)
|
||||
for hit in results:
|
||||
for hit in results:
|
||||
print hit.highlights("content")
|
||||
print "="*10
|
||||
|
||||
|
@ -6,7 +6,7 @@ from whoosh.index import create_in
|
||||
from whoosh.fields import *
|
||||
from whoosh.qparser import QueryParser
|
||||
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
|
||||
analyzer = ChineseAnalyzer()
|
||||
|
||||
@ -23,7 +23,7 @@ with open(file_name,"rb") as inf:
|
||||
for line in inf:
|
||||
i+=1
|
||||
writer.add_document(
|
||||
title=u"line"+str(i),
|
||||
title=u"line"+str(i),
|
||||
path=u"/a",
|
||||
content=line.decode('gbk','ignore')
|
||||
)
|
||||
@ -36,6 +36,6 @@ for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"
|
||||
print "result of ",keyword
|
||||
q = parser.parse(keyword)
|
||||
results = searcher.search(q)
|
||||
for hit in results:
|
||||
for hit in results:
|
||||
print hit.highlights("content")
|
||||
print "="*10
|
||||
|
Loading…
x
Reference in New Issue
Block a user