mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
fix version; fix spaces at end of line
This commit is contained in:
parent
51df77831b
commit
bb1e6000c6
14
Changelog
14
Changelog
@ -13,10 +13,10 @@
|
|||||||
|
|
||||||
2013-07-01: version 0.31
|
2013-07-01: version 0.31
|
||||||
1. 修改了代码缩进格式,遵循PEP8标准
|
1. 修改了代码缩进格式,遵循PEP8标准
|
||||||
2. 支持Jython解析器,感谢 @piaolingxue
|
2. 支持Jython解析器,感谢 @piaolingxue
|
||||||
3. 修复中英混合词汇不能识别数字在前词语的Bug
|
3. 修复中英混合词汇不能识别数字在前词语的Bug
|
||||||
4. 部分代码重构,感谢 @chao78787
|
4. 部分代码重构,感谢 @chao78787
|
||||||
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
||||||
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
|
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
|
||||||
|
|
||||||
|
|
||||||
@ -55,8 +55,8 @@
|
|||||||
2013-04-27: version 0.28
|
2013-04-27: version 0.28
|
||||||
========================
|
========================
|
||||||
1) 新增词典lazy load功能,用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
|
1) 新增词典lazy load功能,用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
|
||||||
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
||||||
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
||||||
|
|
||||||
2013-04-22: version 0.27
|
2013-04-22: version 0.27
|
||||||
========================
|
========================
|
||||||
@ -93,7 +93,7 @@
|
|||||||
2012-11-28: version 0.22
|
2012-11-28: version 0.22
|
||||||
========================
|
========================
|
||||||
1) 新增jieba.cut_for_search方法, 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
|
1) 新增jieba.cut_for_search方法, 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
|
||||||
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
||||||
|
|
||||||
|
|
||||||
2012-11-23: version 0.21
|
2012-11-23: version 0.21
|
||||||
@ -104,7 +104,7 @@
|
|||||||
|
|
||||||
2012-11-06: version 0.20
|
2012-11-06: version 0.20
|
||||||
========================
|
========================
|
||||||
1) 新增词性标注功能
|
1) 新增词性标注功能
|
||||||
|
|
||||||
|
|
||||||
2012-10-25: version 0.19
|
2012-10-25: version 0.19
|
||||||
|
@ -3,7 +3,7 @@ import sys
|
|||||||
import jieba
|
import jieba
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", version="Jieba " + jieba.__version__, epilog="If no filename specified, use STDIN instead.")
|
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
|
||||||
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
||||||
nargs='?', const=' ',
|
nargs='?', const=' ',
|
||||||
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
|
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
|
||||||
@ -14,6 +14,8 @@ parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
|||||||
default=True, help="don't use the Hidden Markov Model")
|
default=True, help="don't use the Hidden Markov Model")
|
||||||
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
||||||
help="don't print loading messages to stderr")
|
help="don't print loading messages to stderr")
|
||||||
|
parser.add_argument("-V", '--version', action='version',
|
||||||
|
version="Jieba " + jieba.__version__)
|
||||||
parser.add_argument("filename", nargs='?', help="input file")
|
parser.add_argument("filename", nargs='?', help="input file")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
##encoding=utf-8
|
##encoding=utf-8
|
||||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
||||||
from whoosh.analysis import Tokenizer,Token
|
from whoosh.analysis import Tokenizer,Token
|
||||||
from whoosh.lang.porter import stem
|
from whoosh.lang.porter import stem
|
||||||
|
|
||||||
import jieba
|
import jieba
|
||||||
|
@ -23,26 +23,26 @@ def load_model():
|
|||||||
|
|
||||||
start_p = {}
|
start_p = {}
|
||||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||||
with open(abs_path, mode='rb') as f:
|
with open(abs_path, mode='r') as f:
|
||||||
start_p = marshal.load(f)
|
start_p = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
trans_p = {}
|
trans_p = {}
|
||||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||||
with open(abs_path, 'rb') as f:
|
with open(abs_path, 'r') as f:
|
||||||
trans_p = marshal.load(f)
|
trans_p = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
emit_p = {}
|
emit_p = {}
|
||||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||||
with file(abs_path, 'rb') as f:
|
with open(abs_path, 'r') as f:
|
||||||
emit_p = marshal.load(f)
|
emit_p = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
return start_p, trans_p, emit_p
|
return start_p, trans_p, emit_p
|
||||||
|
|
||||||
if sys.platform.startswith("java"):
|
if sys.platform.startswith("java"):
|
||||||
start_P, trans_P, emit_P = load_model()
|
start_P, trans_P, emit_P = load_model()
|
||||||
else:
|
else:
|
||||||
import prob_start,prob_trans,prob_emit
|
import prob_start,prob_trans,prob_emit
|
||||||
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
|
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
|
||||||
|
@ -15,40 +15,41 @@ PROB_EMIT_P = "prob_emit.p"
|
|||||||
CHAR_STATE_TAB_P = "char_state_tab.p"
|
CHAR_STATE_TAB_P = "char_state_tab.p"
|
||||||
|
|
||||||
def load_model(f_name, isJython=True):
|
def load_model(f_name, isJython=True):
|
||||||
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
with file(f_name, "rb") as f:
|
with open(f_name, "r") as f:
|
||||||
for line in open(f_name,"rb"):
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line=="":continue
|
if not line:
|
||||||
|
continue
|
||||||
word, _, tag = line.split(' ')
|
word, _, tag = line.split(' ')
|
||||||
result[word.decode('utf-8')] = tag
|
result[word.decode('utf-8')] = tag
|
||||||
f.closed
|
f.closed
|
||||||
if not isJython:
|
if not isJython:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
start_p = {}
|
start_p = {}
|
||||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||||
with open(abs_path, mode='rb') as f:
|
with open(abs_path, mode='r') as f:
|
||||||
start_p = marshal.load(f)
|
start_p = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
trans_p = {}
|
trans_p = {}
|
||||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||||
with open(abs_path, 'rb') as f:
|
with open(abs_path, 'r') as f:
|
||||||
trans_p = marshal.load(f)
|
trans_p = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
emit_p = {}
|
emit_p = {}
|
||||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||||
with file(abs_path, 'rb') as f:
|
with open(abs_path, 'r') as f:
|
||||||
emit_p = marshal.load(f)
|
emit_p = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
state = {}
|
state = {}
|
||||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||||
with file(abs_path, 'rb') as f:
|
with open(abs_path, 'r') as f:
|
||||||
state = marshal.load(f)
|
state = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
@ -62,14 +63,14 @@ else:
|
|||||||
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
|
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
|
||||||
|
|
||||||
def makesure_userdict_loaded(fn):
|
def makesure_userdict_loaded(fn):
|
||||||
|
|
||||||
@wraps(fn)
|
@wraps(fn)
|
||||||
def wrapped(*args,**kwargs):
|
def wrapped(*args,**kwargs):
|
||||||
if len(jieba.user_word_tag_tab)>0:
|
if jieba.user_word_tag_tab:
|
||||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||||
jieba.user_word_tag_tab = {}
|
jieba.user_word_tag_tab = {}
|
||||||
return fn(*args,**kwargs)
|
return fn(*args,**kwargs)
|
||||||
|
|
||||||
return wrapped
|
return wrapped
|
||||||
|
|
||||||
class pair(object):
|
class pair(object):
|
||||||
@ -152,7 +153,7 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
def __cut_DAG(sentence):
|
def __cut_DAG(sentence):
|
||||||
DAG = jieba.get_DAG(sentence)
|
DAG = jieba.get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
|
|
||||||
jieba.calc(sentence,DAG,0,route=route)
|
jieba.calc(sentence,DAG,0,route=route)
|
||||||
|
|
||||||
x = 0
|
x = 0
|
||||||
|
18
setup.py
18
setup.py
@ -1,11 +1,11 @@
|
|||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
setup(name='jieba',
|
setup(name='jieba',
|
||||||
version='0.33',
|
version='0.33',
|
||||||
description='Chinese Words Segementation Utilities',
|
description='Chinese Words Segementation Utilities',
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
author_email='ccnusjy@gmail.com',
|
author_email='ccnusjy@gmail.com',
|
||||||
url='http://github.com/fxsjy',
|
url='http://github.com/fxsjy',
|
||||||
packages=['jieba'],
|
packages=['jieba'],
|
||||||
package_dir={'jieba':'jieba'},
|
package_dir={'jieba':'jieba'},
|
||||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||||
)
|
)
|
||||||
|
@ -6,7 +6,7 @@ import jieba.posseg as pseg
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print w.word, "/", w.flag, ", ",
|
||||||
print ""
|
print ""
|
||||||
|
|
||||||
|
|
||||||
@ -95,4 +95,4 @@ if __name__ == "__main__":
|
|||||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
cuttest('枪杆子中出政权')
|
cuttest('枪杆子中出政权')
|
||||||
|
@ -14,7 +14,7 @@ for w in words:
|
|||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
|
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
print w.word, "/", w.flag, ", ",
|
||||||
|
|
||||||
print "\n========"
|
print "\n========"
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ from whoosh.index import create_in,open_dir
|
|||||||
from whoosh.fields import *
|
from whoosh.fields import *
|
||||||
from whoosh.qparser import QueryParser
|
from whoosh.qparser import QueryParser
|
||||||
|
|
||||||
from jieba.analyse import ChineseAnalyzer
|
from jieba.analyse import ChineseAnalyzer
|
||||||
|
|
||||||
analyzer = ChineseAnalyzer()
|
analyzer = ChineseAnalyzer()
|
||||||
|
|
||||||
@ -18,31 +18,31 @@ ix = create_in("tmp", schema) # for create new index
|
|||||||
writer = ix.writer()
|
writer = ix.writer()
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document1",
|
title=u"document1",
|
||||||
path=u"/a",
|
path=u"/a",
|
||||||
content=u"This is the first document we’ve added!"
|
content=u"This is the first document we’ve added!"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document2",
|
title=u"document2",
|
||||||
path=u"/b",
|
path=u"/b",
|
||||||
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
|
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document3",
|
title=u"document3",
|
||||||
path=u"/c",
|
path=u"/c",
|
||||||
content=u"买水果然后来世博园。"
|
content=u"买水果然后来世博园。"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document4",
|
title=u"document4",
|
||||||
path=u"/c",
|
path=u"/c",
|
||||||
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||||
)
|
)
|
||||||
|
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"document4",
|
title=u"document4",
|
||||||
path=u"/c",
|
path=u"/c",
|
||||||
content=u"咱俩交换一下吧。"
|
content=u"咱俩交换一下吧。"
|
||||||
)
|
)
|
||||||
@ -55,7 +55,7 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
|
|||||||
print "result of ",keyword
|
print "result of ",keyword
|
||||||
q = parser.parse(keyword)
|
q = parser.parse(keyword)
|
||||||
results = searcher.search(q)
|
results = searcher.search(q)
|
||||||
for hit in results:
|
for hit in results:
|
||||||
print hit.highlights("content")
|
print hit.highlights("content")
|
||||||
print "="*10
|
print "="*10
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ from whoosh.index import create_in
|
|||||||
from whoosh.fields import *
|
from whoosh.fields import *
|
||||||
from whoosh.qparser import QueryParser
|
from whoosh.qparser import QueryParser
|
||||||
|
|
||||||
from jieba.analyse import ChineseAnalyzer
|
from jieba.analyse import ChineseAnalyzer
|
||||||
|
|
||||||
analyzer = ChineseAnalyzer()
|
analyzer = ChineseAnalyzer()
|
||||||
|
|
||||||
@ -23,7 +23,7 @@ with open(file_name,"rb") as inf:
|
|||||||
for line in inf:
|
for line in inf:
|
||||||
i+=1
|
i+=1
|
||||||
writer.add_document(
|
writer.add_document(
|
||||||
title=u"line"+str(i),
|
title=u"line"+str(i),
|
||||||
path=u"/a",
|
path=u"/a",
|
||||||
content=line.decode('gbk','ignore')
|
content=line.decode('gbk','ignore')
|
||||||
)
|
)
|
||||||
@ -36,6 +36,6 @@ for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"
|
|||||||
print "result of ",keyword
|
print "result of ",keyword
|
||||||
q = parser.parse(keyword)
|
q = parser.parse(keyword)
|
||||||
results = searcher.search(q)
|
results = searcher.search(q)
|
||||||
for hit in results:
|
for hit in results:
|
||||||
print hit.highlights("content")
|
print hit.highlights("content")
|
||||||
print "="*10
|
print "="*10
|
||||||
|
Loading…
x
Reference in New Issue
Block a user