Merge branch 'master' into jieba3k

Conflicts:
	Changelog
	jieba/__init__.py
	jieba/finalseg/__init__.py
	jieba/posseg/__init__.py
	setup.py
	test/parallel/test_file.py
	test/test_file.py
This commit is contained in:
ZoeyYoung 2013-08-21 13:55:21 +08:00
commit 2857ae45cc
17 changed files with 144 additions and 26 deletions

4
.gitignore vendored
View File

@ -164,3 +164,7 @@ pip-log.txt
*.log *.log
test/tmp/* test/tmp/*
#jython
*.class
MANIFEST

View File

@ -1,3 +1,20 @@
2013-07-01: version 0.31
1. 修改了代码缩进格式遵循PEP8标准
2. 支持Jython解析器感谢 @piaolingxue
3. 修复中英混合词汇不能识别数字在前词语的Bug
4. 部分代码重构,感谢 @chao78787
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数感谢@linkerlin
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
2013-07-01: version 0.30
==========================
1) 新增jieba.tokenize方法返回每个词的起始位置
2) 新增ChineseAnalyzer用于支持whoosh搜索引擎
3添加了更多的中英混合词汇
4修改了一些py文件的加载方法从而支持py2exe,cxfree打包为exe
2013-06-17: version 0.29.1 2013-06-17: version 0.29.1
========================== ==========================
1) 优化了viterbi算法的代码分词速度提升15% 1) 优化了viterbi算法的代码分词速度提升15%

20
LICENSE Normal file
View File

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2013 Sun Junyi
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

2
MANIFEST.in Normal file
View File

@ -0,0 +1,2 @@
graft README.md
graft Changelog

View File

@ -1,4 +1,7 @@
from __future__ import with_statement from __future__ import with_statement
__version__ = '0.31'
__license__ = 'MIT'
import re import re
import math import math
@ -299,13 +302,17 @@ def __lcut_all(sentence):
def __lcut_for_search(sentence): def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence)) return list(__ref_cut_for_search(sentence))
@require_initialized @require_initialized
def enable_parallel(processnum): def enable_parallel(processnum=None):
global pool,cut,cut_for_search global pool,cut,cut_for_search
if os.name=='nt': if os.name=='nt':
raise Exception("parallel mode only supports posix system") raise Exception("parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6:
from multiprocessing import Pool raise Exception("jieba: the parallel feature needs Python version>2.5 ")
from multiprocessing import Pool,cpu_count
if processnum==None:
processnum = cpu_count()
pool = Pool(processnum) pool = Pool(processnum)
def pcut(sentence,cut_all=False): def pcut(sentence,cut_all=False):

View File

@ -1,12 +1,15 @@
import re import re
import os import os
from math import log import marshal
from . import prob_start import sys
from . import prob_trans
from . import prob_emit
MIN_FLOAT=-3.14e100 MIN_FLOAT=-3.14e100
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"
PrevStatus = { PrevStatus = {
'B':('E','S'), 'B':('E','S'),
'M':('M','B'), 'M':('M','B'),
@ -14,6 +17,35 @@ PrevStatus = {
'E':('B','M') 'E':('B','M')
} }
def load_model():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p
if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model()
else:
import prob_start,prob_trans,prob_emit
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
path = {} path = {}
@ -36,7 +68,8 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
def __cut(sentence): def __cut(sentence):
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P) global emit_P
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0,0
#print pos_list, sentence #print pos_list, sentence
for i,char in enumerate(sentence): for i,char in enumerate(sentence):

BIN
jieba/finalseg/prob_emit.p Normal file

Binary file not shown.

BIN
jieba/finalseg/prob_start.p Normal file

Binary file not shown.

BIN
jieba/finalseg/prob_trans.p Normal file

Binary file not shown.

View File

@ -3,29 +3,62 @@ import os
from . import viterbi from . import viterbi
import jieba import jieba
import sys import sys
from . import prob_start import marshal
from . import prob_trans
from . import prob_emit
from . import char_state_tab
default_encoding = sys.getfilesystemencoding() default_encoding = sys.getfilesystemencoding()
def load_model(f_name): PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name,isJython=True):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
prob_p_path = os.path.join(_curpath,f_name)
if f_name.endswith(".py"): result = {}
return eval(open(prob_p_path,"rb").read()) with file(f_name, "rb") as f:
else:
result = {}
for line in open(f_name,"rb"): for line in open(f_name,"rb"):
line = line.strip() line = line.strip()
if line=="":continue if line=="":continue
line = line.decode("utf-8") line = line.decode("utf-8")
word, _, tag = line.split(" ") word, _, tag = line.split(" ")
result[word]=tag result[word]=tag
f.closed
if not isJython:
return result return result
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
word_tag_tab = load_model(jieba.get_abs_path_dict()) state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
with file(abs_path, 'rb') as f:
state = marshal.load(f)
f.closed
return state, start_p, trans_p, emit_p, result
if sys.platform.startswith("java"):
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
else:
import char_state_tab, prob_start, prob_trans, prob_emit
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
if jieba.user_word_tag_tab: if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab) word_tag_tab.update(jieba.user_word_tag_tab)
@ -48,7 +81,7 @@ class pair(object):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)
def __cut(sentence): def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P) prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0,0
for i,char in enumerate(sentence): for i,char in enumerate(sentence):

Binary file not shown.

BIN
jieba/posseg/prob_emit.p Normal file

Binary file not shown.

BIN
jieba/posseg/prob_start.p Normal file

Binary file not shown.

BIN
jieba/posseg/prob_trans.p Normal file

Binary file not shown.

View File

@ -1,6 +1,6 @@
from distutils.core import setup from distutils.core import setup
setup(name='jieba', setup(name='jieba',
version='0.29.1', version='0.31',
description='Chinese Words Segementation Utilities', description='Chinese Words Segementation Utilities',
author='Sun, Junyi', author='Sun, Junyi',
author_email='ccnusjy@gmail.com', author_email='ccnusjy@gmail.com',

View File

@ -2,18 +2,18 @@ import sys,time
import sys import sys
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
jieba.enable_parallel(4)
jieba.enable_parallel()
url = sys.argv[1] url = sys.argv[1]
content = open(url,"rb").read() content = open(url,"rb").read()
t1 = time.time() t1 = time.time()
words = list(jieba.cut(content)) words = "/ ".join(jieba.cut(content))
t2 = time.time() t2 = time.time()
tm_cost = t2-t1 tm_cost = t2-t1
log_f = open("1.log","wb") log_f = open("1.log","wb")
for w in words: log_f.write(words.encode('utf-8'))
log_f.write(w.encode("utf-8"))
print('speed' , len(content)/tm_cost, " bytes/second") print('speed' , len(content)/tm_cost, " bytes/second")

View File

@ -7,12 +7,14 @@ jieba.initialize()
url = sys.argv[1] url = sys.argv[1]
content = open(url,"rb").read() content = open(url,"rb").read()
t1 = time.time() t1 = time.time()
words = list(jieba.cut(content)) words = "/ ".join(jieba.cut(content))
t2 = time.time() t2 = time.time()
tm_cost = t2-t1 tm_cost = t2-t1
log_f = open("1.log","wb") log_f = open("1.log","wb")
log_f.write(words.encode('utf-8'))
log_f.write(bytes("/ ".join(words),'utf-8')) log_f.write(bytes("/ ".join(words),'utf-8'))