Merge branch 'master' into jieba3k

Conflicts: Changelog jieba/__init__.py jieba/finalseg/__init__.py jieba/posseg/__init__.py setup.py test/parallel/test_file.py test/test_file.py
2025-07-10 00:01:33 +08:00 · 2013-08-21 13:55:21 +08:00 · 2013-08-21 13:55:21 +08:00 · 2857ae45cc
commit 2857ae45cc
parent 6549deabbd 3667a4ab01
17 changed files with 144 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -164,3 +164,7 @@ pip-log.txt
 *.log
 test/tmp/*
 #jython
 *.class
 MANIFEST
--- a/17
+++ b/17
@ -1,3 +1,20 @@
 2013-07-01: version 0.31
 1. 修改了代码缩进格式，遵循PEP8标准
 2. 支持Jython解析器，感谢 @piaolingxue 
 3. 修复中英混合词汇不能识别数字在前词语的Bug
 4. 部分代码重构，感谢 @chao78787 
 5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数，感谢@linkerlin 
 6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
 2013-07-01: version 0.30
 ==========================
 1) 新增jieba.tokenize方法，返回每个词的起始位置
 2) 新增ChineseAnalyzer，用于支持whoosh搜索引擎
 3）添加了更多的中英混合词汇
 4）修改了一些py文件的加载方法，从而支持py2exe,cxfree打包为exe
 2013-06-17: version 0.29.1
 ==========================
 1) 优化了viterbi算法的代码，分词速度提升15%
--- a/20
+++ b/20
@ -0,0 +1,20 @@
 The MIT License (MIT)
 Copyright (c) 2013 Sun Junyi
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
 subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,2 @@
 graft README.md
 graft Changelog
--- a/jieba/init.py
+++ b/jieba/init.py
@ -1,4 +1,7 @@
 from __future__ import with_statement
 __version__ = '0.31'
 __license__ = 'MIT'
 import re
 import math
@ -299,13 +302,17 @@ def __lcut_all(sentence):
 def __lcut_for_search(sentence):
    return list(__ref_cut_for_search(sentence))
@require_initialized
-def enable_parallel(processnum):
+def enable_parallel(processnum=None):
    global pool,cut,cut_for_search
    if os.name=='nt':
        raise Exception("parallel mode only supports posix system")
-
+    if sys.version_info[0]==2 and sys.version_info[1]<6:
-    from multiprocessing import Pool
+        raise Exception("jieba: the parallel feature needs Python version>2.5 ")
    from multiprocessing import Pool,cpu_count
    if processnum==None:
        processnum = cpu_count()
    pool = Pool(processnum)
    def pcut(sentence,cut_all=False):
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -1,12 +1,15 @@
 import re
 import os
-from math import log
+import marshal
-from . import prob_start
+import sys
 from . import prob_trans
 from . import prob_emit
 MIN_FLOAT=-3.14e100
 PROB_START_P = "prob_start.p"
 PROB_TRANS_P = "prob_trans.p"
 PROB_EMIT_P = "prob_emit.p"
 PrevStatus = {
    'B':('E','S'),
    'M':('M','B'),
@ -14,6 +17,35 @@ PrevStatus = {
    'E':('B','M')
 }
 def load_model():
    _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
    start_p = {}
    abs_path = os.path.join(_curpath, PROB_START_P)
    with open(abs_path, mode='rb') as f:
        start_p = marshal.load(f)
    f.closed
    trans_p = {}
    abs_path = os.path.join(_curpath, PROB_TRANS_P)
    with open(abs_path, 'rb') as f:
        trans_p = marshal.load(f)
    f.closed
    emit_p = {}
    abs_path = os.path.join(_curpath, PROB_EMIT_P)
    with file(abs_path, 'rb') as f:
        emit_p = marshal.load(f)
    f.closed
    return start_p, trans_p, emit_p
 if sys.platform.startswith("java"):
    start_P, trans_P, emit_P = load_model()    
 else:
    import prob_start,prob_trans,prob_emit
    start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
 def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}] #tabular
    path = {}
@ -36,7 +68,8 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
 def __cut(sentence):
-    prob, pos_list =  viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P)
+    global emit_P
    prob, pos_list =  viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
    begin, next = 0,0
    #print pos_list, sentence
    for i,char in enumerate(sentence):
--- a/jieba/finalseg/prob_emit.p
+++ b/jieba/finalseg/prob_emit.p
--- a/jieba/finalseg/prob_start.p
+++ b/jieba/finalseg/prob_start.p
--- a/jieba/finalseg/prob_trans.p
+++ b/jieba/finalseg/prob_trans.p
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -3,29 +3,62 @@ import os
 from . import viterbi
 import jieba
 import sys
-from . import prob_start
+import marshal
 from . import prob_trans
 from . import prob_emit
 from . import char_state_tab
 default_encoding = sys.getfilesystemencoding()
-def load_model(f_name):
+PROB_START_P = "prob_start.p"
 PROB_TRANS_P = "prob_trans.p"
 PROB_EMIT_P = "prob_emit.p"
 CHAR_STATE_TAB_P = "char_state_tab.p"
 def load_model(f_name,isJython=True):
    _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-    prob_p_path = os.path.join(_curpath,f_name)
+
-    if f_name.endswith(".py"):
+    result = {}
-        return eval(open(prob_p_path,"rb").read())
+    with file(f_name, "rb") as f:
    else:
        result = {}
        for line in open(f_name,"rb"):
            line = line.strip()
            if line=="":continue
            line = line.decode("utf-8")
            word, _, tag = line.split(" ")
            result[word]=tag
    f.closed
    if not isJython:
        return result
    start_p = {}
    abs_path = os.path.join(_curpath, PROB_START_P)
    with open(abs_path, mode='rb') as f:
        start_p = marshal.load(f)
    f.closed
    trans_p = {}
    abs_path = os.path.join(_curpath, PROB_TRANS_P)
    with open(abs_path, 'rb') as f:
        trans_p = marshal.load(f)
    f.closed
    emit_p = {}
    abs_path = os.path.join(_curpath, PROB_EMIT_P)
    with file(abs_path, 'rb') as f:
        emit_p = marshal.load(f)
    f.closed
-word_tag_tab = load_model(jieba.get_abs_path_dict())
+    state = {}
    abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
    with file(abs_path, 'rb') as f:
        state = marshal.load(f)
    f.closed
    return state, start_p, trans_p, emit_p, result
 if sys.platform.startswith("java"):
    char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
 else:
    import char_state_tab, prob_start, prob_trans, prob_emit
    char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
    word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
 if jieba.user_word_tag_tab:
    word_tag_tab.update(jieba.user_word_tag_tab)
@ -48,7 +81,7 @@ class pair(object):
        return self.__unicode__().encode(arg)
 def __cut(sentence):
-    prob, pos_list =  viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P)
+    prob, pos_list =  viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
    begin, next = 0,0
    for i,char in enumerate(sentence):
--- a/jieba/posseg/char_state_tab.p
+++ b/jieba/posseg/char_state_tab.p
--- a/jieba/posseg/prob_emit.p
+++ b/jieba/posseg/prob_emit.p
--- a/jieba/posseg/prob_start.p
+++ b/jieba/posseg/prob_start.p
--- a/jieba/posseg/prob_trans.p
+++ b/jieba/posseg/prob_trans.p
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup  
 setup(name='jieba',  
-      version='0.29.1',  
+      version='0.31',  
      description='Chinese Words Segementation Utilities',  
      author='Sun, Junyi',  
      author_email='ccnusjy@gmail.com',  
--- a/test/parallel/test_file.py
+++ b/test/parallel/test_file.py
@ -2,18 +2,18 @@ import sys,time
 import sys
 sys.path.append("../../")
 import jieba
-jieba.enable_parallel(4)
+
 jieba.enable_parallel()
 url = sys.argv[1]
 content = open(url,"rb").read()
 t1 = time.time()
-words = list(jieba.cut(content))
+words = "/ ".join(jieba.cut(content))
 t2 = time.time()
 tm_cost = t2-t1
 log_f = open("1.log","wb")
-for w in words:
+log_f.write(words.encode('utf-8'))
    log_f.write(w.encode("utf-8"))
 print('speed' , len(content)/tm_cost, " bytes/second")
--- a/test/test_file.py
+++ b/test/test_file.py
@ -7,12 +7,14 @@ jieba.initialize()
 url = sys.argv[1]
 content = open(url,"rb").read()
 t1 = time.time()
-words = list(jieba.cut(content))
+words = "/ ".join(jieba.cut(content))
 t2 = time.time()
 tm_cost = t2-t1
 log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))
 log_f.write(bytes("/ ".join(words),'utf-8'))