update to v0.33

2025-07-10 00:01:33 +08:00 · 2014-09-06 23:28:47 +08:00 · 2014-09-06 23:28:47 +08:00 · 6fad5fbb2c
commit 6fad5fbb2c
parent 6eb43acc10
32 changed files with 176556 additions and 130 deletions
--- a/6
+++ b/6
@ -1,3 +1,9 @@
 2014-08-31: version 0.33
 1. 支持自定义stop words; by @fukuball
 2. 支持自定义idf词典; by @fukuball
 3. 修复自定义词典的词性不能正常显示的bug; by @ShuraChow
 2014-02-07: version 0.32
 1. 新增分词选项：可以关闭新词发现功能；详见：https://github.com/fxsjy/jieba/blob/master/test/test_no_hmm.py#L8
 2. 修复posseg子模块的Bug；详见: https://github.com/fxsjy/jieba/issues/111 https://github.com/fxsjy/jieba/issues/132
--- a/README.md
+++ b/README.md
@ -8,7 +8,6 @@ jieba
 注意！
 ========
 这个branch `jieba3k`是专门用于Python3.x的版本
 =======
 Feature
@ -38,7 +37,7 @@ Python 2.x 下的安装
 * 全自动安装：`easy_install jieba` 或者 `pip install jieba`
 * 半自动安装：先下载 http://pypi.python.org/pypi/jieba/ ，解压后运行 python setup.py install
 * 手动安装：将 jieba 目录放置于当前目录或者 site-packages 目录
-* 通过import jieba 来引用 （第一次import时需要构建Trie树，需要几秒时间）
+* 通过 import jieba 来引用
 Python 3.x 下的安装
@ -49,6 +48,8 @@ Python 3.x 下的安装
        git clone https://github.com/fxsjy/jieba.git
        git checkout jieba3k
        python setup.py install
 * 或使用pip3安装： pip3 install jieba3k
 结巴分词 Java 版本
 ================
@ -94,11 +95,9 @@ Algorithm
 	seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
 	print("Default Mode:", "/ ".join(seg_list)) # 精确模式
 	seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
 	print(", ".join(seg_list))
 	seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
 	print(", ".join(seg_list))
@ -142,6 +141,18 @@ Output:
 	https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
 关键词提取所使用逆向文件频率（IDF）文本语料库可以切换成自定义语料库的路径
 * 用法： jieba.analyse.set_idf_path(file_name) # file_name为自定义语料库的路径
 * 自定义语料库示例：https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py
 关键词提取所使用停止词（Stop Words）文本语料库可以切换成自定义语料库的路径
 * 用法： jieba.analyse.set_stop_words(file_name) # file_name为自定义语料库的路径
 * 自定义语料库示例：https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
 功能 4) : 词性标注
 ================
 * 标注句子分词后每个词的词性，采用和 ictclas 兼容的标记法
@ -173,7 +184,7 @@ Output:
 功能 6) : Tokenize：返回词语在原文的起始位置
 ============================================
-* 注意，输入参数只接受unicode
+* 注意，输入参数只接受 str
 * 默认模式
 ```python
@ -335,9 +346,9 @@ Function 2): Add a custom dictionary
 		李小福 2
 		创新办 3
-		之前： 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
+		[Before]： 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
-		加载自定义词库后：　李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
+		[After]：　李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
 Function 3): Keyword Extraction
 ================
@ -349,6 +360,18 @@ Code sample (keyword extraction)
 	https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
 Developers can specify their own custom IDF corpus in jieba keyword extraction
 * Usage： `jieba.analyse.set_idf_path(file_name) # file_name is a custom corpus path`
 * Custom Corpus Sample：https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big
 * Sample Code：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py
 Developers can specify their own custom stop words corpus in jieba keyword extraction
 * Usage： `jieba.analyse.set_stop_words(file_name) # file_name is a custom corpus path`
 * Custom Corpus Sample：https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
 * Sample Code：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
 Using Other Dictionaries
 ========
 It is possible to supply Jieba with your own custom dictionary, and there are also two dictionaries readily available for download:
--- a/extra_dict/idf.txt.big
+++ b/extra_dict/idf.txt.big
--- a/extra_dict/stop_words.txt
+++ b/extra_dict/stop_words.txt
@ -0,0 +1,51 @@
 the
 of
 is
 and
 to
 in
 that
 we
 for
 an
 are
 by
 be
 as
 on
 with
 can
 if
 from
 which
 you
 it
 this
 then
 at
 have
 all
 not
 one
 has
 or
 that
 的
 了
 和
 是
 就
 都
 而
 及
 與
 著
 或
 一個
 沒有
 我們
 你們
 妳們
 他們
 她們
 是否
--- a/jieba/init.py
+++ b/jieba/init.py
@ -91,8 +91,8 @@ def initialize(*args):
        if load_from_cache_fail:
            trie,FREQ,total = gen_trie(abs_path)
-            FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
+            FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
-            min_freq = min(FREQ.values())
+            min_freq = min(FREQ.itervalues())
            logger.debug("dumping model to file cache %s" % cache_file)
            try:
                tmp_suffix = "."+str(random.random())
@ -131,7 +131,7 @@ def require_initialized(fn):
 def __cut_all(sentence):
    dag = get_DAG(sentence)
    old_j = -1
-    for k,L in dag.items():
+    for k,L in dag.iteritems():
        if len(L)==1 and k>old_j:
            yield sentence[k:L[0]+1]
            old_j = L[0]
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,3 +1,4 @@
 #encoding=utf-8
 import jieba
 import os
 try:
@ -6,29 +7,56 @@ except ImportError:
    pass
 _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-f_name = os.path.join(_curpath,"idf.txt")
+abs_path = os.path.join(_curpath, "idf.txt")
 content = open(f_name,'rb').read().decode('utf-8')
 IDF_DICTIONARY = abs_path
 STOP_WORDS = set([
    "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
 ])
 def set_idf_path(idf_path):
    global IDF_DICTIONARY
    abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path )  )
    if not os.path.exists(abs_path):
        raise Exception("jieba: path does not exist:" + abs_path)
    IDF_DICTIONARY = abs_path
    return
 def get_idf(abs_path):
    content = open(abs_path,'rb').read().decode('utf-8')
 idf_freq = {}
 lines = content.split('\n')
 for line in lines:
    word,freq = line.split(' ')
    idf_freq[word] = float(freq)
    median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
    return idf_freq, median_idf
-median_idf = sorted(idf_freq.values())[int(len(idf_freq)/2)]
+def set_stop_words(stop_words_path):
-stop_words= set([
+    global STOP_WORDS
-"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
+    abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path )  )
-])
+    if not os.path.exists(abs_path):
        raise Exception("jieba: path does not exist:" + abs_path)
    content = open(abs_path,'rb').read().decode('utf-8')
    lines = content.split('\n')
    for line in lines:
        STOP_WORDS.add(line)
    return
 def extract_tags(sentence,topK=20):
    global IDF_DICTIONARY
    global STOP_WORDS
    idf_freq, median_idf = get_idf(IDF_DICTIONARY)
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
        if len(w.strip())<2: continue
-        if w.lower() in stop_words: continue
+        if w.lower() in STOP_WORDS: continue
        freq[w]=freq.get(w,0.0)+1.0
    total = sum(freq.values())
-    freq = [(k,v/total) for k,v in freq.items()]
+    freq = [(k,v/total) for k,v in freq.iteritems()]
    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
    st_list = sorted(tf_idf_list,reverse=True)
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup  
 setup(name='jieba3k',  
-      version='0.32',  
+      version='0.33',  
      description='Chinese Words Segementation Utilities',  
      author='Sun, Junyi',  
      author_email='ccnusjy@gmail.com',  
--- a/test/extract_tags_idfpath.py
+++ b/test/extract_tags_idfpath.py
@ -0,0 +1,32 @@
 import sys
 sys.path.append('../')
 import jieba
 import jieba.analyse
 from optparse import OptionParser
 USAGE = "usage:    python extract_tags_idfpath.py [file name] -k [top k]"
 parser = OptionParser(USAGE)
 parser.add_option("-k", dest="topK")
 opt, args = parser.parse_args()
 if len(args) < 1:
    print(USAGE)
    sys.exit(1)
 file_name = args[0]
 if opt.topK is None:
    topK = 10
 else:
    topK = int(opt.topK)
 content = open(file_name, 'rb').read()
 jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
 tags = jieba.analyse.extract_tags(content, topK=topK)
 print(",".join(tags))
--- a/test/extract_tags_stop_words.py
+++ b/test/extract_tags_stop_words.py
@ -0,0 +1,33 @@
 import sys
 sys.path.append('../')
 import jieba
 import jieba.analyse
 from optparse import OptionParser
 USAGE = "usage:    python extract_tags_stop_words.py [file name] -k [top k]"
 parser = OptionParser(USAGE)
 parser.add_option("-k", dest="topK")
 opt, args = parser.parse_args()
 if len(args) < 1:
    print(USAGE)
    sys.exit(1)
 file_name = args[0]
 if opt.topK is None:
    topK = 10
 else:
    topK = int(opt.topK)
 content = open(file_name, 'rb').read()
 jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
 jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
 tags = jieba.analyse.extract_tags(content, topK=topK)
 print(",".join(tags))
--- a/test/extract_topic.py
+++ b/test/extract_topic.py
@ -12,7 +12,7 @@ import os
 import random
 if len(sys.argv)<2:
-    print "usage: extract_topic.py directory [n_topic] [n_top_words]"
+    print("usage: extract_topic.py directory [n_topic] [n_top_words]")
    sys.exit(0)
 n_topic = 10
@ -28,27 +28,27 @@ count_vect = CountVectorizer()
 docs = []
 pattern = os.path.join(sys.argv[1],"*.txt") 
-print "read "+pattern
+print("read "+pattern)
 for f_name in glob.glob(pattern):
    with open(f_name) as f:
-        print "read file:", f_name
+        print("read file:", f_name)
        for line in f: #one line as a document
            words = " ".join(jieba.cut(line))
            docs.append(words)
 random.shuffle(docs)
-print "read done."
+print("read done.")
-print "transform"
+print("transform")
 counts = count_vect.fit_transform(docs)
 tfidf = TfidfTransformer().fit_transform(counts)
-print tfidf.shape
+print(tfidf.shape)
 t0 = time.time()
-print "training..."
+print("training...")
 nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
 print("done in %0.3fs." % (time.time() - t0))
--- a/test/jieba_test.py
+++ b/test/jieba_test.py
@ -1,7 +1,7 @@
 #-*-coding: utf-8 -*-
 import sys
 import imp
 sys.path.append("../")
 from imp import reload
 import unittest
 import types
 import jieba
@ -98,7 +98,7 @@ test_contents = [
 class JiebaTestCase(unittest.TestCase):
    def setUp(self):
-        reload(jieba)
+        imp.reload(jieba)
    def tearDown(self):
        pass
--- a/test/jiebacmd.py
+++ b/test/jiebacmd.py
@ -23,6 +23,6 @@ while True:
        break
    line = line.strip()
    for word in jieba.cut(line):
-        print(word.encode(default_encoding))
+        print(word)
--- a/test/parallel/test.py
+++ b/test/parallel/test.py
@ -6,7 +6,9 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut(test_sent)
-    print( "/ ".join(result) ) 
+    for word in result:
        print(word, "/", end=' ') 
    print("")
 if __name__ == "__main__":
--- a/test/parallel/test2.py
+++ b/test/parallel/test2.py
@ -6,7 +6,9 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut(test_sent,cut_all=True)
-    print("/ ".join(result))
+    for word in result:
        print(word, "/", end=' ') 
    print("")
 if __name__ == "__main__":
--- a/test/parallel/test_cut_for_search.py
+++ b/test/parallel/test_cut_for_search.py
@ -6,7 +6,9 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
-    print("/ ".join(result))
+    for word in result:
        print(word, "/", end=' ') 
    print("")
 if __name__ == "__main__":
--- a/test/parallel/test_file.py
+++ b/test/parallel/test_file.py
@ -1,3 +1,4 @@
 import urllib.request, urllib.error, urllib.parse
 import sys,time
 import sys
 sys.path.append("../../")
@ -6,16 +7,15 @@ import jieba
 jieba.enable_parallel()
 url = sys.argv[1]
-with open(url,"rb") as content:
+content = open(url,"rb").read()
    content = content.read()
 t1 = time.time()
 words = "/ ".join(jieba.cut(content))
 t2 = time.time()
 tm_cost = t2-t1
    print('cost',tm_cost)
    print('speed' , len(content)/tm_cost, " bytes/second")
-with open("1.log","wb") as log_f:
+log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))
 print('speed' , len(content)/tm_cost, " bytes/second")
--- a/test/parallel/test_pos.py
+++ b/test/parallel/test_pos.py
@ -8,7 +8,7 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
-        sys.stdout.write(w.word+ "/"+ w.flag + ", ") 
+        print(w.word, "/", w.flag, ", ", end=' ')  
    print("")
--- a/test/parallel/test_pos_file.py
+++ b/test/parallel/test_pos_file.py
@ -1,4 +1,4 @@
-import urllib2
+import urllib.request, urllib.error, urllib.parse
 import sys,time
 import sys
 sys.path.append("../../")
@ -16,7 +16,7 @@ tm_cost = t2-t1
 log_f = open("1.log","wb")
 for w in words:
-    print >> log_f, w.encode("utf-8"), "/" ,
+    print(w.encode("utf-8"), "/", end=' ', file=log_f)
-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed' , len(content)/tm_cost, " bytes/second")
--- a/test/test.py
+++ b/test/test.py
@ -3,6 +3,7 @@ import sys
 sys.path.append("../")
 import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent)
    print(" / ".join(result))
--- a/test/test_cut_for_search.py
+++ b/test/test_cut_for_search.py
@ -5,8 +5,9 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
-    print("/ ".join(result))
+    for word in result:
-
+        print(word, "/", end=' ') 
    print("")
 if __name__ == "__main__":
--- a/test/test_cutall.py
+++ b/test/test_cutall.py
@ -5,7 +5,9 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent,cut_all=True)
-    print("/ ".join(result))
+    for word in result:
        print(word, "/", end=' ') 
    print("")
 if __name__ == "__main__":
--- a/test/test_file.py
+++ b/test/test_file.py
@ -1,3 +1,4 @@
 import urllib.request, urllib.error, urllib.parse
 import sys,time
 import sys
 sys.path.append("../")
@ -5,15 +6,17 @@ import jieba
 jieba.initialize()
 url = sys.argv[1]
-with open(url,"rb") as content:
+content = open(url,"rb").read()
    content = content.read()
 t1 = time.time()
 words = "/ ".join(jieba.cut(content))
 t2 = time.time()
 tm_cost = t2-t1
 log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))
 log_f.close()
 print('cost',tm_cost)
 print('speed' , len(content)/tm_cost, " bytes/second")
 with open("1.log","wb") as log_f:
    log_f.write(words.encode('utf-8'))
    log_f.write(bytes("/ ".join(words),'utf-8'))
--- a/test/test_pos.py
+++ b/test/test_pos.py
@ -6,7 +6,7 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
-        sys.stdout.write(w.word+ "/"+ w.flag + ", ") 
+        print(w.word, "/", w.flag, ", ", end=' ')  
    print("")
--- a/test/test_pos_file.py
+++ b/test/test_pos_file.py
@ -1,3 +1,4 @@
 import urllib.request, urllib.error, urllib.parse
 import sys,time
 import sys
 sys.path.append("../")
@ -15,7 +16,7 @@ tm_cost = t2-t1
 log_f = open("1.log","wb")
 for w in words:
-    log_f.write(bytes(w.word+"/"+w.flag+" ",'utf-8'))
+    print(w.encode("utf-8"), "/", end=' ', file=log_f)
 print('speed' , len(content)/tm_cost, " bytes/second")
--- a/test/test_userdict.py
+++ b/test/test_userdict.py
@ -14,7 +14,7 @@ for w in words:
 result = pseg.cut(test_sent)
 for w in result:
-    print(w.word, "/", w.flag, ", ")
+    print(w.word, "/", w.flag, ", ", end=' ')
 print("\n========")
--- a/test/test_whoosh.py
+++ b/test/test_whoosh.py
@ -59,5 +59,5 @@ for keyword in ("水果世博园","你","first","中文","交换机","交换"):
        print(hit.highlights("content"))
    print("="*10)
-for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
+for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
    print(t.text)
--- a/test/test_whoosh_flie.py
+++ b/test/test_whoosh_flie.py
@ -23,8 +23,8 @@ with open(file_name,"rb") as inf:
    for line in inf:
        i+=1
        writer.add_document(
-            title=u"line"+str(i), 
+            title="line"+str(i), 
-            path=u"/a",
+            path="/a",
            content=line.decode('gbk','ignore')
        )
 writer.commit()
@ -32,10 +32,10 @@ writer.commit()
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)
-for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"):
+for keyword in ("水果小姐","你","first","中文","交换机","交换"):
-    print "result of ",keyword
+    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:  
-        print hit.highlights("content")
+        print(hit.highlights("content"))
-    print "="*10
+    print("="*10)
--- a/test/test_whoosh_flie_read.py
+++ b/test/test_whoosh_flie_read.py
@ -18,10 +18,10 @@ ix = open_dir("tmp")
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)
-for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
+for keyword in ("水果小姐","你","first","中文","交换机","交换","少林","乔峰"):
-    print "result of ",keyword
+    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:  
-        print hit.highlights("content")
+        print(hit.highlights("content"))
-    print "="*10
+    print("="*10)