merge master to jieba3k

2025-07-10 00:01:33 +08:00 · 2013-06-08 11:18:56 +08:00 · 2013-06-08 11:18:56 +08:00 · be1686654d
commit be1686654d
parent aae91b6fb6 69e584677a
12 changed files with 241210 additions and 240961 deletions
--- a/19
+++ b/19
@ -1,6 +1,21 @@
-2013-04-27: version 0.28.1
+2013-06-07: version 0.29
 ==========================
-1) hotfix. 修正了全模式下英文处理的bug.
+1) 提升了finalseg子模块命名体识别的准确度
+2) 修正了一些badcase
+
+2013-06-01: version 0.28.4
+==========================
+1) 修正了一些badcase
+2) add wraps decorator, by @cloudaice
+3) unittest, by @cloudaice
+
+2013-05-02: version 0.28.3
+==========================
+1) 修正了临时cache文件生成在pypy解析器下出错的问题
+
+2013-04-28: version 0.28.2
+==========================
+1) 修正了initialize函数默认参数绑定的bug.

 2013-04-27: version 0.28
 ========================
--- a/extra_dict/dict.txt.big
+++ b/extra_dict/dict.txt.big
--- a/jieba/init.py
+++ b/jieba/init.py
@ -11,6 +11,7 @@ import marshal
 from math import log
 import random
 import threading
+from functools import wraps

 DICTIONARY = "dict.txt"
 DICT_LOCK = threading.RLock()
@ -45,8 +46,12 @@ def gen_trie(f_name):
 				raise e
 	return trie, lfreq,ltotal

-def initialize(dictionary=DICTIONARY):
+def initialize(*args):
 	global trie, FREQ, total, min_freq, initialized
+	if len(args)==0:
+		dictionary = DICTIONARY
+	else:
+		dictionary = args[0]
 	with DICT_LOCK:
 		if initialized:
 			return
@ -78,7 +83,8 @@ def initialize(dictionary=DICTIONARY):
 			min_freq = min(FREQ.values())
 			print("dumping model to file cache " + cache_file, file=sys.stderr)
 			tmp_suffix = "."+str(random.random())
-			marshal.dump((trie,FREQ,total,min_freq),open(cache_file+tmp_suffix,'wb'))
+			with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
+				marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
 			if os.name=='nt':
 				import shutil
 				replace_file = shutil.move
@ -94,7 +100,8 @@ def initialize(dictionary=DICTIONARY):

 def require_initialized(fn):
 		global initialized,DICTIONARY
-
+		
+		@wraps(fn)
 		def wrapped(*args, **kwargs):
 			if initialized:
 				return fn(*args, **kwargs)
@ -171,9 +178,13 @@ def __cut_DAG(sentence):
 					yield buf
 					buf=''
 				else:
-					regognized = finalseg.cut(buf)
-					for t in regognized:
-						yield t
+					if not (buf in FREQ):
+						regognized = finalseg.cut(buf)
+						for t in regognized:
+							yield t
+					else:
+						for elem in buf:
+							yield elem
 					buf=''
 			yield l_word		
 		x =y
@ -182,14 +193,19 @@ def __cut_DAG(sentence):
 		if len(buf)==1:
 			yield buf
 		else:
-			regognized = finalseg.cut(buf)
-			for t in regognized:
-				yield t
+			if not (buf in FREQ):
+				regognized = finalseg.cut(buf)
+				for t in regognized:
+					yield t
+			else:
+				for elem in buf:
+					yield elem
+
 def cut(sentence,cut_all=False):
 	if( type(sentence) is bytes):
 		try:
 			sentence = sentence.decode('utf-8')
-		except:
+		except UnicodeDecodeError:
 			sentence = sentence.decode('gbk','ignore')

 	re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
--- a/jieba/dict.txt
+++ b/jieba/dict.txt
@ -1,3 +1,10 @@
+AT&T 3 nz
+B超 3 n
+c# 3 nz
+C# 3 nz
+c++ 3 nz
+C++ 3 nz
+T恤 4 n
 一 217830 m
 一一 1670 m
 一一二 11 m
@ -9162,6 +9169,7 @@
 不弱 3 a
 不强 3 v
 不强不弱 3 l
+不归路 3 i
 不归零制 3 i
 不当 657 d
 不当不正 3 i
@ -13936,7 +13944,7 @@
 中准价 3 nz
 中凯 3 nz
 中凯文化 3 ns
-中出 3 nt
+中出 3 vn
 中分 3 n
 中切 3 ns
 中列明 3 ns
@ -124944,6 +124952,7 @@
 屋顶风机 3 n
 屋项 3 n
 屌 38 zg
+屌丝 13 n
 屍 27 zg
 屎 523 n
 屎壳郎 7 n
@ -138428,7 +138437,6 @@
 张丁华 4 nr
 张万仙 2 nr
 张万年 64 nr
-张三 288 nr
 张三丰 436 nr
 张三之 2 nr
 张三影 2 nr
@ -146275,7 +146283,7 @@
 性丑闻 3 i
 性事 15 n
 性亢奋 3 i
-性交 193 n
+性交 29 n
 性交关系 3 n
 性交图 3 n
 性交时 3 n
@ -245163,6 +245171,8 @@
 皂鞋 3 n
 的 318825 uj
 的一确二 3 l
+的哥 63 n
+的士 20 n
 的士高 3 n
 的的确确 64 d
 的确 2135 d
@ -245173,8 +245183,6 @@
 的里雅斯特 23 ns
 的里雅斯特市 3 ns
 的黎波里 62 ns
-的哥 63 n
-的士 20 n
 皆 7511 d
 皆佳 3 nrt
 皆准 3 i
@ -275108,6 +275116,7 @@
 美男计 3 nz
 美登木 5 nr
 美登素 5 nr
+美的 230 nr
 美目 3 n
 美目盼兮 6 i
 美眉 44 ns
@ -358737,6 +358746,7 @@
 高妈 3 nr
 高妙 16 a
 高妹 3 n
+高富帅 13 n
 高姓大名 3 nr
 高姿态 10 l
 高威达 3 nr
@ -367421,10 +367431,3 @@
 龟龙麟凤 3 ns
 龠 5 g
 龢 732 zg
-B超 3 n
-T恤 4 n
-C++ 3 nz
-c++ 3 nz
-C# 3 nz
-c# 3 nz
-AT&T 3 nz
--- a/jieba/finalseg/prob_trans.py
+++ b/jieba/finalseg/prob_trans.py
@ -1,4 +1,4 @@
-{'B': {'E': -0.16037786260859094, 'M': -1.9093400568760384},
+{'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -106,9 +106,13 @@ def __cut_DAG(sentence):
 					yield pair(buf,word_tag_tab.get(buf,'x'))
 					buf=''
 				else:
-					regognized = __cut_detail(buf)
-					for t in regognized:
-						yield t
+					if not (buf in jieba.FREQ):
+						regognized = __cut_detail(buf)
+						for t in regognized:
+							yield t
+					else:
+						for elem in buf:
+							yield pair(elem,word_tag_tab.get(elem,'x'))
 					buf=''
 			yield pair(l_word,word_tag_tab.get(l_word,'x'))
 		x =y
@ -117,10 +121,13 @@ def __cut_DAG(sentence):
 		if len(buf)==1:
 			yield pair(buf,word_tag_tab.get(buf,'x'))
 		else:
-			regognized = __cut_detail(buf)
-			for t in regognized:
-				yield t
-
+			if not (buf in jieba.FREQ):
+				regognized = __cut_detail(buf)
+				for t in regognized:
+					yield t
+			else:
+				for elem in buf:
+					yield pair(elem,word_tag_tab.get(elem,'x'))

 def __cut_internal(sentence):
 	if not ( type(sentence) is str):
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup  
 setup(name='jieba',  
-      version='0.28.1',  
+      version='0.29',  
      description='Chinese Words Segementation Utilities',  
      author='Sun, Junyi',  
      author_email='ccnusjy@gmail.com',  
--- a/test/extract_tags.py
+++ b/test/extract_tags.py
@ -5,10 +5,10 @@ import jieba
 import jieba.analyse
 from optparse import OptionParser

-USAGE ="usage:    python extract_tags.py [file name] -k [top k]"
+USAGE = "usage:    python extract_tags.py [file name] -k [top k]"

 parser = OptionParser(USAGE)
-parser.add_option("-k",dest="topK")
+parser.add_option("-k", dest="topK")
 opt, args = parser.parse_args()


@ -18,15 +18,14 @@ if len(args) <1:

 file_name = args[0]

-if opt.topK==None:
-	topK=10
+if opt.topK is None:
+    topK = 10
 else:
-	topK = int(opt.topK)
+    topK = int(opt.topK)

+content = open(file_name, 'rb').read()

-content = open(file_name,'rb').read()
-
-tags = jieba.analyse.extract_tags(content,topK=topK)
+tags = jieba.analyse.extract_tags(content, topK=topK)

 print(",".join(tags))

--- a/test/extract_topic.py
+++ b/test/extract_topic.py
@ -0,0 +1,63 @@
+import sys
+sys.path.append("../")
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn import decomposition
+
+import jieba
+import time
+import glob
+import sys
+import os
+import random
+
+if len(sys.argv)<2:
+	print "usage: extract_topic.py directory [n_topic] [n_top_words]"
+	sys.exit(0)
+
+n_topic = 10
+n_top_words = 25
+
+if len(sys.argv)>2:
+	n_topic = int(sys.argv[2])
+
+if len(sys.argv)>3:
+	n_top_words = int(sys.argv[3])
+
+count_vect = CountVectorizer()
+docs = []
+
+pattern = os.path.join(sys.argv[1],"*.txt") 
+print "read "+pattern
+
+for f_name in glob.glob(pattern):
+	with open(f_name) as f:
+		print "read file:", f_name
+		for line in f: #one line as a document
+			words = " ".join(jieba.cut(line))
+			docs.append(words)
+
+random.shuffle(docs)
+
+print "read done."
+
+print "transform"
+counts = count_vect.fit_transform(docs)
+tfidf = TfidfTransformer().fit_transform(counts)
+print tfidf.shape
+
+
+t0 = time.time()
+print "training..."
+
+nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
+print("done in %0.3fs." % (time.time() - t0))
+
+# Inverse the vectorizer vocabulary to be able
+feature_names = count_vect.get_feature_names()
+
+for topic_idx, topic in enumerate(nmf.components_):
+    print("Topic #%d:" % topic_idx)
+    print(" ".join([feature_names[i]
+                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
+    print("")
--- a/test/jieba_test.py
+++ b/test/jieba_test.py
@ -0,0 +1,153 @@
+#-*-coding: utf-8 -*-
+import sys
+sys.path.append("../")
+import unittest
+import types
+import jieba
+jieba.initialize()
+
+
+test_contents = [
+    "这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。",
+    "我不喜欢日本和服。",
+    "雷猴回归人间。",
+    "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
+    "我需要廉租房",
+    "永和服装饰品有限公司",
+    "我爱北京天安门",
+    "abc",
+    "隐马尔可夫",
+    "雷猴是个好网站",
+    "“Microsoft”一词由“MICROcomputer（微型计算机）”和“SOFTware（软件）”两部分组成",
+    "草泥马和欺实马是今年的流行词汇",
+    "伊藤洋华堂总府店",
+    "中国科学院计算技术研究所",
+    "罗密欧与朱丽叶",
+    "我购买了道具和服装",
+    "PS: 我觉得开源有一个好处，就是能够敦促自己不断改进，避免敞帚自珍",
+    "湖北省石首市",
+    "湖北省十堰市",
+    "总经理完成了这件事情",
+    "电脑修好了",
+    "做好了这件事情就一了百了了",
+    "人们审美的观点是不同的",
+    "我们买了一个美的空调",
+    "线程初始化时我们要注意",
+    "一个分子是由好多原子组织成的",
+    "祝你马到功成",
+    "他掉进了无底洞里",
+    "中国的首都是北京",
+    "孙君意",
+    "外交部发言人马朝旭",
+    "领导人会议和第四届东亚峰会",
+    "在过去的这五年",
+    "还需要很长的路要走",
+    "60周年首都阅兵",
+    "你好人们审美的观点是不同的",
+    "买水果然后来世博园",
+    "买水果然后去世博园",
+    "但是后来我才知道你是对的",
+    "存在即合理",
+    "的的的的的在的的的的就以和和和",
+    "I love你，不以为耻，反以为rong",
+    "因",
+    "",
+    "hello你好人们审美的观点是不同的",
+    "很好但主要是基于网页形式",
+    "hello你好人们审美的观点是不同的",
+    "为什么我不能拥有想要的生活",
+    "后来我才",
+    "此次来中国是为了",
+    "使用了它就可以解决一些问题",
+    ",使用了它就可以解决一些问题",
+    "其实使用了它就可以解决一些问题",
+    "好人使用了它就可以解决一些问题",
+    "是因为和国家",
+    "老年搜索还支持",
+    "干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
+    "大",
+    "",
+    "他说的确实在理",
+    "长春市长春节讲话",
+    "结婚的和尚未结婚的",
+    "结合成分子时",
+    "旅游和服务是最好的",
+    "这件事情的确是我的错",
+    "供大家参考指正",
+    "哈尔滨政府公布塌桥原因",
+    "我在机场入口处",
+    "邢永臣摄影报道",
+    "BP神经网络如何训练才能在分类时增加区分度？",
+    "南京市长江大桥",
+    "应一些使用者的建议，也为了便于利用NiuTrans用于SMT研究",
+    '长春市长春药店',
+    '邓颖超生前最喜欢的衣服',
+    '胡锦涛是热爱世界和平的政治局常委',
+    '程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪',
+    '一次性交多少钱',
+    '两块五一套，三块八一斤，四块七一本，五块六一条',
+    '小和尚留了一个像大和尚一样的和尚头',
+    '我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站',
+    '张晓梅去人民医院做了个B超然后去买了件T恤',
+    'AT&T是一件不错的公司，给你发offer了吗？',
+    'C++和c#是什么关系？11+122=133，是吗？PI=3.14159',
+    '你认识那个和主席握手的的哥吗？他开一辆黑色的士。',
+    '枪杆子中出政权']
+
+
+class JiebaTestCase(unittest.TestCase):
+    def setUp(self):
+        reload(jieba)
+
+    def tearDown(self):
+        pass
+
+    def testDefaultCut(self):
+        for content in test_contents:
+            result = jieba.cut(content)
+            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
+            result = list(result)
+            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
+            print >> sys.stderr, " , ".join(result)
+        print  >> sys.stderr, "testDefaultCut"
+
+    def testCutAll(self):
+        for content in test_contents:
+            result = jieba.cut(content, cut_all=True)
+            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
+            result = list(result)
+            assert isinstance(result, list), "Test CutAll error on content: %s" % content
+            print >> sys.stderr, " , ".join(result)
+        print  >> sys.stderr, "testCutAll"
+
+    def testSetDictionary(self):
+        jieba.set_dictionary("foobar.txt")
+        for content in test_contents:
+            result = jieba.cut(content)
+            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
+            result = list(result)
+            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
+            print >> sys.stderr, " , ".join(result)
+        print  >> sys.stderr, "testSetDictionary"
+
+    def testCutForSearch(self):
+        for content in test_contents:
+            result = jieba.cut_for_search(content)
+            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
+            result = list(result)
+            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
+            print >> sys.stderr, " , ".join(result)
+        print  >> sys.stderr, "testCutForSearch"
+
+    def testPosseg(self):
+        import jieba.posseg as pseg
+        for content in test_contents:
+            result = pseg.cut(content)
+            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
+            result = list(result)
+            assert isinstance(result, list), "Test Posseg error on content: %s" % content
+            print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
+        print  >> sys.stderr, "testPosseg"
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/test_userdict.py
+++ b/test/test_userdict.py
@ -5,7 +5,8 @@ import jieba
 jieba.load_userdict("userdict.txt")
 import jieba.posseg as pseg

-test_sent = "李小福是创新办主任也是云计算方面的专家"
+test_sent = "李小福是创新办主任也是云计算方面的专家;"
+test_sent += "例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 words = jieba.cut(test_sent)
 for w in words:
 	print(w)
--- a/test/userdict.txt
+++ b/test/userdict.txt
@ -2,4 +2,5 @@
 李小福 2 nr
 创新办 3 i
 easy_install 3 eng
-好用 300
+好用 300
+韩玉赏鉴 3 nz