merge master to jieba3k

This commit is contained in:
fxsjy 2013-06-08 11:18:56 +08:00
commit be1686654d
12 changed files with 241210 additions and 240961 deletions

View File

@ -1,6 +1,21 @@
2013-04-27: version 0.28.1
2013-06-07: version 0.29
==========================
1) hotfix. 修正了全模式下英文处理的bug.
1) 提升了finalseg子模块命名体识别的准确度
2) 修正了一些badcase
2013-06-01: version 0.28.4
==========================
1) 修正了一些badcase
2) add wraps decorator, by @cloudaice
3) unittest, by @cloudaice
2013-05-02: version 0.28.3
==========================
1) 修正了临时cache文件生成在pypy解析器下出错的问题
2013-04-28: version 0.28.2
==========================
1) 修正了initialize函数默认参数绑定的bug.
2013-04-27: version 0.28
========================

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,7 @@ import marshal
from math import log
import random
import threading
from functools import wraps
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
@ -45,8 +46,12 @@ def gen_trie(f_name):
raise e
return trie, lfreq,ltotal
def initialize(dictionary=DICTIONARY):
def initialize(*args):
global trie, FREQ, total, min_freq, initialized
if len(args)==0:
dictionary = DICTIONARY
else:
dictionary = args[0]
with DICT_LOCK:
if initialized:
return
@ -78,7 +83,8 @@ def initialize(dictionary=DICTIONARY):
min_freq = min(FREQ.values())
print("dumping model to file cache " + cache_file, file=sys.stderr)
tmp_suffix = "."+str(random.random())
marshal.dump((trie,FREQ,total,min_freq),open(cache_file+tmp_suffix,'wb'))
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
if os.name=='nt':
import shutil
replace_file = shutil.move
@ -94,7 +100,8 @@ def initialize(dictionary=DICTIONARY):
def require_initialized(fn):
global initialized,DICTIONARY
@wraps(fn)
def wrapped(*args, **kwargs):
if initialized:
return fn(*args, **kwargs)
@ -171,9 +178,13 @@ def __cut_DAG(sentence):
yield buf
buf=''
else:
regognized = finalseg.cut(buf)
for t in regognized:
yield t
if not (buf in FREQ):
regognized = finalseg.cut(buf)
for t in regognized:
yield t
else:
for elem in buf:
yield elem
buf=''
yield l_word
x =y
@ -182,14 +193,19 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield buf
else:
regognized = finalseg.cut(buf)
for t in regognized:
yield t
if not (buf in FREQ):
regognized = finalseg.cut(buf)
for t in regognized:
yield t
else:
for elem in buf:
yield elem
def cut(sentence,cut_all=False):
if( type(sentence) is bytes):
try:
sentence = sentence.decode('utf-8')
except:
except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")

View File

@ -1,3 +1,10 @@
AT&T 3 nz
B超 3 n
c# 3 nz
C# 3 nz
c++ 3 nz
C++ 3 nz
T恤 4 n
一 217830 m
一一 1670 m
一一二 11 m
@ -9162,6 +9169,7 @@
不弱 3 a
不强 3 v
不强不弱 3 l
不归路 3 i
不归零制 3 i
不当 657 d
不当不正 3 i
@ -13936,7 +13944,7 @@
中准价 3 nz
中凯 3 nz
中凯文化 3 ns
中出 3 nt
中出 3 vn
中分 3 n
中切 3 ns
中列明 3 ns
@ -124944,6 +124952,7 @@
屋顶风机 3 n
屋项 3 n
屌 38 zg
屌丝 13 n
屍 27 zg
屎 523 n
屎壳郎 7 n
@ -138428,7 +138437,6 @@
张丁华 4 nr
张万仙 2 nr
张万年 64 nr
张三 288 nr
张三丰 436 nr
张三之 2 nr
张三影 2 nr
@ -146275,7 +146283,7 @@
性丑闻 3 i
性事 15 n
性亢奋 3 i
性交 193 n
性交 29 n
性交关系 3 n
性交图 3 n
性交时 3 n
@ -245163,6 +245171,8 @@
皂鞋 3 n
的 318825 uj
的一确二 3 l
的哥 63 n
的士 20 n
的士高 3 n
的的确确 64 d
的确 2135 d
@ -245173,8 +245183,6 @@
的里雅斯特 23 ns
的里雅斯特市 3 ns
的黎波里 62 ns
的哥 63 n
的士 20 n
皆 7511 d
皆佳 3 nrt
皆准 3 i
@ -275108,6 +275116,7 @@
美男计 3 nz
美登木 5 nr
美登素 5 nr
美的 230 nr
美目 3 n
美目盼兮 6 i
美眉 44 ns
@ -358737,6 +358746,7 @@
高妈 3 nr
高妙 16 a
高妹 3 n
高富帅 13 n
高姓大名 3 nr
高姿态 10 l
高威达 3 nr
@ -367421,10 +367431,3 @@
龟龙麟凤 3 ns
龠 5 g
龢 732 zg
B超 3 n
T恤 4 n
C++ 3 nz
c++ 3 nz
C# 3 nz
c# 3 nz
AT&T 3 nz

View File

@ -1,4 +1,4 @@
{'B': {'E': -0.16037786260859094, 'M': -1.9093400568760384},
{'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}

View File

@ -106,9 +106,13 @@ def __cut_DAG(sentence):
yield pair(buf,word_tag_tab.get(buf,'x'))
buf=''
else:
regognized = __cut_detail(buf)
for t in regognized:
yield t
if not (buf in jieba.FREQ):
regognized = __cut_detail(buf)
for t in regognized:
yield t
else:
for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x'))
buf=''
yield pair(l_word,word_tag_tab.get(l_word,'x'))
x =y
@ -117,10 +121,13 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield pair(buf,word_tag_tab.get(buf,'x'))
else:
regognized = __cut_detail(buf)
for t in regognized:
yield t
if not (buf in jieba.FREQ):
regognized = __cut_detail(buf)
for t in regognized:
yield t
else:
for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x'))
def __cut_internal(sentence):
if not ( type(sentence) is str):

View File

@ -1,6 +1,6 @@
from distutils.core import setup
setup(name='jieba',
version='0.28.1',
version='0.29',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',

View File

@ -5,10 +5,10 @@ import jieba
import jieba.analyse
from optparse import OptionParser
USAGE ="usage: python extract_tags.py [file name] -k [top k]"
USAGE = "usage: python extract_tags.py [file name] -k [top k]"
parser = OptionParser(USAGE)
parser.add_option("-k",dest="topK")
parser.add_option("-k", dest="topK")
opt, args = parser.parse_args()
@ -18,15 +18,14 @@ if len(args) <1:
file_name = args[0]
if opt.topK==None:
topK=10
if opt.topK is None:
topK = 10
else:
topK = int(opt.topK)
topK = int(opt.topK)
content = open(file_name, 'rb').read()
content = open(file_name,'rb').read()
tags = jieba.analyse.extract_tags(content,topK=topK)
tags = jieba.analyse.extract_tags(content, topK=topK)
print(",".join(tags))

63
test/extract_topic.py Normal file
View File

@ -0,0 +1,63 @@
import sys
sys.path.append("../")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition
import jieba
import time
import glob
import sys
import os
import random
if len(sys.argv)<2:
print "usage: extract_topic.py directory [n_topic] [n_top_words]"
sys.exit(0)
n_topic = 10
n_top_words = 25
if len(sys.argv)>2:
n_topic = int(sys.argv[2])
if len(sys.argv)>3:
n_top_words = int(sys.argv[3])
count_vect = CountVectorizer()
docs = []
pattern = os.path.join(sys.argv[1],"*.txt")
print "read "+pattern
for f_name in glob.glob(pattern):
with open(f_name) as f:
print "read file:", f_name
for line in f: #one line as a document
words = " ".join(jieba.cut(line))
docs.append(words)
random.shuffle(docs)
print "read done."
print "transform"
counts = count_vect.fit_transform(docs)
tfidf = TfidfTransformer().fit_transform(counts)
print tfidf.shape
t0 = time.time()
print "training..."
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0))
# Inverse the vectorizer vocabulary to be able
feature_names = count_vect.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("")

153
test/jieba_test.py Normal file
View File

@ -0,0 +1,153 @@
#-*-coding: utf-8 -*-
import sys
sys.path.append("../")
import unittest
import types
import jieba
jieba.initialize()
test_contents = [
"这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。",
"我不喜欢日本和服。",
"雷猴回归人间。",
"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
"我需要廉租房",
"永和服装饰品有限公司",
"我爱北京天安门",
"abc",
"隐马尔可夫",
"雷猴是个好网站",
"“Microsoft”一词由“MICROcomputer微型计算机”和“SOFTware软件”两部分组成",
"草泥马和欺实马是今年的流行词汇",
"伊藤洋华堂总府店",
"中国科学院计算技术研究所",
"罗密欧与朱丽叶",
"我购买了道具和服装",
"PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍",
"湖北省石首市",
"湖北省十堰市",
"总经理完成了这件事情",
"电脑修好了",
"做好了这件事情就一了百了了",
"人们审美的观点是不同的",
"我们买了一个美的空调",
"线程初始化时我们要注意",
"一个分子是由好多原子组织成的",
"祝你马到功成",
"他掉进了无底洞里",
"中国的首都是北京",
"孙君意",
"外交部发言人马朝旭",
"领导人会议和第四届东亚峰会",
"在过去的这五年",
"还需要很长的路要走",
"60周年首都阅兵",
"你好人们审美的观点是不同的",
"买水果然后来世博园",
"买水果然后去世博园",
"但是后来我才知道你是对的",
"存在即合理",
"的的的的的在的的的的就以和和和",
"I love你不以为耻反以为rong",
"",
"",
"hello你好人们审美的观点是不同的",
"很好但主要是基于网页形式",
"hello你好人们审美的观点是不同的",
"为什么我不能拥有想要的生活",
"后来我才",
"此次来中国是为了",
"使用了它就可以解决一些问题",
",使用了它就可以解决一些问题",
"其实使用了它就可以解决一些问题",
"好人使用了它就可以解决一些问题",
"是因为和国家",
"老年搜索还支持",
"干脆就把那部蒙人的闲法给废了拉倒RT @laoshipukong : 27日全国人大常委会第三次审议侵权责任法草案删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ",
"",
"",
"他说的确实在理",
"长春市长春节讲话",
"结婚的和尚未结婚的",
"结合成分子时",
"旅游和服务是最好的",
"这件事情的确是我的错",
"供大家参考指正",
"哈尔滨政府公布塌桥原因",
"我在机场入口处",
"邢永臣摄影报道",
"BP神经网络如何训练才能在分类时增加区分度",
"南京市长江大桥",
"应一些使用者的建议也为了便于利用NiuTrans用于SMT研究",
'长春市长春药店',
'邓颖超生前最喜欢的衣服',
'胡锦涛是热爱世界和平的政治局常委',
'程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪',
'一次性交多少钱',
'两块五一套,三块八一斤,四块七一本,五块六一条',
'小和尚留了一个像大和尚一样的和尚头',
'我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站',
'张晓梅去人民医院做了个B超然后去买了件T恤',
'AT&T是一件不错的公司给你发offer了吗',
'C++和c#是什么关系11+122=133是吗PI=3.14159',
'你认识那个和主席握手的的哥吗?他开一辆黑色的士。',
'枪杆子中出政权']
class JiebaTestCase(unittest.TestCase):
def setUp(self):
reload(jieba)
def tearDown(self):
pass
def testDefaultCut(self):
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testDefaultCut"
def testCutAll(self):
for content in test_contents:
result = jieba.cut(content, cut_all=True)
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
result = list(result)
assert isinstance(result, list), "Test CutAll error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testCutAll"
def testSetDictionary(self):
jieba.set_dictionary("foobar.txt")
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
result = list(result)
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testSetDictionary"
def testCutForSearch(self):
for content in test_contents:
result = jieba.cut_for_search(content)
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testCutForSearch"
def testPosseg(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
print >> sys.stderr, "testPosseg"
if __name__ == "__main__":
unittest.main()

View File

@ -5,7 +5,8 @@ import jieba
jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg
test_sent = "李小福是创新办主任也是云计算方面的专家"
test_sent = "李小福是创新办主任也是云计算方面的专家;"
test_sent += "例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
words = jieba.cut(test_sent)
for w in words:
print(w)

View File

@ -2,4 +2,5 @@
李小福 2 nr
创新办 3 i
easy_install 3 eng
好用 300
好用 300
韩玉赏鉴 3 nz