first py3k version of jieba

This commit is contained in:
Sun Junyi 2012-11-28 10:50:40 +08:00
parent e0bd9a6a50
commit 9c07d80edb
13 changed files with 41 additions and 78 deletions

View File

@ -2,10 +2,8 @@ import re
import math import math
import os,sys import os,sys
import pprint import pprint
import finalseg from . import finalseg
import time import time
import tempfile
import marshal
FREQ = {} FREQ = {}
total =0.0 total =0.0
@ -31,33 +29,23 @@ def gen_trie(f_name):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
print >> sys.stderr, "Building Trie..." print("Building Trie...",file=sys.stderr)
t1 = time.time() t1 = time.time()
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
print >> sys.stderr, "loading model from cache"
try:
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
load_from_cache_fail = False
except:
load_from_cache_fail = True
if load_from_cache_fail: trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt")) FREQ = dict([(k,float(v)/total) for k,v in FREQ.items()]) #normalize
FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize min_freq = min(FREQ.values())
min_freq = min(FREQ.itervalues()) print("dumping model to file cache",file=sys.stderr)
print >> sys.stderr, "dumping model to file cache"
marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb'))
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds." print("loading model cost ", time.time() - t1, "seconds." ,file=sys.stderr)
print >> sys.stderr, "Trie has been built succesfully." print("Trie has been built succesfully.",file=sys.stderr)
def __cut_all(sentence): def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
old_j = -1 old_j = -1
for k,L in dag.iteritems(): for k,L in dag.items():
if len(L)==1 and k>old_j: if len(L)==1 and k>old_j:
yield sentence[k:L[0]+1] yield sentence[k:L[0]+1]
old_j = L[0] old_j = L[0]
@ -70,7 +58,7 @@ def __cut_all(sentence):
def calc(sentence,DAG,idx,route): def calc(sentence,DAG,idx,route):
N = len(sentence) N = len(sentence)
route[N] = (1.0,'') route[N] = (1.0,'')
for idx in xrange(N-1,-1,-1): for idx in range(N-1,-1,-1):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ] candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
route[idx] = max(candidates) route[idx] = max(candidates)
@ -96,7 +84,7 @@ def get_DAG(sentence):
p = trie p = trie
i+=1 i+=1
j=i j=i
for i in xrange(len(sentence)): for i in range(len(sentence)):
if not i in DAG: if not i in DAG:
DAG[i] =[i] DAG[i] =[i]
return DAG return DAG
@ -136,12 +124,12 @@ def __cut_DAG(sentence):
def cut(sentence,cut_all=False): def cut(sentence,cut_all=False):
if not ( type(sentence) is unicode): if( type(sentence) is bytes):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
cut_block = __cut_DAG cut_block = __cut_DAG
if cut_all: if cut_all:
@ -161,12 +149,12 @@ def cut_for_search(sentence):
words = cut(sentence) words = cut(sentence)
for w in words: for w in words:
if len(w)>2: if len(w)>2:
for i in xrange(len(w)-1): for i in range(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield gram2 yield gram2
if len(w)>3: if len(w)>3:
for i in xrange(len(w)-2): for i in range(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
yield gram3 yield gram3

View File

@ -19,7 +19,7 @@ def extract_tags(sentence,topK=20):
if len(w.strip())<2: continue if len(w.strip())<2: continue
freq[w]=freq.get(w,0.0)+1.0 freq[w]=freq.get(w,0.0)+1.0
total = sum(freq.values()) total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()] freq = [(k,v/total) for k,v in freq.items()]
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq] tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
st_list = sorted(tf_idf_list,reverse=True) st_list = sorted(tf_idf_list,reverse=True)

View File

@ -55,7 +55,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):

View File

@ -1,6 +1,6 @@
import re import re
import os import os
import viterbi from . import viterbi
import jieba import jieba
import sys import sys
default_encoding = sys.getfilesystemencoding() default_encoding = sys.getfilesystemencoding()
@ -15,8 +15,9 @@ def load_model(f_name):
for line in open(prob_p_path,"rb"): for line in open(prob_p_path,"rb"):
line = line.strip() line = line.strip()
if line=="":continue if line=="":continue
word, _, tag = line.split(' ') line = line.decode("utf-8")
result[word.decode('utf-8')]=tag word, _, tag = line.split(" ")
result[word]=tag
return result return result
@ -95,13 +96,13 @@ def __cut_DAG(sentence):
def cut(sentence): def cut(sentence):
if not ( type(sentence) is unicode): if ( type(sentence) is bytes):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n%]")
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") re_eng,re_num = re.compile(r"[a-zA-Z+#]+"), re.compile(r"[0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:

View File

@ -5,13 +5,13 @@ sys.path.append("../")
import jieba import jieba
seg_list = jieba.cut("我来到北京清华大学",cut_all=True) seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
print "Full Mode:", "/ ".join(seg_list) #全模式 print("Full Mode:", "/ ".join(seg_list)) #全模式
seg_list = jieba.cut("我来到北京清华大学",cut_all=False) seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
print "Default Mode:", "/ ".join(seg_list) #默认模式 print("Default Mode:", "/ ".join(seg_list)) #默认模式
seg_list = jieba.cut("他来到了网易杭研大厦") seg_list = jieba.cut("他来到了网易杭研大厦")
print ", ".join(seg_list) print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
print ", ".join(seg_list) print(", ".join(seg_list))

View File

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) <1: if len(args) <1:
print USAGE print(USAGE)
sys.exit(1) sys.exit(1)
file_name = args[0] file_name = args[0]
@ -28,6 +28,6 @@ content = open(file_name,'rb').read()
tags = jieba.analyse.extract_tags(content,topK=topK) tags = jieba.analyse.extract_tags(content,topK=topK)
print ",".join(tags) print(",".join(tags))

View File

@ -23,6 +23,6 @@ while True:
break break
line = line.strip() line = line.strip()
for word in jieba.cut(line): for word in jieba.cut(line):
print word.encode(default_encoding) print(word.encode(default_encoding))

View File

@ -5,9 +5,7 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent) result = jieba.cut(test_sent)
for word in result: print("/ ".join(result))
print word, "/",
print ""
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -5,9 +5,7 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent,cut_all=True) result = jieba.cut(test_sent,cut_all=True)
for word in result: print("/ ".join(result))
print word, "/",
print ""
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -5,9 +5,8 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut_for_search(test_sent) result = jieba.cut_for_search(test_sent)
for word in result: print("/ ".join(result))
print word, "/",
print ""
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,4 +1,3 @@
import urllib2
import sys,time import sys,time
import sys import sys
sys.path.append("../") sys.path.append("../")
@ -13,8 +12,8 @@ t2 = time.time()
tm_cost = t2-t1 tm_cost = t2-t1
log_f = open("1.log","wb") log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("gbk"), "/" ,
print 'speed' , len(content)/tm_cost, " bytes/second" log_f.write(bytes("/ ".join(words),'utf-8'))
print('speed' , len(content)/tm_cost, " bytes/second")

View File

@ -6,8 +6,8 @@ import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", sys.stdout.write(w.word+ "/"+ w.flag + ", ")
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,20 +0,0 @@
import urllib2
import sys,time
import sys
sys.path.append("../")
import jieba.posseg as pseg
url = sys.argv[1]
content = open(url,"rb").read()
t1 = time.time()
words = list(pseg.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("gbk"), "/" ,
print 'speed' , len(content)/tm_cost, " bytes/second"