mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
first py3k version of jieba
This commit is contained in:
parent
e0bd9a6a50
commit
9c07d80edb
@ -2,10 +2,8 @@ import re
|
|||||||
import math
|
import math
|
||||||
import os,sys
|
import os,sys
|
||||||
import pprint
|
import pprint
|
||||||
import finalseg
|
from . import finalseg
|
||||||
import time
|
import time
|
||||||
import tempfile
|
|
||||||
import marshal
|
|
||||||
|
|
||||||
FREQ = {}
|
FREQ = {}
|
||||||
total =0.0
|
total =0.0
|
||||||
@ -31,33 +29,23 @@ def gen_trie(f_name):
|
|||||||
|
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
|
|
||||||
print >> sys.stderr, "Building Trie..."
|
print("Building Trie...",file=sys.stderr)
|
||||||
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
|
||||||
load_from_cache_fail = True
|
|
||||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
|
|
||||||
print >> sys.stderr, "loading model from cache"
|
|
||||||
try:
|
|
||||||
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
|
||||||
load_from_cache_fail = False
|
|
||||||
except:
|
|
||||||
load_from_cache_fail = True
|
|
||||||
|
|
||||||
if load_from_cache_fail:
|
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
|
||||||
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
|
FREQ = dict([(k,float(v)/total) for k,v in FREQ.items()]) #normalize
|
||||||
FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize
|
min_freq = min(FREQ.values())
|
||||||
min_freq = min(FREQ.itervalues())
|
print("dumping model to file cache",file=sys.stderr)
|
||||||
print >> sys.stderr, "dumping model to file cache"
|
|
||||||
marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb'))
|
|
||||||
|
|
||||||
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
|
print("loading model cost ", time.time() - t1, "seconds." ,file=sys.stderr)
|
||||||
print >> sys.stderr, "Trie has been built succesfully."
|
print("Trie has been built succesfully.",file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
def __cut_all(sentence):
|
def __cut_all(sentence):
|
||||||
dag = get_DAG(sentence)
|
dag = get_DAG(sentence)
|
||||||
old_j = -1
|
old_j = -1
|
||||||
for k,L in dag.iteritems():
|
for k,L in dag.items():
|
||||||
if len(L)==1 and k>old_j:
|
if len(L)==1 and k>old_j:
|
||||||
yield sentence[k:L[0]+1]
|
yield sentence[k:L[0]+1]
|
||||||
old_j = L[0]
|
old_j = L[0]
|
||||||
@ -70,7 +58,7 @@ def __cut_all(sentence):
|
|||||||
def calc(sentence,DAG,idx,route):
|
def calc(sentence,DAG,idx,route):
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
route[N] = (1.0,'')
|
route[N] = (1.0,'')
|
||||||
for idx in xrange(N-1,-1,-1):
|
for idx in range(N-1,-1,-1):
|
||||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
||||||
route[idx] = max(candidates)
|
route[idx] = max(candidates)
|
||||||
|
|
||||||
@ -96,7 +84,7 @@ def get_DAG(sentence):
|
|||||||
p = trie
|
p = trie
|
||||||
i+=1
|
i+=1
|
||||||
j=i
|
j=i
|
||||||
for i in xrange(len(sentence)):
|
for i in range(len(sentence)):
|
||||||
if not i in DAG:
|
if not i in DAG:
|
||||||
DAG[i] =[i]
|
DAG[i] =[i]
|
||||||
return DAG
|
return DAG
|
||||||
@ -136,12 +124,12 @@ def __cut_DAG(sentence):
|
|||||||
|
|
||||||
|
|
||||||
def cut(sentence,cut_all=False):
|
def cut(sentence,cut_all=False):
|
||||||
if not ( type(sentence) is unicode):
|
if( type(sentence) is bytes):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
cut_block = __cut_DAG
|
cut_block = __cut_DAG
|
||||||
if cut_all:
|
if cut_all:
|
||||||
@ -161,12 +149,12 @@ def cut_for_search(sentence):
|
|||||||
words = cut(sentence)
|
words = cut(sentence)
|
||||||
for w in words:
|
for w in words:
|
||||||
if len(w)>2:
|
if len(w)>2:
|
||||||
for i in xrange(len(w)-1):
|
for i in range(len(w)-1):
|
||||||
gram2 = w[i:i+2]
|
gram2 = w[i:i+2]
|
||||||
if gram2 in FREQ:
|
if gram2 in FREQ:
|
||||||
yield gram2
|
yield gram2
|
||||||
if len(w)>3:
|
if len(w)>3:
|
||||||
for i in xrange(len(w)-2):
|
for i in range(len(w)-2):
|
||||||
gram3 = w[i:i+3]
|
gram3 = w[i:i+3]
|
||||||
if gram3 in FREQ:
|
if gram3 in FREQ:
|
||||||
yield gram3
|
yield gram3
|
||||||
|
@ -19,7 +19,7 @@ def extract_tags(sentence,topK=20):
|
|||||||
if len(w.strip())<2: continue
|
if len(w.strip())<2: continue
|
||||||
freq[w]=freq.get(w,0.0)+1.0
|
freq[w]=freq.get(w,0.0)+1.0
|
||||||
total = sum(freq.values())
|
total = sum(freq.values())
|
||||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
freq = [(k,v/total) for k,v in freq.items()]
|
||||||
|
|
||||||
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
|
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
|
||||||
st_list = sorted(tf_idf_list,reverse=True)
|
st_list = sorted(tf_idf_list,reverse=True)
|
||||||
|
@ -55,7 +55,7 @@ def cut(sentence):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n]")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import viterbi
|
from . import viterbi
|
||||||
import jieba
|
import jieba
|
||||||
import sys
|
import sys
|
||||||
default_encoding = sys.getfilesystemencoding()
|
default_encoding = sys.getfilesystemencoding()
|
||||||
@ -15,8 +15,9 @@ def load_model(f_name):
|
|||||||
for line in open(prob_p_path,"rb"):
|
for line in open(prob_p_path,"rb"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line=="":continue
|
if line=="":continue
|
||||||
word, _, tag = line.split(' ')
|
line = line.decode("utf-8")
|
||||||
result[word.decode('utf-8')]=tag
|
word, _, tag = line.split(" ")
|
||||||
|
result[word]=tag
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@ -95,13 +96,13 @@ def __cut_DAG(sentence):
|
|||||||
|
|
||||||
|
|
||||||
def cut(sentence):
|
def cut(sentence):
|
||||||
if not ( type(sentence) is unicode):
|
if ( type(sentence) is bytes):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"[^a-zA-Z0-9+#\n%]")
|
||||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
re_eng,re_num = re.compile(r"[a-zA-Z+#]+"), re.compile(r"[0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
|
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
|
@ -5,13 +5,13 @@ sys.path.append("../")
|
|||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
||||||
print "Full Mode:", "/ ".join(seg_list) #全模式
|
print("Full Mode:", "/ ".join(seg_list)) #全模式
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
||||||
print "Default Mode:", "/ ".join(seg_list) #默认模式
|
print("Default Mode:", "/ ".join(seg_list)) #默认模式
|
||||||
|
|
||||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
|
||||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
||||||
print ", ".join(seg_list)
|
print(", ".join(seg_list))
|
||||||
|
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
|
|||||||
|
|
||||||
|
|
||||||
if len(args) <1:
|
if len(args) <1:
|
||||||
print USAGE
|
print(USAGE)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
file_name = args[0]
|
file_name = args[0]
|
||||||
@ -28,6 +28,6 @@ content = open(file_name,'rb').read()
|
|||||||
|
|
||||||
tags = jieba.analyse.extract_tags(content,topK=topK)
|
tags = jieba.analyse.extract_tags(content,topK=topK)
|
||||||
|
|
||||||
print ",".join(tags)
|
print(",".join(tags))
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,6 +23,6 @@ while True:
|
|||||||
break
|
break
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
for word in jieba.cut(line):
|
for word in jieba.cut(line):
|
||||||
print word.encode(default_encoding)
|
print(word.encode(default_encoding))
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,9 +5,7 @@ import jieba
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent)
|
result = jieba.cut(test_sent)
|
||||||
for word in result:
|
print("/ ".join(result))
|
||||||
print word, "/",
|
|
||||||
print ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -5,9 +5,7 @@ import jieba
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent,cut_all=True)
|
result = jieba.cut(test_sent,cut_all=True)
|
||||||
for word in result:
|
print("/ ".join(result))
|
||||||
print word, "/",
|
|
||||||
print ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -5,9 +5,8 @@ import jieba
|
|||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut_for_search(test_sent)
|
result = jieba.cut_for_search(test_sent)
|
||||||
for word in result:
|
print("/ ".join(result))
|
||||||
print word, "/",
|
|
||||||
print ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import urllib2
|
|
||||||
import sys,time
|
import sys,time
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
@ -13,8 +12,8 @@ t2 = time.time()
|
|||||||
tm_cost = t2-t1
|
tm_cost = t2-t1
|
||||||
|
|
||||||
log_f = open("1.log","wb")
|
log_f = open("1.log","wb")
|
||||||
for w in words:
|
|
||||||
print >> log_f, w.encode("gbk"), "/" ,
|
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
log_f.write(bytes("/ ".join(words),'utf-8'))
|
||||||
|
|
||||||
|
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||||
|
|
||||||
|
@ -6,8 +6,8 @@ import jieba.posseg as pseg
|
|||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent)
|
result = pseg.cut(test_sent)
|
||||||
for w in result:
|
for w in result:
|
||||||
print w.word, "/", w.flag, ", ",
|
sys.stdout.write(w.word+ "/"+ w.flag + ", ")
|
||||||
print ""
|
print("")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,20 +0,0 @@
|
|||||||
import urllib2
|
|
||||||
import sys,time
|
|
||||||
import sys
|
|
||||||
sys.path.append("../")
|
|
||||||
import jieba.posseg as pseg
|
|
||||||
|
|
||||||
url = sys.argv[1]
|
|
||||||
content = open(url,"rb").read()
|
|
||||||
t1 = time.time()
|
|
||||||
words = list(pseg.cut(content))
|
|
||||||
|
|
||||||
t2 = time.time()
|
|
||||||
tm_cost = t2-t1
|
|
||||||
|
|
||||||
log_f = open("1.log","wb")
|
|
||||||
for w in words:
|
|
||||||
print >> log_f, w.encode("gbk"), "/" ,
|
|
||||||
|
|
||||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user