mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
146 lines
2.7 KiB
Python
146 lines
2.7 KiB
Python
import re
|
|
import math
|
|
import os,sys
|
|
import pprint
|
|
import finalseg
|
|
import time
|
|
|
|
FREQ = {}
|
|
total =0.0
|
|
|
|
def gen_trie(f_name):
|
|
global total
|
|
trie = {}
|
|
content = open(f_name,'rb').read().decode('utf-8')
|
|
for line in content.split("\n"):
|
|
word,freq = line.split(" ")
|
|
freq = float(freq)
|
|
FREQ[word] = freq
|
|
total+=freq
|
|
p = trie
|
|
for c in word:
|
|
if not c in p:
|
|
p[c] ={}
|
|
p = p[c]
|
|
p['']='' #ending flag
|
|
return trie
|
|
|
|
|
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
|
|
print >> sys.stderr, "Building Trie..."
|
|
t1 = time.time()
|
|
trie = gen_trie(os.path.join(_curpath,"dict.txt"))
|
|
FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize
|
|
min_freq = min(FREQ.itervalues())
|
|
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
|
|
print >> sys.stderr, "Trie has been built succesfully."
|
|
|
|
|
|
def __cut_all(sentence):
|
|
N = len(sentence)
|
|
i,j=0,0
|
|
p = trie
|
|
while i<N:
|
|
c = sentence[j]
|
|
if c in p:
|
|
p = p[c]
|
|
if '' in p:
|
|
yield sentence[i:j+1]
|
|
j+=1
|
|
if j>=N:
|
|
i+=1
|
|
j=i
|
|
p=trie
|
|
else:
|
|
p = trie
|
|
i+=1
|
|
j=i
|
|
|
|
|
|
def calc(sentence,DAG,idx,route):
|
|
N = len(sentence)
|
|
route[N] = (1.0,'')
|
|
for idx in xrange(N-1,-1,-1):
|
|
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
|
route[idx] = max(candidates)
|
|
|
|
def __cut_DAG(sentence):
|
|
N = len(sentence)
|
|
i,j=0,0
|
|
p = trie
|
|
DAG = {}
|
|
while i<N:
|
|
c = sentence[j]
|
|
if c in p:
|
|
p = p[c]
|
|
if '' in p:
|
|
if not i in DAG:
|
|
DAG[i]=[]
|
|
DAG[i].append(j)
|
|
j+=1
|
|
if j>=N:
|
|
i+=1
|
|
j=i
|
|
p=trie
|
|
else:
|
|
p = trie
|
|
i+=1
|
|
j=i
|
|
for i in xrange(len(sentence)):
|
|
if not i in DAG:
|
|
DAG[i] =[i]
|
|
#pprint.pprint(DAG)
|
|
route ={}
|
|
calc(sentence,DAG,0,route=route)
|
|
x = 0
|
|
buf =u''
|
|
while x<N:
|
|
y = route[x][1]+1
|
|
l_word = sentence[x:y]
|
|
if y-x==1:
|
|
buf+= l_word
|
|
else:
|
|
if len(buf)>0:
|
|
if len(buf)==1:
|
|
yield buf
|
|
buf=u''
|
|
else:
|
|
regognized = finalseg.__cut(buf)
|
|
for t in regognized:
|
|
yield t
|
|
buf=u''
|
|
yield l_word
|
|
x =y
|
|
|
|
if len(buf)>0:
|
|
if len(buf)==1:
|
|
yield buf
|
|
else:
|
|
regognized = finalseg.__cut(buf)
|
|
for t in regognized:
|
|
yield t
|
|
|
|
|
|
def cut(sentence,cut_all=False):
|
|
if not ( type(sentence) is unicode):
|
|
try:
|
|
sentence = sentence.decode('utf-8')
|
|
except:
|
|
sentence = sentence.decode('gbk','ignore')
|
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
|
blocks = re_han.split(sentence)
|
|
cut_block = __cut_DAG
|
|
if cut_all:
|
|
cut_block = __cut_all
|
|
for blk in blocks:
|
|
if re_han.match(blk):
|
|
#pprint.pprint(__cut_DAG(blk))
|
|
for word in cut_block(blk):
|
|
yield word
|
|
else:
|
|
tmp = re_skip.split(blk)
|
|
for x in tmp:
|
|
if x!="":
|
|
yield x
|