mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
121 lines
2.7 KiB
Python
121 lines
2.7 KiB
Python
import re
|
|
import os
|
|
import viterbi
|
|
import jieba
|
|
import sys
|
|
default_encoding = sys.getfilesystemencoding()
|
|
|
|
def load_model(f_name):
|
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
|
prob_p_path = os.path.join(_curpath,f_name)
|
|
if f_name.endswith(".py"):
|
|
return eval(open(prob_p_path,"rb").read())
|
|
else:
|
|
result = {}
|
|
for line in open(prob_p_path,"rb"):
|
|
line = line.strip()
|
|
if line=="":continue
|
|
word, tag = line.split(' ')
|
|
result[word.decode('utf-8')]=tag
|
|
return result
|
|
|
|
|
|
prob_start = load_model("prob_start.py")
|
|
prob_trans = load_model("prob_trans.py")
|
|
prob_emit = load_model("prob_emit.py")
|
|
char_state_tab = load_model("char_state_tab.py")
|
|
word_tag_tab = load_model("tags.txt")
|
|
|
|
class pair(object):
|
|
def __init__(self,word,flag):
|
|
self.word = word
|
|
self.flag = flag
|
|
|
|
def __unicode__(self):
|
|
return self.word+u"/"+self.flag
|
|
|
|
def __repr__(self):
|
|
return self.__str__()
|
|
|
|
def __str__(self):
|
|
return self.__unicode__().encode(default_encoding)
|
|
|
|
def encode(self,arg):
|
|
return self.__unicode__().encode(arg)
|
|
|
|
def __cut(sentence):
|
|
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
|
|
begin, next = 0,0
|
|
|
|
for i,char in enumerate(sentence):
|
|
pos = pos_list[i][0]
|
|
if pos=='B':
|
|
begin = i
|
|
elif pos=='E':
|
|
yield pair(sentence[begin:i+1], pos_list[i][1])
|
|
next = i+1
|
|
elif pos=='S':
|
|
yield pair(char,pos_list[i][1])
|
|
next = i+1
|
|
if next<len(sentence):
|
|
yield pair(sentence[next:], pos_list[next][1] )
|
|
|
|
def __cut_DAG(sentence):
|
|
DAG = jieba.get_DAG(sentence)
|
|
route ={}
|
|
jieba.calc(sentence,DAG,0,route=route)
|
|
x = 0
|
|
buf =u''
|
|
N = len(sentence)
|
|
while x<N:
|
|
y = route[x][1]+1
|
|
l_word = sentence[x:y]
|
|
if y-x==1:
|
|
buf+= l_word
|
|
else:
|
|
if len(buf)>0:
|
|
if len(buf)==1:
|
|
yield pair(buf,word_tag_tab.get(buf,'x'))
|
|
buf=u''
|
|
else:
|
|
regognized = __cut(buf)
|
|
for t in regognized:
|
|
yield t
|
|
buf=u''
|
|
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
|
x =y
|
|
|
|
if len(buf)>0:
|
|
if len(buf)==1:
|
|
yield pair(buf,word_tag_tab.get(buf,'x'))
|
|
else:
|
|
regognized = __cut(buf)
|
|
for t in regognized:
|
|
yield t
|
|
|
|
|
|
def cut(sentence):
|
|
if not ( type(sentence) is unicode):
|
|
try:
|
|
sentence = sentence.decode('utf-8')
|
|
except:
|
|
sentence = sentence.decode('gbk','ignore')
|
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
|
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
|
blocks = re_han.split(sentence)
|
|
|
|
for blk in blocks:
|
|
if re_han.match(blk):
|
|
for word in __cut_DAG(blk):
|
|
yield word
|
|
else:
|
|
tmp = re_skip.split(blk)
|
|
for x in tmp:
|
|
if x!="":
|
|
if re_num.match(x):
|
|
yield pair(x,'m')
|
|
elif re_eng.match(x):
|
|
yield pair(x,'eng')
|
|
else:
|
|
yield pair(x,'x')
|