mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
improve speed of tagging
This commit is contained in:
parent
1a2a64a13f
commit
59c3efeb2f
@ -11,9 +11,12 @@ def load_model(f_name):
|
|||||||
if f_name.endswith(".py"):
|
if f_name.endswith(".py"):
|
||||||
return eval(open(prob_p_path,"rb").read())
|
return eval(open(prob_p_path,"rb").read())
|
||||||
else:
|
else:
|
||||||
result = set()
|
result = {}
|
||||||
for line in open(prob_p_path,"rb"):
|
for line in open(prob_p_path,"rb"):
|
||||||
result.add(line.strip().decode('utf-8'))
|
line = line.strip()
|
||||||
|
if line=="":continue
|
||||||
|
word, tag = line.split(' ')
|
||||||
|
result[word.decode('utf-8')]=tag
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@ -21,6 +24,7 @@ prob_start = load_model("prob_start.py")
|
|||||||
prob_trans = load_model("prob_trans.py")
|
prob_trans = load_model("prob_trans.py")
|
||||||
prob_emit = load_model("prob_emit.py")
|
prob_emit = load_model("prob_emit.py")
|
||||||
char_state_tab = load_model("char_state_tab.py")
|
char_state_tab = load_model("char_state_tab.py")
|
||||||
|
word_tag_tab = load_model("tags.txt")
|
||||||
|
|
||||||
class pair(object):
|
class pair(object):
|
||||||
def __init__(self,word,flag):
|
def __init__(self,word,flag):
|
||||||
@ -39,18 +43,8 @@ class pair(object):
|
|||||||
def encode(self,arg):
|
def encode(self,arg):
|
||||||
return self.__unicode__().encode(arg)
|
return self.__unicode__().encode(arg)
|
||||||
|
|
||||||
def __cut(sentence,tags_limited=False):
|
def __cut(sentence):
|
||||||
limit_tags = None
|
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
|
||||||
if tags_limited:
|
|
||||||
limit_tags = []
|
|
||||||
if len(sentence)==1:
|
|
||||||
limit_tags = ['S']
|
|
||||||
else:
|
|
||||||
limit_tags.append('B')
|
|
||||||
for i in xrange(len(sentence)-2):
|
|
||||||
limit_tags.append('M')
|
|
||||||
limit_tags.append('E')
|
|
||||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit,limit_tags)
|
|
||||||
begin, next = 0,0
|
begin, next = 0,0
|
||||||
|
|
||||||
for i,char in enumerate(sentence):
|
for i,char in enumerate(sentence):
|
||||||
@ -81,20 +75,19 @@ def __cut_DAG(sentence):
|
|||||||
else:
|
else:
|
||||||
if len(buf)>0:
|
if len(buf)>0:
|
||||||
if len(buf)==1:
|
if len(buf)==1:
|
||||||
yield list(__cut(buf))[0]
|
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||||
buf=u''
|
buf=u''
|
||||||
else:
|
else:
|
||||||
regognized = __cut(buf)
|
regognized = __cut(buf)
|
||||||
for t in regognized:
|
for t in regognized:
|
||||||
yield t
|
yield t
|
||||||
buf=u''
|
buf=u''
|
||||||
for w in __cut(l_word,tags_limited=True):
|
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
||||||
yield w
|
|
||||||
x =y
|
x =y
|
||||||
|
|
||||||
if len(buf)>0:
|
if len(buf)>0:
|
||||||
if len(buf)==1:
|
if len(buf)==1:
|
||||||
yield list(__cut(buf))[0]
|
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||||
else:
|
else:
|
||||||
regognized = __cut(buf)
|
regognized = __cut(buf)
|
||||||
for t in regognized:
|
for t in regognized:
|
||||||
@ -108,6 +101,7 @@ def cut(sentence):
|
|||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
||||||
|
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
|
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
@ -118,9 +112,9 @@ def cut(sentence):
|
|||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
if x!="":
|
if x!="":
|
||||||
if re.match(ur"[0-9]+",x):
|
if re_num.match(x):
|
||||||
yield pair(x,'m')
|
yield pair(x,'m')
|
||||||
elif re.match(ur"[a-zA-Z+#]+",x):
|
elif re_eng.match(x):
|
||||||
yield pair(x,'eng')
|
yield pair(x,'eng')
|
||||||
else:
|
else:
|
||||||
yield pair(x,'x')
|
yield pair(x,'x')
|
||||||
|
450246
jieba/posseg/near_char_tab.txt
Normal file
450246
jieba/posseg/near_char_tab.txt
Normal file
File diff suppressed because it is too large
Load Diff
406958
jieba/posseg/tags.txt
Normal file
406958
jieba/posseg/tags.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,7 @@ def get_top_states(t_state_v,K=4):
|
|||||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||||
return [x[0] for x in topK]
|
return [x[0] for x in topK]
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p,limit_tags):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] #tabular
|
||||||
mem_path = [{}]
|
mem_path = [{}]
|
||||||
all_states = trans_p.keys()
|
all_states = trans_p.keys()
|
||||||
@ -17,16 +17,11 @@ def viterbi(obs, states, start_p, trans_p, emit_p,limit_tags):
|
|||||||
mem_path.append({})
|
mem_path.append({})
|
||||||
prev_states = get_top_states(V[t-1])
|
prev_states = get_top_states(V[t-1])
|
||||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||||
tmp = prev_states
|
|
||||||
if limit_tags:
|
|
||||||
prev_states = [x for x in prev_states if x[0]==limit_tags[t-1]]
|
|
||||||
if len(prev_states)==0:
|
|
||||||
prev_states = tmp
|
|
||||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||||
obs_states = states.get(obs[t],all_states)
|
obs_states = states.get(obs[t],all_states)
|
||||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||||
if limit_tags:
|
|
||||||
obs_states = [x for x in obs_states if x[0]==limit_tags[t]]
|
|
||||||
if len(obs_states)==0: obs_states = all_states
|
if len(obs_states)==0: obs_states = all_states
|
||||||
for y in obs_states:
|
for y in obs_states:
|
||||||
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
|
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
|
||||||
|
1990
test/1.log
Normal file
1990
test/1.log
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user