mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
improve POS tagging
This commit is contained in:
parent
7612a62115
commit
90cd4b3014
@ -82,7 +82,7 @@ def calc(sentence,DAG,idx,route):
|
|||||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
||||||
route[idx] = max(candidates)
|
route[idx] = max(candidates)
|
||||||
|
|
||||||
def __cut_DAG(sentence):
|
def get_DAG(sentence):
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
i,j=0,0
|
i,j=0,0
|
||||||
p = trie
|
p = trie
|
||||||
@ -107,11 +107,15 @@ def __cut_DAG(sentence):
|
|||||||
for i in xrange(len(sentence)):
|
for i in xrange(len(sentence)):
|
||||||
if not i in DAG:
|
if not i in DAG:
|
||||||
DAG[i] =[i]
|
DAG[i] =[i]
|
||||||
#pprint.pprint(DAG)
|
return DAG
|
||||||
|
|
||||||
|
def __cut_DAG(sentence):
|
||||||
|
DAG = get_DAG(sentence)
|
||||||
route ={}
|
route ={}
|
||||||
calc(sentence,DAG,0,route=route)
|
calc(sentence,DAG,0,route=route)
|
||||||
x = 0
|
x = 0
|
||||||
buf =u''
|
buf =u''
|
||||||
|
N = len(sentence)
|
||||||
while x<N:
|
while x<N:
|
||||||
y = route[x][1]+1
|
y = route[x][1]+1
|
||||||
l_word = sentence[x:y]
|
l_word = sentence[x:y]
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import viterbi
|
import viterbi
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
|
||||||
def load_model(f_name):
|
def load_model(f_name):
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
@ -19,9 +21,32 @@ prob_trans = load_model("prob_trans.py")
|
|||||||
prob_emit = load_model("prob_emit.py")
|
prob_emit = load_model("prob_emit.py")
|
||||||
char_state_tab = load_model("char_state_tab.py")
|
char_state_tab = load_model("char_state_tab.py")
|
||||||
|
|
||||||
|
class pair(object):
|
||||||
|
def __init__(self,word,flag):
|
||||||
|
self.word = word
|
||||||
|
self.flag = flag
|
||||||
|
|
||||||
def __cut(sentence):
|
def __unicode__(self):
|
||||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
|
return self.word+u"/"+self.flag
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__unicode__()
|
||||||
|
|
||||||
|
def encode(self,arg):
|
||||||
|
return self.__unicode__().encode(arg)
|
||||||
|
|
||||||
|
def __cut(sentence,tags_limited=False):
|
||||||
|
limit_tags = None
|
||||||
|
if tags_limited:
|
||||||
|
limit_tags = []
|
||||||
|
if len(sentence)==1:
|
||||||
|
limit_tags = ['S']
|
||||||
|
else:
|
||||||
|
limit_tags.append('B')
|
||||||
|
for i in xrange(len(sentence)-2):
|
||||||
|
limit_tags.append('M')
|
||||||
|
limit_tags.append('E')
|
||||||
|
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit,limit_tags)
|
||||||
begin, next = 0,0
|
begin, next = 0,0
|
||||||
|
|
||||||
for i,char in enumerate(sentence):
|
for i,char in enumerate(sentence):
|
||||||
@ -29,13 +54,48 @@ def __cut(sentence):
|
|||||||
if pos=='B':
|
if pos=='B':
|
||||||
begin = i
|
begin = i
|
||||||
elif pos=='E':
|
elif pos=='E':
|
||||||
yield sentence[begin:i+1]+"/"+pos_list[i][1]
|
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||||
next = i+1
|
next = i+1
|
||||||
elif pos=='S':
|
elif pos=='S':
|
||||||
yield char+"/"+pos_list[i][1]
|
yield pair(char,pos_list[i][1])
|
||||||
next = i+1
|
next = i+1
|
||||||
if next<len(sentence):
|
if next<len(sentence):
|
||||||
yield sentence[next:]+"/"+pos_list[next][1]
|
yield pair(sentence[next:], pos_list[next][1] )
|
||||||
|
|
||||||
|
def __cut_DAG(sentence):
|
||||||
|
DAG = jieba.get_DAG(sentence)
|
||||||
|
route ={}
|
||||||
|
jieba.calc(sentence,DAG,0,route=route)
|
||||||
|
x = 0
|
||||||
|
buf =u''
|
||||||
|
N = len(sentence)
|
||||||
|
while x<N:
|
||||||
|
y = route[x][1]+1
|
||||||
|
l_word = sentence[x:y]
|
||||||
|
if y-x==1:
|
||||||
|
buf+= l_word
|
||||||
|
else:
|
||||||
|
if len(buf)>0:
|
||||||
|
if len(buf)==1:
|
||||||
|
yield list(__cut(buf))[0]
|
||||||
|
buf=u''
|
||||||
|
else:
|
||||||
|
regognized = __cut(buf)
|
||||||
|
for t in regognized:
|
||||||
|
yield t
|
||||||
|
buf=u''
|
||||||
|
for w in __cut(l_word,tags_limited=True):
|
||||||
|
yield w
|
||||||
|
x =y
|
||||||
|
|
||||||
|
if len(buf)>0:
|
||||||
|
if len(buf)==1:
|
||||||
|
yield list(__cut(buf))[0]
|
||||||
|
else:
|
||||||
|
regognized = __cut(buf)
|
||||||
|
for t in regognized:
|
||||||
|
yield t
|
||||||
|
|
||||||
|
|
||||||
def cut(sentence):
|
def cut(sentence):
|
||||||
if not ( type(sentence) is unicode):
|
if not ( type(sentence) is unicode):
|
||||||
@ -48,10 +108,15 @@ def cut(sentence):
|
|||||||
|
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
for word in __cut(blk):
|
for word in __cut_DAG(blk):
|
||||||
yield word
|
yield word
|
||||||
else:
|
else:
|
||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
if x!="":
|
if x!="":
|
||||||
yield x
|
if re.match(ur"[0-9]+",x):
|
||||||
|
yield pair(x,'m')
|
||||||
|
elif re.match(ur"[a-zA-Z+#]+",x):
|
||||||
|
yield pair(x,'eng')
|
||||||
|
else:
|
||||||
|
yield pair(x,'x')
|
||||||
|
@ -5,7 +5,7 @@ def get_top_states(t_state_v,K=4):
|
|||||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||||
return [x[0] for x in topK]
|
return [x[0] for x in topK]
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p,limit_tags):
|
||||||
V = [{}] #tabular
|
V = [{}] #tabular
|
||||||
mem_path = [{}]
|
mem_path = [{}]
|
||||||
all_states = trans_p.keys()
|
all_states = trans_p.keys()
|
||||||
@ -15,19 +15,24 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
|
|||||||
for t in range(1,len(obs)):
|
for t in range(1,len(obs)):
|
||||||
V.append({})
|
V.append({})
|
||||||
mem_path.append({})
|
mem_path.append({})
|
||||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
|
||||||
#print get_top_states(V[t-1])
|
|
||||||
prev_states = get_top_states(V[t-1])
|
prev_states = get_top_states(V[t-1])
|
||||||
|
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||||
|
tmp = prev_states
|
||||||
|
if limit_tags:
|
||||||
|
prev_states = [x for x in prev_states if x[0]==limit_tags[t-1]]
|
||||||
|
if len(prev_states)==0:
|
||||||
|
prev_states = tmp
|
||||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||||
obs_states = states.get(obs[t],all_states)
|
obs_states = states.get(obs[t],all_states)
|
||||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||||
|
if limit_tags:
|
||||||
|
obs_states = [x for x in obs_states if x[0]==limit_tags[t]]
|
||||||
if len(obs_states)==0: obs_states = all_states
|
if len(obs_states)==0: obs_states = all_states
|
||||||
for y in obs_states:
|
for y in obs_states:
|
||||||
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
|
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
|
||||||
V[t][y] =prob
|
V[t][y] =prob
|
||||||
mem_path[t][y] = state
|
mem_path[t][y] = state
|
||||||
|
|
||||||
|
|
||||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
||||||
#if len(last)==0:
|
#if len(last)==0:
|
||||||
#print obs
|
#print obs
|
||||||
|
2
setup.py
2
setup.py
@ -7,5 +7,5 @@ setup(name='jieba',
|
|||||||
url='http://github.com/fxsjy',
|
url='http://github.com/fxsjy',
|
||||||
packages=['jieba'],
|
packages=['jieba'],
|
||||||
package_dir={'jieba':'jieba'},
|
package_dir={'jieba':'jieba'},
|
||||||
package_data={'jieba':['*.*','finalseg/*']}
|
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||||
)
|
)
|
||||||
|
2
test/test.txt
Normal file
2
test/test.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
西三旗硅谷先锋小区半地下室出租,便宜可合租硅谷
|
||||||
|
工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
|
20
test/test_file.py
Normal file
20
test/test_file.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import urllib2
|
||||||
|
import sys,time
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
content = open(url,"rb").read()
|
||||||
|
t1 = time.time()
|
||||||
|
words = list(jieba.cut(content))
|
||||||
|
|
||||||
|
t2 = time.time()
|
||||||
|
tm_cost = t2-t1
|
||||||
|
|
||||||
|
log_f = open("1.log","wb")
|
||||||
|
for w in words:
|
||||||
|
print >> log_f, w.encode("gbk"), "/" ,
|
||||||
|
|
||||||
|
print 'speed' , len(content)/tm_cost, " bytes/second"
|
||||||
|
|
20
test/test_pos_file.py
Normal file
20
test/test_pos_file.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import urllib2
|
||||||
|
import sys,time
|
||||||
|
import sys
|
||||||
|
sys.path.append("../")
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
content = open(url,"rb").read()
|
||||||
|
t1 = time.time()
|
||||||
|
words = list(pseg.cut(content))
|
||||||
|
|
||||||
|
t2 = time.time()
|
||||||
|
tm_cost = t2-t1
|
||||||
|
|
||||||
|
log_f = open("1.log","wb")
|
||||||
|
for w in words:
|
||||||
|
print >> log_f, w.encode("gbk"), "/" ,
|
||||||
|
|
||||||
|
print 'speed' , len(content)/tm_cost, " bytes/second"
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user