mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
improve POS tagging
This commit is contained in:
parent
7612a62115
commit
90cd4b3014
@ -82,7 +82,7 @@ def calc(sentence,DAG,idx,route):
|
||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
|
||||
route[idx] = max(candidates)
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
def get_DAG(sentence):
|
||||
N = len(sentence)
|
||||
i,j=0,0
|
||||
p = trie
|
||||
@ -107,11 +107,15 @@ def __cut_DAG(sentence):
|
||||
for i in xrange(len(sentence)):
|
||||
if not i in DAG:
|
||||
DAG[i] =[i]
|
||||
#pprint.pprint(DAG)
|
||||
return DAG
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = get_DAG(sentence)
|
||||
route ={}
|
||||
calc(sentence,DAG,0,route=route)
|
||||
x = 0
|
||||
buf =u''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
|
@ -1,6 +1,8 @@
|
||||
import re
|
||||
import os
|
||||
import viterbi
|
||||
import jieba
|
||||
|
||||
|
||||
def load_model(f_name):
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
@ -19,9 +21,32 @@ prob_trans = load_model("prob_trans.py")
|
||||
prob_emit = load_model("prob_emit.py")
|
||||
char_state_tab = load_model("char_state_tab.py")
|
||||
|
||||
class pair(object):
|
||||
def __init__(self,word,flag):
|
||||
self.word = word
|
||||
self.flag = flag
|
||||
|
||||
def __cut(sentence):
|
||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit)
|
||||
def __unicode__(self):
|
||||
return self.word+u"/"+self.flag
|
||||
|
||||
def __repr__(self):
|
||||
return self.__unicode__()
|
||||
|
||||
def encode(self,arg):
|
||||
return self.__unicode__().encode(arg)
|
||||
|
||||
def __cut(sentence,tags_limited=False):
|
||||
limit_tags = None
|
||||
if tags_limited:
|
||||
limit_tags = []
|
||||
if len(sentence)==1:
|
||||
limit_tags = ['S']
|
||||
else:
|
||||
limit_tags.append('B')
|
||||
for i in xrange(len(sentence)-2):
|
||||
limit_tags.append('M')
|
||||
limit_tags.append('E')
|
||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab, prob_start, prob_trans, prob_emit,limit_tags)
|
||||
begin, next = 0,0
|
||||
|
||||
for i,char in enumerate(sentence):
|
||||
@ -29,13 +54,48 @@ def __cut(sentence):
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield sentence[begin:i+1]+"/"+pos_list[i][1]
|
||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield char+"/"+pos_list[i][1]
|
||||
yield pair(char,pos_list[i][1])
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield sentence[next:]+"/"+pos_list[next][1]
|
||||
yield pair(sentence[next:], pos_list[next][1] )
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
x = 0
|
||||
buf =u''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield list(__cut(buf))[0]
|
||||
buf=u''
|
||||
else:
|
||||
regognized = __cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
buf=u''
|
||||
for w in __cut(l_word,tags_limited=True):
|
||||
yield w
|
||||
x =y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield list(__cut(buf))[0]
|
||||
else:
|
||||
regognized = __cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
|
||||
|
||||
def cut(sentence):
|
||||
if not ( type(sentence) is unicode):
|
||||
@ -48,10 +108,15 @@ def cut(sentence):
|
||||
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
for word in __cut_DAG(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
yield x
|
||||
if re.match(ur"[0-9]+",x):
|
||||
yield pair(x,'m')
|
||||
elif re.match(ur"[a-zA-Z+#]+",x):
|
||||
yield pair(x,'eng')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
|
@ -5,7 +5,7 @@ def get_top_states(t_state_v,K=4):
|
||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||
return [x[0] for x in topK]
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p,limit_tags):
|
||||
V = [{}] #tabular
|
||||
mem_path = [{}]
|
||||
all_states = trans_p.keys()
|
||||
@ -15,19 +15,24 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
mem_path.append({})
|
||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||
#print get_top_states(V[t-1])
|
||||
prev_states = get_top_states(V[t-1])
|
||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||
tmp = prev_states
|
||||
if limit_tags:
|
||||
prev_states = [x for x in prev_states if x[0]==limit_tags[t-1]]
|
||||
if len(prev_states)==0:
|
||||
prev_states = tmp
|
||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||
obs_states = states.get(obs[t],all_states)
|
||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||
if limit_tags:
|
||||
obs_states = [x for x in obs_states if x[0]==limit_tags[t]]
|
||||
if len(obs_states)==0: obs_states = all_states
|
||||
for y in obs_states:
|
||||
(prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
|
||||
V[t][y] =prob
|
||||
mem_path[t][y] = state
|
||||
|
||||
|
||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
||||
#if len(last)==0:
|
||||
#print obs
|
||||
|
2
setup.py
2
setup.py
@ -7,5 +7,5 @@ setup(name='jieba',
|
||||
url='http://github.com/fxsjy',
|
||||
packages=['jieba'],
|
||||
package_dir={'jieba':'jieba'},
|
||||
package_data={'jieba':['*.*','finalseg/*']}
|
||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||
)
|
||||
|
2
test/test.txt
Normal file
2
test/test.txt
Normal file
@ -0,0 +1,2 @@
|
||||
西三旗硅谷先锋小区半地下室出租,便宜可合租硅谷
|
||||
工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
|
20
test/test_file.py
Normal file
20
test/test_file.py
Normal file
@ -0,0 +1,20 @@
|
||||
import urllib2
|
||||
import sys,time
|
||||
import sys
|
||||
sys.path.append("../")
|
||||
import jieba
|
||||
|
||||
url = sys.argv[1]
|
||||
content = open(url,"rb").read()
|
||||
t1 = time.time()
|
||||
words = list(jieba.cut(content))
|
||||
|
||||
t2 = time.time()
|
||||
tm_cost = t2-t1
|
||||
|
||||
log_f = open("1.log","wb")
|
||||
for w in words:
|
||||
print >> log_f, w.encode("gbk"), "/" ,
|
||||
|
||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
||||
|
20
test/test_pos_file.py
Normal file
20
test/test_pos_file.py
Normal file
@ -0,0 +1,20 @@
|
||||
import urllib2
|
||||
import sys,time
|
||||
import sys
|
||||
sys.path.append("../")
|
||||
import jieba.posseg as pseg
|
||||
|
||||
url = sys.argv[1]
|
||||
content = open(url,"rb").read()
|
||||
t1 = time.time()
|
||||
words = list(pseg.cut(content))
|
||||
|
||||
t2 = time.time()
|
||||
tm_cost = t2-t1
|
||||
|
||||
log_f = open("1.log","wb")
|
||||
for w in words:
|
||||
print >> log_f, w.encode("gbk"), "/" ,
|
||||
|
||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
||||
|
Loading…
x
Reference in New Issue
Block a user