mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
en-chn mix words in POS
This commit is contained in:
parent
a8ae0398b4
commit
06ebc6f71c
@ -3,6 +3,7 @@ import os
|
||||
import viterbi
|
||||
import jieba
|
||||
import sys
|
||||
|
||||
default_encoding = sys.getfilesystemencoding()
|
||||
|
||||
def load_model(f_name):
|
||||
@ -60,10 +61,31 @@ def __cut(sentence):
|
||||
if next<len(sentence):
|
||||
yield pair(sentence[next:], pos_list[next][1] )
|
||||
|
||||
def __cut_detail(sentence):
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\r\n]")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if re_num.match(x):
|
||||
yield pair(x,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(x,'eng')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
|
||||
x = 0
|
||||
buf =u''
|
||||
N = len(sentence)
|
||||
@ -78,7 +100,7 @@ def __cut_DAG(sentence):
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
buf=u''
|
||||
else:
|
||||
regognized = __cut(buf)
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
buf=u''
|
||||
@ -89,7 +111,7 @@ def __cut_DAG(sentence):
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
else:
|
||||
regognized = __cut(buf)
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
|
||||
@ -100,10 +122,9 @@ def cut(sentence):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut_DAG(blk):
|
||||
|
@ -90,4 +90,5 @@ if __name__ == "__main__":
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
Loading…
x
Reference in New Issue
Block a user