mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
remove useless data & code
This commit is contained in:
parent
051f43c1d7
commit
7612a62115
@ -18,17 +18,6 @@ prob_start = load_model("prob_start.py")
|
|||||||
prob_trans = load_model("prob_trans.py")
|
prob_trans = load_model("prob_trans.py")
|
||||||
prob_emit = load_model("prob_emit.py")
|
prob_emit = load_model("prob_emit.py")
|
||||||
char_state_tab = load_model("char_state_tab.py")
|
char_state_tab = load_model("char_state_tab.py")
|
||||||
near_char_tab = load_model("near_char_tab.txt")
|
|
||||||
|
|
||||||
|
|
||||||
def __raw_seg(sentence):
|
|
||||||
i,j =0,0
|
|
||||||
while j<len(sentence)-1:
|
|
||||||
if not ( sentence[j:j+2] in near_char_tab):
|
|
||||||
yield sentence[i:j+1]
|
|
||||||
i=j+1
|
|
||||||
j+=1
|
|
||||||
yield sentence[i:j+1]
|
|
||||||
|
|
||||||
|
|
||||||
def __cut(sentence):
|
def __cut(sentence):
|
||||||
@ -48,7 +37,7 @@ def __cut(sentence):
|
|||||||
if next<len(sentence):
|
if next<len(sentence):
|
||||||
yield sentence[next:]+"/"+pos_list[next][1]
|
yield sentence[next:]+"/"+pos_list[next][1]
|
||||||
|
|
||||||
def cut(sentence,find_new_word=True):
|
def cut(sentence):
|
||||||
if not ( type(sentence) is unicode):
|
if not ( type(sentence) is unicode):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
@ -56,14 +45,10 @@ def cut(sentence,find_new_word=True):
|
|||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n%]")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
if find_new_word:
|
|
||||||
detail_seg = lambda x: (x,)
|
|
||||||
else:
|
|
||||||
detail_seg = __raw_seg
|
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
for lb in detail_seg(blk):
|
for word in __cut(blk):
|
||||||
for word in __cut(lb):
|
|
||||||
yield word
|
yield word
|
||||||
else:
|
else:
|
||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user