mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
support en-chn mixed words, like B超
This commit is contained in:
parent
e0bd9a6a50
commit
379cd4933a
@ -9,6 +9,7 @@ import marshal
|
||||
|
||||
FREQ = {}
|
||||
total =0.0
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
|
||||
|
||||
def gen_trie(f_name):
|
||||
lfreq = {}
|
||||
@ -119,7 +120,7 @@ def __cut_DAG(sentence):
|
||||
yield buf
|
||||
buf=u''
|
||||
else:
|
||||
regognized = finalseg.__cut(buf)
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
buf=u''
|
||||
@ -130,7 +131,7 @@ def __cut_DAG(sentence):
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
else:
|
||||
regognized = finalseg.__cut(buf)
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
|
||||
@ -141,7 +142,7 @@ def cut(sentence,cut_all=False):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
||||
|
||||
blocks = re_han.split(sentence)
|
||||
cut_block = __cut_DAG
|
||||
if cut_all:
|
||||
|
@ -283173,7 +283173,7 @@
|
||||
自学辅导 3 n
|
||||
自守 3 v
|
||||
自定 3 d
|
||||
自定义 3 l
|
||||
自定义 13 l
|
||||
自定义词 3 n
|
||||
自审 6 v
|
||||
自宫 3 n
|
||||
@ -367420,3 +367420,5 @@
|
||||
龟龙麟凤 3 ns
|
||||
龠 5 g
|
||||
龢 732 zg
|
||||
B超 3 n
|
||||
T恤 4 n
|
12003
test/1.log
Normal file
12003
test/1.log
Normal file
File diff suppressed because it is too large
Load Diff
12003
test/tlbb.txt
Normal file
12003
test/tlbb.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user