mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
support en-chn mixed words, like B超
This commit is contained in:
parent
e0bd9a6a50
commit
379cd4933a
@ -9,6 +9,7 @@ import marshal
|
|||||||
|
|
||||||
FREQ = {}
|
FREQ = {}
|
||||||
total =0.0
|
total =0.0
|
||||||
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
|
||||||
|
|
||||||
def gen_trie(f_name):
|
def gen_trie(f_name):
|
||||||
lfreq = {}
|
lfreq = {}
|
||||||
@ -119,7 +120,7 @@ def __cut_DAG(sentence):
|
|||||||
yield buf
|
yield buf
|
||||||
buf=u''
|
buf=u''
|
||||||
else:
|
else:
|
||||||
regognized = finalseg.__cut(buf)
|
regognized = finalseg.cut(buf)
|
||||||
for t in regognized:
|
for t in regognized:
|
||||||
yield t
|
yield t
|
||||||
buf=u''
|
buf=u''
|
||||||
@ -130,7 +131,7 @@ def __cut_DAG(sentence):
|
|||||||
if len(buf)==1:
|
if len(buf)==1:
|
||||||
yield buf
|
yield buf
|
||||||
else:
|
else:
|
||||||
regognized = finalseg.__cut(buf)
|
regognized = finalseg.cut(buf)
|
||||||
for t in regognized:
|
for t in regognized:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
@ -141,7 +142,7 @@ def cut(sentence,cut_all=False):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
cut_block = __cut_DAG
|
cut_block = __cut_DAG
|
||||||
if cut_all:
|
if cut_all:
|
||||||
|
@ -283173,7 +283173,7 @@
|
|||||||
自学辅导 3 n
|
自学辅导 3 n
|
||||||
自守 3 v
|
自守 3 v
|
||||||
自定 3 d
|
自定 3 d
|
||||||
自定义 3 l
|
自定义 13 l
|
||||||
自定义词 3 n
|
自定义词 3 n
|
||||||
自审 6 v
|
自审 6 v
|
||||||
自宫 3 n
|
自宫 3 n
|
||||||
@ -367419,4 +367419,6 @@
|
|||||||
龟龙片甲 3 nz
|
龟龙片甲 3 nz
|
||||||
龟龙麟凤 3 ns
|
龟龙麟凤 3 ns
|
||||||
龠 5 g
|
龠 5 g
|
||||||
龢 732 zg
|
龢 732 zg
|
||||||
|
B超 3 n
|
||||||
|
T恤 4 n
|
12003
test/1.log
Normal file
12003
test/1.log
Normal file
File diff suppressed because it is too large
Load Diff
12003
test/tlbb.txt
Normal file
12003
test/tlbb.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user