mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
parent
4eef868338
commit
b4dd5b58f3
@ -40,7 +40,7 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
|||||||
|
|
||||||
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||||
# \r\n|\s : whitespace characters. Will not be handled.
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
|
||||||
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
||||||
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
||||||
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||||
@ -409,6 +409,8 @@ class Tokenizer(object):
|
|||||||
wfrag = word[:ch + 1]
|
wfrag = word[:ch + 1]
|
||||||
if wfrag not in self.FREQ:
|
if wfrag not in self.FREQ:
|
||||||
self.FREQ[wfrag] = 0
|
self.FREQ[wfrag] = 0
|
||||||
|
if freq == 0:
|
||||||
|
finalseg.add_force_split(word)
|
||||||
|
|
||||||
def del_word(self, word):
|
def del_word(self, word):
|
||||||
"""
|
"""
|
||||||
|
@ -19,7 +19,7 @@ PrevStatus = {
|
|||||||
'E': 'BM'
|
'E': 'BM'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Force_Split_Words = set([])
|
||||||
def load_model():
|
def load_model():
|
||||||
start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
|
start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
|
||||||
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
|
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
|
||||||
@ -75,16 +75,24 @@ def __cut(sentence):
|
|||||||
yield sentence[nexti:]
|
yield sentence[nexti:]
|
||||||
|
|
||||||
re_han = re.compile("([\u4E00-\u9FD5]+)")
|
re_han = re.compile("([\u4E00-\u9FD5]+)")
|
||||||
re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
|
||||||
|
|
||||||
|
|
||||||
|
def add_force_split(word):
|
||||||
|
global Force_Split_Words
|
||||||
|
Force_Split_Words.add(word)
|
||||||
|
|
||||||
def cut(sentence):
|
def cut(sentence):
|
||||||
sentence = strdecode(sentence)
|
sentence = strdecode(sentence)
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
for word in __cut(blk):
|
for word in __cut(blk):
|
||||||
|
if word not in Force_Split_Words:
|
||||||
yield word
|
yield word
|
||||||
|
else:
|
||||||
|
for c in word:
|
||||||
|
yield c
|
||||||
else:
|
else:
|
||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
|
@ -98,3 +98,5 @@ if __name__ == "__main__":
|
|||||||
cuttest('张三风同学走上了不归路')
|
cuttest('张三风同学走上了不归路')
|
||||||
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
||||||
|
jieba.del_word('很赞')
|
||||||
|
cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么?')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user