bug fix, issue: #511, #512

2025-07-10 00:01:33 +08:00 · 2017-08-28 21:10:50 +08:00 · 2017-08-28 21:10:50 +08:00 · b4dd5b58f3
commit b4dd5b58f3
parent 4eef868338
3 changed files with 16 additions and 4 deletions
--- a/jieba/init.py
+++ b/jieba/init.py
@ -40,7 +40,7 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
 # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
 # \r\n|\s : whitespace characters. Will not be handled.
-re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
+re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
 re_skip_default = re.compile("(\r\n|\s)", re.U)
 re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
 re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
@ -409,6 +409,8 @@ class Tokenizer(object):
            wfrag = word[:ch + 1]
            if wfrag not in self.FREQ:
                self.FREQ[wfrag] = 0
        if freq == 0:
            finalseg.add_force_split(word)
    def del_word(self, word):
        """
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -19,7 +19,7 @@ PrevStatus = {
    'E': 'BM'
 }
-
+Force_Split_Words = set([])
 def load_model():
    start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
    trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
@ -75,16 +75,24 @@ def __cut(sentence):
        yield sentence[nexti:]
 re_han = re.compile("([\u4E00-\u9FD5]+)")
-re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
+re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
 def add_force_split(word):
    global Force_Split_Words
    Force_Split_Words.add(word)
 def cut(sentence):
    sentence = strdecode(sentence)
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
            for word in __cut(blk):
                if word not in Force_Split_Words:
                    yield word
                else:
                    for c in word:
                        yield c
        else:
            tmp = re_skip.split(blk)
            for x in tmp:
--- a/test/test.py
+++ b/test/test.py
@ -98,3 +98,5 @@ if __name__ == "__main__":
    cuttest('张三风同学走上了不归路')
    cuttest('阿Q腰间挂着BB机手里拿着大哥大，说：我一般吃饭不AA制的。')
    cuttest('在1号店能买到小S和大S八卦的书，还有3D电视。')
    jieba.del_word('很赞')
    cuttest('看上去iphone8手机样式很赞,售价699美元,销量涨了5%么？')