mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix issue #798
This commit is contained in:
parent
0489a6979e
commit
97c32464e1
@ -46,8 +46,6 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
||||
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
||||
|
||||
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
||||
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
||||
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||
|
||||
def setLogLevel(log_level):
|
||||
global logger
|
||||
@ -200,8 +198,20 @@ class Tokenizer(object):
|
||||
def __cut_all(self, sentence):
|
||||
dag = self.get_DAG(sentence)
|
||||
old_j = -1
|
||||
eng_scan = 0
|
||||
eng_buf = u''
|
||||
for k, L in iteritems(dag):
|
||||
if eng_scan==1 and not re_eng.match(sentence[k]):
|
||||
eng_scan = 0
|
||||
yield eng_buf
|
||||
if len(L) == 1 and k > old_j:
|
||||
if re_eng.match(sentence[k]):
|
||||
if eng_scan == 0:
|
||||
eng_scan = 1
|
||||
eng_buf = sentence[k]
|
||||
else:
|
||||
eng_buf += sentence[k]
|
||||
if eng_scan == 0:
|
||||
yield sentence[k:L[0] + 1]
|
||||
old_j = L[0]
|
||||
else:
|
||||
@ -209,6 +219,8 @@ class Tokenizer(object):
|
||||
if j > k:
|
||||
yield sentence[k:j + 1]
|
||||
old_j = j
|
||||
if eng_scan==1:
|
||||
yield eng_buf
|
||||
|
||||
def __cut_DAG_NO_HMM(self, sentence):
|
||||
DAG = self.get_DAG(sentence)
|
||||
@ -299,10 +311,6 @@ class Tokenizer(object):
|
||||
continue
|
||||
yield sent
|
||||
return
|
||||
if cut_all:
|
||||
re_han = re_han_cut_all
|
||||
re_skip = re_skip_cut_all
|
||||
else:
|
||||
re_han = re_han_default
|
||||
re_skip = re_skip_default
|
||||
if cut_all:
|
||||
|
@ -96,3 +96,5 @@ if __name__ == "__main__":
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
jieba.add_word('超敏C反应蛋白')
|
||||
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')
|
||||
|
Loading…
x
Reference in New Issue
Block a user