mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix issue #798
This commit is contained in:
parent
0489a6979e
commit
97c32464e1
@ -46,8 +46,6 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
|||||||
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
||||||
|
|
||||||
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
||||||
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
|
||||||
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
||||||
|
|
||||||
def setLogLevel(log_level):
|
def setLogLevel(log_level):
|
||||||
global logger
|
global logger
|
||||||
@ -200,15 +198,29 @@ class Tokenizer(object):
|
|||||||
def __cut_all(self, sentence):
|
def __cut_all(self, sentence):
|
||||||
dag = self.get_DAG(sentence)
|
dag = self.get_DAG(sentence)
|
||||||
old_j = -1
|
old_j = -1
|
||||||
|
eng_scan = 0
|
||||||
|
eng_buf = u''
|
||||||
for k, L in iteritems(dag):
|
for k, L in iteritems(dag):
|
||||||
|
if eng_scan==1 and not re_eng.match(sentence[k]):
|
||||||
|
eng_scan = 0
|
||||||
|
yield eng_buf
|
||||||
if len(L) == 1 and k > old_j:
|
if len(L) == 1 and k > old_j:
|
||||||
yield sentence[k:L[0] + 1]
|
if re_eng.match(sentence[k]):
|
||||||
|
if eng_scan == 0:
|
||||||
|
eng_scan = 1
|
||||||
|
eng_buf = sentence[k]
|
||||||
|
else:
|
||||||
|
eng_buf += sentence[k]
|
||||||
|
if eng_scan == 0:
|
||||||
|
yield sentence[k:L[0] + 1]
|
||||||
old_j = L[0]
|
old_j = L[0]
|
||||||
else:
|
else:
|
||||||
for j in L:
|
for j in L:
|
||||||
if j > k:
|
if j > k:
|
||||||
yield sentence[k:j + 1]
|
yield sentence[k:j + 1]
|
||||||
old_j = j
|
old_j = j
|
||||||
|
if eng_scan==1:
|
||||||
|
yield eng_buf
|
||||||
|
|
||||||
def __cut_DAG_NO_HMM(self, sentence):
|
def __cut_DAG_NO_HMM(self, sentence):
|
||||||
DAG = self.get_DAG(sentence)
|
DAG = self.get_DAG(sentence)
|
||||||
@ -299,12 +311,8 @@ class Tokenizer(object):
|
|||||||
continue
|
continue
|
||||||
yield sent
|
yield sent
|
||||||
return
|
return
|
||||||
if cut_all:
|
re_han = re_han_default
|
||||||
re_han = re_han_cut_all
|
re_skip = re_skip_default
|
||||||
re_skip = re_skip_cut_all
|
|
||||||
else:
|
|
||||||
re_han = re_han_default
|
|
||||||
re_skip = re_skip_default
|
|
||||||
if cut_all:
|
if cut_all:
|
||||||
cut_block = self.__cut_all
|
cut_block = self.__cut_all
|
||||||
elif HMM:
|
elif HMM:
|
||||||
|
@ -96,3 +96,5 @@ if __name__ == "__main__":
|
|||||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
jieba.add_word('超敏C反应蛋白')
|
||||||
|
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user