mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
punctuation
This commit is contained in:
parent
8e49199993
commit
7d227da5c4
@ -65,7 +65,7 @@ def __cut(sentence):
|
|||||||
yield pair(sentence[next:], pos_list[next][1] )
|
yield pair(sentence[next:], pos_list[next][1] )
|
||||||
|
|
||||||
def __cut_detail(sentence):
|
def __cut_detail(sentence):
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\r\n]")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([a-zA-Z0-9]+)")
|
||||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
@ -125,7 +125,7 @@ def cut(sentence):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
|
||||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user