mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
make a serial white spaces seperated
This commit is contained in:
parent
a1ad2cbd55
commit
d63140fe5e
@ -212,7 +212,7 @@ def cut(sentence,cut_all=False):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\s+)", re.U)
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
||||
blocks = re_han.split(sentence)
|
||||
|
@ -131,7 +131,7 @@ def __cut_internal(sentence):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
|
Loading…
x
Reference in New Issue
Block a user