make a serial white spaces seperated

This commit is contained in:
Sun Junyi 2013-07-10 17:27:47 +08:00
parent a1ad2cbd55
commit d63140fe5e
2 changed files with 2 additions and 2 deletions

View File

@ -212,7 +212,7 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\s+)", re.U)
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
blocks = re_han.split(sentence)

View File

@ -131,7 +131,7 @@ def __cut_internal(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks: