From d63140fe5e0bd5e7003f38040e26ca6533a5faa6 Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Wed, 10 Jul 2013 17:27:47 +0800 Subject: [PATCH] make a serial white spaces seperated --- jieba/__init__.py | 2 +- jieba/posseg/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jieba/__init__.py b/jieba/__init__.py index fe3988c..0ee4320 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -212,7 +212,7 @@ def cut(sentence,cut_all=False): sentence = sentence.decode('utf-8') except UnicodeDecodeError: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\s+)", re.U) + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U) if cut_all: re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U) blocks = re_han.split(sentence) diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 70aacf4..69f47da 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -131,7 +131,7 @@ def __cut_internal(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\s+)") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)") re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") blocks = re_han.split(sentence) for blk in blocks: