From 7d227da5c4c44f0a70b107eca04904ecc1b18be1 Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Fri, 5 Apr 2013 22:49:16 +0800 Subject: [PATCH] punctuation --- jieba/posseg/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index c3cae77..ef247e1 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -65,7 +65,7 @@ def __cut(sentence): yield pair(sentence[next:], pos_list[next][1] ) def __cut_detail(sentence): - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\r\n]") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([a-zA-Z0-9]+)") re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") blocks = re_han.split(sentence) for blk in blocks: @@ -125,7 +125,7 @@ def cut(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]") re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") blocks = re_han.split(sentence) for blk in blocks: