diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 982c132..09df9e4 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -66,7 +66,7 @@ def __cut(sentence): def __cut_detail(sentence): re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)") - re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") + re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): @@ -125,8 +125,8 @@ def cut(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)") - re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)") + re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): diff --git a/test/test_pos.py b/test/test_pos.py index ce11410..5e2862f 100644 --- a/test/test_pos.py +++ b/test/test_pos.py @@ -91,4 +91,6 @@ if __name__ == "__main__": cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') cuttest('小和尚留了一个像大和尚一样的和尚头') cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') - cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') \ No newline at end of file + cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') + cuttest('AT&T是一件不错的公司,给你发offer了吗?') + cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159') \ No newline at end of file