diff --git a/jieba/__init__.py b/jieba/__init__.py index 4717da5..b935985 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -153,7 +153,7 @@ def cut(sentence,cut_all=False): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]") if cut_all: re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") blocks = re_han.split(sentence) diff --git a/jieba/dict.txt b/jieba/dict.txt index a0923a8..f4b81ef 100644 --- a/jieba/dict.txt +++ b/jieba/dict.txt @@ -367421,4 +367421,9 @@ 龠 5 g 龢 732 zg B超 3 n -T恤 4 n \ No newline at end of file +T恤 4 n +C++ 3 nz +c++ 3 nz +C# 3 nz +c# 3 nz +AT&T 3 nz \ No newline at end of file diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index fec7767..2ba75d2 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -59,7 +59,7 @@ def cut(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([a-zA-Z0-9]+)") blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): diff --git a/test/test.py b/test/test.py index f33c3f0..884b40a 100644 --- a/test/test.py +++ b/test/test.py @@ -91,4 +91,6 @@ if __name__ == "__main__": cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') cuttest('小和尚留了一个像大和尚一样的和尚头') cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') - cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') \ No newline at end of file + cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') + cuttest('AT&T是一件不错的公司,给你发offer了吗?') + cuttest('C++和c#是什么关系?11+122=133,是吗?') \ No newline at end of file