keep punctuation marks

This commit is contained in:
Sun Junyi 2013-04-05 21:48:36 +08:00
parent 58c363655c
commit 8e49199993
4 changed files with 11 additions and 4 deletions

View File

@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)

View File

@ -367422,3 +367422,8 @@
龢 732 zg
B超 3 n
T恤 4 n
C++ 3 nz
c++ 3 nz
C# 3 nz
c# 3 nz
AT&T 3 nz

View File

@ -59,7 +59,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):

View File

@ -92,3 +92,5 @@ if __name__ == "__main__":
cuttest('小和尚留了一个像大和尚一样的和尚头')
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗')