fix bug; decimals & digit-english mixed

This commit is contained in:
Sun Junyi 2013-07-05 16:16:49 +08:00
parent ba5114dc95
commit 9d0ea771a5
2 changed files with 2 additions and 2 deletions

View File

@ -58,7 +58,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):

View File

@ -97,4 +97,4 @@ if __name__ == "__main__":
cuttest('枪杆子中出政权')
cuttest('张三风同学走上了不归路')
cuttest('阿Q腰间挂着BB机手里拿着大哥大我一般吃饭不AA制的。')
cuttest('在1号店能买到小S和大S八卦的书')
cuttest('在1号店能买到小S和大S八卦的书还有3D电视')