support decimal point

This commit is contained in:
Sun Junyi 2013-04-08 09:53:04 +08:00
parent 72fff6c8e2
commit 94ad7e7035
2 changed files with 6 additions and 4 deletions

View File

@ -66,7 +66,7 @@ def __cut(sentence):
def __cut_detail(sentence): def __cut_detail(sentence):
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
@ -125,8 +125,8 @@ def cut(sentence):
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile(ur"(\s+)")
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):

View File

@ -91,4 +91,6 @@ if __name__ == "__main__":
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条') cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
cuttest('小和尚留了一个像大和尚一样的和尚头') cuttest('小和尚留了一个像大和尚一样的和尚头')
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站') cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤') cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')