punctuation

This commit is contained in:
Sun Junyi 2013-04-05 22:49:16 +08:00
parent 8e49199993
commit 7d227da5c4

View File

@ -65,7 +65,7 @@ def __cut(sentence):
yield pair(sentence[next:], pos_list[next][1] ) yield pair(sentence[next:], pos_list[next][1] )
def __cut_detail(sentence): def __cut_detail(sentence):
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\r\n]") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([a-zA-Z0-9]+)")
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
@ -125,7 +125,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks: