mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix bug; decimals & digit-english mixed
This commit is contained in:
parent
ba5114dc95
commit
9d0ea771a5
@ -58,7 +58,7 @@ def cut(sentence):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
|
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
|
@ -97,4 +97,4 @@ if __name__ == "__main__":
|
|||||||
cuttest('枪杆子中出政权')
|
cuttest('枪杆子中出政权')
|
||||||
cuttest('张三风同学走上了不归路')
|
cuttest('张三风同学走上了不归路')
|
||||||
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。')
|
||||||
cuttest('在1号店能买到小S和大S八卦的书。')
|
cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user