mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
keep punctuation marks
This commit is contained in:
parent
58c363655c
commit
8e49199993
@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
||||
blocks = re_han.split(sentence)
|
||||
|
@ -367421,4 +367421,9 @@
|
||||
龠 5 g
|
||||
龢 732 zg
|
||||
B超 3 n
|
||||
T恤 4 n
|
||||
T恤 4 n
|
||||
C++ 3 nz
|
||||
c++ 3 nz
|
||||
C# 3 nz
|
||||
c# 3 nz
|
||||
AT&T 3 nz
|
@ -59,7 +59,7 @@ def cut(sentence):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([a-zA-Z0-9]+)")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
|
@ -91,4 +91,6 @@ if __name__ == "__main__":
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?')
|
Loading…
x
Reference in New Issue
Block a user