mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
merge changes from master branch
This commit is contained in:
commit
6da857b554
@ -157,7 +157,7 @@ def cut(sentence,cut_all=False):
|
|||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
|
|
||||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile("(\s+)")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||||
|
|
||||||
if cut_all:
|
if cut_all:
|
||||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
||||||
@ -175,7 +175,8 @@ def cut(sentence,cut_all=False):
|
|||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
if re_skip.match(x):
|
if re_skip.match(x):
|
||||||
yield x
|
if x.strip(' ')!='':
|
||||||
|
yield x
|
||||||
else:
|
else:
|
||||||
for xx in x:
|
for xx in x:
|
||||||
yield xx
|
yield xx
|
||||||
|
@ -245161,10 +245161,9 @@
|
|||||||
皂隶 96 n
|
皂隶 96 n
|
||||||
皂靴 10 n
|
皂靴 10 n
|
||||||
皂鞋 3 n
|
皂鞋 3 n
|
||||||
的 3188252 uj
|
的 318825 uj
|
||||||
的一确二 3 l
|
的一确二 3 l
|
||||||
的士高 3 n
|
的士高 3 n
|
||||||
的歌者 3 n
|
|
||||||
的的确确 64 d
|
的的确确 64 d
|
||||||
的确 2135 d
|
的确 2135 d
|
||||||
的确如此 31 l
|
的确如此 31 l
|
||||||
@ -245174,6 +245173,8 @@
|
|||||||
的里雅斯特 23 ns
|
的里雅斯特 23 ns
|
||||||
的里雅斯特市 3 ns
|
的里雅斯特市 3 ns
|
||||||
的黎波里 62 ns
|
的黎波里 62 ns
|
||||||
|
的哥 63 n
|
||||||
|
的士 20 n
|
||||||
皆 7511 d
|
皆 7511 d
|
||||||
皆佳 3 nrt
|
皆佳 3 nrt
|
||||||
皆准 3 i
|
皆准 3 i
|
||||||
|
@ -129,7 +129,7 @@ def cut(sentence):
|
|||||||
except:
|
except:
|
||||||
sentence = sentence.decode('gbk','ignore')
|
sentence = sentence.decode('gbk','ignore')
|
||||||
|
|
||||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile("(\s+)")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||||
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
|
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
@ -141,7 +141,8 @@ def cut(sentence):
|
|||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
if re_skip.match(x):
|
if re_skip.match(x):
|
||||||
yield pair(x,'')
|
if x.strip(' ')!='':
|
||||||
|
yield pair(x,'')
|
||||||
else:
|
else:
|
||||||
for xx in x:
|
for xx in x:
|
||||||
if re_num.match(xx):
|
if re_num.match(xx):
|
||||||
|
@ -92,3 +92,4 @@ if __name__ == "__main__":
|
|||||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
@ -94,4 +94,4 @@ if __name__ == "__main__":
|
|||||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
@ -14,3 +14,13 @@ result = pseg.cut(test_sent)
|
|||||||
|
|
||||||
for w in result:
|
for w in result:
|
||||||
print(w.word, "/", w.flag, ", ")
|
print(w.word, "/", w.flag, ", ")
|
||||||
|
|
||||||
|
print("\n========")
|
||||||
|
|
||||||
|
terms = jieba.cut('easy_install is great')
|
||||||
|
for t in terms:
|
||||||
|
print(t)
|
||||||
|
print('-------------------------')
|
||||||
|
terms = jieba.cut('python 的正则表达式是好用的')
|
||||||
|
for t in terms:
|
||||||
|
print(t)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
云计算 5
|
云计算 5
|
||||||
李小福 2 nr
|
李小福 2 nr
|
||||||
创新办 3 i
|
创新办 3 i
|
||||||
|
easy_install 3 eng
|
||||||
|
好用 300
|
Loading…
x
Reference in New Issue
Block a user