mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
merge changes from master branch
This commit is contained in:
commit
6da857b554
@ -157,7 +157,7 @@ def cut(sentence,cut_all=False):
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile("(\s+)")
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
||||
@ -175,7 +175,8 @@ def cut(sentence,cut_all=False):
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield x
|
||||
if x.strip(' ')!='':
|
||||
yield x
|
||||
else:
|
||||
for xx in x:
|
||||
yield xx
|
||||
|
@ -245161,10 +245161,9 @@
|
||||
皂隶 96 n
|
||||
皂靴 10 n
|
||||
皂鞋 3 n
|
||||
的 3188252 uj
|
||||
的 318825 uj
|
||||
的一确二 3 l
|
||||
的士高 3 n
|
||||
的歌者 3 n
|
||||
的的确确 64 d
|
||||
的确 2135 d
|
||||
的确如此 31 l
|
||||
@ -245174,6 +245173,8 @@
|
||||
的里雅斯特 23 ns
|
||||
的里雅斯特市 3 ns
|
||||
的黎波里 62 ns
|
||||
的哥 63 n
|
||||
的士 20 n
|
||||
皆 7511 d
|
||||
皆佳 3 nrt
|
||||
皆准 3 i
|
||||
|
@ -129,7 +129,7 @@ def cut(sentence):
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\.]+)"), re.compile("(\s+)")
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||
|
||||
blocks = re_han.split(sentence)
|
||||
@ -141,7 +141,8 @@ def cut(sentence):
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield pair(x,'')
|
||||
if x.strip(' ')!='':
|
||||
yield pair(x,'')
|
||||
else:
|
||||
for xx in x:
|
||||
if re_num.match(xx):
|
||||
|
@ -92,3 +92,4 @@ if __name__ == "__main__":
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -94,4 +94,4 @@ if __name__ == "__main__":
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -14,3 +14,13 @@ result = pseg.cut(test_sent)
|
||||
|
||||
for w in result:
|
||||
print(w.word, "/", w.flag, ", ")
|
||||
|
||||
print("\n========")
|
||||
|
||||
terms = jieba.cut('easy_install is great')
|
||||
for t in terms:
|
||||
print(t)
|
||||
print('-------------------------')
|
||||
terms = jieba.cut('python 的正则表达式是好用的')
|
||||
for t in terms:
|
||||
print(t)
|
||||
|
@ -1,3 +1,5 @@
|
||||
云计算 5
|
||||
云计算 5
|
||||
李小福 2 nr
|
||||
创新办 3 i
|
||||
创新办 3 i
|
||||
easy_install 3 eng
|
||||
好用 300
|
Loading…
x
Reference in New Issue
Block a user