mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
add 'search mode' for jieba.tokenize
This commit is contained in:
parent
237dc6625e
commit
f08690a2df
@ -335,11 +335,29 @@ def get_abs_path_dict():
|
|||||||
abs_path = os.path.join(_curpath,DICTIONARY)
|
abs_path = os.path.join(_curpath,DICTIONARY)
|
||||||
return abs_path
|
return abs_path
|
||||||
|
|
||||||
def tokenize(unicode_sentence):
|
def tokenize(unicode_sentence,mode="default"):
|
||||||
|
#mode ("default" or "search")
|
||||||
if not isinstance(unicode_sentence, unicode):
|
if not isinstance(unicode_sentence, unicode):
|
||||||
raise Exception("jieba: the input parameter should unicode.")
|
raise Exception("jieba: the input parameter should unicode.")
|
||||||
start = 0
|
start = 0
|
||||||
|
if mode=='default':
|
||||||
for w in cut(unicode_sentence):
|
for w in cut(unicode_sentence):
|
||||||
width = len(w)
|
width = len(w)
|
||||||
yield (w,start,start+width)
|
yield (w,start,start+width)
|
||||||
start+=width
|
start+=width
|
||||||
|
else:
|
||||||
|
for w in cut(unicode_sentence):
|
||||||
|
width = len(w)
|
||||||
|
if len(w)>2:
|
||||||
|
for i in xrange(len(w)-1):
|
||||||
|
gram2 = w[i:i+2]
|
||||||
|
if gram2 in FREQ:
|
||||||
|
yield (gram2,start+i,start+i+2)
|
||||||
|
if len(w)>3:
|
||||||
|
for i in xrange(len(w)-2):
|
||||||
|
gram3 = w[i:i+3]
|
||||||
|
if gram3 in FREQ:
|
||||||
|
yield (gram3,start+i,start+i+3)
|
||||||
|
yield (w,start,start+width)
|
||||||
|
start+=width
|
||||||
|
|
||||||
|
@ -3,15 +3,19 @@ import sys
|
|||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
|
g_mode="default"
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
|
global g_mode
|
||||||
test_sent = test_sent.decode('utf-8')
|
test_sent = test_sent.decode('utf-8')
|
||||||
result = jieba.tokenize(test_sent)
|
result = jieba.tokenize(test_sent,mode=g_mode)
|
||||||
for tk in result:
|
for tk in result:
|
||||||
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
for m in ("default","search"):
|
||||||
|
g_mode = m
|
||||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||||
cuttest("我不喜欢日本和服。")
|
cuttest("我不喜欢日本和服。")
|
||||||
cuttest("雷猴回归人间。")
|
cuttest("雷猴回归人间。")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user