add 'search mode' for jieba.tokenize

2025-07-24 00:00:05 +08:00 · 2013-06-28 12:04:16 +08:00 · 2013-06-28 12:04:16 +08:00 · f08690a2df
commit f08690a2df
parent 237dc6625e
2 changed files with 116 additions and 94 deletions
--- a/jieba/init.py
+++ b/jieba/init.py
@ -335,11 +335,29 @@ def get_abs_path_dict():
 	abs_path = os.path.join(_curpath,DICTIONARY)
 	return abs_path

-def tokenize(unicode_sentence):
+def tokenize(unicode_sentence,mode="default"):
+	#mode ("default" or "search")
 	if not isinstance(unicode_sentence, unicode):
 		raise Exception("jieba: the input parameter should  unicode.")
 	start = 0 
+	if mode=='default':
 		for w in cut(unicode_sentence):
 			width = len(w)
 			yield (w,start,start+width)
 			start+=width
+	else:
+		for w in cut(unicode_sentence):
+			width = len(w)
+			if len(w)>2:
+				for i in xrange(len(w)-1):
+					gram2 = w[i:i+2]
+					if gram2 in FREQ:
+						yield (gram2,start+i,start+i+2)
+			if len(w)>3:
+				for i in xrange(len(w)-2):
+					gram3 = w[i:i+3]
+					if gram3 in FREQ:
+						yield (gram3,start+i,start+i+3)
+			yield (w,start,start+width)
+			start+=width
+
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@ -3,15 +3,19 @@ import sys
 sys.path.append("../")
 import jieba

+g_mode="default"

 def cuttest(test_sent):
+    global g_mode
    test_sent = test_sent.decode('utf-8')
-    result = jieba.tokenize(test_sent)
+    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])


 if __name__ == "__main__":
+    for m in ("default","search"):
+        g_mode = m
        cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
        cuttest("我不喜欢日本和服。")
        cuttest("雷猴回归人间。")