mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
add new method: cut_for_search(sentence), which can get better recall rate for search engine's reverse index
This commit is contained in:
parent
d421fe5e7e
commit
5ce72e76b1
@ -157,6 +157,21 @@ def cut(sentence,cut_all=False):
|
|||||||
if x!="":
|
if x!="":
|
||||||
yield x
|
yield x
|
||||||
|
|
||||||
|
def cut_for_search(sentence):
|
||||||
|
words = cut(sentence)
|
||||||
|
for w in words:
|
||||||
|
if len(w)>2:
|
||||||
|
for i in xrange(len(w)-1):
|
||||||
|
gram2 = w[i:i+2]
|
||||||
|
if gram2 in FREQ:
|
||||||
|
yield gram2
|
||||||
|
if len(w)>3:
|
||||||
|
for i in xrange(len(w)-2):
|
||||||
|
gram3 = w[i:i+3]
|
||||||
|
if gram3 in FREQ:
|
||||||
|
yield gram3
|
||||||
|
yield w
|
||||||
|
|
||||||
def load_userdict(f):
|
def load_userdict(f):
|
||||||
global trie,total,FREQ
|
global trie,total,FREQ
|
||||||
if isinstance(f, (str, unicode)):
|
if isinstance(f, (str, unicode)):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user