From 5ce72e76b124acc965d61e25ee82ca7d72df953b Mon Sep 17 00:00:00 2001 From: Sun Junyi Date: Tue, 27 Nov 2012 13:37:40 +0800 Subject: [PATCH] add new method: cut_for_search(sentence), which can get better recall rate for search engine's reverse index --- jieba/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/jieba/__init__.py b/jieba/__init__.py index dea7a25..945affa 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -157,6 +157,21 @@ def cut(sentence,cut_all=False): if x!="": yield x +def cut_for_search(sentence): + words = cut(sentence) + for w in words: + if len(w)>2: + for i in xrange(len(w)-1): + gram2 = w[i:i+2] + if gram2 in FREQ: + yield gram2 + if len(w)>3: + for i in xrange(len(w)-2): + gram3 = w[i:i+3] + if gram3 in FREQ: + yield gram3 + yield w + def load_userdict(f): global trie,total,FREQ if isinstance(f, (str, unicode)):