add better support for english. like input: 'this is interesting and interested me'-->output:'this interest interest',which 'interest' match 'interesting interested'

This commit is contained in:
gan 2013-09-09 11:54:30 +08:00
parent 7e7fcc1184
commit 31d5845535
2 changed files with 6 additions and 4 deletions

View File

@ -1,6 +1,7 @@
#encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
import jieba
import re
@ -29,5 +30,6 @@ class ChineseTokenizer(Tokenizer):
token.endchar = stop_pos
yield token
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)

View File

@ -59,5 +59,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
print hit.highlights("content")
print "="*10
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
print t.text