Merge pull request #106 from jannson/master

add better support for english for ChineseAnalyzer
This commit is contained in:
Sun Junyi 2013-09-16 23:58:46 -07:00
commit fd96527f71
2 changed files with 6 additions and 4 deletions

View File

@ -1,6 +1,7 @@
#encoding=utf-8 #encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
import jieba import jieba
import re import re
@ -29,5 +30,6 @@ class ChineseTokenizer(Tokenizer):
token.endchar = stop_pos token.endchar = stop_pos
yield token yield token
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1): def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize) return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)

View File

@ -59,5 +59,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
print hit.highlights("content") print hit.highlights("content")
print "="*10 print "="*10
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."): for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
print t.text print t.text