diff --git a/jieba/analyse/analyzer.py b/jieba/analyse/analyzer.py index 54ea337..f1f7feb 100644 --- a/jieba/analyse/analyzer.py +++ b/jieba/analyse/analyzer.py @@ -1,6 +1,7 @@ #encoding=utf-8 -from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter +from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter from whoosh.analysis import Tokenizer,Token +from whoosh.lang.porter import stem import jieba import re @@ -29,5 +30,6 @@ class ChineseTokenizer(Tokenizer): token.endchar = stop_pos yield token -def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1): - return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize) \ No newline at end of file +def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): + return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ + |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize) diff --git a/test/test_whoosh.py b/test/test_whoosh.py index 1d7f747..9a7c033 100644 --- a/test/test_whoosh.py +++ b/test/test_whoosh.py @@ -59,5 +59,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交 print hit.highlights("content") print "="*10 -for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."): +for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"): print t.text