From 31d58455353cdb6d42bc39fb66a57ee66edc2437 Mon Sep 17 00:00:00 2001 From: gan Date: Mon, 9 Sep 2013 11:54:30 +0800 Subject: [PATCH] add better support for english. like input: 'this is interesting and interested me'-->output:'this interest interest',which 'interest' match 'interesting interested' --- jieba/analyse/analyzer.py | 8 +++++--- test/test_whoosh.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jieba/analyse/analyzer.py b/jieba/analyse/analyzer.py index 54ea337..f1f7feb 100644 --- a/jieba/analyse/analyzer.py +++ b/jieba/analyse/analyzer.py @@ -1,6 +1,7 @@ #encoding=utf-8 -from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter +from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter from whoosh.analysis import Tokenizer,Token +from whoosh.lang.porter import stem import jieba import re @@ -29,5 +30,6 @@ class ChineseTokenizer(Tokenizer): token.endchar = stop_pos yield token -def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1): - return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize) \ No newline at end of file +def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): + return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ + |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize) diff --git a/test/test_whoosh.py b/test/test_whoosh.py index 1d7f747..9a7c033 100644 --- a/test/test_whoosh.py +++ b/test/test_whoosh.py @@ -59,5 +59,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交 print hit.highlights("content") print "="*10 -for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."): +for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"): print t.text