mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
add better support for english. like input: 'this is interesting and interested me'-->output:'this interest interest',which 'interest' match 'interesting interested'
This commit is contained in:
parent
7e7fcc1184
commit
31d5845535
@ -1,6 +1,7 @@
|
||||
#encoding=utf-8
|
||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter
|
||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
from whoosh.lang.porter import stem
|
||||
|
||||
import jieba
|
||||
import re
|
||||
@ -29,5 +30,6 @@ class ChineseTokenizer(Tokenizer):
|
||||
token.endchar = stop_pos
|
||||
yield token
|
||||
|
||||
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1):
|
||||
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)
|
||||
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
|
||||
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
|
||||
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)
|
||||
|
@ -59,5 +59,5 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
|
||||
print hit.highlights("content")
|
||||
print "="*10
|
||||
|
||||
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream."):
|
||||
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
|
||||
print t.text
|
||||
|
Loading…
x
Reference in New Issue
Block a user