add ChineseAnalyzer for whoosh search engine

2025-07-10 00:01:33 +08:00 · 2013-07-01 10:53:39 +08:00 · 2013-07-01 10:53:39 +08:00 · efc784312c
commit efc784312c
parent f08690a2df
4 changed files with 131 additions and 0 deletions
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,5 +1,6 @@
 import jieba
 import os
+from analyzer import ChineseAnalyzer

 _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 f_name = os.path.join(_curpath,"idf.txt")
--- a/jieba/analyse/analyzer.py
+++ b/jieba/analyse/analyzer.py
@ -0,0 +1,33 @@
+#encoding=utf-8
+from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter
+from whoosh.analysis import Tokenizer,Token 
+
+import jieba
+import re
+
+STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
+                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
+                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
+                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
+                        'you', 'your',u'的',u'了',u'和'))
+
+accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
+
+class ChineseTokenizer(Tokenizer):
+	def __call__(self,text,**kargs):
+		words = jieba.tokenize(text,mode="search")
+		token  = Token()
+		for (w,start_pos,stop_pos) in words:
+			if not accepted_chars.match(w):
+				if len(w)>1:
+					pass
+				else:
+					continue
+			token.text = w
+			token.pos = start_pos
+			token.startchar = start_pos
+			token.endchar = stop_pos
+			yield token
+
+def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1):
+	return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)
--- a/test/test_whoosh.py
+++ b/test/test_whoosh.py
@ -0,0 +1,59 @@
+# -*- coding: UTF-8 -*-
+import sys
+sys.path.append("../")
+from whoosh.index import create_in
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+
+from jieba.analyse import ChineseAnalyzer 
+
+analyzer = ChineseAnalyzer()
+
+schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+ix = create_in("tmp", schema)
+writer = ix.writer()
+
+writer.add_document(
+	title=u"document1", 
+	path=u"/a",
+	content=u"This is the first document we’ve added!"
+)
+
+writer.add_document(
+	title=u"document2", 
+	path=u"/b",
+	content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
+)
+
+writer.add_document(
+	title=u"document3", 
+	path=u"/c",
+	content=u"买水果然后来世博园。"
+)
+
+writer.add_document(
+	title=u"document4", 
+	path=u"/c",
+	content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
+)
+
+writer.add_document(
+	title=u"document4", 
+	path=u"/c",
+	content=u"咱俩交换一下吧。"
+)
+
+writer.commit()
+searcher = ix.searcher()
+parser = QueryParser("content", schema=ix.schema)
+
+for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"):
+	print "result of ",keyword
+	q = parser.parse(keyword)
+	results = searcher.search(q)
+	for hit in results:  
+	    print hit.highlights("content")
+	print "="*10
+
+for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft"):
+	print t.text
--- a/test/test_whoosh_flie.py
+++ b/test/test_whoosh_flie.py
@ -0,0 +1,38 @@
+# -*- coding: UTF-8 -*-
+import sys
+sys.path.append("../")
+from whoosh.index import create_in
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+
+from jieba.analyse import ChineseAnalyzer 
+
+analyzer = ChineseAnalyzer()
+
+schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+ix = create_in("tmp", schema)
+writer = ix.writer()
+
+file_name = sys.argv[1]
+
+with open(file_name,"rb") as inf:
+	i=0
+	for line in inf:
+		i+=1
+		writer.add_document(
+			title=u"line"+str(i), 
+			path=u"/a",
+			content=line.decode('gbk','ignore')
+		)
+writer.commit()
+
+searcher = ix.searcher()
+parser = QueryParser("content", schema=ix.schema)
+
+for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"):
+	print "result of ",keyword
+	q = parser.parse(keyword)
+	results = searcher.search(q)
+	for hit in results:  
+	    print hit.highlights("content")
+	print "="*10