diff --git a/.gitignore b/.gitignore index 90f38e8..d5e1f8e 100644 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,5 @@ pip-log.txt # Mac crap .DS_Store *.log +test/tmp/* diff --git a/jieba/__init__.py b/jieba/__init__.py index bac3075..95c76fe 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -1,7 +1,9 @@ from __future__ import with_statement import re + import math -import os,sys +import os +import sys import pprint from . import finalseg import time @@ -103,16 +105,18 @@ def initialize(*args): def require_initialized(fn): - global initialized,DICTIONARY - - @wraps(fn) - def wrapped(*args, **kwargs): - if initialized: - return fn(*args, **kwargs) - else: - initialize(DICTIONARY) - return fn(*args, **kwargs) - return wrapped + global initialized,DICTIONARY + + @wraps(fn) + def wrapped(*args, **kwargs): + if initialized: + return fn(*args, **kwargs) + else: + initialize(DICTIONARY) + return fn(*args, **kwargs) + + return wrapped + def __cut_all(sentence): dag = get_DAG(sentence) @@ -212,7 +216,8 @@ def cut(sentence,cut_all=False): except UnicodeDecodeError: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)") + + re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)") if cut_all: re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]") @@ -223,9 +228,9 @@ def cut(sentence,cut_all=False): cut_block = __cut_all for blk in blocks: if re_han.match(blk): - #pprint.pprint(__cut_DAG(blk)) - for word in cut_block(blk): - yield word + #pprint.pprint(__cut_DAG(blk)) + for word in cut_block(blk): + yield word else: tmp = re_skip.split(blk) for x in tmp: diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 7d3fdb0..4e7f46a 100644 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -1,6 +1,11 @@ import jieba import os +try: + from analyzer import ChineseAnalyzer +except ImportError: + pass + _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) f_name = os.path.join(_curpath,"idf.txt") content = open(f_name,'rb').read().decode('utf-8') diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index 6bc8759..8590743 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -59,7 +59,7 @@ def cut(sentence): except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)") + re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)") blocks = re_han.split(sentence) for blk in blocks: diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 05c4567..bc680df 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -135,7 +135,7 @@ def __cut_internal(sentence): except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)") + re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)") re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+") blocks = re_han.split(sentence) diff --git a/test/test.py b/test/test.py index af7d94f..9bdf1f4 100644 --- a/test/test.py +++ b/test/test.py @@ -96,4 +96,4 @@ if __name__ == "__main__": cuttest('枪杆子中出政权') cuttest('张三风同学走上了不归路') cuttest('阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。') - cuttest('在1号店能买到小S和大S八卦的书。') \ No newline at end of file + cuttest('在1号店能买到小S和大S八卦的书,还有3D电视。') diff --git a/test/test_whoosh.py b/test/test_whoosh.py index c1c9a7a..b72a967 100644 --- a/test/test_whoosh.py +++ b/test/test_whoosh.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -import sys +import sys,os sys.path.append("../") from whoosh.index import create_in,open_dir from whoosh.fields import * @@ -10,6 +10,9 @@ from jieba.analyse.analyzer import ChineseAnalyzer analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) +if not os.path.exists("tmp"): + os.mkdir("tmp") + ix = create_in("tmp", schema) # for create new index #ix = open_dir("tmp", schema=schema) # for read only writer = ix.writer() diff --git a/test/test_whoosh_flie.py b/test/test_whoosh_flie.py index c843d6b..3610b49 100644 --- a/test/test_whoosh_flie.py +++ b/test/test_whoosh_flie.py @@ -1,5 +1,6 @@ # -*- coding: UTF-8 -*- import sys +import os sys.path.append("../") from whoosh.index import create_in from whoosh.fields import * @@ -10,6 +11,8 @@ from jieba.analyse import ChineseAnalyzer analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) +if not os.path.exists("tmp"): + os.mkdir("tmp") ix = create_in("tmp", schema) writer = ix.writer() diff --git a/test/test_whoosh_flie_read.py b/test/test_whoosh_flie_read.py new file mode 100644 index 0000000..4528ae4 --- /dev/null +++ b/test/test_whoosh_flie_read.py @@ -0,0 +1,27 @@ +# -*- coding: UTF-8 -*- +import sys +import os +sys.path.append("../") +from whoosh.index import create_in,open_dir +from whoosh.fields import * +from whoosh.qparser import QueryParser + +from jieba.analyse import ChineseAnalyzer + +analyzer = ChineseAnalyzer() + +schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) +if not os.path.exists("tmp"): + os.mkdir("tmp") +ix = open_dir("tmp") + +searcher = ix.searcher() +parser = QueryParser("content", schema=ix.schema) + +for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"): + print "result of ",keyword + q = parser.parse(keyword) + results = searcher.search(q) + for hit in results: + print hit.highlights("content") + print "="*10