merge change from master

2025-07-10 00:01:33 +08:00 · 2013-07-16 11:06:41 +08:00 · 2013-07-16 11:06:41 +08:00 · 6549deabbd
commit 6549deabbd
parent 7334bedf5c d691d91674
9 changed files with 63 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -162,4 +162,5 @@ pip-log.txt
 # Mac crap
 .DS_Store
 *.log
 test/tmp/*
--- a/jieba/init.py
+++ b/jieba/init.py
@ -1,7 +1,9 @@
 from __future__ import with_statement
 import re
 import math
-import os,sys
+import os
 import sys
 import pprint
 from . import finalseg
 import time
@ -112,8 +114,10 @@ def require_initialized(fn):
        else:
            initialize(DICTIONARY)
            return fn(*args, **kwargs)
    return wrapped
 def __cut_all(sentence):
    dag = get_DAG(sentence)
    old_j = -1
@ -212,7 +216,8 @@ def cut(sentence,cut_all=False):
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk','ignore')
-    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
+
    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
    if cut_all:
        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,6 +1,11 @@
 import jieba
 import os
 try:
 	from analyzer import ChineseAnalyzer
 except ImportError:
 	pass
 _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 f_name = os.path.join(_curpath,"idf.txt")
 content = open(f_name,'rb').read().decode('utf-8')
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -59,7 +59,7 @@ def cut(sentence):
        except:
            sentence = sentence.decode('gbk','ignore')
-    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
    blocks = re_han.split(sentence)
    for blk in blocks:
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -135,7 +135,7 @@ def __cut_internal(sentence):
        except:
            sentence = sentence.decode('gbk','ignore')
-    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
    re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
    blocks = re_han.split(sentence)
--- a/test/test.py
+++ b/test/test.py
@ -96,4 +96,4 @@ if __name__ == "__main__":
    cuttest('枪杆子中出政权')
    cuttest('张三风同学走上了不归路')
    cuttest('阿Q腰间挂着BB机手里拿着大哥大，说：我一般吃饭不AA制的。')
-    cuttest('在1号店能买到小S和大S八卦的书。')
+    cuttest('在1号店能买到小S和大S八卦的书，还有3D电视。')
--- a/test/test_whoosh.py
+++ b/test/test_whoosh.py
@ -1,5 +1,5 @@
 # -*- coding: UTF-8 -*-
-import sys
+import sys,os
 sys.path.append("../")
 from whoosh.index import create_in,open_dir
 from whoosh.fields import *
@ -10,6 +10,9 @@ from jieba.analyse.analyzer import ChineseAnalyzer
 analyzer = ChineseAnalyzer()
 schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
 if not os.path.exists("tmp"):
    os.mkdir("tmp")
 ix = create_in("tmp", schema) # for create new index
 #ix = open_dir("tmp", schema=schema) # for read only
 writer = ix.writer()
--- a/test/test_whoosh_flie.py
+++ b/test/test_whoosh_flie.py
@ -1,5 +1,6 @@
 # -*- coding: UTF-8 -*-
 import sys
 import os
 sys.path.append("../")
 from whoosh.index import create_in
 from whoosh.fields import *
@ -10,6 +11,8 @@ from jieba.analyse import ChineseAnalyzer
 analyzer = ChineseAnalyzer()
 schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
 if not os.path.exists("tmp"):
    os.mkdir("tmp")
 ix = create_in("tmp", schema)
 writer = ix.writer()
--- a/test/test_whoosh_flie_read.py
+++ b/test/test_whoosh_flie_read.py
@ -0,0 +1,27 @@
 # -*- coding: UTF-8 -*-
 import sys
 import os
 sys.path.append("../")
 from whoosh.index import create_in,open_dir
 from whoosh.fields import *
 from whoosh.qparser import QueryParser
 from jieba.analyse import ChineseAnalyzer 
 analyzer = ChineseAnalyzer()
 schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
 if not os.path.exists("tmp"):
    os.mkdir("tmp")
 ix = open_dir("tmp")
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)
 for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
    print "result of ",keyword
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:  
        print hit.highlights("content")
    print "="*10