merge change from master

2025-07-10 00:01:33 +08:00 · 2013-07-16 11:06:41 +08:00 · 2013-07-16 11:06:41 +08:00 · 6549deabbd
commit 6549deabbd
parent 7334bedf5c d691d91674
9 changed files with 63 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -162,4 +162,5 @@ pip-log.txt
 # Mac crap
 .DS_Store
 *.log
+test/tmp/*

--- a/jieba/init.py
+++ b/jieba/init.py
@ -1,7 +1,9 @@
 from __future__ import with_statement
 import re
+
 import math
-import os,sys
+import os
+import sys
 import pprint
 from . import finalseg
 import time
@ -103,16 +105,18 @@ def initialize(*args):


 def require_initialized(fn):
-        global initialized,DICTIONARY
+    global initialized,DICTIONARY
+
+    @wraps(fn)
+    def wrapped(*args, **kwargs):
+        if initialized:
+            return fn(*args, **kwargs)
+        else:
+            initialize(DICTIONARY)
+            return fn(*args, **kwargs)
+
+    return wrapped

-        @wraps(fn)
-        def wrapped(*args, **kwargs):
-            if initialized:
-                return fn(*args, **kwargs)
-            else:
-                initialize(DICTIONARY)
-                return fn(*args, **kwargs)
-        return wrapped

 def __cut_all(sentence):
    dag = get_DAG(sentence)
@ -212,7 +216,8 @@ def cut(sentence,cut_all=False):
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk','ignore')

-    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
+
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")

    if cut_all:
        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
@ -223,9 +228,9 @@ def cut(sentence,cut_all=False):
        cut_block = __cut_all
    for blk in blocks:
        if re_han.match(blk):
-                #pprint.pprint(__cut_DAG(blk))
-                for word in cut_block(blk):
-                    yield word
+            #pprint.pprint(__cut_DAG(blk))
+            for word in cut_block(blk):
+                yield word
        else:
            tmp = re_skip.split(blk)
            for x in tmp:
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,6 +1,11 @@
 import jieba
 import os

+try:
+	from analyzer import ChineseAnalyzer
+except ImportError:
+	pass
+
 _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 f_name = os.path.join(_curpath,"idf.txt")
 content = open(f_name,'rb').read().decode('utf-8')
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -59,7 +59,7 @@ def cut(sentence):
        except:
            sentence = sentence.decode('gbk','ignore')

-    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")

    blocks = re_han.split(sentence)
    for blk in blocks:
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -135,7 +135,7 @@ def __cut_internal(sentence):
        except:
            sentence = sentence.decode('gbk','ignore')

-    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
    re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")

    blocks = re_han.split(sentence)
--- a/test/test.py
+++ b/test/test.py
@ -96,4 +96,4 @@ if __name__ == "__main__":
    cuttest('枪杆子中出政权')
    cuttest('张三风同学走上了不归路')
    cuttest('阿Q腰间挂着BB机手里拿着大哥大，说：我一般吃饭不AA制的。')
-    cuttest('在1号店能买到小S和大S八卦的书。')
+    cuttest('在1号店能买到小S和大S八卦的书，还有3D电视。')
--- a/test/test_whoosh.py
+++ b/test/test_whoosh.py
@ -1,5 +1,5 @@
 # -*- coding: UTF-8 -*-
-import sys
+import sys,os
 sys.path.append("../")
 from whoosh.index import create_in,open_dir
 from whoosh.fields import *
@ -10,6 +10,9 @@ from jieba.analyse.analyzer import ChineseAnalyzer
 analyzer = ChineseAnalyzer()

 schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+if not os.path.exists("tmp"):
+    os.mkdir("tmp")
+
 ix = create_in("tmp", schema) # for create new index
 #ix = open_dir("tmp", schema=schema) # for read only
 writer = ix.writer()
--- a/test/test_whoosh_flie.py
+++ b/test/test_whoosh_flie.py
@ -1,5 +1,6 @@
 # -*- coding: UTF-8 -*-
 import sys
+import os
 sys.path.append("../")
 from whoosh.index import create_in
 from whoosh.fields import *
@ -10,6 +11,8 @@ from jieba.analyse import ChineseAnalyzer
 analyzer = ChineseAnalyzer()

 schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+if not os.path.exists("tmp"):
+    os.mkdir("tmp")
 ix = create_in("tmp", schema)
 writer = ix.writer()

--- a/test/test_whoosh_flie_read.py
+++ b/test/test_whoosh_flie_read.py
@ -0,0 +1,27 @@
+# -*- coding: UTF-8 -*-
+import sys
+import os
+sys.path.append("../")
+from whoosh.index import create_in,open_dir
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+
+from jieba.analyse import ChineseAnalyzer 
+
+analyzer = ChineseAnalyzer()
+
+schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
+if not os.path.exists("tmp"):
+    os.mkdir("tmp")
+ix = open_dir("tmp")
+
+searcher = ix.searcher()
+parser = QueryParser("content", schema=ix.schema)
+
+for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
+    print "result of ",keyword
+    q = parser.parse(keyword)
+    results = searcher.search(q)
+    for hit in results:  
+        print hit.highlights("content")
+    print "="*10