merge from upstream

2025-07-10 00:01:33 +08:00 · 2014-11-15 14:06:03 +08:00 · 2014-11-15 14:06:03 +08:00 · 6b0da06481
commit 6b0da06481
parent a5ecf70f71 5c487dbcba
6 changed files with 17 additions and 14 deletions
--- a/11
+++ b/11
@ -1,8 +1,11 @@
 2014-11-15: version 0.35.1
 1) fix Python 3.2的兼容性问题
 2014-11-13: version 0.35
-1. 改进词典cache的dump和加载机制；by @gumblex
+1) 改进词典cache的dump和加载机制；by @gumblex
-2. 提升关键词提取的性能; by @gumblex
+2）提升关键词提取的性能; by @gumblex
-3. 关键词提取新增基于textrank算法的子模块; by @singlee
+3）关键词提取新增基于textrank算法的子模块; by @singlee
-4. 修复自定义stopwords功能的bug; by @walkskyer
+4）修复自定义stopwords功能的bug; by @walkskyer
 2014-10-20: version 0.34
--- a/jieba/init.py
+++ b/jieba/init.py
@ -242,9 +242,9 @@ def cut(sentence, cut_all=False, HMM=True):
    # \r\n|\s : whitespace characters. Will not be handled.
    if cut_all:
-        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
+        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
    else:
-        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
+        re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
    blocks = re_han.split(sentence)
    if cut_all:
        cut_block = __cut_all
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -88,7 +88,7 @@ def cut(sentence):
            sentence = sentence.decode('utf-8')
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk', 'ignore')
-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -104,8 +104,8 @@ def __cut(sentence):
        yield pair(sentence[next:], pos_list[next][1])
 def __cut_detail(sentence):
-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
-    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
@ -129,7 +129,7 @@ def __cut_DAG_NO_HMM(sentence):
    x = 0
    N = len(sentence)
    buf = ''
-    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
+    re_eng = re.compile('[a-zA-Z0-9]',re.U)
    while x < N:
        y = route[x][1]+1
        l_word = sentence[x:y]
@ -194,8 +194,8 @@ def __cut_internal(sentence, HMM=True):
            sentence = sentence.decode('utf-8')
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk', 'ignore')
-    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
-    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
    blocks = re_han.split(sentence)
    if HMM:
        __cut_blk = __cut_DAG
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 from distutils.core import setup
 setup(name='jieba3k',
-      version='0.35',
+      version='0.35.1',
      description='Chinese Words Segementation Utilities',
      author='Sun, Junyi',
      author_email='ccnusjy@gmail.com',