mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
merge from upstream
This commit is contained in:
commit
6b0da06481
11
Changelog
11
Changelog
@ -1,8 +1,11 @@
|
|||||||
|
2014-11-15: version 0.35.1
|
||||||
|
1) fix Python 3.2的兼容性问题
|
||||||
|
|
||||||
2014-11-13: version 0.35
|
2014-11-13: version 0.35
|
||||||
1. 改进词典cache的dump和加载机制;by @gumblex
|
1) 改进词典cache的dump和加载机制;by @gumblex
|
||||||
2. 提升关键词提取的性能; by @gumblex
|
2)提升关键词提取的性能; by @gumblex
|
||||||
3. 关键词提取新增基于textrank算法的子模块; by @singlee
|
3)关键词提取新增基于textrank算法的子模块; by @singlee
|
||||||
4. 修复自定义stopwords功能的bug; by @walkskyer
|
4)修复自定义stopwords功能的bug; by @walkskyer
|
||||||
|
|
||||||
|
|
||||||
2014-10-20: version 0.34
|
2014-10-20: version 0.34
|
||||||
|
@ -242,9 +242,9 @@ def cut(sentence, cut_all=False, HMM=True):
|
|||||||
# \r\n|\s : whitespace characters. Will not be handled.
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
|
|
||||||
if cut_all:
|
if cut_all:
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||||
else:
|
else:
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
if cut_all:
|
if cut_all:
|
||||||
cut_block = __cut_all
|
cut_block = __cut_all
|
||||||
|
@ -88,7 +88,7 @@ def cut(sentence):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
|
@ -104,8 +104,8 @@ def __cut(sentence):
|
|||||||
yield pair(sentence[next:], pos_list[next][1])
|
yield pair(sentence[next:], pos_list[next][1])
|
||||||
|
|
||||||
def __cut_detail(sentence):
|
def __cut_detail(sentence):
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||||
re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
@ -129,7 +129,7 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = ''
|
buf = ''
|
||||||
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
re_eng = re.compile('[a-zA-Z0-9]',re.U)
|
||||||
while x < N:
|
while x < N:
|
||||||
y = route[x][1]+1
|
y = route[x][1]+1
|
||||||
l_word = sentence[x:y]
|
l_word = sentence[x:y]
|
||||||
@ -194,8 +194,8 @@ def __cut_internal(sentence, HMM=True):
|
|||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
|
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
|
||||||
re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
if HMM:
|
if HMM:
|
||||||
__cut_blk = __cut_DAG
|
__cut_blk = __cut_DAG
|
||||||
|
2
setup.py
2
setup.py
@ -1,6 +1,6 @@
|
|||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
setup(name='jieba3k',
|
setup(name='jieba3k',
|
||||||
version='0.35',
|
version='0.35.1',
|
||||||
description='Chinese Words Segementation Utilities',
|
description='Chinese Words Segementation Utilities',
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
author_email='ccnusjy@gmail.com',
|
author_email='ccnusjy@gmail.com',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user