Compare commits

..

No commits in common. "master" and "v0.41" have entirely different histories.

37 changed files with 73 additions and 72 deletions

View File

@ -1,11 +1,3 @@
2019-1-20: version 0.42.1
1. 修复setup.py在python2.7版本无法工作的问题 (issue #809)
2019-1-13: version 0.42
1. 修复paddle模式空字符串coredump问题 @JesseyXujin
2. 修复cut_all模式切分丢字问题 @fxsjy
3. paddle安装检测优化 @vissssa
2019-1-8: version 0.41
1. 开启paddle模式更友好
2. 修复cut_all模式不支持中英混合词的bug

View File

@ -13,7 +13,7 @@ jieba
* 精确模式,试图将句子最精确地切开,适合文本分析;
* 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
* 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
* paddle模式利用PaddlePaddle深度学习框架训练序列标注双向GRU网络模型实现分词。同时支持词性标注。paddle模式使用需安装paddlepaddle-tiny`pip install paddlepaddle-tiny==1.6.1`。目前paddle模式支持jieba v0.40及以上版本。jieba v0.40以下版本请升级jieba`pip install jieba --upgrade` 。[PaddlePaddle官网](https://www.paddlepaddle.org.cn/)
* paddle模式利用PaddlePaddle深度学习框架训练序列标注双向GRU网络模型实现分词。同时支持词性标注。paddle模式使用需安装paddlepaddle-tiny`pip install paddlepaddle-tiny==1.6.1`。目前paddle模式支持jieba v0.40及以上版本。jieba v0.40以下版本请升级jieba`pip install jieba --upgrade` 。[PaddlePaddle官网](www.paddlepaddle.org.cn)
* 支持繁体分词
* 支持自定义词典
* MIT 授权协议

View File

@ -1,24 +1,26 @@
from __future__ import absolute_import, unicode_literals
__version__ = '0.42.1'
__version__ = '0.41'
__license__ = 'MIT'
import marshal
import re
import os
import sys
import time
import logging
import marshal
import tempfile
import threading
import time
from hashlib import md5
from math import log
from . import finalseg
from hashlib import md5
from ._compat import *
from . import finalseg
if os.name == 'nt':
from shutil import move as _replace_file
else:
_replace_file = os.rename
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
DEFAULT_DICT = None
@ -45,11 +47,10 @@ re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
re_skip_default = re.compile("(\r\n|\s)", re.U)
def setLogLevel(log_level):
global logger
default_logger.setLevel(log_level)
class Tokenizer(object):
def __init__(self, dictionary=DEFAULT_DICT):
@ -68,8 +69,7 @@ class Tokenizer(object):
def __repr__(self):
return '<Tokenizer dictionary=%r>' % self.dictionary
@staticmethod
def gen_pfdict(f):
def gen_pfdict(self, f):
lfreq = {}
ltotal = 0
f_name = resolve_filename(f)
@ -128,7 +128,7 @@ class Tokenizer(object):
load_from_cache_fail = True
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
default_logger.debug(
"Loading model from cache %s" % cache_file)
try:
@ -201,26 +201,25 @@ class Tokenizer(object):
eng_scan = 0
eng_buf = u''
for k, L in iteritems(dag):
if eng_scan == 1 and not re_eng.match(sentence[k]):
if eng_scan==1 and not re_eng.match(sentence[k]):
eng_scan = 0
yield eng_buf
if len(L) == 1 and k > old_j:
word = sentence[k:L[0] + 1]
if re_eng.match(word):
if re_eng.match(sentence[k]):
if eng_scan == 0:
eng_scan = 1
eng_buf = word
eng_buf = sentence[k]
else:
eng_buf += word
eng_buf += sentence[k]
if eng_scan == 0:
yield word
yield sentence[k:L[0] + 1]
old_j = L[0]
else:
for j in L:
if j > k:
yield sentence[k:j + 1]
old_j = j
if eng_scan == 1:
if eng_scan==1:
yield eng_buf
def __cut_DAG_NO_HMM(self, sentence):
@ -286,8 +285,8 @@ class Tokenizer(object):
for elem in buf:
yield elem
def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False):
"""
def cut(self, sentence, cut_all = False, HMM = True,use_paddle = False):
'''
The main function that segments an entire sentence that contains
Chinese characters into separated words.
@ -295,12 +294,14 @@ class Tokenizer(object):
- sentence: The str(unicode) to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
"""
is_paddle_installed = check_paddle_install['is_paddle_installed']
'''
is_paddle_installed = False
if use_paddle == True:
is_paddle_installed = check_paddle_install()
sentence = strdecode(sentence)
if use_paddle and is_paddle_installed:
# if sentence is null, it will raise core exception in paddle.
if sentence is None or len(sentence) == 0:
if use_paddle == True and is_paddle_installed == True:
if sentence is None or sentence == "" or sentence == u"":
yield sentence
return
import jieba.lac_small.predict as predict
results = predict.get_sent(sentence)

View File

@ -1,55 +1,49 @@
# -*- coding: utf-8 -*-
import logging
import os
import sys
import logging
log_console = logging.StreamHandler(sys.stderr)
default_logger = logging.getLogger(__name__)
default_logger.setLevel(logging.DEBUG)
def setLogLevel(log_level):
global logger
default_logger.setLevel(log_level)
check_paddle_install = {'is_paddle_installed': False}
try:
import pkg_resources
get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
os.path.join(*res))
except ImportError:
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
def enable_paddle():
import_paddle_check = False
try:
import paddle
except ImportError:
default_logger.debug("Installing paddle-tiny, please wait a minute......")
os.system("pip install paddlepaddle-tiny")
try:
import paddle
except ImportError:
default_logger.debug(
"Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1."
"Now, back to jieba basic cut......")
try:
import paddle
except ImportError:
default_logger.debug("Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1."
"Now, back to jieba basic cut......")
if paddle.__version__ < '1.6.1':
default_logger.debug("Find your own paddle version doesn't satisfy the minimum requirement (1.6.1), "
"please install paddle tiny by 'pip install --upgrade paddlepaddle-tiny', "
"or upgrade paddle full version by "
"'pip install --upgrade paddlepaddle (-gpu for GPU version)' ")
"or upgrade paddle full version by 'pip install --upgrade paddlepaddle (-gpu for GPU version)' ")
else:
try:
import jieba.lac_small.predict as predict
import_paddle_check = True
default_logger.debug("Paddle enabled successfully......")
check_paddle_install['is_paddle_installed'] = True
except ImportError:
default_logger.debug("Import error, cannot find paddle.fluid and jieba.lac_small.predict module. "
"Now, back to jieba basic cut......")
"Now, back to jieba basic cut......")
PY2 = sys.version_info[0] == 2
@ -72,7 +66,6 @@ else:
itervalues = lambda d: iter(d.values())
iteritems = lambda d: iter(d.items())
def strdecode(sentence):
if not isinstance(sentence, text_type):
try:
@ -81,9 +74,25 @@ def strdecode(sentence):
sentence = sentence.decode('gbk', 'ignore')
return sentence
def resolve_filename(f):
try:
return f.name
except AttributeError:
return repr(f)
def check_paddle_install():
is_paddle_installed = False
try:
import paddle
if paddle.__version__ >= '1.6.1':
is_paddle_installed = True
else:
is_paddle_installed = False
default_logger.debug("Check the paddle version is not correct, the current version is "+ paddle.__version__+","
"please use command to install paddle: pip uninstall paddlepaddle(-gpu), "
"pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
except ImportError:
default_logger.debug("Import paddle error, back to jieba basic cut......")
is_paddle_installed = False
return is_paddle_installed

0
jieba/lac_small/__init__.py Normal file → Executable file
View File

0
jieba/lac_small/creator.py Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/crfw Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_0.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_0.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_1.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_1.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_2.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_2.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_3.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_3.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_4.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/fc_4.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_0.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_0.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_1.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_1.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_2.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_2.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_3.b_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/gru_3.w_0 Normal file → Executable file
View File

0
jieba/lac_small/model_baseline/word_emb Normal file → Executable file
View File

0
jieba/lac_small/nets.py Normal file → Executable file
View File

0
jieba/lac_small/predict.py Normal file → Executable file
View File

0
jieba/lac_small/reader_small.py Normal file → Executable file
View File

0
jieba/lac_small/tag.dic Normal file → Executable file
View File

0
jieba/lac_small/utils.py Normal file → Executable file
View File

0
jieba/lac_small/word.dic Normal file → Executable file
View File

View File

@ -1,11 +1,11 @@
from __future__ import absolute_import, unicode_literals
import pickle
import os
import re
import sys
import jieba
from .viterbi import viterbi
import pickle
from .._compat import *
from .viterbi import viterbi
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
@ -252,7 +252,6 @@ class POSTokenizer(object):
def lcut(self, *args, **kwargs):
return list(self.cut(*args, **kwargs))
# default Tokenizer instance
dt = POSTokenizer(jieba.dt)
@ -277,17 +276,19 @@ def cut(sentence, HMM=True, use_paddle=False):
Note that this only works using dt, custom POSTokenizer
instances are not supported.
"""
is_paddle_installed = check_paddle_install['is_paddle_installed']
if use_paddle and is_paddle_installed:
# if sentence is null, it will raise core exception in paddle.
is_paddle_installed = False
if use_paddle == True:
is_paddle_installed = check_paddle_install()
if use_paddle==True and is_paddle_installed == True:
if sentence is None or sentence == "" or sentence == u"":
yield pair(None, None)
return
import jieba.lac_small.predict as predict
sents, tags = predict.get_result(strdecode(sentence))
for i, sent in enumerate(sents):
sents,tags = predict.get_result(strdecode(sentence))
for i,sent in enumerate(sents):
if sent is None or tags[i] is None:
continue
yield pair(sent, tags[i])
yield pair(sent,tags[i])
return
global dt
if jieba.pool is None:

View File

@ -43,7 +43,7 @@ GitHub: https://github.com/fxsjy/jieba
"""
setup(name='jieba',
version='0.42.1',
version='0.41',
description='Chinese Words Segmentation Utilities',
long_description=LONGDOC,
author='Sun, Junyi',
@ -71,5 +71,5 @@ setup(name='jieba',
keywords='NLP,tokenizing,Chinese word segementation',
packages=['jieba'],
package_dir={'jieba':'jieba'},
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*', 'lac_small/*.py','lac_small/*.dic', 'lac_small/model_baseline/*']}
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*', 'lac_small/*','lac_small/model_baseline/*']}
)

View File

@ -98,4 +98,3 @@ if __name__ == "__main__":
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
jieba.add_word('超敏C反应蛋白')
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')
cuttest('steel健身爆发力运动兴奋补充剂')

View File

@ -2,7 +2,7 @@
import sys
sys.path.append("../")
import jieba
jieba.enable_paddle()
def cuttest(test_sent):
result = jieba.cut(test_sent, use_paddle=True)

View File

@ -2,8 +2,7 @@
import sys
sys.path.append("../")
import jieba.posseg as pseg
import jieba
jieba.enable_paddle()
def cuttest(test_sent):
result = pseg.cut(test_sent, use_paddle=True)