mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-24 00:00:05 +08:00
Compare commits
16 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
67fa2e36e7 | ||
|
1e20c89b66 | ||
|
5704e23bbf | ||
|
aa65031788 | ||
|
2eb11c8028 | ||
|
d703bce302 | ||
|
dc2b788eb3 | ||
|
0868c323d9 | ||
|
eb37e048da | ||
|
381b0691ac | ||
|
97c32464e1 | ||
|
0489a6979e | ||
|
30ea8f929e | ||
|
0b74b6c2de | ||
|
2fdee89883 | ||
|
17bab6a2d1 |
12
Changelog
12
Changelog
@ -1,3 +1,15 @@
|
|||||||
|
2019-1-20: version 0.42.1
|
||||||
|
1. 修复setup.py在python2.7版本无法工作的问题 (issue #809)
|
||||||
|
|
||||||
|
2019-1-13: version 0.42
|
||||||
|
1. 修复paddle模式空字符串coredump问题 @JesseyXujin
|
||||||
|
2. 修复cut_all模式切分丢字问题 @fxsjy
|
||||||
|
3. paddle安装检测优化 @vissssa
|
||||||
|
|
||||||
|
2019-1-8: version 0.41
|
||||||
|
1. 开启paddle模式更友好
|
||||||
|
2. 修复cut_all模式不支持中英混合词的bug
|
||||||
|
|
||||||
2019-12-25: version 0.40
|
2019-12-25: version 0.40
|
||||||
1. 支持基于paddle的深度学习分词模式(use_paddle=True); by @JesseyXujin, @xyzhou-puck
|
1. 支持基于paddle的深度学习分词模式(use_paddle=True); by @JesseyXujin, @xyzhou-puck
|
||||||
2. 修复自定义Tokenizer实例的add_word方法指向全局的问题; by @linhx13
|
2. 修复自定义Tokenizer实例的add_word方法指向全局的问题; by @linhx13
|
||||||
|
15
README.md
Executable file → Normal file
15
README.md
Executable file → Normal file
@ -13,7 +13,7 @@ jieba
|
|||||||
* 精确模式,试图将句子最精确地切开,适合文本分析;
|
* 精确模式,试图将句子最精确地切开,适合文本分析;
|
||||||
* 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
|
* 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
|
||||||
* 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
* 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
||||||
* paddle模式,利用paddlepaddle深度学习框架,训练序列标注(双向GRU)网络模型实现分词。同时支持词性标注。如需使用,请先安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。 (www.paddlepaddle.org)
|
* paddle模式,利用PaddlePaddle深度学习框架,训练序列标注(双向GRU)网络模型实现分词。同时支持词性标注。paddle模式使用需安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。目前paddle模式支持jieba v0.40及以上版本。jieba v0.40以下版本,请升级jieba,`pip install jieba --upgrade` 。[PaddlePaddle官网](https://www.paddlepaddle.org.cn/)
|
||||||
* 支持繁体分词
|
* 支持繁体分词
|
||||||
* 支持自定义词典
|
* 支持自定义词典
|
||||||
* MIT 授权协议
|
* MIT 授权协议
|
||||||
@ -39,7 +39,7 @@ jieba
|
|||||||
=======
|
=======
|
||||||
1. 分词
|
1. 分词
|
||||||
--------
|
--------
|
||||||
* `jieba.cut` 方法接受四个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型;use_paddle 参数用来控制是否使用paddle模式下的分词模式(如需使用,安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1` );
|
* `jieba.cut` 方法接受四个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型;use_paddle 参数用来控制是否使用paddle模式下的分词模式,paddle模式采用延迟加载方式,通过enable_paddle接口安装paddlepaddle-tiny,并且import相关代码;
|
||||||
* `jieba.cut_for_search` 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
|
* `jieba.cut_for_search` 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
|
||||||
* 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
|
* 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
|
||||||
* `jieba.cut` 以及 `jieba.cut_for_search` 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
|
* `jieba.cut` 以及 `jieba.cut_for_search` 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
|
||||||
@ -52,8 +52,11 @@ jieba
|
|||||||
# encoding=utf-8
|
# encoding=utf-8
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", use_paddle=True)
|
jieba.enable_paddle()# 启动paddle模式。 0.40版之后开始支持,早期版本不支持
|
||||||
print("Paddle Mode: " + "/ ".join(seg_list)) # paddle模式
|
strs=["我来到北京清华大学","乒乓球拍卖完了","中国科学技术大学"]
|
||||||
|
for str in strs:
|
||||||
|
seg_list = jieba.cut(str,use_paddle=True) # 使用paddle模式
|
||||||
|
print("Paddle Mode: " + '/'.join(list(seg_list)))
|
||||||
|
|
||||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||||
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||||
@ -190,12 +193,14 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
|||||||
-----------
|
-----------
|
||||||
* `jieba.posseg.POSTokenizer(tokenizer=None)` 新建自定义分词器,`tokenizer` 参数可指定内部使用的 `jieba.Tokenizer` 分词器。`jieba.posseg.dt` 为默认词性标注分词器。
|
* `jieba.posseg.POSTokenizer(tokenizer=None)` 新建自定义分词器,`tokenizer` 参数可指定内部使用的 `jieba.Tokenizer` 分词器。`jieba.posseg.dt` 为默认词性标注分词器。
|
||||||
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。
|
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。
|
||||||
* 除了jieba默认分词模式,提供paddle模式下的词性标注功能。如需使用,请先安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。
|
* 除了jieba默认分词模式,提供paddle模式下的词性标注功能。paddle模式采用延迟加载方式,通过enable_paddle()安装paddlepaddle-tiny,并且import相关代码;
|
||||||
* 用法示例
|
* 用法示例
|
||||||
|
|
||||||
```pycon
|
```pycon
|
||||||
|
>>> import jieba
|
||||||
>>> import jieba.posseg as pseg
|
>>> import jieba.posseg as pseg
|
||||||
>>> words = pseg.cut("我爱北京天安门") #jieba默认模式
|
>>> words = pseg.cut("我爱北京天安门") #jieba默认模式
|
||||||
|
>>> jieba.enable_paddle() #启动paddle模式。 0.40版之后开始支持,早期版本不支持
|
||||||
>>> words = pseg.cut("我爱北京天安门",use_paddle=True) #paddle模式
|
>>> words = pseg.cut("我爱北京天安门",use_paddle=True) #paddle模式
|
||||||
>>> for word, flag in words:
|
>>> for word, flag in words:
|
||||||
... print('%s %s' % (word, flag))
|
... print('%s %s' % (word, flag))
|
||||||
|
68
jieba/__init__.py
Executable file → Normal file
68
jieba/__init__.py
Executable file → Normal file
@ -1,26 +1,24 @@
|
|||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
__version__ = '0.40'
|
|
||||||
|
__version__ = '0.42.1'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
import marshal
|
import marshal
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
from math import log
|
import time
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from ._compat import *
|
from math import log
|
||||||
|
|
||||||
from . import finalseg
|
from . import finalseg
|
||||||
|
from ._compat import *
|
||||||
|
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
from shutil import move as _replace_file
|
from shutil import move as _replace_file
|
||||||
else:
|
else:
|
||||||
_replace_file = os.rename
|
_replace_file = os.rename
|
||||||
|
|
||||||
|
|
||||||
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
||||||
|
|
||||||
DEFAULT_DICT = None
|
DEFAULT_DICT = None
|
||||||
@ -46,13 +44,12 @@ re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
|||||||
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
|
||||||
|
|
||||||
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
||||||
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
|
||||||
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
||||||
|
|
||||||
def setLogLevel(log_level):
|
def setLogLevel(log_level):
|
||||||
global logger
|
|
||||||
default_logger.setLevel(log_level)
|
default_logger.setLevel(log_level)
|
||||||
|
|
||||||
|
|
||||||
class Tokenizer(object):
|
class Tokenizer(object):
|
||||||
|
|
||||||
def __init__(self, dictionary=DEFAULT_DICT):
|
def __init__(self, dictionary=DEFAULT_DICT):
|
||||||
@ -71,7 +68,8 @@ class Tokenizer(object):
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<Tokenizer dictionary=%r>' % self.dictionary
|
return '<Tokenizer dictionary=%r>' % self.dictionary
|
||||||
|
|
||||||
def gen_pfdict(self, f):
|
@staticmethod
|
||||||
|
def gen_pfdict(f):
|
||||||
lfreq = {}
|
lfreq = {}
|
||||||
ltotal = 0
|
ltotal = 0
|
||||||
f_name = resolve_filename(f)
|
f_name = resolve_filename(f)
|
||||||
@ -130,7 +128,7 @@ class Tokenizer(object):
|
|||||||
|
|
||||||
load_from_cache_fail = True
|
load_from_cache_fail = True
|
||||||
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
|
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
|
||||||
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
|
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
|
||||||
default_logger.debug(
|
default_logger.debug(
|
||||||
"Loading model from cache %s" % cache_file)
|
"Loading model from cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
@ -200,15 +198,30 @@ class Tokenizer(object):
|
|||||||
def __cut_all(self, sentence):
|
def __cut_all(self, sentence):
|
||||||
dag = self.get_DAG(sentence)
|
dag = self.get_DAG(sentence)
|
||||||
old_j = -1
|
old_j = -1
|
||||||
|
eng_scan = 0
|
||||||
|
eng_buf = u''
|
||||||
for k, L in iteritems(dag):
|
for k, L in iteritems(dag):
|
||||||
|
if eng_scan == 1 and not re_eng.match(sentence[k]):
|
||||||
|
eng_scan = 0
|
||||||
|
yield eng_buf
|
||||||
if len(L) == 1 and k > old_j:
|
if len(L) == 1 and k > old_j:
|
||||||
yield sentence[k:L[0] + 1]
|
word = sentence[k:L[0] + 1]
|
||||||
|
if re_eng.match(word):
|
||||||
|
if eng_scan == 0:
|
||||||
|
eng_scan = 1
|
||||||
|
eng_buf = word
|
||||||
|
else:
|
||||||
|
eng_buf += word
|
||||||
|
if eng_scan == 0:
|
||||||
|
yield word
|
||||||
old_j = L[0]
|
old_j = L[0]
|
||||||
else:
|
else:
|
||||||
for j in L:
|
for j in L:
|
||||||
if j > k:
|
if j > k:
|
||||||
yield sentence[k:j + 1]
|
yield sentence[k:j + 1]
|
||||||
old_j = j
|
old_j = j
|
||||||
|
if eng_scan == 1:
|
||||||
|
yield eng_buf
|
||||||
|
|
||||||
def __cut_DAG_NO_HMM(self, sentence):
|
def __cut_DAG_NO_HMM(self, sentence):
|
||||||
DAG = self.get_DAG(sentence)
|
DAG = self.get_DAG(sentence)
|
||||||
@ -273,8 +286,8 @@ class Tokenizer(object):
|
|||||||
for elem in buf:
|
for elem in buf:
|
||||||
yield elem
|
yield elem
|
||||||
|
|
||||||
def cut(self, sentence, cut_all = False, HMM = True,use_paddle = False):
|
def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False):
|
||||||
'''
|
"""
|
||||||
The main function that segments an entire sentence that contains
|
The main function that segments an entire sentence that contains
|
||||||
Chinese characters into separated words.
|
Chinese characters into separated words.
|
||||||
|
|
||||||
@ -282,15 +295,12 @@ class Tokenizer(object):
|
|||||||
- sentence: The str(unicode) to be segmented.
|
- sentence: The str(unicode) to be segmented.
|
||||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||||
- HMM: Whether to use the Hidden Markov Model.
|
- HMM: Whether to use the Hidden Markov Model.
|
||||||
'''
|
"""
|
||||||
is_paddle_installed = False
|
is_paddle_installed = check_paddle_install['is_paddle_installed']
|
||||||
if use_paddle == True:
|
|
||||||
import_paddle_check = import_paddle()
|
|
||||||
is_paddle_installed = check_paddle_install()
|
|
||||||
sentence = strdecode(sentence)
|
sentence = strdecode(sentence)
|
||||||
if use_paddle == True and is_paddle_installed == True and import_paddle_check == True:
|
if use_paddle and is_paddle_installed:
|
||||||
if sentence is None or sentence == "" or sentence == u"":
|
# if sentence is null, it will raise core exception in paddle.
|
||||||
yield sentence
|
if sentence is None or len(sentence) == 0:
|
||||||
return
|
return
|
||||||
import jieba.lac_small.predict as predict
|
import jieba.lac_small.predict as predict
|
||||||
results = predict.get_sent(sentence)
|
results = predict.get_sent(sentence)
|
||||||
@ -299,12 +309,8 @@ class Tokenizer(object):
|
|||||||
continue
|
continue
|
||||||
yield sent
|
yield sent
|
||||||
return
|
return
|
||||||
if cut_all:
|
re_han = re_han_default
|
||||||
re_han = re_han_cut_all
|
re_skip = re_skip_default
|
||||||
re_skip = re_skip_cut_all
|
|
||||||
else:
|
|
||||||
re_han = re_han_default
|
|
||||||
re_skip = re_skip_default
|
|
||||||
if cut_all:
|
if cut_all:
|
||||||
cut_block = self.__cut_all
|
cut_block = self.__cut_all
|
||||||
elif HMM:
|
elif HMM:
|
||||||
|
61
jieba/_compat.py
Executable file → Normal file
61
jieba/_compat.py
Executable file → Normal file
@ -1,38 +1,54 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import imp
|
|
||||||
import logging
|
|
||||||
|
|
||||||
log_console = logging.StreamHandler(sys.stderr)
|
log_console = logging.StreamHandler(sys.stderr)
|
||||||
default_logger = logging.getLogger(__name__)
|
default_logger = logging.getLogger(__name__)
|
||||||
default_logger.setLevel(logging.DEBUG)
|
default_logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
def setLogLevel(log_level):
|
def setLogLevel(log_level):
|
||||||
global logger
|
|
||||||
default_logger.setLevel(log_level)
|
default_logger.setLevel(log_level)
|
||||||
|
|
||||||
|
|
||||||
|
check_paddle_install = {'is_paddle_installed': False}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
|
||||||
get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
|
get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
|
||||||
os.path.join(*res))
|
os.path.join(*res))
|
||||||
except ImportError:
|
except ImportError:
|
||||||
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
|
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
|
||||||
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
|
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
|
||||||
|
|
||||||
|
|
||||||
def import_paddle():
|
def enable_paddle():
|
||||||
import_paddle_check = False
|
|
||||||
try:
|
try:
|
||||||
import paddle
|
import paddle
|
||||||
if paddle.__version__ >= '1.6.1' or paddle.__version__ >= u'1.6.1':
|
|
||||||
import paddle.fluid as fluid
|
|
||||||
import jieba.lac_small.predict as predict
|
|
||||||
import_paddle_check = True
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
default_logger.debug("Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
|
default_logger.debug("Installing paddle-tiny, please wait a minute......")
|
||||||
return False
|
os.system("pip install paddlepaddle-tiny")
|
||||||
return import_paddle_check
|
try:
|
||||||
|
import paddle
|
||||||
|
except ImportError:
|
||||||
|
default_logger.debug(
|
||||||
|
"Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1."
|
||||||
|
"Now, back to jieba basic cut......")
|
||||||
|
if paddle.__version__ < '1.6.1':
|
||||||
|
default_logger.debug("Find your own paddle version doesn't satisfy the minimum requirement (1.6.1), "
|
||||||
|
"please install paddle tiny by 'pip install --upgrade paddlepaddle-tiny', "
|
||||||
|
"or upgrade paddle full version by "
|
||||||
|
"'pip install --upgrade paddlepaddle (-gpu for GPU version)' ")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import jieba.lac_small.predict as predict
|
||||||
|
default_logger.debug("Paddle enabled successfully......")
|
||||||
|
check_paddle_install['is_paddle_installed'] = True
|
||||||
|
except ImportError:
|
||||||
|
default_logger.debug("Import error, cannot find paddle.fluid and jieba.lac_small.predict module. "
|
||||||
|
"Now, back to jieba basic cut......")
|
||||||
|
|
||||||
|
|
||||||
PY2 = sys.version_info[0] == 2
|
PY2 = sys.version_info[0] == 2
|
||||||
@ -56,6 +72,7 @@ else:
|
|||||||
itervalues = lambda d: iter(d.values())
|
itervalues = lambda d: iter(d.values())
|
||||||
iteritems = lambda d: iter(d.items())
|
iteritems = lambda d: iter(d.items())
|
||||||
|
|
||||||
|
|
||||||
def strdecode(sentence):
|
def strdecode(sentence):
|
||||||
if not isinstance(sentence, text_type):
|
if not isinstance(sentence, text_type):
|
||||||
try:
|
try:
|
||||||
@ -64,25 +81,9 @@ def strdecode(sentence):
|
|||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
def resolve_filename(f):
|
def resolve_filename(f):
|
||||||
try:
|
try:
|
||||||
return f.name
|
return f.name
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
return repr(f)
|
return repr(f)
|
||||||
|
|
||||||
|
|
||||||
def check_paddle_install():
|
|
||||||
is_paddle_installed = False
|
|
||||||
try:
|
|
||||||
import paddle
|
|
||||||
if imp.find_module('paddle') and (paddle.__version__ >= '1.6.1' or paddle.__version__ >= u'1.6.1'):
|
|
||||||
is_paddle_installed = True
|
|
||||||
elif paddle.__version__ < '1.6.1':
|
|
||||||
is_paddle_installed = False
|
|
||||||
default_logger.debug("Check the paddle version is not correct, please\
|
|
||||||
use command to install paddle: pip uninstall paddlepaddle(-gpu), \
|
|
||||||
pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
|
|
||||||
except ImportError:
|
|
||||||
default_logger.debug("import paddle error, back to jieba basic cut......")
|
|
||||||
is_paddle_installed = False
|
|
||||||
return is_paddle_installed
|
|
||||||
|
0
jieba/lac_small/__init__.py
Executable file → Normal file
0
jieba/lac_small/__init__.py
Executable file → Normal file
0
jieba/lac_small/creator.py
Executable file → Normal file
0
jieba/lac_small/creator.py
Executable file → Normal file
0
jieba/lac_small/model_baseline/crfw
Executable file → Normal file
0
jieba/lac_small/model_baseline/crfw
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_0.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_0.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_0.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_0.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_1.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_1.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_1.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_1.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_2.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_2.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_2.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_2.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_3.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_3.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_3.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_3.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_4.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_4.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_4.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/fc_4.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_0.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_0.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_0.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_0.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_1.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_1.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_1.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_1.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_2.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_2.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_2.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_2.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_3.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_3.b_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_3.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/gru_3.w_0
Executable file → Normal file
0
jieba/lac_small/model_baseline/word_emb
Executable file → Normal file
0
jieba/lac_small/model_baseline/word_emb
Executable file → Normal file
0
jieba/lac_small/nets.py
Executable file → Normal file
0
jieba/lac_small/nets.py
Executable file → Normal file
0
jieba/lac_small/predict.py
Executable file → Normal file
0
jieba/lac_small/predict.py
Executable file → Normal file
2
jieba/lac_small/reader_small.py
Executable file → Normal file
2
jieba/lac_small/reader_small.py
Executable file → Normal file
@ -64,7 +64,7 @@ class Dataset(object):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def vocab_size(self):
|
||||||
"""vocabuary size"""
|
"""vocabulary size"""
|
||||||
return max(self.word2id_dict.values()) + 1
|
return max(self.word2id_dict.values()) + 1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
0
jieba/lac_small/tag.dic
Executable file → Normal file
0
jieba/lac_small/tag.dic
Executable file → Normal file
0
jieba/lac_small/utils.py
Executable file → Normal file
0
jieba/lac_small/utils.py
Executable file → Normal file
0
jieba/lac_small/word.dic
Executable file → Normal file
0
jieba/lac_small/word.dic
Executable file → Normal file
@ -1,11 +1,11 @@
|
|||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import jieba
|
|
||||||
import pickle
|
import pickle
|
||||||
from .._compat import *
|
import re
|
||||||
|
|
||||||
|
import jieba
|
||||||
from .viterbi import viterbi
|
from .viterbi import viterbi
|
||||||
|
from .._compat import *
|
||||||
|
|
||||||
PROB_START_P = "prob_start.p"
|
PROB_START_P = "prob_start.p"
|
||||||
PROB_TRANS_P = "prob_trans.p"
|
PROB_TRANS_P = "prob_trans.p"
|
||||||
@ -252,6 +252,7 @@ class POSTokenizer(object):
|
|||||||
def lcut(self, *args, **kwargs):
|
def lcut(self, *args, **kwargs):
|
||||||
return list(self.cut(*args, **kwargs))
|
return list(self.cut(*args, **kwargs))
|
||||||
|
|
||||||
|
|
||||||
# default Tokenizer instance
|
# default Tokenizer instance
|
||||||
|
|
||||||
dt = POSTokenizer(jieba.dt)
|
dt = POSTokenizer(jieba.dt)
|
||||||
@ -276,20 +277,17 @@ def cut(sentence, HMM=True, use_paddle=False):
|
|||||||
Note that this only works using dt, custom POSTokenizer
|
Note that this only works using dt, custom POSTokenizer
|
||||||
instances are not supported.
|
instances are not supported.
|
||||||
"""
|
"""
|
||||||
is_paddle_installed = False
|
is_paddle_installed = check_paddle_install['is_paddle_installed']
|
||||||
if use_paddle == True:
|
if use_paddle and is_paddle_installed:
|
||||||
import_paddle_check = import_paddle()
|
# if sentence is null, it will raise core exception in paddle.
|
||||||
is_paddle_installed = check_paddle_install()
|
|
||||||
if use_paddle==True and is_paddle_installed == True and import_paddle_check == True:
|
|
||||||
if sentence is None or sentence == "" or sentence == u"":
|
if sentence is None or sentence == "" or sentence == u"":
|
||||||
yield pair(None, None)
|
|
||||||
return
|
return
|
||||||
import jieba.lac_small.predict as predict
|
import jieba.lac_small.predict as predict
|
||||||
sents,tags = predict.get_result(strdecode(sentence))
|
sents, tags = predict.get_result(strdecode(sentence))
|
||||||
for i,sent in enumerate(sents):
|
for i, sent in enumerate(sents):
|
||||||
if sent is None or tags[i] is None:
|
if sent is None or tags[i] is None:
|
||||||
continue
|
continue
|
||||||
yield pair(sent,tags[i])
|
yield pair(sent, tags[i])
|
||||||
return
|
return
|
||||||
global dt
|
global dt
|
||||||
if jieba.pool is None:
|
if jieba.pool is None:
|
||||||
@ -306,5 +304,7 @@ def cut(sentence, HMM=True, use_paddle=False):
|
|||||||
yield w
|
yield w
|
||||||
|
|
||||||
|
|
||||||
def lcut(sentence, HMM=True):
|
def lcut(sentence, HMM=True, use_paddle=False):
|
||||||
|
if use_paddle:
|
||||||
|
return list(cut(sentence, use_paddle=True))
|
||||||
return list(cut(sentence, HMM))
|
return list(cut(sentence, HMM))
|
||||||
|
4
setup.py
Executable file → Normal file
4
setup.py
Executable file → Normal file
@ -43,7 +43,7 @@ GitHub: https://github.com/fxsjy/jieba
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
setup(name='jieba',
|
setup(name='jieba',
|
||||||
version='0.40',
|
version='0.42.1',
|
||||||
description='Chinese Words Segmentation Utilities',
|
description='Chinese Words Segmentation Utilities',
|
||||||
long_description=LONGDOC,
|
long_description=LONGDOC,
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
@ -71,5 +71,5 @@ setup(name='jieba',
|
|||||||
keywords='NLP,tokenizing,Chinese word segementation',
|
keywords='NLP,tokenizing,Chinese word segementation',
|
||||||
packages=['jieba'],
|
packages=['jieba'],
|
||||||
package_dir={'jieba':'jieba'},
|
package_dir={'jieba':'jieba'},
|
||||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*', 'lac_small/*','lac_small/model_baseline/*']}
|
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*', 'lac_small/*.py','lac_small/*.dic', 'lac_small/model_baseline/*']}
|
||||||
)
|
)
|
||||||
|
@ -96,3 +96,6 @@ if __name__ == "__main__":
|
|||||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||||
|
jieba.add_word('超敏C反应蛋白')
|
||||||
|
cuttest('超敏C反应蛋白是什么, java好学吗?,小潘老板都学Python')
|
||||||
|
cuttest('steel健身爆发力运动兴奋补充剂')
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba
|
import jieba
|
||||||
|
jieba.enable_paddle()
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = jieba.cut(test_sent, use_paddle=True)
|
result = jieba.cut(test_sent, use_paddle=True)
|
||||||
|
@ -2,7 +2,8 @@
|
|||||||
import sys
|
import sys
|
||||||
sys.path.append("../")
|
sys.path.append("../")
|
||||||
import jieba.posseg as pseg
|
import jieba.posseg as pseg
|
||||||
|
import jieba
|
||||||
|
jieba.enable_paddle()
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
result = pseg.cut(test_sent, use_paddle=True)
|
result = pseg.cut(test_sent, use_paddle=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user