mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
加入paddle分词和词性标注功能 (#788)
* paddle cut release * 修改README.md,提示用户安装paddlepaddle.tiny * 删除两个init.py文件中utf头文件 * 修改readme细节
This commit is contained in:
parent
38134ee20f
commit
5b3bb4b7f2
29
README.md
Normal file → Executable file
29
README.md
Normal file → Executable file
@ -9,11 +9,11 @@ jieba
|
||||
|
||||
特点
|
||||
========
|
||||
* 支持三种分词模式:
|
||||
* 支持四种分词模式:
|
||||
* 精确模式,试图将句子最精确地切开,适合文本分析;
|
||||
* 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
|
||||
* 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
||||
|
||||
* paddle模式,利用paddlepaddle深度学习框架,训练序列标注(双向GRU)网络模型实现分词。同时支持词性标注。如需使用,请先安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。
|
||||
* 支持繁体分词
|
||||
* 支持自定义词典
|
||||
* MIT 授权协议
|
||||
@ -33,6 +33,7 @@ jieba
|
||||
* 半自动安装:先下载 http://pypi.python.org/pypi/jieba/ ,解压后运行 `python setup.py install`
|
||||
* 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
|
||||
* 通过 `import jieba` 来引用
|
||||
* 如果需要使用paddle模式下的分词和词性标注功能,请先安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。
|
||||
|
||||
算法
|
||||
========
|
||||
@ -44,7 +45,7 @@ jieba
|
||||
=======
|
||||
1. 分词
|
||||
--------
|
||||
* `jieba.cut` 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型
|
||||
* `jieba.cut` 方法接受四个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型;use_paddle 参数用来控制是否使用paddle模式下的分词模式(如需使用,安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1` );
|
||||
* `jieba.cut_for_search` 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
|
||||
* 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
|
||||
* `jieba.cut` 以及 `jieba.cut_for_search` 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
|
||||
@ -57,6 +58,9 @@ jieba
|
||||
# encoding=utf-8
|
||||
import jieba
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", use_paddle=True)
|
||||
print("Paddle Mode: " + "/ ".join(seg_list)) # paddle模式
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||
print("Full Mode: " + "/ ".join(seg_list)) # 全模式
|
||||
|
||||
@ -192,11 +196,13 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
||||
-----------
|
||||
* `jieba.posseg.POSTokenizer(tokenizer=None)` 新建自定义分词器,`tokenizer` 参数可指定内部使用的 `jieba.Tokenizer` 分词器。`jieba.posseg.dt` 为默认词性标注分词器。
|
||||
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。
|
||||
* 除了jieba默认分词模式,提供paddle模式下的词性标注功能。如需使用,请先安装paddlepaddle-tiny,`pip install paddlepaddle-tiny==1.6.1`。
|
||||
* 用法示例
|
||||
|
||||
```pycon
|
||||
>>> import jieba.posseg as pseg
|
||||
>>> words = pseg.cut("我爱北京天安门")
|
||||
>>> words = pseg.cut("我爱北京天安门") #jieba默认模式
|
||||
>>> words = pseg.cut("我爱北京天安门",use_paddle=True) #paddle模式
|
||||
>>> for word, flag in words:
|
||||
... print('%s %s' % (word, flag))
|
||||
...
|
||||
@ -206,6 +212,21 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
||||
天安门 ns
|
||||
```
|
||||
|
||||
paddle模式词性标注对应表如下:
|
||||
|
||||
paddle模式词性和专名类别标签集合如下表,其中词性标签 24 个(小写字母),专名类别标签 4 个(大写字母)。
|
||||
|
||||
| 标签 | 含义 | 标签 | 含义 | 标签 | 含义 | 标签 | 含义 |
|
||||
| ---- | -------- | ---- | -------- | ---- | -------- | ---- | -------- |
|
||||
| n | 普通名词 | f | 方位名词 | s | 处所名词 | t | 时间 |
|
||||
| nr | 人名 | ns | 地名 | nt | 机构名 | nw | 作品名 |
|
||||
| nz | 其他专名 | v | 普通动词 | vd | 动副词 | vn | 名动词 |
|
||||
| a | 形容词 | ad | 副形词 | an | 名形词 | d | 副词 |
|
||||
| m | 数量词 | q | 量词 | r | 代词 | p | 介词 |
|
||||
| c | 连词 | u | 助词 | xc | 其他虚词 | w | 标点符号 |
|
||||
| PER | 人名 | LOC | 地名 | ORG | 机构名 | TIME | 时间 |
|
||||
|
||||
|
||||
5. 并行分词
|
||||
-----------
|
||||
* 原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升
|
||||
|
15
jieba/__init__.py
Normal file → Executable file
15
jieba/__init__.py
Normal file → Executable file
@ -20,6 +20,7 @@ if os.name == 'nt':
|
||||
else:
|
||||
_replace_file = os.rename
|
||||
|
||||
|
||||
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
||||
|
||||
DEFAULT_DICT = None
|
||||
@ -272,7 +273,7 @@ class Tokenizer(object):
|
||||
for elem in buf:
|
||||
yield elem
|
||||
|
||||
def cut(self, sentence, cut_all=False, HMM=True):
|
||||
def cut(self, sentence, cut_all = False, HMM = True,use_paddle = False):
|
||||
'''
|
||||
The main function that segments an entire sentence that contains
|
||||
Chinese characters into separated words.
|
||||
@ -282,8 +283,18 @@ class Tokenizer(object):
|
||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||
- HMM: Whether to use the Hidden Markov Model.
|
||||
'''
|
||||
is_paddle_installed = False
|
||||
if use_paddle == True:
|
||||
import_paddle_check = import_paddle()
|
||||
is_paddle_installed = check_paddle_install()
|
||||
sentence = strdecode(sentence)
|
||||
|
||||
if use_paddle == True and is_paddle_installed == True and import_paddle_check == True:
|
||||
results = predict.get_sent(sentence)
|
||||
for sent in results:
|
||||
if sent is None:
|
||||
continue
|
||||
yield sent
|
||||
return
|
||||
if cut_all:
|
||||
re_han = re_han_cut_all
|
||||
re_skip = re_skip_cut_all
|
||||
|
49
jieba/_compat.py
Normal file → Executable file
49
jieba/_compat.py
Normal file → Executable file
@ -1,6 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import sys
|
||||
import imp
|
||||
import logging
|
||||
|
||||
log_console = logging.StreamHandler(sys.stderr)
|
||||
default_logger = logging.getLogger(__name__)
|
||||
default_logger.setLevel(logging.DEBUG)
|
||||
|
||||
def setLogLevel(log_level):
|
||||
global logger
|
||||
default_logger.setLevel(log_level)
|
||||
|
||||
try:
|
||||
import pkg_resources
|
||||
@ -10,6 +20,29 @@ except ImportError:
|
||||
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
|
||||
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
|
||||
|
||||
try:
|
||||
import paddle
|
||||
if paddle.__version__ == '1.6.1':
|
||||
import paddle.fluid as fluid
|
||||
import jieba.lac_small.predict as predict
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def import_paddle():
|
||||
import_paddle_check = False
|
||||
try:
|
||||
import paddle
|
||||
if paddle.__version__ == '1.6.1':
|
||||
import paddle.fluid as fluid
|
||||
import jieba.lac_small.predict as predict
|
||||
import_paddle_check = True
|
||||
except ImportError:
|
||||
default_logger.debug("Import paddle error, please use command to install: pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
|
||||
return False
|
||||
return import_paddle_check
|
||||
|
||||
|
||||
PY2 = sys.version_info[0] == 2
|
||||
|
||||
default_encoding = sys.getfilesystemencoding()
|
||||
@ -44,3 +77,19 @@ def resolve_filename(f):
|
||||
return f.name
|
||||
except AttributeError:
|
||||
return repr(f)
|
||||
|
||||
|
||||
def check_paddle_install():
|
||||
is_paddle_installed = False
|
||||
try:
|
||||
if imp.find_module('paddle') and paddle.__version__ == '1.6.1':
|
||||
is_paddle_installed = True
|
||||
elif paddle.__version__ != '1.6.1':
|
||||
is_paddle_installed = False
|
||||
default_logger.debug("Check the paddle version is not correct, please\
|
||||
use command to install paddle: pip uninstall paddlepaddle(-gpu), \
|
||||
pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
|
||||
except ImportError:
|
||||
default_logger.debug("import paddle error, back to jieba basic cut......")
|
||||
is_paddle_installed = False
|
||||
return is_paddle_installed
|
0
jieba/lac_small/__init__.py
Executable file
0
jieba/lac_small/__init__.py
Executable file
46
jieba/lac_small/creator.py
Executable file
46
jieba/lac_small/creator.py
Executable file
@ -0,0 +1,46 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Define the function to create lexical analysis model and model's data reader
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.initializer import NormalInitializer
|
||||
import jieba.lac_small.nets as nets
|
||||
|
||||
|
||||
def create_model(vocab_size, num_labels, mode='train'):
|
||||
"""create lac model"""
|
||||
|
||||
# model's input data
|
||||
words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
|
||||
targets = fluid.data(
|
||||
name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
|
||||
|
||||
# for inference process
|
||||
if mode == 'infer':
|
||||
crf_decode = nets.lex_net(
|
||||
words, vocab_size, num_labels, for_infer=True, target=None)
|
||||
return {
|
||||
"feed_list": [words],
|
||||
"words": words,
|
||||
"crf_decode": crf_decode,
|
||||
}
|
||||
return ret
|
||||
|
BIN
jieba/lac_small/model_baseline/crfw
Executable file
BIN
jieba/lac_small/model_baseline/crfw
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_0.b_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_0.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_0.w_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_0.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_1.b_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_1.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_1.w_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_1.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_2.b_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_2.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_2.w_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_2.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_3.b_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_3.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_3.w_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_3.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_4.b_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_4.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/fc_4.w_0
Executable file
BIN
jieba/lac_small/model_baseline/fc_4.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_0.b_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_0.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_0.w_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_0.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_1.b_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_1.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_1.w_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_1.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_2.b_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_2.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_2.w_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_2.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_3.b_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_3.b_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/gru_3.w_0
Executable file
BIN
jieba/lac_small/model_baseline/gru_3.w_0
Executable file
Binary file not shown.
BIN
jieba/lac_small/model_baseline/word_emb
Executable file
BIN
jieba/lac_small/model_baseline/word_emb
Executable file
Binary file not shown.
122
jieba/lac_small/nets.py
Executable file
122
jieba/lac_small/nets.py
Executable file
@ -0,0 +1,122 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
The function lex_net(args) define the lexical analysis network structure
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import math
|
||||
|
||||
import paddle.fluid as fluid
|
||||
from paddle.fluid.initializer import NormalInitializer
|
||||
|
||||
|
||||
def lex_net(word, vocab_size, num_labels, for_infer=True, target=None):
|
||||
"""
|
||||
define the lexical analysis network structure
|
||||
word: stores the input of the model
|
||||
for_infer: a boolean value, indicating if the model to be created is for training or predicting.
|
||||
|
||||
return:
|
||||
for infer: return the prediction
|
||||
otherwise: return the prediction
|
||||
"""
|
||||
|
||||
word_emb_dim=128
|
||||
grnn_hidden_dim=128
|
||||
bigru_num=2
|
||||
emb_lr = 1.0
|
||||
crf_lr = 1.0
|
||||
init_bound = 0.1
|
||||
IS_SPARSE = True
|
||||
|
||||
def _bigru_layer(input_feature):
|
||||
"""
|
||||
define the bidirectional gru layer
|
||||
"""
|
||||
pre_gru = fluid.layers.fc(
|
||||
input=input_feature,
|
||||
size=grnn_hidden_dim * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
gru = fluid.layers.dynamic_gru(
|
||||
input=pre_gru,
|
||||
size=grnn_hidden_dim,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
|
||||
pre_gru_r = fluid.layers.fc(
|
||||
input=input_feature,
|
||||
size=grnn_hidden_dim * 3,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
gru_r = fluid.layers.dynamic_gru(
|
||||
input=pre_gru_r,
|
||||
size=grnn_hidden_dim,
|
||||
is_reverse=True,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
|
||||
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
|
||||
return bi_merge
|
||||
|
||||
def _net_conf(word, target=None):
|
||||
"""
|
||||
Configure the network
|
||||
"""
|
||||
word_embedding = fluid.embedding(
|
||||
input=word,
|
||||
size=[vocab_size, word_emb_dim],
|
||||
dtype='float32',
|
||||
is_sparse=IS_SPARSE,
|
||||
param_attr=fluid.ParamAttr(
|
||||
learning_rate=emb_lr,
|
||||
name="word_emb",
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound)))
|
||||
|
||||
input_feature = word_embedding
|
||||
for i in range(bigru_num):
|
||||
bigru_output = _bigru_layer(input_feature)
|
||||
input_feature = bigru_output
|
||||
|
||||
emission = fluid.layers.fc(
|
||||
size=num_labels,
|
||||
input=bigru_output,
|
||||
param_attr=fluid.ParamAttr(
|
||||
initializer=fluid.initializer.Uniform(
|
||||
low=-init_bound, high=init_bound),
|
||||
regularizer=fluid.regularizer.L2DecayRegularizer(
|
||||
regularization_coeff=1e-4)))
|
||||
|
||||
size = emission.shape[1]
|
||||
fluid.layers.create_parameter(
|
||||
shape=[size + 2, size], dtype=emission.dtype, name='crfw')
|
||||
crf_decode = fluid.layers.crf_decoding(
|
||||
input=emission, param_attr=fluid.ParamAttr(name='crfw'))
|
||||
|
||||
return crf_decode
|
||||
return _net_conf(word)
|
82
jieba/lac_small/predict.py
Executable file
82
jieba/lac_small/predict.py
Executable file
@ -0,0 +1,82 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
|
||||
import paddle.fluid as fluid
|
||||
import paddle
|
||||
|
||||
import jieba.lac_small.utils as utils
|
||||
import jieba.lac_small.creator as creator
|
||||
import jieba.lac_small.reader_small as reader_small
|
||||
import numpy
|
||||
|
||||
word_emb_dim=128
|
||||
grnn_hidden_dim=128
|
||||
bigru_num=2
|
||||
use_cuda=False
|
||||
basepath = os.path.abspath(__file__)
|
||||
folder = os.path.dirname(basepath)
|
||||
init_checkpoint = os.path.join(folder, "model_baseline")
|
||||
batch_size=1
|
||||
|
||||
dataset = reader_small.Dataset()
|
||||
infer_program = fluid.Program()
|
||||
with fluid.program_guard(infer_program, fluid.default_startup_program()):
|
||||
with fluid.unique_name.guard():
|
||||
infer_ret = creator.create_model(dataset.vocab_size, dataset.num_labels, mode='infer')
|
||||
infer_program = infer_program.clone(for_test=True)
|
||||
place = fluid.CPUPlace()
|
||||
exe = fluid.Executor(place)
|
||||
exe.run(fluid.default_startup_program())
|
||||
utils.init_checkpoint(exe, init_checkpoint, infer_program)
|
||||
results = []
|
||||
|
||||
def get_sent(str1):
|
||||
feed_data=dataset.get_vars(str1)
|
||||
a = numpy.array(feed_data).astype(numpy.int64)
|
||||
a=a.reshape(-1,1)
|
||||
c = fluid.create_lod_tensor(a, [[a.shape[0]]], place)
|
||||
|
||||
words, crf_decode = exe.run(
|
||||
infer_program,
|
||||
fetch_list=[infer_ret['words'], infer_ret['crf_decode']],
|
||||
feed={"words":c, },
|
||||
return_numpy=False,
|
||||
use_program_cache=True)
|
||||
sents=[]
|
||||
sent,tag = utils.parse_result(words, crf_decode, dataset)
|
||||
sents = sents + sent
|
||||
return sents
|
||||
|
||||
def get_result(str1):
|
||||
feed_data=dataset.get_vars(str1)
|
||||
a = numpy.array(feed_data).astype(numpy.int64)
|
||||
a=a.reshape(-1,1)
|
||||
c = fluid.create_lod_tensor(a, [[a.shape[0]]], place)
|
||||
|
||||
words, crf_decode = exe.run(
|
||||
infer_program,
|
||||
fetch_list=[infer_ret['words'], infer_ret['crf_decode']],
|
||||
feed={"words":c, },
|
||||
return_numpy=False,
|
||||
use_program_cache=True)
|
||||
results=[]
|
||||
results += utils.parse_result(words, crf_decode, dataset)
|
||||
return results
|
100
jieba/lac_small/reader_small.py
Executable file
100
jieba/lac_small/reader_small.py
Executable file
@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
The file_reader converts raw corpus to input.
|
||||
"""
|
||||
|
||||
import os
|
||||
import __future__
|
||||
import io
|
||||
import paddle
|
||||
import paddle.fluid as fluid
|
||||
|
||||
def load_kv_dict(dict_path,
|
||||
reverse=False,
|
||||
delimiter="\t",
|
||||
key_func=None,
|
||||
value_func=None):
|
||||
"""
|
||||
Load key-value dict from file
|
||||
"""
|
||||
result_dict = {}
|
||||
for line in io.open(dict_path, "r", encoding='utf8'):
|
||||
terms = line.strip("\n").split(delimiter)
|
||||
if len(terms) != 2:
|
||||
continue
|
||||
if reverse:
|
||||
value, key = terms
|
||||
else:
|
||||
key, value = terms
|
||||
if key in result_dict:
|
||||
raise KeyError("key duplicated with [%s]" % (key))
|
||||
if key_func:
|
||||
key = key_func(key)
|
||||
if value_func:
|
||||
value = value_func(value)
|
||||
result_dict[key] = value
|
||||
return result_dict
|
||||
|
||||
class Dataset(object):
|
||||
"""data reader"""
|
||||
def __init__(self):
|
||||
# read dict
|
||||
basepath = os.path.abspath(__file__)
|
||||
folder = os.path.dirname(basepath)
|
||||
word_dict_path = os.path.join(folder, "word.dic")
|
||||
label_dict_path = os.path.join(folder, "tag.dic")
|
||||
self.word2id_dict = load_kv_dict(
|
||||
word_dict_path, reverse=True, value_func=int)
|
||||
self.id2word_dict = load_kv_dict(word_dict_path)
|
||||
self.label2id_dict = load_kv_dict(
|
||||
label_dict_path, reverse=True, value_func=int)
|
||||
self.id2label_dict = load_kv_dict(label_dict_path)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""vocabuary size"""
|
||||
return max(self.word2id_dict.values()) + 1
|
||||
|
||||
@property
|
||||
def num_labels(self):
|
||||
"""num_labels"""
|
||||
return max(self.label2id_dict.values()) + 1
|
||||
|
||||
def word_to_ids(self, words):
|
||||
"""convert word to word index"""
|
||||
word_ids = []
|
||||
for word in words:
|
||||
if word not in self.word2id_dict:
|
||||
word = "OOV"
|
||||
word_id = self.word2id_dict[word]
|
||||
word_ids.append(word_id)
|
||||
return word_ids
|
||||
|
||||
def label_to_ids(self, labels):
|
||||
"""convert label to label index"""
|
||||
label_ids = []
|
||||
for label in labels:
|
||||
if label not in self.label2id_dict:
|
||||
label = "O"
|
||||
label_id = self.label2id_dict[label]
|
||||
label_ids.append(label_id)
|
||||
return label_ids
|
||||
|
||||
def get_vars(self,str1):
|
||||
words = str1.strip()
|
||||
word_ids = self.word_to_ids(words)
|
||||
return word_ids
|
||||
|
||||
|
57
jieba/lac_small/tag.dic
Executable file
57
jieba/lac_small/tag.dic
Executable file
@ -0,0 +1,57 @@
|
||||
0 a-B
|
||||
1 a-I
|
||||
2 ad-B
|
||||
3 ad-I
|
||||
4 an-B
|
||||
5 an-I
|
||||
6 c-B
|
||||
7 c-I
|
||||
8 d-B
|
||||
9 d-I
|
||||
10 f-B
|
||||
11 f-I
|
||||
12 m-B
|
||||
13 m-I
|
||||
14 n-B
|
||||
15 n-I
|
||||
16 nr-B
|
||||
17 nr-I
|
||||
18 ns-B
|
||||
19 ns-I
|
||||
20 nt-B
|
||||
21 nt-I
|
||||
22 nw-B
|
||||
23 nw-I
|
||||
24 nz-B
|
||||
25 nz-I
|
||||
26 p-B
|
||||
27 p-I
|
||||
28 q-B
|
||||
29 q-I
|
||||
30 r-B
|
||||
31 r-I
|
||||
32 s-B
|
||||
33 s-I
|
||||
34 t-B
|
||||
35 t-I
|
||||
36 u-B
|
||||
37 u-I
|
||||
38 v-B
|
||||
39 v-I
|
||||
40 vd-B
|
||||
41 vd-I
|
||||
42 vn-B
|
||||
43 vn-I
|
||||
44 w-B
|
||||
45 w-I
|
||||
46 xc-B
|
||||
47 xc-I
|
||||
48 PER-B
|
||||
49 PER-I
|
||||
50 LOC-B
|
||||
51 LOC-I
|
||||
52 ORG-B
|
||||
53 ORG-I
|
||||
54 TIME-B
|
||||
55 TIME-I
|
||||
56 O
|
142
jieba/lac_small/utils.py
Executable file
142
jieba/lac_small/utils.py
Executable file
@ -0,0 +1,142 @@
|
||||
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
util tools
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import paddle.fluid as fluid
|
||||
import io
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
"""
|
||||
argparse does not support True or False in python
|
||||
"""
|
||||
return v.lower() in ("true", "t", "1")
|
||||
|
||||
|
||||
|
||||
def parse_result(words, crf_decode, dataset):
|
||||
""" parse result """
|
||||
offset_list = (crf_decode.lod())[0]
|
||||
words = np.array(words)
|
||||
crf_decode = np.array(crf_decode)
|
||||
batch_size = len(offset_list) - 1
|
||||
|
||||
for sent_index in range(batch_size):
|
||||
begin, end = offset_list[sent_index], offset_list[sent_index + 1]
|
||||
sent=[]
|
||||
for id in words[begin:end]:
|
||||
if dataset.id2word_dict[str(id[0])]=='OOV':
|
||||
sent.append(' ')
|
||||
else:
|
||||
sent.append(dataset.id2word_dict[str(id[0])])
|
||||
tags = [
|
||||
dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]
|
||||
]
|
||||
|
||||
sent_out = []
|
||||
tags_out = []
|
||||
parital_word = ""
|
||||
for ind, tag in enumerate(tags):
|
||||
# for the first word
|
||||
if parital_word == "":
|
||||
parital_word = sent[ind]
|
||||
tags_out.append(tag.split('-')[0])
|
||||
continue
|
||||
|
||||
# for the beginning of word
|
||||
if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
|
||||
sent_out.append(parital_word)
|
||||
tags_out.append(tag.split('-')[0])
|
||||
parital_word = sent[ind]
|
||||
continue
|
||||
|
||||
parital_word += sent[ind]
|
||||
|
||||
# append the last word, except for len(tags)=0
|
||||
if len(sent_out) < len(tags_out):
|
||||
sent_out.append(parital_word)
|
||||
return sent_out,tags_out
|
||||
|
||||
def parse_padding_result(words, crf_decode, seq_lens, dataset):
|
||||
""" parse padding result """
|
||||
words = np.squeeze(words)
|
||||
batch_size = len(seq_lens)
|
||||
|
||||
batch_out = []
|
||||
for sent_index in range(batch_size):
|
||||
|
||||
sent=[]
|
||||
for id in words[begin:end]:
|
||||
if dataset.id2word_dict[str(id[0])]=='OOV':
|
||||
sent.append(' ')
|
||||
else:
|
||||
sent.append(dataset.id2word_dict[str(id[0])])
|
||||
tags = [
|
||||
dataset.id2label_dict[str(id)]
|
||||
for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1]
|
||||
]
|
||||
|
||||
sent_out = []
|
||||
tags_out = []
|
||||
parital_word = ""
|
||||
for ind, tag in enumerate(tags):
|
||||
# for the first word
|
||||
if parital_word == "":
|
||||
parital_word = sent[ind]
|
||||
tags_out.append(tag.split('-')[0])
|
||||
continue
|
||||
|
||||
# for the beginning of word
|
||||
if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
|
||||
sent_out.append(parital_word)
|
||||
tags_out.append(tag.split('-')[0])
|
||||
parital_word = sent[ind]
|
||||
continue
|
||||
|
||||
parital_word += sent[ind]
|
||||
|
||||
# append the last word, except for len(tags)=0
|
||||
if len(sent_out) < len(tags_out):
|
||||
sent_out.append(parital_word)
|
||||
|
||||
batch_out.append([sent_out, tags_out])
|
||||
return batch_out
|
||||
|
||||
|
||||
def init_checkpoint(exe, init_checkpoint_path, main_program):
|
||||
"""
|
||||
Init CheckPoint
|
||||
"""
|
||||
assert os.path.exists(
|
||||
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
|
||||
|
||||
def existed_persitables(var):
|
||||
"""
|
||||
If existed presitabels
|
||||
"""
|
||||
if not fluid.io.is_persistable(var):
|
||||
return False
|
||||
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
|
||||
|
||||
fluid.io.load_vars(
|
||||
exe,
|
||||
init_checkpoint_path,
|
||||
main_program=main_program,
|
||||
predicate=existed_persitables)
|
||||
|
20940
jieba/lac_small/word.dic
Executable file
20940
jieba/lac_small/word.dic
Executable file
File diff suppressed because it is too large
Load Diff
13
jieba/posseg/__init__.py
Normal file → Executable file
13
jieba/posseg/__init__.py
Normal file → Executable file
@ -269,13 +269,24 @@ def _lcut_internal_no_hmm(s):
|
||||
return dt._lcut_internal_no_hmm(s)
|
||||
|
||||
|
||||
def cut(sentence, HMM=True):
|
||||
def cut(sentence, HMM=True, use_paddle=False):
|
||||
"""
|
||||
Global `cut` function that supports parallel processing.
|
||||
|
||||
Note that this only works using dt, custom POSTokenizer
|
||||
instances are not supported.
|
||||
"""
|
||||
is_paddle_installed = False
|
||||
if use_paddle == True:
|
||||
import_paddle_check = import_paddle()
|
||||
is_paddle_installed = check_paddle_install()
|
||||
if use_paddle==True and is_paddle_installed == True and import_paddle_check == True:
|
||||
sents,tags = predict.get_result(strdecode(sentence))
|
||||
for i,sent in enumerate(sents):
|
||||
if sent is None or tags[i] is None:
|
||||
continue
|
||||
yield pair(sent,tags[i])
|
||||
return
|
||||
global dt
|
||||
if jieba.pool is None:
|
||||
for w in dt.cut(sentence, HMM=HMM):
|
||||
|
2
setup.py
Normal file → Executable file
2
setup.py
Normal file → Executable file
@ -71,5 +71,5 @@ setup(name='jieba',
|
||||
keywords='NLP,tokenizing,Chinese word segementation',
|
||||
packages=['jieba'],
|
||||
package_dir={'jieba':'jieba'},
|
||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*', 'lac_small/*','lac_small/model_baseline/*']}
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user