mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
complete the setup.py, fix #202 problem in posseg
This commit is contained in:
parent
8a2e7f0e7e
commit
7b7c6955a9
@ -1,5 +1,5 @@
|
|||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
__version__ = '0.34'
|
__version__ = '0.35'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -78,7 +78,8 @@ def initialize(dictionary=None):
|
|||||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||||
logger.debug("Loading model from cache %s" % cache_file)
|
logger.debug("Loading model from cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
with open(cache_file, 'rb') as cf:
|
||||||
|
pfdict,FREQ,total,min_freq = marshal.load(cf)
|
||||||
# prevent conflict with old version
|
# prevent conflict with old version
|
||||||
load_from_cache_fail = not isinstance(pfdict, set)
|
load_from_cache_fail = not isinstance(pfdict, set)
|
||||||
except:
|
except:
|
||||||
|
@ -46,7 +46,7 @@ def load_model(f_name, isJython=True):
|
|||||||
|
|
||||||
state = {}
|
state = {}
|
||||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||||
with open(abs_path, 'r') as f:
|
with open(abs_path, 'rb') as f:
|
||||||
state = marshal.load(f)
|
state = marshal.load(f)
|
||||||
f.closed
|
f.closed
|
||||||
|
|
||||||
@ -126,7 +126,7 @@ def __cut_detail(sentence):
|
|||||||
def __cut_DAG_NO_HMM(sentence):
|
def __cut_DAG_NO_HMM(sentence):
|
||||||
DAG = jieba.get_DAG(sentence)
|
DAG = jieba.get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
jieba.calc(sentence, DAG, 0, route=route)
|
jieba.calc(sentence, DAG, route)
|
||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = u''
|
buf = u''
|
||||||
@ -151,7 +151,7 @@ def __cut_DAG(sentence):
|
|||||||
DAG = jieba.get_DAG(sentence)
|
DAG = jieba.get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
|
|
||||||
jieba.calc(sentence,DAG,0,route=route)
|
jieba.calc(sentence, DAG, route)
|
||||||
|
|
||||||
x = 0
|
x = 0
|
||||||
buf = u''
|
buf = u''
|
||||||
|
@ -3,9 +3,7 @@ MIN_FLOAT = -3.14e100
|
|||||||
MIN_INF = float("-inf")
|
MIN_INF = float("-inf")
|
||||||
|
|
||||||
def get_top_states(t_state_v, K=4):
|
def get_top_states(t_state_v, K=4):
|
||||||
items = t_state_v.items()
|
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
|
||||||
topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
|
|
||||||
return [x[0] for x in topK]
|
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] #tabular
|
||||||
@ -27,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
|
|||||||
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
||||||
|
|
||||||
for y in obs_states:
|
for y in obs_states:
|
||||||
prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
|
prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states)
|
||||||
V[t][y] = prob
|
V[t][y] = prob
|
||||||
mem_path[t][y] = state
|
mem_path[t][y] = state
|
||||||
|
|
||||||
|
78
setup.py
78
setup.py
@ -1,10 +1,86 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
|
LONGDOC = u"""
|
||||||
|
jieba
|
||||||
|
=====
|
||||||
|
|
||||||
|
“结巴”中文分词:做最好的 Python 中文分词组件
|
||||||
|
|
||||||
|
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to
|
||||||
|
be the best Python Chinese word segmentation module.
|
||||||
|
|
||||||
|
完整文档见 ``README.md``
|
||||||
|
|
||||||
|
GitHub: https://github.com/fxsjy/jieba
|
||||||
|
|
||||||
|
特点
|
||||||
|
====
|
||||||
|
|
||||||
|
- 支持三种分词模式:
|
||||||
|
|
||||||
|
- 精确模式,试图将句子最精确地切开,适合文本分析;
|
||||||
|
- 全模式,把句子中所有的可以成词的词语都扫描出来,
|
||||||
|
速度非常快,但是不能解决歧义;
|
||||||
|
- 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
||||||
|
|
||||||
|
- 支持繁体分词
|
||||||
|
- 支持自定义词典
|
||||||
|
|
||||||
|
在线演示: http://jiebademo.ap01.aws.af.cm/
|
||||||
|
|
||||||
|
安装说明
|
||||||
|
========
|
||||||
|
|
||||||
|
Python 2.x
|
||||||
|
----------
|
||||||
|
|
||||||
|
- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
|
||||||
|
- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
|
||||||
|
python setup.py install
|
||||||
|
- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
|
||||||
|
- 通过 ``import jieba`` 来引用
|
||||||
|
|
||||||
|
Python 3.x
|
||||||
|
----------
|
||||||
|
|
||||||
|
见 https://pypi.python.org/pypi/jieba3k/
|
||||||
|
|
||||||
|
- 目前 master 分支是只支持 Python 2.x 的
|
||||||
|
- Python 3.x 版本的分支也已经基本可用:
|
||||||
|
https://github.com/fxsjy/jieba/tree/jieba3k
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
git clone https://github.com/fxsjy/jieba.git
|
||||||
|
git checkout jieba3k
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
- 或使用pip3安装: pip3 install jieba3k
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
setup(name='jieba',
|
setup(name='jieba',
|
||||||
version='0.35',
|
version='0.35',
|
||||||
description='Chinese Words Segementation Utilities',
|
description='Chinese Words Segementation Utilities',
|
||||||
|
long_description=LONGDOC,
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
author_email='ccnusjy@gmail.com',
|
author_email='ccnusjy@gmail.com',
|
||||||
url='http://github.com/fxsjy',
|
url='https://github.com/fxsjy/jieba',
|
||||||
|
license="MIT",
|
||||||
|
classifiers=[
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
'Natural Language :: Chinese (Simplified)',
|
||||||
|
'Natural Language :: Chinese (Traditional)',
|
||||||
|
'Programming Language :: Cython',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 2',
|
||||||
|
'Topic :: Text Processing',
|
||||||
|
'Topic :: Text Processing :: Indexing',
|
||||||
|
'Topic :: Text Processing :: Linguistic',
|
||||||
|
],
|
||||||
|
keywords='NLP,tokenizing,Chinese word segementation',
|
||||||
packages=['jieba'],
|
packages=['jieba'],
|
||||||
package_dir={'jieba':'jieba'},
|
package_dir={'jieba':'jieba'},
|
||||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user