mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
update jieba3k
This commit is contained in:
parent
6b0da06481
commit
c6b386f65b
6
.gitignore
vendored
6
.gitignore
vendored
@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
|
|||||||
_UpgradeReport_Files/
|
_UpgradeReport_Files/
|
||||||
Backup*/
|
Backup*/
|
||||||
UpgradeLog*.XML
|
UpgradeLog*.XML
|
||||||
|
############
|
||||||
|
## pycharm
|
||||||
|
############
|
||||||
|
.idea
|
||||||
|
|
||||||
############
|
############
|
||||||
## Windows
|
## Windows
|
||||||
|
10
Changelog
10
Changelog
@ -1,11 +1,11 @@
|
|||||||
2014-11-15: version 0.35.1
|
2014-11-15: version 0.35.1
|
||||||
1) fix Python 3.2的兼容性问题
|
1. 修复 Python 3.2 的兼容性问题
|
||||||
|
|
||||||
2014-11-13: version 0.35
|
2014-11-13: version 0.35
|
||||||
1) 改进词典cache的dump和加载机制;by @gumblex
|
1. 改进词典cache的dump和加载机制;by @gumblex
|
||||||
2)提升关键词提取的性能; by @gumblex
|
2. 提升关键词提取的性能; by @gumblex
|
||||||
3)关键词提取新增基于textrank算法的子模块; by @singlee
|
3. 关键词提取新增基于textrank算法的子模块; by @singlee
|
||||||
4)修复自定义stopwords功能的bug; by @walkskyer
|
4. 修复自定义stopwords功能的bug; by @walkskyer
|
||||||
|
|
||||||
|
|
||||||
2014-10-20: version 0.34
|
2014-10-20: version 0.34
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = '0.34'
|
__version__ = '0.35'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -135,7 +135,7 @@ def __cut_all(sentence):
|
|||||||
old_j = j
|
old_j = j
|
||||||
|
|
||||||
|
|
||||||
def calc(sentence, DAG, idx, route):
|
def calc(sentence, DAG, route):
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
route[N] = (0.0, '')
|
route[N] = (0.0, '')
|
||||||
for idx in range(N-1, -1, -1):
|
for idx in range(N-1, -1, -1):
|
||||||
@ -164,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
||||||
DAG = get_DAG(sentence)
|
DAG = get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
calc(sentence, DAG, 0, route)
|
calc(sentence, DAG, route)
|
||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = ''
|
buf = ''
|
||||||
@ -187,7 +187,7 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
def __cut_DAG(sentence):
|
def __cut_DAG(sentence):
|
||||||
DAG = get_DAG(sentence)
|
DAG = get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
calc(sentence, DAG, 0, route=route)
|
calc(sentence, DAG, route=route)
|
||||||
x = 0
|
x = 0
|
||||||
buf = ''
|
buf = ''
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
import jieba
|
import jieba
|
||||||
|
import jieba.posseg
|
||||||
import os
|
import os
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
try:
|
try:
|
||||||
@ -54,25 +55,36 @@ def set_stop_words(stop_words_path):
|
|||||||
if not os.path.exists(abs_path):
|
if not os.path.exists(abs_path):
|
||||||
raise Exception("jieba: path does not exist: " + abs_path)
|
raise Exception("jieba: path does not exist: " + abs_path)
|
||||||
content = open(abs_path,'rb').read().decode('utf-8')
|
content = open(abs_path,'rb').read().decode('utf-8')
|
||||||
lines = content.replace("\r","").split('\n')
|
lines = content.replace("\r", "").split('\n')
|
||||||
for line in lines:
|
for line in lines:
|
||||||
STOP_WORDS.add(line)
|
STOP_WORDS.add(line)
|
||||||
|
|
||||||
def extract_tags(sentence, topK=20, withWeight=False):
|
def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
|
||||||
"""
|
"""
|
||||||
Extract keywords from sentence using TF-IDF algorithm.
|
Extract keywords from sentence using TF-IDF algorithm.
|
||||||
Parameter:
|
Parameter:
|
||||||
- topK: return how many top keywords. `None` for all possible words.
|
- topK: return how many top keywords. `None` for all possible words.
|
||||||
- withWeight: if True, return a list of (word, weight);
|
- withWeight: if True, return a list of (word, weight);
|
||||||
if False, return a list of words.
|
if False, return a list of words.
|
||||||
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
||||||
|
if the POS of w is not in this list,it will be filtered.
|
||||||
"""
|
"""
|
||||||
global STOP_WORDS, idf_loader
|
global STOP_WORDS, idf_loader
|
||||||
|
|
||||||
idf_freq, median_idf = idf_loader.get_idf()
|
idf_freq, median_idf = idf_loader.get_idf()
|
||||||
|
|
||||||
|
if allowPOS:
|
||||||
|
allowPOS = frozenset(allowPOS)
|
||||||
|
words = jieba.posseg.cut(sentence)
|
||||||
|
else:
|
||||||
words = jieba.cut(sentence)
|
words = jieba.cut(sentence)
|
||||||
freq = {}
|
freq = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
|
if allowPOS:
|
||||||
|
if w.flag not in allowPOS:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
w = w.word
|
||||||
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
|
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
|
||||||
continue
|
continue
|
||||||
freq[w] = freq.get(w, 0.0) + 1.0
|
freq[w] = freq.get(w, 0.0) + 1.0
|
||||||
|
@ -48,15 +48,17 @@ class UndirectWeightedGraph:
|
|||||||
return ws
|
return ws
|
||||||
|
|
||||||
|
|
||||||
def textrank(sentence, topK=10, withWeight=False):
|
def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
|
||||||
"""
|
"""
|
||||||
Extract keywords from sentence using TextRank algorithm.
|
Extract keywords from sentence using TextRank algorithm.
|
||||||
Parameter:
|
Parameter:
|
||||||
- topK: return how many top keywords. `None` for all possible words.
|
- topK: return how many top keywords. `None` for all possible words.
|
||||||
- withWeight: if True, return a list of (word, weight);
|
- withWeight: if True, return a list of (word, weight);
|
||||||
if False, return a list of words.
|
if False, return a list of words.
|
||||||
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
||||||
|
if the POS of w is not in this list,it will be filtered.
|
||||||
"""
|
"""
|
||||||
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
|
pos_filt = frozenset(allowPOS)
|
||||||
g = UndirectWeightedGraph()
|
g = UndirectWeightedGraph()
|
||||||
cm = collections.defaultdict(int)
|
cm = collections.defaultdict(int)
|
||||||
span = 5
|
span = 5
|
||||||
|
@ -125,7 +125,7 @@ def __cut_detail(sentence):
|
|||||||
def __cut_DAG_NO_HMM(sentence):
|
def __cut_DAG_NO_HMM(sentence):
|
||||||
DAG = jieba.get_DAG(sentence)
|
DAG = jieba.get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
jieba.calc(sentence, DAG, 0, route=route)
|
jieba.calc(sentence, DAG, route)
|
||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = ''
|
buf = ''
|
||||||
@ -150,7 +150,7 @@ def __cut_DAG(sentence):
|
|||||||
DAG = jieba.get_DAG(sentence)
|
DAG = jieba.get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
|
|
||||||
jieba.calc(sentence,DAG,0,route=route)
|
jieba.calc(sentence, DAG, route)
|
||||||
|
|
||||||
x = 0
|
x = 0
|
||||||
buf = ''
|
buf = ''
|
||||||
|
@ -3,8 +3,7 @@ MIN_FLOAT = -3.14e100
|
|||||||
MIN_INF = float("-inf")
|
MIN_INF = float("-inf")
|
||||||
|
|
||||||
def get_top_states(t_state_v, K=4):
|
def get_top_states(t_state_v, K=4):
|
||||||
topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
|
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
|
||||||
return [x[0] for x in topK]
|
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] #tabular
|
||||||
@ -26,7 +25,7 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
|
|||||||
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
||||||
|
|
||||||
for y in obs_states:
|
for y in obs_states:
|
||||||
prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
|
prob, state = max((V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states)
|
||||||
V[t][y] = prob
|
V[t][y] = prob
|
||||||
mem_path[t][y] = state
|
mem_path[t][y] = state
|
||||||
|
|
||||||
|
71
setup.py
71
setup.py
@ -1,10 +1,79 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
|
LONGDOC = """
|
||||||
|
jieba
|
||||||
|
=====
|
||||||
|
|
||||||
|
“结巴”中文分词:做最好的 Python 中文分词组件
|
||||||
|
|
||||||
|
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to
|
||||||
|
be the best Python Chinese word segmentation module.
|
||||||
|
|
||||||
|
完整文档见 ``README.md``
|
||||||
|
|
||||||
|
GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
|
||||||
|
|
||||||
|
特点
|
||||||
|
====
|
||||||
|
|
||||||
|
- 支持三种分词模式:
|
||||||
|
|
||||||
|
- 精确模式,试图将句子最精确地切开,适合文本分析;
|
||||||
|
- 全模式,把句子中所有的可以成词的词语都扫描出来,
|
||||||
|
速度非常快,但是不能解决歧义;
|
||||||
|
- 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
|
||||||
|
|
||||||
|
- 支持繁体分词
|
||||||
|
- 支持自定义词典
|
||||||
|
|
||||||
|
在线演示: http://jiebademo.ap01.aws.af.cm/
|
||||||
|
|
||||||
|
安装说明
|
||||||
|
========
|
||||||
|
|
||||||
|
Python 2.x
|
||||||
|
----------
|
||||||
|
|
||||||
|
见 https://pypi.python.org/pypi/jieba/
|
||||||
|
|
||||||
|
Python 3.x
|
||||||
|
----------
|
||||||
|
|
||||||
|
- 目前 master 分支是只支持 Python 2.x 的
|
||||||
|
- Python 3.x 版本的分支也已经基本可用:
|
||||||
|
https://github.com/fxsjy/jieba/tree/jieba3k
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
git clone https://github.com/fxsjy/jieba.git
|
||||||
|
git checkout jieba3k
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
- 或使用pip3安装: pip3 install jieba3k
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
setup(name='jieba3k',
|
setup(name='jieba3k',
|
||||||
version='0.35.1',
|
version='0.35.1',
|
||||||
description='Chinese Words Segementation Utilities',
|
description='Chinese Words Segementation Utilities',
|
||||||
|
long_description=LONGDOC,
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
author_email='ccnusjy@gmail.com',
|
author_email='ccnusjy@gmail.com',
|
||||||
url='http://github.com/fxsjy',
|
url='https://github.com/fxsjy/jieba/tree/jieba3k',
|
||||||
|
license="MIT",
|
||||||
|
classifiers=[
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'License :: OSI Approved :: MIT License',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
'Natural Language :: Chinese (Simplified)',
|
||||||
|
'Natural Language :: Chinese (Traditional)',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Topic :: Text Processing',
|
||||||
|
'Topic :: Text Processing :: Indexing',
|
||||||
|
'Topic :: Text Processing :: Linguistic',
|
||||||
|
],
|
||||||
|
keywords='NLP,tokenizing,Chinese word segementation',
|
||||||
packages=['jieba'],
|
packages=['jieba'],
|
||||||
package_dir={'jieba':'jieba'},
|
package_dir={'jieba':'jieba'},
|
||||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||||
|
214
test/2to3.diff
214
test/2to3.diff
@ -1,6 +1,6 @@
|
|||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
|
||||||
--- ./jieba/analyse/analyzer.py 2014-11-07 23:07:02.779210408 +0800
|
--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800
|
||||||
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-07 23:07:02.079210422 +0800
|
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800
|
||||||
@@ -1,4 +1,4 @@
|
@@ -1,4 +1,4 @@
|
||||||
-##encoding=utf-8
|
-##encoding=utf-8
|
||||||
+#encoding=utf-8
|
+#encoding=utf-8
|
||||||
@ -8,9 +8,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
from whoosh.analysis import Tokenizer,Token
|
from whoosh.analysis import Tokenizer,Token
|
||||||
from whoosh.lang.porter import stem
|
from whoosh.lang.porter import stem
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
|
||||||
--- ./jieba/analyse/__init__.py 2014-11-07 23:07:02.879210406 +0800
|
--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800
|
||||||
+++ ../jieba/jieba/analyse/__init__.py 2014-11-07 23:16:27.171198767 +0800
|
+++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800
|
||||||
@@ -25,7 +25,7 @@
|
@@ -26,7 +26,7 @@
|
||||||
|
|
||||||
def set_new_path(self, new_idf_path):
|
def set_new_path(self, new_idf_path):
|
||||||
if self.path != new_idf_path:
|
if self.path != new_idf_path:
|
||||||
@ -19,7 +19,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
idf_freq = {}
|
idf_freq = {}
|
||||||
lines = content.rstrip('\n').split('\n')
|
lines = content.rstrip('\n').split('\n')
|
||||||
for line in lines:
|
for line in lines:
|
||||||
@@ -81,7 +81,7 @@
|
@@ -93,7 +93,7 @@
|
||||||
freq[k] *= idf_freq.get(k, median_idf) / total
|
freq[k] *= idf_freq.get(k, median_idf) / total
|
||||||
|
|
||||||
if withWeight:
|
if withWeight:
|
||||||
@ -29,8 +29,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
tags = sorted(freq, key=freq.__getitem__, reverse=True)
|
||||||
if topK:
|
if topK:
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
|
||||||
--- ./jieba/analyse/textrank.py 2014-11-07 23:07:02.827210407 +0800
|
--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800
|
||||||
+++ ../jieba/jieba/analyse/textrank.py 2014-11-07 23:18:22.059196398 +0800
|
+++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800
|
||||||
@@ -1,4 +1,4 @@
|
@@ -1,4 +1,4 @@
|
||||||
-#!/usr/bin/env python
|
-#!/usr/bin/env python
|
||||||
+#!/usr/bin/env python3
|
+#!/usr/bin/env python3
|
||||||
@ -61,7 +61,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
# to unify the weights, don't *100.
|
# to unify the weights, don't *100.
|
||||||
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
|
||||||
|
|
||||||
@@ -70,12 +70,12 @@
|
@@ -72,12 +72,12 @@
|
||||||
continue
|
continue
|
||||||
cm[(words[i].word, words[j].word)] += 1
|
cm[(words[i].word, words[j].word)] += 1
|
||||||
|
|
||||||
@ -77,19 +77,28 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
||||||
if topK:
|
if topK:
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
|
||||||
--- ./jieba/finalseg/__init__.py 2014-11-07 23:07:03.147210400 +0800
|
--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800
|
||||||
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-07 23:18:43.495195956 +0800
|
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800
|
||||||
@@ -1,4 +1,3 @@
|
@@ -1,4 +1,3 @@
|
||||||
-
|
-
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import marshal
|
import marshal
|
||||||
|
@@ -89,7 +88,7 @@
|
||||||
|
sentence = sentence.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
|
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
|
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
|
blocks = re_han.split(sentence)
|
||||||
|
for blk in blocks:
|
||||||
|
if re_han.match(blk):
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
|
||||||
--- ./jieba/__init__.py 2014-11-07 23:07:02.751210408 +0800
|
--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800
|
||||||
+++ ../jieba/jieba/__init__.py 2014-11-07 23:22:34.963191182 +0800
|
+++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800
|
||||||
@@ -1,4 +1,3 @@
|
@@ -1,4 +1,3 @@
|
||||||
-
|
-
|
||||||
__version__ = '0.34'
|
__version__ = '0.35'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
||||||
@@ -51,7 +50,7 @@
|
@@ -51,7 +50,7 @@
|
||||||
@ -101,17 +110,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
return pfdict, lfreq, ltotal
|
return pfdict, lfreq, ltotal
|
||||||
|
|
||||||
def initialize(dictionary=None):
|
def initialize(dictionary=None):
|
||||||
@@ -78,7 +77,8 @@
|
@@ -229,11 +228,11 @@
|
||||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
|
||||||
logger.debug("Loading model from cache %s" % cache_file)
|
|
||||||
try:
|
|
||||||
- pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
|
||||||
+ with open(cache_file, 'rb') as cf:
|
|
||||||
+ pfdict,FREQ,total,min_freq = marshal.load(cf)
|
|
||||||
# prevent conflict with old version
|
|
||||||
load_from_cache_fail = not isinstance(pfdict, set)
|
|
||||||
except:
|
|
||||||
@@ -228,11 +228,11 @@
|
|
||||||
'''The main function that segments an entire sentence that contains
|
'''The main function that segments an entire sentence that contains
|
||||||
Chinese characters into seperated words.
|
Chinese characters into seperated words.
|
||||||
Parameter:
|
Parameter:
|
||||||
@ -125,7 +124,19 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
@@ -338,8 +338,6 @@
|
@@ -243,9 +242,9 @@
|
||||||
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
|
|
||||||
|
if cut_all:
|
||||||
|
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
|
||||||
|
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
||||||
|
else:
|
||||||
|
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
|
||||||
|
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
|
||||||
|
blocks = re_han.split(sentence)
|
||||||
|
if cut_all:
|
||||||
|
cut_block = __cut_all
|
||||||
|
@@ -339,8 +338,6 @@
|
||||||
global pool, cut, cut_for_search
|
global pool, cut, cut_for_search
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
raise Exception("jieba: parallel mode only supports posix system")
|
raise Exception("jieba: parallel mode only supports posix system")
|
||||||
@ -134,7 +145,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
from multiprocessing import Pool, cpu_count
|
from multiprocessing import Pool, cpu_count
|
||||||
if processnum is None:
|
if processnum is None:
|
||||||
processnum = cpu_count()
|
processnum = cpu_count()
|
||||||
@@ -392,12 +390,12 @@
|
@@ -393,12 +390,12 @@
|
||||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
def tokenize(unicode_sentence, mode="default", HMM=True):
|
||||||
"""Tokenize a sentence and yields tuples of (word, start, end)
|
"""Tokenize a sentence and yields tuples of (word, start, end)
|
||||||
Parameter:
|
Parameter:
|
||||||
@ -150,8 +161,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
if mode == 'default':
|
if mode == 'default':
|
||||||
for w in cut(unicode_sentence, HMM=HMM):
|
for w in cut(unicode_sentence, HMM=HMM):
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
|
||||||
--- ./jieba/__main__.py 2014-11-07 23:07:02.563210412 +0800
|
--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800
|
||||||
+++ ../jieba/jieba/__main__.py 2014-11-07 23:07:02.079210422 +0800
|
+++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800
|
||||||
@@ -40,7 +40,7 @@
|
@@ -40,7 +40,7 @@
|
||||||
ln = fp.readline()
|
ln = fp.readline()
|
||||||
while ln:
|
while ln:
|
||||||
@ -162,8 +173,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
|
|
||||||
fp.close()
|
fp.close()
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
|
||||||
--- ./jieba/posseg/__init__.py 2014-11-07 23:07:03.047210402 +0800
|
--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800
|
||||||
+++ ../jieba/jieba/posseg/__init__.py 2014-11-07 23:19:40.883194772 +0800
|
+++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800
|
||||||
@@ -1,4 +1,3 @@
|
@@ -1,4 +1,3 @@
|
||||||
-
|
-
|
||||||
import re
|
import re
|
||||||
@ -188,27 +199,41 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
if not isJython:
|
if not isJython:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -46,7 +45,7 @@
|
@@ -105,8 +104,8 @@
|
||||||
|
yield pair(sentence[next:], pos_list[next][1])
|
||||||
state = {}
|
|
||||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
|
||||||
- with open(abs_path, 'r') as f:
|
|
||||||
+ with open(abs_path, 'rb') as f:
|
|
||||||
state = marshal.load(f)
|
|
||||||
f.closed
|
|
||||||
|
|
||||||
|
def __cut_detail(sentence):
|
||||||
|
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
|
||||||
|
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
||||||
|
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||||
|
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
|
blocks = re_han.split(sentence)
|
||||||
|
for blk in blocks:
|
||||||
|
if re_han.match(blk):
|
||||||
|
@@ -130,7 +129,7 @@
|
||||||
|
x = 0
|
||||||
|
N = len(sentence)
|
||||||
|
buf = ''
|
||||||
|
- re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
|
||||||
|
+ re_eng = re.compile('[a-zA-Z0-9]',re.U)
|
||||||
|
while x < N:
|
||||||
|
y = route[x][1]+1
|
||||||
|
l_word = sentence[x:y]
|
||||||
|
@@ -195,8 +194,8 @@
|
||||||
|
sentence = sentence.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
|
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
|
||||||
|
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
||||||
|
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
|
||||||
|
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||||
|
blocks = re_han.split(sentence)
|
||||||
|
if HMM:
|
||||||
|
__cut_blk = __cut_DAG
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
|
||||||
--- ./jieba/posseg/viterbi.py 2014-11-07 23:07:03.079210402 +0800
|
--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800
|
||||||
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-07 23:07:02.095210422 +0800
|
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800
|
||||||
@@ -3,14 +3,13 @@
|
@@ -8,7 +8,7 @@
|
||||||
MIN_INF = float("-inf")
|
|
||||||
|
|
||||||
def get_top_states(t_state_v, K=4):
|
|
||||||
- items = list(t_state_v.items())
|
|
||||||
- topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
|
|
||||||
+ topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
|
|
||||||
return [x[0] for x in topK]
|
|
||||||
|
|
||||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||||
V = [{}] #tabular
|
V = [{}] #tabular
|
||||||
mem_path = [{}]
|
mem_path = [{}]
|
||||||
@ -217,7 +242,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
for y in states.get(obs[0], all_states): #init
|
for y in states.get(obs[0], all_states): #init
|
||||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
||||||
mem_path[0][y] = ''
|
mem_path[0][y] = ''
|
||||||
@@ -18,9 +17,9 @@
|
@@ -16,9 +16,9 @@
|
||||||
V.append({})
|
V.append({})
|
||||||
mem_path.append({})
|
mem_path.append({})
|
||||||
#prev_states = get_top_states(V[t-1])
|
#prev_states = get_top_states(V[t-1])
|
||||||
@ -229,7 +254,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
|
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
|
||||||
|
|
||||||
if not obs_states:
|
if not obs_states:
|
||||||
@@ -31,7 +30,7 @@
|
@@ -29,7 +29,7 @@
|
||||||
V[t][y] = prob
|
V[t][y] = prob
|
||||||
mem_path[t][y] = state
|
mem_path[t][y] = state
|
||||||
|
|
||||||
@ -239,8 +264,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
#print obs
|
#print obs
|
||||||
prob, state = max(last)
|
prob, state = max(last)
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
|
||||||
--- ./README.md 2014-11-07 23:07:02.067210423 +0800
|
--- ./README.md 2014-11-29 15:46:08.487925926 +0800
|
||||||
+++ ../jieba/README.md 2014-11-07 23:24:49.263188412 +0800
|
+++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800
|
||||||
@@ -4,6 +4,9 @@
|
@@ -4,6 +4,9 @@
|
||||||
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
|
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
|
||||||
- _Scroll down for English documentation._
|
- _Scroll down for English documentation._
|
||||||
@ -348,18 +373,65 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
|
|
||||||
```
|
```
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
|
||||||
--- ./setup.py 2014-11-07 23:07:02.067210423 +0800
|
--- ./setup.py 2014-11-29 15:46:46.379925565 +0800
|
||||||
+++ ../jieba/setup.py 2014-11-07 23:07:02.095210422 +0800
|
+++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800
|
||||||
@@ -1,5 +1,5 @@
|
@@ -11,7 +11,7 @@
|
||||||
from distutils.core import setup
|
|
||||||
|
完整文档见 ``README.md``
|
||||||
|
|
||||||
|
-GitHub: https://github.com/fxsjy/jieba
|
||||||
|
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
|
||||||
|
|
||||||
|
特点
|
||||||
|
====
|
||||||
|
@@ -34,17 +34,11 @@
|
||||||
|
Python 2.x
|
||||||
|
----------
|
||||||
|
|
||||||
|
-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
|
||||||
|
-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
|
||||||
|
- python setup.py install
|
||||||
|
-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
|
||||||
|
-- 通过 ``import jieba`` 来引用
|
||||||
|
+见 https://pypi.python.org/pypi/jieba/
|
||||||
|
|
||||||
|
Python 3.x
|
||||||
|
----------
|
||||||
|
|
||||||
|
-见 https://pypi.python.org/pypi/jieba3k/
|
||||||
|
-
|
||||||
|
- 目前 master 分支是只支持 Python 2.x 的
|
||||||
|
- Python 3.x 版本的分支也已经基本可用:
|
||||||
|
https://github.com/fxsjy/jieba/tree/jieba3k
|
||||||
|
@@ -59,13 +53,13 @@
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
-setup(name='jieba',
|
-setup(name='jieba',
|
||||||
+setup(name='jieba3k',
|
+setup(name='jieba3k',
|
||||||
version='0.35',
|
version='0.35.1',
|
||||||
description='Chinese Words Segementation Utilities',
|
description='Chinese Words Segementation Utilities',
|
||||||
|
long_description=LONGDOC,
|
||||||
author='Sun, Junyi',
|
author='Sun, Junyi',
|
||||||
|
author_email='ccnusjy@gmail.com',
|
||||||
|
- url='https://github.com/fxsjy/jieba',
|
||||||
|
+ url='https://github.com/fxsjy/jieba/tree/jieba3k',
|
||||||
|
license="MIT",
|
||||||
|
classifiers=[
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
@@ -73,9 +67,8 @@
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
'Natural Language :: Chinese (Simplified)',
|
||||||
|
'Natural Language :: Chinese (Traditional)',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
- 'Programming Language :: Python :: 2',
|
||||||
|
+ 'Programming Language :: Python :: 3',
|
||||||
|
'Topic :: Text Processing',
|
||||||
|
'Topic :: Text Processing :: Indexing',
|
||||||
|
'Topic :: Text Processing :: Linguistic',
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
|
||||||
--- ./test/extract_topic.py 2014-11-07 23:07:03.707210389 +0800
|
--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800
|
||||||
+++ ../jieba/test/extract_topic.py 2014-11-07 23:07:02.095210422 +0800
|
+++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800
|
||||||
@@ -51,13 +51,13 @@
|
@@ -51,13 +51,13 @@
|
||||||
print("training...")
|
print("training...")
|
||||||
|
|
||||||
@ -379,8 +451,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
|
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
|
||||||
print("")
|
print("")
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
|
||||||
--- ./test/jiebacmd.py 2014-11-07 23:07:03.211210399 +0800
|
--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800
|
||||||
+++ ../jieba/test/jiebacmd.py 2014-11-07 23:07:02.099210422 +0800
|
+++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800
|
||||||
@@ -23,6 +23,6 @@
|
@@ -23,6 +23,6 @@
|
||||||
break
|
break
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
@ -390,9 +462,9 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
|
|
||||||
|
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
|
||||||
--- ./test/jieba_test.py 2014-11-07 23:07:03.947210384 +0800
|
--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800
|
||||||
+++ ../jieba/test/jieba_test.py 2014-11-07 23:07:02.099210422 +0800
|
+++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800
|
||||||
@@ -1,5 +1,6 @@
|
@@ -152,7 +152,7 @@
|
||||||
#-*-coding: utf-8 -*-
|
#-*-coding: utf-8 -*-
|
||||||
import sys
|
import sys
|
||||||
+import imp
|
+import imp
|
||||||
@ -417,7 +489,7 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
|
||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
@@ -180,7 +181,7 @@
|
@@ -181,7 +181,7 @@
|
||||||
|
|
||||||
def testTokenize_NOHMM(self):
|
def testTokenize_NOHMM(self):
|
||||||
for content in test_contents:
|
for content in test_contents:
|
||||||
@ -427,8 +499,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
result = list(result)
|
result = list(result)
|
||||||
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
|
||||||
--- ./test/test_tokenize_no_hmm.py 2014-11-07 23:07:04.031210382 +0800
|
--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800
|
||||||
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-07 23:07:02.099210422 +0800
|
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800
|
||||||
@@ -7,7 +7,6 @@
|
@@ -7,7 +7,6 @@
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
@ -438,8 +510,8 @@ diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.p
|
|||||||
for tk in result:
|
for tk in result:
|
||||||
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
|
||||||
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
|
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
|
||||||
--- ./test/test_tokenize.py 2014-11-07 23:07:04.071210381 +0800
|
--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800
|
||||||
+++ ../jieba/test/test_tokenize.py 2014-11-07 23:07:02.099210422 +0800
|
+++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800
|
||||||
@@ -7,7 +7,6 @@
|
@@ -7,7 +7,6 @@
|
||||||
|
|
||||||
def cuttest(test_sent):
|
def cuttest(test_sent):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user