Merge pull request #100 from ZoeyYoung/jieba3k

Jieba3k
This commit is contained in:
Sun Junyi 2013-08-21 00:50:47 -07:00
commit d16727ba89
20 changed files with 308 additions and 139 deletions

4
.gitignore vendored
View File

@ -164,3 +164,7 @@ pip-log.txt
*.log
test/tmp/*
#jython
*.class
MANIFEST

View File

@ -1,3 +1,20 @@
2013-07-01: version 0.31
1. 修改了代码缩进格式遵循PEP8标准
2. 支持Jython解析器感谢 @piaolingxue
3. 修复中英混合词汇不能识别数字在前词语的Bug
4. 部分代码重构,感谢 @chao78787
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数感谢@linkerlin
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
2013-07-01: version 0.30
==========================
1) 新增jieba.tokenize方法返回每个词的起始位置
2) 新增ChineseAnalyzer用于支持whoosh搜索引擎
3添加了更多的中英混合词汇
4修改了一些py文件的加载方法从而支持py2exe,cxfree打包为exe
2013-06-17: version 0.29.1
==========================
1) 优化了viterbi算法的代码分词速度提升15%

20
LICENSE Normal file
View File

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2013 Sun Junyi
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

2
MANIFEST.in Normal file
View File

@ -0,0 +1,2 @@
graft README.md
graft Changelog

View File

@ -29,19 +29,31 @@ http://jiebademo.ap01.aws.af.cm/
(Powered by Appfog)
Python Version
==============
* 目前master分支是只支持Python2.x 的
* Python3.x 版本的分支也已经基本可用: https://github.com/fxsjy/jieba/tree/jieba3k
网站代码https://github.com/fxsjy/jiebademo
Usage
========
Python 2.x 下的安装
===================
* 全自动安装:`easy_install jieba` 或者 `pip install jieba`
* 半自动安装先下载http://pypi.python.org/pypi/jieba/ 解压后运行python setup.py install
* 手动安装将jieba目录放置于当前目录或者site-packages目录
* 通过import jieba 来引用 第一次import时需要构建Trie树需要几秒时间
Python 3.x 下的安装
====================
* 目前master分支是只支持Python2.x 的
* Python3.x 版本的分支也已经基本可用: https://github.com/fxsjy/jieba/tree/jieba3k
git clone https://github.com/fxsjy/jieba.git
git checkout jieba3k
python setup.py install
结巴分词Java版本
================
作者piaolingxue
地址https://github.com/huaban/jieba-analysis
Algorithm
========
* 基于Trie树结构实现高效的词图扫描生成句子中汉字所有可能成词情况所构成的有向无环图DAG)
@ -122,7 +134,7 @@ Output:
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
... print(w.word,w.flag)
... print w.word, w.flag
...
我 r
爱 v
@ -142,6 +154,50 @@ Output:
* 实验结果在4核3.4GHz Linux机器上对金庸全集进行精确分词获得了1MB/s的速度是单进程版的3.3倍。
功能 6) : Tokenize返回词语在原文的起始位置
============================================
* 注意输入参数只接受unicode
* 默认模式
```python
result = jieba.tokenize('永和服装饰品有限公司')
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
```
```
word 永和 start: 0 end:2
word 服装 start: 2 end:4
word 饰品 start: 4 end:6
word 有限公司 start: 6 end:10
```
* 搜索模式
```python
result = jieba.tokenize('永和服装饰品有限公司', mode='search')
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2]))
```
```
word 永和 start: 0 end:2
word 服装 start: 2 end:4
word 饰品 start: 4 end:6
word 有限 start: 6 end:8
word 公司 start: 8 end:10
word 有限公司 start: 6 end:10
```
功能 7) : ChineseAnalyzer for Whoosh搜索引擎
============================================
* 引用: `from jieba.analyse import ChineseAnalyzer `
* 用法示例https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py
其他词典
========
1. 占用内存较小的词典文件
@ -189,7 +245,7 @@ jieba采用延迟加载"import jieba"不会立即触发词典的加载,一
Change Log
==========
http://www.oschina.net/p/jieba/news#list
https://github.com/fxsjy/jieba/blob/master/Changelog
jieba
========

View File

@ -1,10 +1,9 @@
from __future__ import with_statement
import re
__version__ = '0.31'
__license__ = 'MIT'
import math
import re
import os
import sys
import pprint
from . import finalseg
import time
@ -39,7 +38,7 @@ def gen_trie(f_name):
ltotal+=freq
p = trie
for c in word:
if not c in p:
if c not in p:
p[c] ={}
p = p[c]
p['']='' #ending flag
@ -150,7 +149,7 @@ def get_DAG(sentence):
if c in p:
p = p[c]
if '' in p:
if not i in DAG:
if i not in DAG:
DAG[i]=[]
DAG[i].append(j)
j+=1
@ -163,7 +162,7 @@ def get_DAG(sentence):
i+=1
j=i
for i in range(len(sentence)):
if not i in DAG:
if i not in DAG:
DAG[i] =[i]
return DAG
@ -186,7 +185,7 @@ def __cut_DAG(sentence):
yield buf
buf=''
else:
if not (buf in FREQ):
if (buf not in FREQ):
regognized = finalseg.cut(buf)
for t in regognized:
yield t
@ -201,7 +200,7 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield buf
else:
if not (buf in FREQ):
if (buf not in FREQ):
regognized = finalseg.cut(buf)
for t in regognized:
yield t
@ -210,7 +209,7 @@ def __cut_DAG(sentence):
yield elem
def cut(sentence,cut_all=False):
if( type(sentence) is bytes):
if isinstance(sentence, bytes):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
@ -227,8 +226,9 @@ def cut(sentence,cut_all=False):
if cut_all:
cut_block = __cut_all
for blk in blocks:
if len(blk)==0:
continue
if re_han.match(blk):
#pprint.pprint(__cut_DAG(blk))
for word in cut_block(blk):
yield word
else:
@ -284,7 +284,7 @@ def add_word(word, freq, tag=None):
user_word_tag_tab[word] = tag.strip()
p = trie
for c in word:
if not c in p:
if c not in p:
p[c] = {}
p = p[c]
p[''] = '' # ending flag
@ -299,13 +299,17 @@ def __lcut_all(sentence):
def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence))
@require_initialized
def enable_parallel(processnum):
def enable_parallel(processnum=None):
global pool,cut,cut_for_search
if os.name=='nt':
raise Exception("parallel mode only supports posix system")
from multiprocessing import Pool
raise Exception("jieba: parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6:
raise Exception("jieba: the parallel feature needs Python version>2.5 ")
from multiprocessing import Pool,cpu_count
if processnum==None:
processnum = cpu_count()
pool = Pool(processnum)
def pcut(sentence,cut_all=False):
@ -341,7 +345,7 @@ def set_dictionary(dictionary_path):
with DICT_LOCK:
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
if not os.path.exists(abs_path):
raise Exception("path does not exists:" + abs_path)
raise Exception("jieba: path does not exists:" + abs_path)
DICTIONARY = abs_path
initialized = False
@ -353,7 +357,7 @@ def get_abs_path_dict():
def tokenize(unicode_sentence,mode="default"):
#mode ("default" or "search")
if not isinstance(unicode_sentence, str):
raise Exception("jieba: the input parameter should string.")
raise Exception("jieba: the input parameter should unicode.")
start = 0
if mode=='default':
for w in cut(unicode_sentence):

View File

@ -1,12 +1,15 @@
import re
import os
from math import log
from . import prob_start
from . import prob_trans
from . import prob_emit
import marshal
import sys
MIN_FLOAT=-3.14e100
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"
PrevStatus = {
'B':('E','S'),
'M':('M','B'),
@ -14,6 +17,35 @@ PrevStatus = {
'E':('B','M')
}
def load_model():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
return start_p, trans_p, emit_p
if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model()
else:
import prob_start,prob_trans,prob_emit
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
path = {}
@ -36,7 +68,8 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
def __cut(sentence):
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P)
global emit_P
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
begin, next = 0,0
#print pos_list, sentence
for i,char in enumerate(sentence):

BIN
jieba/finalseg/prob_emit.p Normal file

Binary file not shown.

BIN
jieba/finalseg/prob_start.p Normal file

Binary file not shown.

BIN
jieba/finalseg/prob_trans.p Normal file

Binary file not shown.

View File

@ -3,29 +3,62 @@ import os
from . import viterbi
import jieba
import sys
from . import prob_start
from . import prob_trans
from . import prob_emit
from . import char_state_tab
import marshal
default_encoding = sys.getfilesystemencoding()
def load_model(f_name):
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name,isJython=True):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
prob_p_path = os.path.join(_curpath,f_name)
if f_name.endswith(".py"):
return eval(open(prob_p_path,"rb").read())
else:
result = {}
with file(f_name, "rb") as f:
for line in open(f_name,"rb"):
line = line.strip()
if line=="":continue
line = line.decode("utf-8")
word, _, tag = line.split(" ")
result[word]=tag
f.closed
if not isJython:
return result
word_tag_tab = load_model(jieba.get_abs_path_dict())
start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f:
start_p = marshal.load(f)
f.closed
trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f:
trans_p = marshal.load(f)
f.closed
emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f:
emit_p = marshal.load(f)
f.closed
state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
with file(abs_path, 'rb') as f:
state = marshal.load(f)
f.closed
return state, start_p, trans_p, emit_p, result
if sys.platform.startswith("java"):
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
else:
import char_state_tab, prob_start, prob_trans, prob_emit
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab)
@ -48,7 +81,7 @@ class pair(object):
return self.__unicode__().encode(arg)
def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P)
prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0,0
for i,char in enumerate(sentence):
@ -105,7 +138,7 @@ def __cut_DAG(sentence):
yield pair(buf,word_tag_tab.get(buf,'x'))
buf=''
else:
if not (buf in jieba.FREQ):
if (buf not in jieba.FREQ):
regognized = __cut_detail(buf)
for t in regognized:
yield t
@ -120,7 +153,7 @@ def __cut_DAG(sentence):
if len(buf)==1:
yield pair(buf,word_tag_tab.get(buf,'x'))
else:
if not (buf in jieba.FREQ):
if (buf not in jieba.FREQ):
regognized = __cut_detail(buf)
for t in regognized:
yield t
@ -129,7 +162,7 @@ def __cut_DAG(sentence):
yield pair(elem,word_tag_tab.get(elem,'x'))
def __cut_internal(sentence):
if not ( type(sentence) is str):
if not isinstance(sentence, str):
try:
sentence = sentence.decode('utf-8')
except:

Binary file not shown.

BIN
jieba/posseg/prob_emit.p Normal file

Binary file not shown.

BIN
jieba/posseg/prob_start.p Normal file

Binary file not shown.

BIN
jieba/posseg/prob_trans.p Normal file

Binary file not shown.

View File

@ -1,6 +1,6 @@
from distutils.core import setup
setup(name='jieba',
version='0.29.1',
version='0.31',
description='Chinese Words Segementation Utilities',
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',

View File

@ -2,18 +2,20 @@ import sys,time
import sys
sys.path.append("../../")
import jieba
jieba.enable_parallel(4)
jieba.enable_parallel()
url = sys.argv[1]
content = open(url,"rb").read()
with open(url,"rb") as content:
content = content.read()
t1 = time.time()
words = list(jieba.cut(content))
words = "/ ".join(jieba.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
log_f.write(w.encode("utf-8"))
print('cost',tm_cost)
print('speed' , len(content)/tm_cost, " bytes/second")
with open("1.log","wb") as log_f:
log_f.write(words.encode('utf-8'))

View File

@ -5,17 +5,15 @@ import jieba
jieba.initialize()
url = sys.argv[1]
content = open(url,"rb").read()
with open(url,"rb") as content:
content = content.read()
t1 = time.time()
words = list(jieba.cut(content))
words = "/ ".join(jieba.cut(content))
t2 = time.time()
tm_cost = t2-t1
log_f = open("1.log","wb")
log_f.write(bytes("/ ".join(words),'utf-8'))
print('cost',tm_cost)
print('speed' , len(content)/tm_cost, " bytes/second")
with open("1.log","wb") as log_f:
log_f.write(words.encode('utf-8'))
log_f.write(bytes("/ ".join(words),'utf-8'))