Merge master and jieba3k, make the code Python 2/3 compatible

This commit is contained in:
Dingyuan Wang 2015-02-10 20:54:55 +08:00
commit 22bcf8be7a
48 changed files with 131433 additions and 131938 deletions

View File

@ -68,16 +68,16 @@ python setup.py install
import jieba import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True) seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print "Full Mode:", "/ ".join(seg_list) # 全模式 print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False) seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print "Default Mode:", "/ ".join(seg_list) # 精确模式 print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
print ", ".join(seg_list) print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print ", ".join(seg_list) print(", ".join(seg_list))
``` ```
输出: 输出:
@ -174,7 +174,7 @@ jieba.analyse.textrank(raw_text)
>>> import jieba.posseg as pseg >>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门") >>> words = pseg.cut("我爱北京天安门")
>>> for w in words: >>> for w in words:
... print w.word, w.flag ... print('%s %s' % (w.word, w.flag))
... ...
我 r 我 r
爱 v 爱 v
@ -203,7 +203,7 @@ jieba.analyse.textrank(raw_text)
```python ```python
result = jieba.tokenize(u'永和服装饰品有限公司') result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
``` ```
``` ```
@ -219,7 +219,7 @@ word 有限公司 start: 6 end:10
```python ```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search') result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
``` ```
``` ```
@ -413,16 +413,16 @@ Main Functions
import jieba import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True) seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print "Full Mode:", "/ ".join(seg_list) # 全模式 print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False) seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print "Default Mode:", "/ ".join(seg_list) # 默认模式 print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦") seg_list = jieba.cut("他来到了网易杭研大厦")
print ", ".join(seg_list) print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print ", ".join(seg_list) print(", ".join(seg_list))
``` ```
Output: Output:
@ -488,7 +488,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
>>> import jieba.posseg as pseg >>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门") >>> words = pseg.cut("我爱北京天安门")
>>> for w in words: >>> for w in words:
... print w.word, w.flag ... print('%s %s' % (w.word, w.flag))
... ...
我 r 我 r
爱 v 爱 v
@ -517,7 +517,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
```python ```python
result = jieba.tokenize(u'永和服装饰品有限公司') result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
``` ```
``` ```
@ -533,7 +533,7 @@ word 有限公司 start: 6 end:10
```python ```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search') result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
``` ```
``` ```

View File

@ -1,20 +1,20 @@
from __future__ import with_statement from __future__ import absolute_import, unicode_literals
__version__ = '0.35' __version__ = '0.35'
__license__ = 'MIT' __license__ = 'MIT'
import re import re
import os import os
import sys import sys
import finalseg
import time import time
import tempfile import tempfile
import marshal import marshal
from math import log from math import log
import random
import threading import threading
from functools import wraps from functools import wraps
import logging import logging
from hashlib import md5 from hashlib import md5
from ._compat import *
from . import finalseg
DICTIONARY = "dict.txt" DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock() DICT_LOCK = threading.RLock()
@ -51,13 +51,13 @@ def gen_pfdict(f_name):
ltotal += freq ltotal += freq
for ch in xrange(len(word)): for ch in xrange(len(word)):
pfdict.add(word[:ch+1]) pfdict.add(word[:ch+1])
except ValueError, e: except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line)) logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise ValueError, e raise e
return pfdict, lfreq, ltotal return pfdict, lfreq, ltotal
def initialize(dictionary=None): def initialize(dictionary=None):
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK
if not dictionary: if not dictionary:
dictionary = DICTIONARY dictionary = DICTIONARY
with DICT_LOCK: with DICT_LOCK:
@ -121,7 +121,7 @@ def require_initialized(fn):
def __cut_all(sentence): def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
old_j = -1 old_j = -1
for k,L in dag.iteritems(): for k,L in iteritems(dag):
if len(L) == 1 and k > old_j: if len(L) == 1 and k > old_j:
yield sentence[k:L[0]+1] yield sentence[k:L[0]+1]
old_j = L[0] old_j = L[0]
@ -158,13 +158,13 @@ def get_DAG(sentence):
return DAG return DAG
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route = {} route = {}
calc(sentence, DAG, route) calc(sentence, DAG, route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = u'' buf = ''
while x < N: while x < N:
y = route[x][1] + 1 y = route[x][1] + 1
l_word = sentence[x:y] l_word = sentence[x:y]
@ -174,19 +174,19 @@ def __cut_DAG_NO_HMM(sentence):
else: else:
if buf: if buf:
yield buf yield buf
buf = u'' buf = ''
yield l_word yield l_word
x = y x = y
if buf: if buf:
yield buf yield buf
buf = u'' buf = ''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route = {} route = {}
calc(sentence, DAG, route=route) calc(sentence, DAG, route=route)
x = 0 x = 0
buf = u'' buf = ''
N = len(sentence) N = len(sentence)
while x < N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
@ -197,7 +197,7 @@ def __cut_DAG(sentence):
if buf: if buf:
if len(buf) == 1: if len(buf) == 1:
yield buf yield buf
buf = u'' buf = ''
else: else:
if buf not in FREQ: if buf not in FREQ:
recognized = finalseg.cut(buf) recognized = finalseg.cut(buf)
@ -206,7 +206,7 @@ def __cut_DAG(sentence):
else: else:
for elem in buf: for elem in buf:
yield elem yield elem
buf = u'' buf = ''
yield l_word yield l_word
x = y x = y
@ -225,23 +225,19 @@ def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains '''The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
- sentence: The str/unicode to be segmented. - sentence: The str(unicode) to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern. - cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model. - HMM: Whether to use the Hidden Markov Model.
''' '''
if not isinstance(sentence, unicode): sentence = strdecode(sentence)
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled. # \r\n|\s : whitespace characters. Will not be handled.
if cut_all: if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U) re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
else: else:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U) re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if cut_all: if cut_all:
cut_block = __cut_all cut_block = __cut_all
@ -292,9 +288,9 @@ def load_userdict(f):
... ...
Word type may be ignored Word type may be ignored
''' '''
if isinstance(f, (str, unicode)): if isinstance(f, string_types):
f = open(f, 'rb') f = open(f, 'rb')
content = f.read().decode('utf-8').lstrip(u'\ufeff') content = f.read().decode('utf-8').lstrip('\ufeff')
line_no = 0 line_no = 0
for line in content.split("\n"): for line in content.split("\n"):
line_no += 1 line_no += 1
@ -333,15 +329,13 @@ def enable_parallel(processnum=None):
global pool, cut, cut_for_search global pool, cut, cut_for_search
if os.name == 'nt': if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system") raise Exception("jieba: parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6:
raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool, cpu_count from multiprocessing import Pool, cpu_count
if processnum is None: if processnum is None:
processnum = cpu_count() processnum = cpu_count()
pool = Pool(processnum) pool = Pool(processnum)
def pcut(sentence,cut_all=False,HMM=True): def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence) parts = strdecode(sentence).split('\n')
if cut_all: if cut_all:
result = pool.map(__lcut_all, parts) result = pool.map(__lcut_all, parts)
elif HMM: elif HMM:
@ -353,7 +347,7 @@ def enable_parallel(processnum=None):
yield w yield w
def pcut_for_search(sentence): def pcut_for_search(sentence):
parts = re.compile('([\r\n]+)').split(sentence) parts = strdecode(sentence).split('\n')
result = pool.map(__lcut_for_search, parts) result = pool.map(__lcut_for_search, parts)
for r in result: for r in result:
for w in r: for w in r:
@ -385,11 +379,11 @@ def get_abs_path_dict():
def tokenize(unicode_sentence, mode="default", HMM=True): def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end) """Tokenize a sentence and yields tuples of (word, start, end)
Parameter: Parameter:
- sentence: the unicode to be segmented. - sentence: the str(unicode) to be segmented.
- mode: "default" or "search", "search" is for finer segmentation. - mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model. - HMM: whether to use the Hidden Markov Model.
""" """
if not isinstance(unicode_sentence, unicode): if not isinstance(unicode_sentence, text_type):
raise Exception("jieba: the input parameter should be unicode.") raise Exception("jieba: the input parameter should be unicode.")
start = 0 start = 0
if mode == 'default': if mode == 'default':

View File

@ -25,7 +25,7 @@ args = parser.parse_args()
if args.quiet: if args.quiet:
jieba.setLogLevel(60) jieba.setLogLevel(60)
delim = unicode(args.delimiter) delim = text_type(args.delimiter)
cutall = args.cutall cutall = args.cutall
hmm = args.hmm hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin fp = open(args.filename, 'r') if args.filename else sys.stdin
@ -40,7 +40,10 @@ if args.user_dict:
ln = fp.readline() ln = fp.readline()
while ln: while ln:
l = ln.rstrip('\r\n') l = ln.rstrip('\r\n')
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')) result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
if PY2:
result = result.encode(default_encoding)
print(result)
ln = fp.readline() ln = fp.readline()
fp.close() fp.close()

31
jieba/_compat.py Normal file
View File

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
import sys
PY2 = sys.version_info[0] == 2
default_encoding = sys.getfilesystemencoding()
if PY2:
text_type = unicode
string_types = (str, unicode)
iterkeys = lambda d: d.iterkeys()
itervalues = lambda d: d.itervalues()
iteritems = lambda d: d.iteritems()
else:
text_type = str
string_types = (str,)
xrange = range
iterkeys = lambda d: iter(d.keys())
itervalues = lambda d: iter(d.values())
iteritems = lambda d: iter(d.items())
def strdecode(sentence):
if not isinstance(sentence, text_type):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
return sentence

View File

@ -1,13 +1,14 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import absolute_import
import jieba import jieba
import jieba.posseg import jieba.posseg
import os import os
from operator import itemgetter from operator import itemgetter
try: try:
from analyzer import ChineseAnalyzer from .analyzer import ChineseAnalyzer
except ImportError: except ImportError:
pass pass
from textrank import textrank from .textrank import textrank
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath, "idf.txt") abs_path = os.path.join(_curpath, "idf.txt")

View File

@ -1,4 +1,5 @@
##encoding=utf-8 #encoding=utf-8
from __future__ import unicode_literals
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem from whoosh.lang.porter import stem
@ -10,9 +11,9 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'to', 'us', 'we', 'when', 'will', 'with', 'yet',
'you', 'your', u'', u'', u'')) 'you', 'your', '', '', ''))
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")
class ChineseTokenizer(Tokenizer): class ChineseTokenizer(Tokenizer):
def __call__(self, text, **kargs): def __call__(self, text, **kargs):

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys import sys
import collections import collections
from operator import itemgetter from operator import itemgetter
@ -35,7 +36,7 @@ class UndirectWeightedGraph:
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
for w in ws.itervalues(): for w in itervalues(ws):
if w < min_rank: if w < min_rank:
min_rank = w min_rank = w
elif w > max_rank: elif w > max_rank:
@ -88,4 +89,4 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
if __name__ == '__main__': if __name__ == '__main__':
s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。" s = "此外公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元增资后吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年实现营业收入0万元实现净利润-139.13万元。"
for x, w in textrank(s, withWeight=True): for x, w in textrank(s, withWeight=True):
print x, w print('%s %s' % (x, w))

View File

@ -1,8 +1,9 @@
from __future__ import with_statement from __future__ import absolute_import, unicode_literals
import re import re
import os import os
import marshal import marshal
import sys import sys
from .._compat import *
MIN_FLOAT = -3.14e100 MIN_FLOAT = -3.14e100
@ -41,9 +42,9 @@ def load_model():
if sys.platform.startswith("java"): if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model() start_P, trans_P, emit_P = load_model()
else: else:
from prob_start import P as start_P from .prob_start import P as start_P
from prob_trans import P as trans_P from .prob_trans import P as trans_P
from prob_emit import P as emit_P from .prob_emit import P as emit_P
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
@ -85,12 +86,8 @@ def __cut(sentence):
yield sentence[nexti:] yield sentence[nexti:]
def cut(sentence): def cut(sentence):
if not isinstance(sentence, unicode): sentence = strdecode(sentence)
try: re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,12 @@
from __future__ import with_statement from __future__ import absolute_import, unicode_literals
import re import re
import os import os
import viterbi
import jieba import jieba
import sys import sys
import marshal import marshal
from functools import wraps from functools import wraps
from .._compat import *
default_encoding = sys.getfilesystemencoding() from .viterbi import viterbi
PROB_START_P = "prob_start.p" PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p" PROB_TRANS_P = "prob_trans.p"
@ -18,13 +17,14 @@ def load_model(f_name, isJython=True):
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {} result = {}
with open(f_name, "r") as f: with open(f_name, "rb") as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
word, _, tag = line.split(' ') line = line.decode("utf-8")
result[word.decode('utf-8')] = tag word, _, tag = line.split(" ")
result[word] = tag
if not isJython: if not isJython:
return result return result
@ -55,10 +55,10 @@ def load_model(f_name, isJython=True):
if sys.platform.startswith("java"): if sys.platform.startswith("java"):
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict()) char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
else: else:
from char_state_tab import P as char_state_tab_P from .char_state_tab import P as char_state_tab_P
from prob_start import P as start_P from .prob_start import P as start_P
from prob_trans import P as trans_P from .prob_trans import P as trans_P
from prob_emit import P as emit_P from .prob_emit import P as emit_P
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False) word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
@ -79,20 +79,23 @@ class pair(object):
self.flag = flag self.flag = flag
def __unicode__(self): def __unicode__(self):
return u'%s/%s' % (self.word, self.flag) return '%s/%s' % (self.word, self.flag)
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
def __str__(self): def __str__(self):
if PY2:
return self.__unicode__().encode(default_encoding) return self.__unicode__().encode(default_encoding)
else:
return self.__unicode__()
def encode(self,arg): def encode(self,arg):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)
def __cut(sentence): def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P) prob, pos_list = viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0, 0 begin, nexti = 0, 0
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i][0] pos = pos_list[i][0]
@ -100,16 +103,16 @@ def __cut(sentence):
begin = i begin = i
elif pos == 'E': elif pos == 'E':
yield pair(sentence[begin:i+1], pos_list[i][1]) yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1 nexti = i+1
elif pos == 'S': elif pos == 'S':
yield pair(char, pos_list[i][1]) yield pair(char, pos_list[i][1])
next = i+1 nexti = i+1
if next < len(sentence): if nexti < len(sentence):
yield pair(sentence[next:], pos_list[next][1]) yield pair(sentence[nexti:], pos_list[nexti][1])
def __cut_detail(sentence): def __cut_detail(sentence):
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
@ -132,8 +135,8 @@ def __cut_DAG_NO_HMM(sentence):
jieba.calc(sentence, DAG, route) jieba.calc(sentence, DAG, route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = u'' buf = ''
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
@ -143,12 +146,12 @@ def __cut_DAG_NO_HMM(sentence):
else: else:
if buf: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = u'' buf = ''
yield pair(l_word, word_tag_tab.get(l_word, 'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x = y x = y
if buf: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = u'' buf = ''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
@ -157,7 +160,7 @@ def __cut_DAG(sentence):
jieba.calc(sentence, DAG, route) jieba.calc(sentence, DAG, route)
x = 0 x = 0
buf = u'' buf = ''
N = len(sentence) N = len(sentence)
while x < N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
@ -175,7 +178,7 @@ def __cut_DAG(sentence):
else: else:
for elem in buf: for elem in buf:
yield pair(elem, word_tag_tab.get(elem, 'x')) yield pair(elem, word_tag_tab.get(elem, 'x'))
buf = u'' buf = ''
yield pair(l_word, word_tag_tab.get(l_word, 'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x = y x = y
@ -191,13 +194,9 @@ def __cut_DAG(sentence):
yield pair(elem, word_tag_tab.get(elem, 'x')) yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_internal(sentence, HMM=True): def __cut_internal(sentence, HMM=True):
if not isinstance(sentence, unicode): sentence = strdecode(sentence)
try: re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
sentence = sentence.decode('utf-8') re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if HMM:
__cut_blk = __cut_DAG __cut_blk = __cut_DAG
@ -234,7 +233,7 @@ def cut(sentence, HMM=True):
for w in __cut_internal(sentence, HMM=HMM): for w in __cut_internal(sentence, HMM=HMM):
yield w yield w
else: else:
parts = re.compile('([\r\n]+)').split(sentence) parts = strdecode(sentence).split('\n')
if HMM: if HMM:
result = jieba.pool.map(__lcut_internal, parts) result = jieba.pool.map(__lcut_internal, parts)
else: else:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,11 @@
import sys
import operator import operator
MIN_FLOAT = -3.14e100 MIN_FLOAT = -3.14e100
MIN_INF = float("-inf") MIN_INF = float("-inf")
if sys.version_info[0] > 2:
xrange = range
def get_top_states(t_state_v, K=4): def get_top_states(t_state_v, K=4):
return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K] return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from distutils.core import setup from distutils.core import setup
LONGDOC = u""" LONGDOC = """
jieba jieba
===== =====
@ -75,6 +75,12 @@ setup(name='jieba',
'Natural Language :: Chinese (Traditional)', 'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python', 'Programming Language :: Python',
'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Topic :: Text Processing', 'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic', 'Topic :: Text Processing :: Linguistic',

View File

@ -1,522 +0,0 @@
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
--- ./jieba/analyse/analyzer.py 2014-11-29 15:46:45.987925569 +0800
+++ ../jieba/jieba/analyse/analyzer.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,4 @@
-##encoding=utf-8
+#encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
--- ./jieba/analyse/__init__.py 2014-11-29 15:46:46.139925567 +0800
+++ ../jieba/jieba/analyse/__init__.py 2014-11-29 15:36:13.147931604 +0800
@@ -26,7 +26,7 @@
def set_new_path(self, new_idf_path):
if self.path != new_idf_path:
- content = open(new_idf_path, 'rb').read().decode('utf-8')
+ content = open(new_idf_path, 'r', encoding='utf-8').read()
idf_freq = {}
lines = content.rstrip('\n').split('\n')
for line in lines:
@@ -93,7 +93,7 @@
freq[k] *= idf_freq.get(k, median_idf) / total
if withWeight:
- tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
--- ./jieba/analyse/textrank.py 2014-11-29 15:46:46.043925568 +0800
+++ ../jieba/jieba/analyse/textrank.py 2014-11-29 15:36:39.291931354 +0800
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
@@ -22,12 +22,12 @@
outSum = collections.defaultdict(float)
wsdef = 1.0 / len(self.graph)
- for n, out in list(self.graph.items()):
+ for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)
for x in range(10): # 10 iters
- for n, inedges in list(self.graph.items()):
+ for n, inedges in self.graph.items():
s = 0
for e in inedges:
s += e[2] / outSum[e[1]] * ws[e[1]]
@@ -41,7 +41,7 @@
elif w > max_rank:
max_rank = w
- for n, w in list(ws.items()):
+ for n, w in ws.items():
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
@@ -72,12 +72,12 @@
continue
cm[(words[i].word, words[j].word)] += 1
- for terms, w in list(cm.items()):
+ for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
- tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
+ tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
--- ./jieba/finalseg/__init__.py 2014-11-29 15:46:46.367925565 +0800
+++ ../jieba/jieba/finalseg/__init__.py 2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,3 @@
-
import re
import os
import marshal
@@ -89,7 +88,7 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
--- ./jieba/__init__.py 2014-11-29 15:46:45.955925569 +0800
+++ ../jieba/jieba/__init__.py 2014-11-29 15:39:03.335929981 +0800
@@ -1,4 +1,3 @@
-
__version__ = '0.35'
__license__ = 'MIT'
@@ -51,7 +50,7 @@
pfdict.add(word[:ch+1])
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
- raise ValueError(e)
+ raise e
return pfdict, lfreq, ltotal
def initialize(dictionary=None):
@@ -229,11 +228,11 @@
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- - sentence: The str/unicode to be segmented.
+ - sentence: The str to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
- if not isinstance(sentence, str):
+ if isinstance(sentence, bytes):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
@@ -243,9 +242,9 @@
# \r\n|\s : whitespace characters. Will not be handled.
if cut_all:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
else:
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
blocks = re_han.split(sentence)
if cut_all:
cut_block = __cut_all
@@ -339,8 +338,6 @@
global pool, cut, cut_for_search
if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system")
- if sys.version_info[0]==2 and sys.version_info[1]<6:
- raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool, cpu_count
if processnum is None:
processnum = cpu_count()
@@ -393,12 +390,12 @@
def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- - sentence: the unicode to be segmented.
+ - sentence: the str to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, str):
- raise Exception("jieba: the input parameter should be unicode.")
+ raise Exception("jieba: the input parameter should be str.")
start = 0
if mode == 'default':
for w in cut(unicode_sentence, HMM=HMM):
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
--- ./jieba/__main__.py 2014-11-29 15:46:45.747925571 +0800
+++ ../jieba/jieba/__main__.py 2014-11-29 15:34:42.859932465 +0800
@@ -40,7 +40,7 @@
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
- print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
+ print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
ln = fp.readline()
fp.close()
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
--- ./jieba/posseg/__init__.py 2014-11-29 15:46:46.271925566 +0800
+++ ../jieba/jieba/posseg/__init__.py 2014-11-29 15:37:52.299930658 +0800
@@ -1,4 +1,3 @@
-
import re
import os
from . import viterbi
@@ -18,14 +17,14 @@
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {}
- with open(f_name, "r") as f:
+ with open(f_name, "rb") as f:
for line in f:
line = line.strip()
if not line:
continue
- word, _, tag = line.split(' ')
- result[word.decode('utf-8')] = tag
-
+ line = line.decode("utf-8")
+ word, _, tag = line.split(" ")
+ result[word] = tag
if not isJython:
return result
@@ -105,8 +104,8 @@
yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence):
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
@@ -130,7 +129,7 @@
x = 0
N = len(sentence)
buf = ''
- re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
+ re_eng = re.compile('[a-zA-Z0-9]',re.U)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
@@ -195,8 +194,8 @@
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
- re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
- re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
+ re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
+ re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
blocks = re_han.split(sentence)
if HMM:
__cut_blk = __cut_DAG
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
--- ./jieba/posseg/viterbi.py 2014-11-29 15:46:46.303925566 +0800
+++ ../jieba/jieba/posseg/viterbi.py 2014-11-29 15:38:28.527930313 +0800
@@ -8,7 +8,7 @@
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular
mem_path = [{}]
- all_states = list(trans_p.keys())
+ all_states = trans_p.keys()
for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = ''
@@ -16,9 +16,9 @@
V.append({})
mem_path.append({})
#prev_states = get_top_states(V[t-1])
- prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
+ prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
- prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
+ prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
if not obs_states:
@@ -29,7 +29,7 @@
V[t][y] = prob
mem_path[t][y] = state
- last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
+ last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0:
#print obs
prob, state = max(last)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
--- ./README.md 2014-11-29 15:46:08.487925926 +0800
+++ ../jieba/README.md 2014-11-29 15:34:42.859932465 +0800
@@ -4,6 +4,9 @@
"Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
- _Scroll down for English documentation._
+注意!
+========
+这个branch `jieba3k` 是专门用于Python3.x的版本
特点
========
@@ -68,16 +71,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 精确模式
+print("Default Mode:", "/ ".join(seg_list)) # 精确模式
seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
输出:
@@ -174,7 +177,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -203,7 +206,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -219,7 +222,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -408,16 +411,16 @@
import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list) # 全模式
+print("Full Mode:", "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list) # 默认模式
+print("Default Mode:", "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
```
Output:
@@ -483,7 +486,7 @@
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
-... print w.word, w.flag
+... print(w.word, w.flag)
...
我 r
爱 v
@@ -512,7 +515,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
@@ -528,7 +531,7 @@
```python
result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
for tk in result:
- print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+ print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
```
```
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
--- ./setup.py 2014-11-29 15:46:46.379925565 +0800
+++ ../jieba/setup.py 2014-11-29 15:42:20.263928103 +0800
@@ -11,7 +11,7 @@
完整文档见 ``README.md``
-GitHub: https://github.com/fxsjy/jieba
+GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
特点
====
@@ -34,17 +34,11 @@
Python 2.x
----------
-- 全自动安装: ``easy_install jieba`` 或者 ``pip install jieba``
-- 半自动安装:先下载 https://pypi.python.org/pypi/jieba/ ,解压后运行
- python setup.py install
-- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录
-- 通过 ``import jieba`` 来引用
+见 https://pypi.python.org/pypi/jieba/
Python 3.x
----------
-见 https://pypi.python.org/pypi/jieba3k/
-
- 目前 master 分支是只支持 Python 2.x 的
- Python 3.x 版本的分支也已经基本可用:
https://github.com/fxsjy/jieba/tree/jieba3k
@@ -59,13 +53,13 @@
"""
-setup(name='jieba',
+setup(name='jieba3k',
version='0.35.1',
description='Chinese Words Segementation Utilities',
long_description=LONGDOC,
author='Sun, Junyi',
author_email='ccnusjy@gmail.com',
- url='https://github.com/fxsjy/jieba',
+ url='https://github.com/fxsjy/jieba/tree/jieba3k',
license="MIT",
classifiers=[
'Intended Audience :: Developers',
@@ -73,9 +67,8 @@
'Operating System :: OS Independent',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python',
- 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 3',
'Topic :: Text Processing',
'Topic :: Text Processing :: Indexing',
'Topic :: Text Processing :: Linguistic',
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
--- ./test/extract_topic.py 2014-11-29 15:46:47.003925559 +0800
+++ ../jieba/test/extract_topic.py 2014-11-29 15:34:42.919932464 +0800
@@ -51,13 +51,13 @@
print("training...")
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
-print(("done in %0.3fs." % (time.time() - t0)))
+print("done in %0.3fs." % (time.time() - t0))
# Inverse the vectorizer vocabulary to be able
feature_names = count_vect.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
- print(("Topic #%d:" % topic_idx))
- print((" ".join([feature_names[i]
- for i in topic.argsort()[:-n_top_words - 1:-1]])))
+ print("Topic #%d:" % topic_idx)
+ print(" ".join([feature_names[i]
+ for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("")
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
--- ./test/jiebacmd.py 2014-11-29 15:46:46.443925564 +0800
+++ ../jieba/test/jiebacmd.py 2014-11-29 15:34:42.919932464 +0800
@@ -23,6 +23,6 @@
break
line = line.strip()
for word in jieba.cut(line):
- print(word.encode(default_encoding))
+ print(word)
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
--- ./test/jieba_test.py 2014-11-29 15:46:47.271925556 +0800
+++ ../jieba/test/jieba_test.py 2014-11-29 15:34:42.919932464 +0800
@@ -152,7 +152,7 @@
#-*-coding: utf-8 -*-
import sys
+import imp
sys.path.append("../")
import unittest
import types
@@ -97,7 +98,7 @@
class JiebaTestCase(unittest.TestCase):
def setUp(self):
- reload(jieba)
+ imp.reload(jieba)
def tearDown(self):
pass
@@ -151,7 +152,7 @@
def testTokenize(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'))
+ result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
@@ -181,7 +181,7 @@
def testTokenize_NOHMM(self):
for content in test_contents:
- result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+ result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
--- ./test/test_tokenize_no_hmm.py 2014-11-29 15:46:47.355925556 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
--- ./test/test_tokenize.py 2014-11-29 15:46:47.403925555 +0800
+++ ../jieba/test/test_tokenize.py 2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
def cuttest(test_sent):
global g_mode
- test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

View File

@ -1,34 +0,0 @@
#!/bin/bash
# Set 2to3 path.
PYTHON2TO3=2to3
# Copy the python2 version.
echo Jieba 2to3 manual conversion tool
echo
if ! git rev-parse; then
exit 1
fi
echo Copying working directory to ../jieba2
if [ -d ../jieba2 ]; then
echo Found existing ../jieba2
read -p "Replace it with new one? (y/n) " -r
if ! [[ $REPLY =~ ^[Yy]$ ]]; then
echo Cancelled.
exit
else
rm -rf ../jieba2
fi
fi
if ! git checkout jieba3k; then
exit 1
fi
cp -r . ../jieba2
cd ../jieba2
if ! git checkout master; then
exit 1
fi
# Here starts auto conversion.
echo Converting jieba2 to Python3 ...
find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
patch -p0 -s <../jieba/test/2to3.diff
echo Done. Compare jieba and jieba2 to manually port.

View File

@ -1,17 +1,18 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import unicode_literals
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba import jieba
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True) seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print u"Full Mode:", u"/ ".join(seg_list) # 全模式 print("Full Mode: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False) seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print u"Default Mode:", u"/ ".join(seg_list) # 默认模式 print("Default Mode: " + "/ ".join(seg_list)) # 默认模式
seg_list = jieba.cut(u"他来到了网易杭研大厦") seg_list = jieba.cut("他来到了网易杭研大厦")
print u", ".join(seg_list) print(", ".join(seg_list))
seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print u", ".join(seg_list) print(", ".join(seg_list))

View File

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) < 1: if len(args) < 1:
print USAGE print(USAGE)
sys.exit(1) sys.exit(1)
file_name = args[0] file_name = args[0]
@ -27,4 +27,4 @@ content = open(file_name, 'rb').read()
tags = jieba.analyse.extract_tags(content, topK=topK) tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags) print(",".join(tags))

View File

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) < 1: if len(args) < 1:
print USAGE print(USAGE)
sys.exit(1) sys.exit(1)
file_name = args[0] file_name = args[0]
@ -29,4 +29,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK) tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags) print(",".join(tags))

View File

@ -13,7 +13,7 @@ opt, args = parser.parse_args()
if len(args) < 1: if len(args) < 1:
print USAGE print(USAGE)
sys.exit(1) sys.exit(1)
file_name = args[0] file_name = args[0]
@ -30,4 +30,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK) tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags) print(",".join(tags))

View File

@ -14,7 +14,7 @@ opt, args = parser.parse_args()
if len(args) < 1: if len(args) < 1:
print USAGE print(USAGE)
sys.exit(1) sys.exit(1)
file_name = args[0] file_name = args[0]
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
if withWeight is True: if withWeight is True:
for tag in tags: for tag in tags:
print "tag: %s\t\t weight: %f" % (tag[0],tag[1]) print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
else: else:
print ",".join(tags) print(",".join(tags))

View File

@ -12,7 +12,7 @@ import os
import random import random
if len(sys.argv)<2: if len(sys.argv)<2:
print "usage: extract_topic.py directory [n_topic] [n_top_words]" print("usage: extract_topic.py directory [n_topic] [n_top_words]")
sys.exit(0) sys.exit(0)
n_topic = 10 n_topic = 10
@ -28,27 +28,27 @@ count_vect = CountVectorizer()
docs = [] docs = []
pattern = os.path.join(sys.argv[1],"*.txt") pattern = os.path.join(sys.argv[1],"*.txt")
print "read "+pattern print("read "+pattern)
for f_name in glob.glob(pattern): for f_name in glob.glob(pattern):
with open(f_name) as f: with open(f_name) as f:
print "read file:", f_name print("read file:", f_name)
for line in f: #one line as a document for line in f: #one line as a document
words = " ".join(jieba.cut(line)) words = " ".join(jieba.cut(line))
docs.append(words) docs.append(words)
random.shuffle(docs) random.shuffle(docs)
print "read done." print("read done.")
print "transform" print("transform")
counts = count_vect.fit_transform(docs) counts = count_vect.fit_transform(docs)
tfidf = TfidfTransformer().fit_transform(counts) tfidf = TfidfTransformer().fit_transform(counts)
print tfidf.shape print(tfidf.shape)
t0 = time.time() t0 = time.time()
print "training..." print("training...")
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf) nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
print("done in %0.3fs." % (time.time() - t0)) print("done in %0.3fs." % (time.time() - t0))

View File

@ -1,9 +1,13 @@
#-*-coding: utf-8 -*- #-*-coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import sys import sys
sys.path.append("../") sys.path.append("../")
import unittest import unittest
import types import types
import jieba import jieba
if sys.version_info[0] > 2:
from imp import reload
jieba.initialize() jieba.initialize()
@ -108,8 +112,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error" assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print >> sys.stderr, " , ".join(result) print(" , ".join(result), file=sys.stderr)
print >> sys.stderr, "testDefaultCut" print("testDefaultCut", file=sys.stderr)
def testCutAll(self): def testCutAll(self):
for content in test_contents: for content in test_contents:
@ -117,8 +121,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test CutAll Generator error" assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test CutAll error on content: %s" % content assert isinstance(result, list), "Test CutAll error on content: %s" % content
print >> sys.stderr, " , ".join(result) print(" , ".join(result), file=sys.stderr)
print >> sys.stderr, "testCutAll" print("testCutAll", file=sys.stderr)
def testSetDictionary(self): def testSetDictionary(self):
jieba.set_dictionary("foobar.txt") jieba.set_dictionary("foobar.txt")
@ -127,8 +131,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
print >> sys.stderr, " , ".join(result) print(" , ".join(result), file=sys.stderr)
print >> sys.stderr, "testSetDictionary" print("testSetDictionary", file=sys.stderr)
def testCutForSearch(self): def testCutForSearch(self):
for content in test_contents: for content in test_contents:
@ -136,8 +140,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print >> sys.stderr, " , ".join(result) print(" , ".join(result), file=sys.stderr)
print >> sys.stderr, "testCutForSearch" print("testCutForSearch", file=sys.stderr)
def testPosseg(self): def testPosseg(self):
import jieba.posseg as pseg import jieba.posseg as pseg
@ -146,18 +150,18 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content assert isinstance(result, list), "Test Posseg error on content: %s" % content
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result]) print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print >> sys.stderr, "testPosseg" print("testPosseg", file=sys.stderr)
def testTokenize(self): def testTokenize(self):
for content in test_contents: for content in test_contents:
result = jieba.tokenize(content.decode('utf-8')) result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result: for tk in result:
print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print >> sys.stderr, "testTokenize" print("testTokenize", file=sys.stderr)
def testDefaultCut_NOHMM(self): def testDefaultCut_NOHMM(self):
for content in test_contents: for content in test_contents:
@ -165,8 +169,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error" assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test DefaultCut error on content: %s" % content assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
print >> sys.stderr, " , ".join(result) print(" , ".join(result), file=sys.stderr)
print >> sys.stderr, "testDefaultCut_NOHMM" print("testDefaultCut_NOHMM", file=sys.stderr)
def testPosseg_NOHMM(self): def testPosseg_NOHMM(self):
import jieba.posseg as pseg import jieba.posseg as pseg
@ -175,18 +179,18 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content assert isinstance(result, list), "Test Posseg error on content: %s" % content
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result]) print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print >> sys.stderr, "testPosseg_NOHMM" print("testPosseg_NOHMM", file=sys.stderr)
def testTokenize_NOHMM(self): def testTokenize_NOHMM(self):
for content in test_contents: for content in test_contents:
result = jieba.tokenize(content.decode('utf-8'),HMM=False) result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result: for tk in result:
print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print >> sys.stderr, "testTokenize_NOHMM" print("testTokenize_NOHMM", file=sys.stderr)
def testCutForSearch_NOHMM(self): def testCutForSearch_NOHMM(self):
for content in test_contents: for content in test_contents:
@ -194,8 +198,8 @@ class JiebaTestCase(unittest.TestCase):
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result) result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print >> sys.stderr, " , ".join(result) print(" , ".join(result), file=sys.stderr)
print >> sys.stderr, "testCutForSearch_NOHMM" print("testCutForSearch_NOHMM", file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -6,7 +6,7 @@ cat abc.txt | python jiebacmd.py | sort | uniq -c | sort -nr -k1 | head -100
''' '''
from __future__ import unicode_literals
import sys import sys
sys.path.append("../") sys.path.append("../")
@ -23,6 +23,6 @@ while True:
break break
line = line.strip() line = line.strip()
for word in jieba.cut(line): for word in jieba.cut(line):
print word.encode(default_encoding) print(word)

View File

@ -14,7 +14,7 @@ opt, args = parser.parse_args()
if len(args) <1: if len(args) <1:
print USAGE print(USAGE)
sys.exit(1) sys.exit(1)
file_name = args[0] file_name = args[0]
@ -29,6 +29,6 @@ content = open(file_name,'rb').read()
tags = jieba.analyse.extract_tags(content,topK=topK) tags = jieba.analyse.extract_tags(content,topK=topK)
print ",".join(tags) print(",".join(tags))

View File

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent) result = jieba.cut(test_sent)
for word in result: for word in result:
print word, "/", print(word, "/", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent,cut_all=True) result = jieba.cut(test_sent,cut_all=True)
for word in result: for word in result:
print word, "/", print(word, "/", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut_for_search(test_sent) result = jieba.cut_for_search(test_sent)
for word in result: for word in result:
print word, "/", print(word, "/", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,6 +1,5 @@
import urllib2
import sys,time
import sys import sys
import time
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
@ -17,5 +16,5 @@ tm_cost = t2-t1
log_f = open("1.log","wb") log_f = open("1.log","wb")
log_f.write(words.encode('utf-8')) log_f.write(words.encode('utf-8'))
print 'speed' , len(content)/tm_cost, " bytes/second" print('speed %s bytes/second' % (len(content)/tm_cost))

View File

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
@ -8,8 +9,8 @@ import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", print(w.word, "/", w.flag, ", ", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,9 +1,10 @@
import urllib2 from __future__ import print_function
import sys,time import sys,time
import sys import sys
sys.path.append("../../") sys.path.append("../../")
import jieba import jieba
import jieba.posseg as pseg import jieba.posseg as pseg
jieba.enable_parallel(4) jieba.enable_parallel(4)
url = sys.argv[1] url = sys.argv[1]
@ -14,9 +15,8 @@ words = list(pseg.cut(content))
t2 = time.time() t2 = time.time()
tm_cost = t2-t1 tm_cost = t2-t1
log_f = open("1.log","wb") log_f = open("1.log","w")
for w in words: log_f.write(' / '.join(map(str, words)))
print >> log_f, w.encode("utf-8"), "/" ,
print 'speed' , len(content)/tm_cost, " bytes/second" print('speed' , len(content)/tm_cost, " bytes/second")

View File

@ -6,7 +6,7 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent) result = jieba.cut(test_sent)
print " / ".join(result) print(" / ".join(result))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -5,5 +5,5 @@ import jieba
import jieba.posseg as pseg import jieba.posseg as pseg
words=pseg.cut("又跛又啞") words=pseg.cut("又跛又啞")
for w in words: for w in words:
print w.word,w.flag print(w.word,w.flag)

View File

@ -5,7 +5,7 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent) result = jieba.cut(test_sent)
print " ".join(result) print(" ".join(result))
def testcase(): def testcase():
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。") cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空我爱北京我爱Python和C++。")
@ -22,6 +22,6 @@ def testcase():
if __name__ == "__main__": if __name__ == "__main__":
testcase() testcase()
jieba.set_dictionary("foobar.txt") jieba.set_dictionary("foobar.txt")
print "================================" print("================================")
testcase() testcase()

View File

@ -6,8 +6,8 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut_for_search(test_sent) result = jieba.cut_for_search(test_sent)
for word in result: for word in result:
print word, "/", print(word, "/", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -6,8 +6,8 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent,cut_all=True) result = jieba.cut(test_sent,cut_all=True)
for word in result: for word in result:
print word, "/", print(word, "/", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,4 +1,3 @@
import urllib2
import sys,time import sys,time
import sys import sys
sys.path.append("../") sys.path.append("../")
@ -17,6 +16,6 @@ log_f = open("1.log","wb")
log_f.write(words.encode('utf-8')) log_f.write(words.encode('utf-8'))
log_f.close() log_f.close()
print 'cost',tm_cost print('cost ' + tm_cost)
print 'speed' , len(content)/tm_cost, " bytes/second" print('speed %s bytes/second' % (len(content)/tm_cost))

View File

@ -8,18 +8,18 @@ import jieba
class Worker(threading.Thread): class Worker(threading.Thread):
def run(self): def run(self):
seg_list = jieba.cut("我来到北京清华大学",cut_all=True) seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
print "Full Mode:" + "/ ".join(seg_list) #全模式 print("Full Mode:" + "/ ".join(seg_list)) #全模式
seg_list = jieba.cut("我来到北京清华大学",cut_all=False) seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
print "Default Mode:" + "/ ".join(seg_list) #默认模式 print("Default Mode:" + "/ ".join(seg_list)) #默认模式
seg_list = jieba.cut("他来到了网易杭研大厦") seg_list = jieba.cut("他来到了网易杭研大厦")
print ", ".join(seg_list) print(", ".join(seg_list))
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
print ", ".join(seg_list) print(", ".join(seg_list))
workers = [] workers = []
for i in xrange(10): for i in range(10):
worker = Worker() worker = Worker()
workers.append(worker) workers.append(worker)
worker.start() worker.start()

View File

@ -6,7 +6,7 @@ import jieba
def cuttest(test_sent): def cuttest(test_sent):
result = jieba.cut(test_sent,HMM=False) result = jieba.cut(test_sent,HMM=False)
print " / ".join(result) print(" / ".join(result))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,4 +1,5 @@
#encoding=utf-8 #encoding=utf-8
from __future__ import print_function
import sys import sys
sys.path.append("../") sys.path.append("../")
import jieba.posseg as pseg import jieba.posseg as pseg
@ -6,8 +7,8 @@ import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", print(w.word, "/", w.flag, ", ", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,6 +1,6 @@
import urllib2 from __future__ import print_function
import sys,time
import sys import sys
import time
sys.path.append("../") sys.path.append("../")
import jieba import jieba
jieba.initialize() jieba.initialize()
@ -14,9 +14,8 @@ words = list(pseg.cut(content))
t2 = time.time() t2 = time.time()
tm_cost = t2-t1 tm_cost = t2-t1
log_f = open("1.log","wb") log_f = open("1.log","w")
for w in words: log_f.write(' / '.join(map(str, words)))
print >> log_f, w.encode("utf-8"), "/" ,
print 'speed' , len(content)/tm_cost, " bytes/second" print('speed' , len(content)/tm_cost, " bytes/second")

View File

@ -6,8 +6,8 @@ import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent,HMM=False) result = pseg.cut(test_sent,HMM=False)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", print(w.word, "/", w.flag, ", ", end=' ')
print "" print("")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -7,10 +7,9 @@ g_mode="default"
def cuttest(test_sent): def cuttest(test_sent):
global g_mode global g_mode
test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode) result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -7,10 +7,9 @@ g_mode="default"
def cuttest(test_sent): def cuttest(test_sent):
global g_mode global g_mode
test_sent = test_sent.decode('utf-8')
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result: for tk in result:
print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]) print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -9,19 +9,19 @@ test_sent = "李小福是创新办主任也是云计算方面的专家; 什么
test_sent += "例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型" test_sent += "例如我输入一个带“韩玉赏鉴”的标题在自定义词库中也增加了此词为N类型"
words = jieba.cut(test_sent) words = jieba.cut(test_sent)
for w in words: for w in words:
print w print(w)
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", print(w.word, "/", w.flag, ", ", end=' ')
print "\n========" print("\n========")
terms = jieba.cut('easy_install is great') terms = jieba.cut('easy_install is great')
for t in terms: for t in terms:
print t print(t)
print '-------------------------' print('-------------------------')
terms = jieba.cut('python 的正则表达式是好用的') terms = jieba.cut('python 的正则表达式是好用的')
for t in terms: for t in terms:
print t print(t)

View File

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys,os import sys,os
sys.path.append("../") sys.path.append("../")
from whoosh.index import create_in,open_dir from whoosh.index import create_in,open_dir
@ -18,46 +19,46 @@ ix = create_in("tmp", schema) # for create new index
writer = ix.writer() writer = ix.writer()
writer.add_document( writer.add_document(
title=u"document1", title="document1",
path=u"/a", path="/a",
content=u"This is the first document weve added!" content="This is the first document weve added!"
) )
writer.add_document( writer.add_document(
title=u"document2", title="document2",
path=u"/b", path="/b",
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果" content="The second one 你 中文测试中文 is even more interesting! 吃水果"
) )
writer.add_document( writer.add_document(
title=u"document3", title="document3",
path=u"/c", path="/c",
content=u"买水果然后来世博园。" content="买水果然后来世博园。"
) )
writer.add_document( writer.add_document(
title=u"document4", title="document4",
path=u"/c", path="/c",
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
) )
writer.add_document( writer.add_document(
title=u"document4", title="document4",
path=u"/c", path="/c",
content=u"咱俩交换一下吧。" content="咱俩交换一下吧。"
) )
writer.commit() writer.commit()
searcher = ix.searcher() searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema) parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果世博园",u"",u"first",u"中文",u"交换机",u"交换"): for keyword in ("水果世博园","","first","中文","交换机","交换"):
print "result of ",keyword print("result of ",keyword)
q = parser.parse(keyword) q = parser.parse(keyword)
results = searcher.search(q) results = searcher.search(q)
for hit in results: for hit in results:
print hit.highlights("content") print(hit.highlights("content"))
print "="*10 print("="*10)
for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"): for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
print t.text print(t.text)

View File

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys import sys
import os import os
sys.path.append("../") sys.path.append("../")
@ -23,8 +24,8 @@ with open(file_name,"rb") as inf:
for line in inf: for line in inf:
i+=1 i+=1
writer.add_document( writer.add_document(
title=u"line"+str(i), title="line"+str(i),
path=u"/a", path="/a",
content=line.decode('gbk','ignore') content=line.decode('gbk','ignore')
) )
writer.commit() writer.commit()
@ -32,10 +33,10 @@ writer.commit()
searcher = ix.searcher() searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema) parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果小姐",u"",u"first",u"中文",u"交换机",u"交换"): for keyword in ("水果小姐","","first","中文","交换机","交换"):
print "result of ",keyword print("result of " + keyword)
q = parser.parse(keyword) q = parser.parse(keyword)
results = searcher.search(q) results = searcher.search(q)
for hit in results: for hit in results:
print hit.highlights("content") print(hit.highlights("content"))
print "="*10 print("="*10)

View File

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import sys import sys
import os import os
sys.path.append("../") sys.path.append("../")
@ -18,10 +19,10 @@ ix = open_dir("tmp")
searcher = ix.searcher() searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema) parser = QueryParser("content", schema=ix.schema)
for keyword in (u"水果小姐",u"",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"): for keyword in ("水果小姐","","first","中文","交换机","交换","少林","乔峰"):
print "result of ",keyword print("result of ",keyword)
q = parser.parse(keyword) q = parser.parse(keyword)
results = searcher.search(q) results = searcher.search(q)
for hit in results: for hit in results:
print hit.highlights("content") print(hit.highlights("content"))
print "="*10 print("="*10)