Merge master and jieba3k, make the code Python 2/3 compatible

2025-07-10 00:01:33 +08:00 · 2015-02-10 20:54:55 +08:00 · 2015-02-10 20:54:55 +08:00 · 22bcf8be7a
commit 22bcf8be7a
parent 765fd6b7f0 4197dfb8fa
48 changed files with 131433 additions and 131938 deletions
--- a/README.md
+++ b/README.md
@ -68,16 +68,16 @@ python setup.py install
 import jieba
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 精确模式
+print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
 seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 ```
 输出:
@ -174,7 +174,7 @@ jieba.analyse.textrank(raw_text)
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
-...    print w.word, w.flag
+...    print('%s %s' % (w.word, w.flag))
 ...
 我 r
 爱 v
@ -203,7 +203,7 @@ jieba.analyse.textrank(raw_text)
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
@ -219,7 +219,7 @@ word 有限公司            start: 6                end:10
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
@ -413,16 +413,16 @@ Main Functions
 import jieba
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print "Default Mode:", "/ ".join(seg_list)  # 默认模式
+print("Default Mode: " + "/ ".join(seg_list))  # 默认模式
 seg_list = jieba.cut("他来到了网易杭研大厦")
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print ", ".join(seg_list)
+print(", ".join(seg_list))
 ```
 Output:
@ -488,7 +488,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
-...    print w.word, w.flag
+...    print('%s %s' % (w.word, w.flag))
 ...
 我 r
 爱 v
@ -517,7 +517,7 @@ Use: `jieba.analyse.textrank(raw_text)`.
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
@ -533,7 +533,7 @@ word 有限公司            start: 6                end:10
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
--- a/jieba/init.py
+++ b/jieba/init.py
@ -1,20 +1,20 @@
-from __future__ import with_statement
+from __future__ import absolute_import, unicode_literals
 __version__ = '0.35'
 __license__ = 'MIT'
 import re
 import os
 import sys
 import finalseg
 import time
 import tempfile
 import marshal
 from math import log
 import random
 import threading
 from functools import wraps
 import logging
 from hashlib import md5
 from ._compat import *
 from . import finalseg
 DICTIONARY = "dict.txt"
 DICT_LOCK = threading.RLock()
@ -51,13 +51,13 @@ def gen_pfdict(f_name):
                ltotal += freq
                for ch in xrange(len(word)):
                    pfdict.add(word[:ch+1])
-            except ValueError, e:
+            except ValueError as e:
                logger.debug('%s at line %s %s' % (f_name, lineno, line))
-                raise ValueError, e
+                raise e
    return pfdict, lfreq, ltotal
 def initialize(dictionary=None):
-    global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
+    global pfdict, FREQ, total, initialized, DICTIONARY, DICT_LOCK
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
@ -121,7 +121,7 @@ def require_initialized(fn):
 def __cut_all(sentence):
    dag = get_DAG(sentence)
    old_j = -1
-    for k,L in dag.iteritems():
+    for k,L in iteritems(dag):
        if len(L) == 1 and k > old_j:
            yield sentence[k:L[0]+1]
            old_j = L[0]
@ -158,13 +158,13 @@ def get_DAG(sentence):
    return DAG
 def __cut_DAG_NO_HMM(sentence):
-    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
+    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
-    buf = u''
+    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
@ -174,19 +174,19 @@ def __cut_DAG_NO_HMM(sentence):
        else:
            if buf:
                yield buf
-                buf = u''
+                buf = ''
            yield l_word
            x = y
    if buf:
        yield buf
-        buf = u''
+        buf = ''
 def __cut_DAG(sentence):
    DAG = get_DAG(sentence)
    route = {}
    calc(sentence, DAG, route=route)
    x = 0
-    buf = u''
+    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1]+1
@ -197,7 +197,7 @@ def __cut_DAG(sentence):
            if buf:
                if len(buf) == 1:
                    yield buf
-                    buf = u''
+                    buf = ''
                else:
                    if buf not in FREQ:
                        recognized = finalseg.cut(buf)
@ -206,7 +206,7 @@ def __cut_DAG(sentence):
                    else:
                        for elem in buf:
                            yield elem
-                    buf = u''
+                    buf = ''
            yield l_word
        x = y
@ -225,23 +225,19 @@ def cut(sentence, cut_all=False, HMM=True):
    '''The main function that segments an entire sentence that contains
    Chinese characters into seperated words.
    Parameter:
-        - sentence: The str/unicode to be segmented.
+        - sentence: The str(unicode) to be segmented.
        - cut_all: Model type. True for full pattern, False for accurate pattern.
        - HMM: Whether to use the Hidden Markov Model.
    '''
-    if not isinstance(sentence, unicode):
+    sentence = strdecode(sentence)
        try:
            sentence = sentence.decode('utf-8')
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk', 'ignore')
    # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
    # \r\n|\s : whitespace characters. Will not be handled.
    if cut_all:
-        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
+        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
    else:
-        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
+        re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
    blocks = re_han.split(sentence)
    if cut_all:
        cut_block = __cut_all
@ -292,9 +288,9 @@ def load_userdict(f):
    ...
    Word type may be ignored
    '''
-    if isinstance(f, (str, unicode)):
+    if isinstance(f, string_types):
        f = open(f, 'rb')
-    content = f.read().decode('utf-8').lstrip(u'\ufeff')
+    content = f.read().decode('utf-8').lstrip('\ufeff')
    line_no = 0
    for line in content.split("\n"):
        line_no += 1
@ -333,15 +329,13 @@ def enable_parallel(processnum=None):
    global pool, cut, cut_for_search
    if os.name == 'nt':
        raise Exception("jieba: parallel mode only supports posix system")
    if sys.version_info[0]==2 and sys.version_info[1]<6:
        raise Exception("jieba: the parallel feature needs Python version>2.5")
    from multiprocessing import Pool, cpu_count
    if processnum is None:
        processnum = cpu_count()
    pool = Pool(processnum)
    def pcut(sentence,cut_all=False,HMM=True):
-        parts = re.compile('([\r\n]+)').split(sentence)
+        parts = strdecode(sentence).split('\n')
        if cut_all:
            result = pool.map(__lcut_all, parts)
        elif HMM:
@ -353,7 +347,7 @@ def enable_parallel(processnum=None):
                yield w
    def pcut_for_search(sentence):
-        parts = re.compile('([\r\n]+)').split(sentence)
+        parts = strdecode(sentence).split('\n')
        result = pool.map(__lcut_for_search, parts)
        for r in result:
            for w in r:
@ -385,11 +379,11 @@ def get_abs_path_dict():
 def tokenize(unicode_sentence, mode="default", HMM=True):
    """Tokenize a sentence and yields tuples of (word, start, end)
    Parameter:
-        - sentence: the unicode to be segmented.
+        - sentence: the str(unicode) to be segmented.
        - mode: "default" or "search", "search" is for finer segmentation.
        - HMM: whether to use the Hidden Markov Model.
    """
-    if not isinstance(unicode_sentence, unicode):
+    if not isinstance(unicode_sentence, text_type):
        raise Exception("jieba: the input parameter should be unicode.")
    start = 0
    if mode == 'default':
--- a/jieba/main.py
+++ b/jieba/main.py
@ -25,7 +25,7 @@ args = parser.parse_args()
 if args.quiet:
    jieba.setLogLevel(60)
-delim = unicode(args.delimiter)
+delim = text_type(args.delimiter)
 cutall = args.cutall
 hmm = args.hmm
 fp = open(args.filename, 'r') if args.filename else sys.stdin
@ -40,7 +40,10 @@ if args.user_dict:
 ln = fp.readline()
 while ln:
    l = ln.rstrip('\r\n')
-    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
+    result = delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))
    if PY2:
        result = result.encode(default_encoding)
    print(result)
    ln = fp.readline()
 fp.close()
--- a/jieba/_compat.py
+++ b/jieba/_compat.py
@ -0,0 +1,31 @@
 # -*- coding: utf-8 -*-
 import sys
 PY2 = sys.version_info[0] == 2
 default_encoding = sys.getfilesystemencoding()
 if PY2:
    text_type = unicode
    string_types = (str, unicode)
    iterkeys = lambda d: d.iterkeys()
    itervalues = lambda d: d.itervalues()
    iteritems = lambda d: d.iteritems()
 else:
    text_type = str
    string_types = (str,)
    xrange = range
    iterkeys = lambda d: iter(d.keys())
    itervalues = lambda d: iter(d.values())
    iteritems = lambda d: iter(d.items())
 def strdecode(sentence):
    if not isinstance(sentence, text_type):
        try:
            sentence = sentence.decode('utf-8')
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk', 'ignore')
    return sentence
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,13 +1,14 @@
 #encoding=utf-8
 from __future__ import absolute_import
 import jieba
 import jieba.posseg
 import os
 from operator import itemgetter
 try:
-    from analyzer import ChineseAnalyzer
+    from .analyzer import ChineseAnalyzer
 except ImportError:
    pass
-from textrank import textrank
+from .textrank import textrank
 _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 abs_path = os.path.join(_curpath, "idf.txt")
--- a/jieba/analyse/analyzer.py
+++ b/jieba/analyse/analyzer.py
@ -1,4 +1,5 @@
-##encoding=utf-8
+#encoding=utf-8
 from __future__ import unicode_literals
 from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
 from whoosh.analysis import Tokenizer,Token
 from whoosh.lang.porter import stem
@ -10,9 +11,9 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
-                        'you', 'your', u'的', u'了', u'和'))
+                        'you', 'your', '的', '了', '和'))
-accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
+accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")
 class ChineseTokenizer(Tokenizer):
    def __call__(self, text, **kargs):
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import sys
 import collections
 from operator import itemgetter
@ -35,7 +36,7 @@ class UndirectWeightedGraph:
        (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
-        for w in ws.itervalues():
+        for w in itervalues(ws):
            if w < min_rank:
                min_rank = w
            elif w > max_rank:
@ -88,4 +89,4 @@ def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v'
 if __name__ == '__main__':
    s = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"
    for x, w in textrank(s, withWeight=True):
-        print x, w
+        print('%s %s' % (x, w))
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -1,8 +1,9 @@
-from __future__ import with_statement
+from __future__ import absolute_import, unicode_literals
 import re
 import os
 import marshal
 import sys
 from .._compat import *
 MIN_FLOAT = -3.14e100
@ -41,9 +42,9 @@ def load_model():
 if sys.platform.startswith("java"):
    start_P, trans_P, emit_P = load_model()
 else:
-    from prob_start import P as start_P
+    from .prob_start import P as start_P
-    from prob_trans import P as trans_P
+    from .prob_trans import P as trans_P
-    from prob_emit  import P as emit_P
+    from .prob_emit  import P as emit_P
 def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}] #tabular
@ -85,12 +86,8 @@ def __cut(sentence):
        yield sentence[nexti:]
 def cut(sentence):
-    if not isinstance(sentence, unicode):
+    sentence = strdecode(sentence)
-        try:
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
            sentence = sentence.decode('utf-8')
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk', 'ignore')
    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
--- a/jieba/finalseg/prob_emit.py
+++ b/jieba/finalseg/prob_emit.py
--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -1,13 +1,12 @@
-from __future__ import with_statement
+from __future__ import absolute_import, unicode_literals
 import re
 import os
 import viterbi
 import jieba
 import sys
 import marshal
 from functools import wraps
-
+from .._compat import *
-default_encoding = sys.getfilesystemencoding()
+from .viterbi import viterbi
 PROB_START_P = "prob_start.p"
 PROB_TRANS_P = "prob_trans.p"
@ -18,13 +17,14 @@ def load_model(f_name, isJython=True):
    _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    result = {}
-    with open(f_name, "r") as f:
+    with open(f_name, "rb") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
-            word, _, tag = line.split(' ')
+            line = line.decode("utf-8")
-            result[word.decode('utf-8')] = tag
+            word, _, tag = line.split(" ")
            result[word] = tag
    if not isJython:
        return result
@ -55,10 +55,10 @@ def load_model(f_name, isJython=True):
 if sys.platform.startswith("java"):
    char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
 else:
-    from char_state_tab import P as char_state_tab_P
+    from .char_state_tab import P as char_state_tab_P
-    from prob_start import P as start_P
+    from .prob_start import P as start_P
-    from prob_trans import P as trans_P
+    from .prob_trans import P as trans_P
-    from prob_emit  import P as emit_P
+    from .prob_emit  import P as emit_P
    word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
@ -79,20 +79,23 @@ class pair(object):
        self.flag = flag
    def __unicode__(self):
-        return u'%s/%s' % (self.word, self.flag)
+        return '%s/%s' % (self.word, self.flag)
    def __repr__(self):
        return self.__str__()
    def __str__(self):
        if PY2:
            return self.__unicode__().encode(default_encoding)
        else:
            return self.__unicode__()
    def encode(self,arg):
        return self.__unicode__().encode(arg)
 def __cut(sentence):
-    prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
+    prob, pos_list = viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
-    begin, next = 0, 0
+    begin, nexti = 0, 0
    for i,char in enumerate(sentence):
        pos = pos_list[i][0]
@ -100,16 +103,16 @@ def __cut(sentence):
            begin = i
        elif pos == 'E':
            yield pair(sentence[begin:i+1], pos_list[i][1])
-            next = i+1
+            nexti = i+1
        elif pos == 'S':
            yield pair(char, pos_list[i][1])
-            next = i+1
+            nexti = i+1
-    if next < len(sentence):
+    if nexti < len(sentence):
-        yield pair(sentence[next:], pos_list[next][1])
+        yield pair(sentence[nexti:], pos_list[nexti][1])
 def __cut_detail(sentence):
-    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
-    re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
    blocks = re_han.split(sentence)
    for blk in blocks:
        if re_han.match(blk):
@ -132,8 +135,8 @@ def __cut_DAG_NO_HMM(sentence):
    jieba.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
-    buf = u''
+    buf = ''
-    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
+    re_eng = re.compile('[a-zA-Z0-9]',re.U)
    while x < N:
        y = route[x][1]+1
        l_word = sentence[x:y]
@ -143,12 +146,12 @@ def __cut_DAG_NO_HMM(sentence):
        else:
            if buf:
                yield pair(buf,'eng')
-                buf = u''
+                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
            x = y
    if buf:
        yield pair(buf,'eng')
-        buf = u''
+        buf = ''
 def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
@ -157,7 +160,7 @@ def __cut_DAG(sentence):
    jieba.calc(sentence, DAG, route)
    x = 0
-    buf = u''
+    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1]+1
@ -175,7 +178,7 @@ def __cut_DAG(sentence):
                else:
                    for elem in buf:
                        yield pair(elem, word_tag_tab.get(elem, 'x'))
-                buf = u''
+                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y
@ -191,13 +194,9 @@ def __cut_DAG(sentence):
                yield pair(elem, word_tag_tab.get(elem, 'x'))
 def __cut_internal(sentence, HMM=True):
-    if not isinstance(sentence, unicode):
+    sentence = strdecode(sentence)
-        try:
+    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
-            sentence = sentence.decode('utf-8')
+    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
        except UnicodeDecodeError:
            sentence = sentence.decode('gbk', 'ignore')
    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
    re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
    blocks = re_han.split(sentence)
    if HMM:
        __cut_blk = __cut_DAG
@ -234,7 +233,7 @@ def cut(sentence, HMM=True):
        for w in __cut_internal(sentence, HMM=HMM):
            yield w
    else:
-        parts = re.compile('([\r\n]+)').split(sentence)
+        parts = strdecode(sentence).split('\n')
        if HMM:
            result = jieba.pool.map(__lcut_internal, parts)
        else:
--- a/jieba/posseg/char_state_tab.py
+++ b/jieba/posseg/char_state_tab.py
--- a/jieba/posseg/prob_emit.py
+++ b/jieba/posseg/prob_emit.py
--- a/jieba/posseg/viterbi.py
+++ b/jieba/posseg/viterbi.py
@ -1,7 +1,11 @@
 import sys
 import operator
 MIN_FLOAT = -3.14e100
 MIN_INF = float("-inf")
 if sys.version_info[0] > 2:
    xrange = range
 def get_top_states(t_state_v, K=4):
    return sorted(t_state_v, key=t_state_v.__getitem__, reverse=True)[:K]
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 from distutils.core import setup
-LONGDOC = u"""
+LONGDOC = """
 jieba
 =====
@ -75,6 +75,12 @@ setup(name='jieba',
        'Natural Language :: Chinese (Traditional)',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2',
        'Programming Language :: Python :: 2.6',
        'Programming Language :: Python :: 2.7',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.2',
        'Programming Language :: Python :: 3.3',
        'Programming Language :: Python :: 3.4',
        'Topic :: Text Processing',
        'Topic :: Text Processing :: Indexing',
        'Topic :: Text Processing :: Linguistic',
--- a/test/2to3.diff
+++ b/test/2to3.diff
@ -1,522 +0,0 @@
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
 --- ./jieba/analyse/analyzer.py	2014-11-29 15:46:45.987925569 +0800
 +++ ../jieba/jieba/analyse/analyzer.py	2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,4 @@
 -##encoding=utf-8
 +#encoding=utf-8
 from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
 from whoosh.analysis import Tokenizer,Token
 from whoosh.lang.porter import stem
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
 --- ./jieba/analyse/__init__.py	2014-11-29 15:46:46.139925567 +0800
 +++ ../jieba/jieba/analyse/__init__.py	2014-11-29 15:36:13.147931604 +0800
@@ -26,7 +26,7 @@
     def set_new_path(self, new_idf_path):
         if self.path != new_idf_path:
 -            content = open(new_idf_path, 'rb').read().decode('utf-8')
 +            content = open(new_idf_path, 'r', encoding='utf-8').read()
             idf_freq = {}
             lines = content.rstrip('\n').split('\n')
             for line in lines:
@@ -93,7 +93,7 @@
         freq[k] *= idf_freq.get(k, median_idf) / total
     if withWeight:
 -        tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
 +        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
     else:
         tags = sorted(freq, key=freq.__getitem__, reverse=True)
     if topK:
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
 --- ./jieba/analyse/textrank.py	2014-11-29 15:46:46.043925568 +0800
 +++ ../jieba/jieba/analyse/textrank.py	2014-11-29 15:36:39.291931354 +0800
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import sys
@@ -22,12 +22,12 @@
         outSum = collections.defaultdict(float)
         wsdef = 1.0 / len(self.graph)
 -        for n, out in list(self.graph.items()):
 +        for n, out in self.graph.items():
             ws[n] = wsdef
             outSum[n] = sum((e[2] for e in out), 0.0)
         for x in range(10):  # 10 iters
 -            for n, inedges in list(self.graph.items()):
 +            for n, inedges in self.graph.items():
                 s = 0
                 for e in inedges:
                     s += e[2] / outSum[e[1]] * ws[e[1]]
@@ -41,7 +41,7 @@
             elif w > max_rank:
                 max_rank = w
 -        for n, w in list(ws.items()):
 +        for n, w in ws.items():
             # to unify the weights, don't *100.
             ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
@@ -72,12 +72,12 @@
                     continue
                 cm[(words[i].word, words[j].word)] += 1
 -    for terms, w in list(cm.items()):
 +    for terms, w in cm.items():
         g.addEdge(terms[0], terms[1], w)
     nodes_rank = g.rank()
     if withWeight:
 -        tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
 +        tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
     else:
         tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
     if topK:
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
 --- ./jieba/finalseg/__init__.py	2014-11-29 15:46:46.367925565 +0800
 +++ ../jieba/jieba/finalseg/__init__.py	2014-11-29 15:34:42.859932465 +0800
@@ -1,4 +1,3 @@
 -
 import re
 import os
 import marshal
@@ -89,7 +88,7 @@
             sentence = sentence.decode('utf-8')
         except UnicodeDecodeError:
             sentence = sentence.decode('gbk', 'ignore')
 -    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
 +    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
     blocks = re_han.split(sentence)
     for blk in blocks:
         if re_han.match(blk):
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
 --- ./jieba/__init__.py	2014-11-29 15:46:45.955925569 +0800
 +++ ../jieba/jieba/__init__.py	2014-11-29 15:39:03.335929981 +0800
@@ -1,4 +1,3 @@
 -
 __version__ = '0.35'
 __license__ = 'MIT'
@@ -51,7 +50,7 @@
                     pfdict.add(word[:ch+1])
             except ValueError as e:
                 logger.debug('%s at line %s %s' % (f_name, lineno, line))
 -                raise ValueError(e)
 +                raise e
     return pfdict, lfreq, ltotal
 def initialize(dictionary=None):
@@ -229,11 +228,11 @@
     '''The main function that segments an entire sentence that contains
     Chinese characters into seperated words.
     Parameter:
 -        - sentence: The str/unicode to be segmented.
 +        - sentence: The str to be segmented.
         - cut_all: Model type. True for full pattern, False for accurate pattern.
         - HMM: Whether to use the Hidden Markov Model.
     '''
 -    if not isinstance(sentence, str):
 +    if isinstance(sentence, bytes):
         try:
             sentence = sentence.decode('utf-8')
         except UnicodeDecodeError:
@@ -243,9 +242,9 @@
     # \r\n|\s : whitespace characters. Will not be handled.
     if cut_all:
 -        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
 +        re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)", re.U), re.compile("[^a-zA-Z0-9+#\n]", re.U)
     else:
 -        re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
 +        re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile("(\r\n|\s)", re.U)
     blocks = re_han.split(sentence)
     if cut_all:
         cut_block = __cut_all
@@ -339,8 +338,6 @@
     global pool, cut, cut_for_search
     if os.name == 'nt':
         raise Exception("jieba: parallel mode only supports posix system")
 -    if sys.version_info[0]==2 and sys.version_info[1]<6:
 -        raise Exception("jieba: the parallel feature needs Python version>2.5")
     from multiprocessing import Pool, cpu_count
     if processnum is None:
         processnum = cpu_count()
@@ -393,12 +390,12 @@
 def tokenize(unicode_sentence, mode="default", HMM=True):
     """Tokenize a sentence and yields tuples of (word, start, end)
     Parameter:
 -        - sentence: the unicode to be segmented.
 +        - sentence: the str to be segmented.
         - mode: "default" or "search", "search" is for finer segmentation.
         - HMM: whether to use the Hidden Markov Model.
     """
     if not isinstance(unicode_sentence, str):
 -        raise Exception("jieba: the input parameter should be unicode.")
 +        raise Exception("jieba: the input parameter should be str.")
     start = 0
     if mode == 'default':
         for w in cut(unicode_sentence, HMM=HMM):
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
 --- ./jieba/__main__.py	2014-11-29 15:46:45.747925571 +0800
 +++ ../jieba/jieba/__main__.py	2014-11-29 15:34:42.859932465 +0800
@@ -40,7 +40,7 @@
 ln = fp.readline()
 while ln:
     l = ln.rstrip('\r\n')
 -    print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
 +    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
     ln = fp.readline()
 fp.close()
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
 --- ./jieba/posseg/__init__.py	2014-11-29 15:46:46.271925566 +0800
 +++ ../jieba/jieba/posseg/__init__.py	2014-11-29 15:37:52.299930658 +0800
@@ -1,4 +1,3 @@
 -
 import re
 import os
 from . import viterbi
@@ -18,14 +17,14 @@
     _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
     result = {}
 -    with open(f_name, "r") as f:
 +    with open(f_name, "rb") as f:
         for line in f:
             line = line.strip()
             if not line:
                 continue
 -            word, _, tag = line.split(' ')
 -            result[word.decode('utf-8')] = tag
 -
 +            line = line.decode("utf-8")
 +            word, _, tag = line.split(" ")
 +            result[word] = tag
     if not isJython:
         return result
@@ -105,8 +104,8 @@
         yield pair(sentence[next:], pos_list[next][1])
 def __cut_detail(sentence):
 -    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
 -    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
 +    re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
 +    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
     blocks = re_han.split(sentence)
     for blk in blocks:
         if re_han.match(blk):
@@ -130,7 +129,7 @@
     x = 0
     N = len(sentence)
     buf = ''
 -    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
 +    re_eng = re.compile('[a-zA-Z0-9]',re.U)
     while x < N:
         y = route[x][1]+1
         l_word = sentence[x:y]
@@ -195,8 +194,8 @@
             sentence = sentence.decode('utf-8')
         except UnicodeDecodeError:
             sentence = sentence.decode('gbk', 'ignore')
 -    re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
 -    re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
 +    re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\r\n|\s)")
 +    re_eng, re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
     blocks = re_han.split(sentence)
     if HMM:
         __cut_blk = __cut_DAG
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
 --- ./jieba/posseg/viterbi.py	2014-11-29 15:46:46.303925566 +0800
 +++ ../jieba/jieba/posseg/viterbi.py	2014-11-29 15:38:28.527930313 +0800
@@ -8,7 +8,7 @@
 def viterbi(obs, states, start_p, trans_p, emit_p):
     V = [{}] #tabular
     mem_path = [{}]
 -    all_states = list(trans_p.keys())
 +    all_states = trans_p.keys()
     for y in states.get(obs[0], all_states): #init
         V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
         mem_path[0][y] = ''
@@ -16,9 +16,9 @@
         V.append({})
         mem_path.append({})
         #prev_states = get_top_states(V[t-1])
 -        prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
 +        prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
 -        prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
 +        prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
         obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
         if not obs_states:
@@ -29,7 +29,7 @@
             V[t][y] = prob
             mem_path[t][y] = state
 -    last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
 +    last = [(V[-1][y], y) for y in mem_path[-1].keys()]
     #if len(last)==0:
         #print obs
     prob, state = max(last)
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
 --- ./README.md	2014-11-29 15:46:08.487925926 +0800
 +++ ../jieba/README.md	2014-11-29 15:34:42.859932465 +0800
@@ -4,6 +4,9 @@
 "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
 - _Scroll down for English documentation._
 +注意！
 +========
 +这个branch `jieba3k` 是专门用于Python3.x的版本
 特点
 ========
@@ -68,16 +71,16 @@
 import jieba
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
 -print "Full Mode:", "/ ".join(seg_list)  # 全模式
 +print("Full Mode:", "/ ".join(seg_list))  # 全模式
 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
 -print "Default Mode:", "/ ".join(seg_list)  # 精确模式
 +print("Default Mode:", "/ ".join(seg_list))  # 精确模式
 seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
 -print ", ".join(seg_list)
 +print(", ".join(seg_list))
 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
 -print ", ".join(seg_list)
 +print(", ".join(seg_list))
 ```
 输出:
@@ -174,7 +177,7 @@
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
 -...    print w.word, w.flag
 +...    print(w.word, w.flag)
 ...
 我 r
 爱 v
@@ -203,7 +206,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
 -    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
 +    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
@@ -219,7 +222,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
 -    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
 +    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
@@ -408,16 +411,16 @@
 import jieba
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
 -print "Full Mode:", "/ ".join(seg_list)  # 全模式
 +print("Full Mode:", "/ ".join(seg_list))  # 全模式
 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
 -print "Default Mode:", "/ ".join(seg_list)  # 默认模式
 +print("Default Mode:", "/ ".join(seg_list))  # 默认模式
 seg_list = jieba.cut("他来到了网易杭研大厦")
 -print ", ".join(seg_list)
 +print(", ".join(seg_list))
 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
 -print ", ".join(seg_list)
 +print(", ".join(seg_list))
 ```
 Output:
@@ -483,7 +486,7 @@
 >>> import jieba.posseg as pseg
 >>> words = pseg.cut("我爱北京天安门")
 >>> for w in words:
 -...    print w.word, w.flag
 +...    print(w.word, w.flag)
 ...
 我 r
 爱 v
@@ -512,7 +515,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司')
 for tk in result:
 -    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
 +    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
@@ -528,7 +531,7 @@
 ```python
 result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
 for tk in result:
 -    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
 +    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 ```
 ```
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
 --- ./setup.py	2014-11-29 15:46:46.379925565 +0800
 +++ ../jieba/setup.py	2014-11-29 15:42:20.263928103 +0800
@@ -11,7 +11,7 @@
 完整文档见 ``README.md``
 -GitHub: https://github.com/fxsjy/jieba
 +GitHub: https://github.com/fxsjy/jieba/tree/jieba3k
 特点
 ====
@@ -34,17 +34,11 @@
 Python 2.x
 ----------
 --  全自动安装： ``easy_install jieba`` 或者 ``pip install jieba``
 --  半自动安装：先下载 https://pypi.python.org/pypi/jieba/ ，解压后运行
 -   python setup.py install
 --  手动安装：将 jieba 目录放置于当前目录或者 site-packages 目录
 --  通过 ``import jieba`` 来引用
 +见 https://pypi.python.org/pypi/jieba/
 Python 3.x
 ----------
 -见 https://pypi.python.org/pypi/jieba3k/
 -
 -  目前 master 分支是只支持 Python 2.x 的
 -  Python 3.x 版本的分支也已经基本可用：
    https://github.com/fxsjy/jieba/tree/jieba3k
@@ -59,13 +53,13 @@
 """
 -setup(name='jieba',
 +setup(name='jieba3k',
       version='0.35.1',
       description='Chinese Words Segementation Utilities',
       long_description=LONGDOC,
       author='Sun, Junyi',
       author_email='ccnusjy@gmail.com',
 -      url='https://github.com/fxsjy/jieba',
 +      url='https://github.com/fxsjy/jieba/tree/jieba3k',
       license="MIT",
       classifiers=[
         'Intended Audience :: Developers',
@@ -73,9 +67,8 @@
         'Operating System :: OS Independent',
         'Natural Language :: Chinese (Simplified)',
         'Natural Language :: Chinese (Traditional)',
         'Programming Language :: Python',
 -        'Programming Language :: Python :: 2',
 +        'Programming Language :: Python :: 3',
         'Topic :: Text Processing',
         'Topic :: Text Processing :: Indexing',
         'Topic :: Text Processing :: Linguistic',
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
 --- ./test/extract_topic.py	2014-11-29 15:46:47.003925559 +0800
 +++ ../jieba/test/extract_topic.py	2014-11-29 15:34:42.919932464 +0800
@@ -51,13 +51,13 @@
 print("training...")
 nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
 -print(("done in %0.3fs." % (time.time() - t0)))
 +print("done in %0.3fs." % (time.time() - t0))
 # Inverse the vectorizer vocabulary to be able
 feature_names = count_vect.get_feature_names()
 for topic_idx, topic in enumerate(nmf.components_):
 -    print(("Topic #%d:" % topic_idx))
 -    print((" ".join([feature_names[i]
 -                    for i in topic.argsort()[:-n_top_words - 1:-1]])))
 +    print("Topic #%d:" % topic_idx)
 +    print(" ".join([feature_names[i]
 +                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
     print("")
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
 --- ./test/jiebacmd.py	2014-11-29 15:46:46.443925564 +0800
 +++ ../jieba/test/jiebacmd.py	2014-11-29 15:34:42.919932464 +0800
@@ -23,6 +23,6 @@
         break
     line = line.strip()
     for word in jieba.cut(line):
 -        print(word.encode(default_encoding))
 +        print(word)
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
 --- ./test/jieba_test.py	2014-11-29 15:46:47.271925556 +0800
 +++ ../jieba/test/jieba_test.py	2014-11-29 15:34:42.919932464 +0800
@@ -152,7 +152,7 @@
 #-*-coding: utf-8 -*-
 import sys
 +import imp
 sys.path.append("../")
 import unittest
 import types
@@ -97,7 +98,7 @@
 class JiebaTestCase(unittest.TestCase):
     def setUp(self):
 -        reload(jieba)
 +        imp.reload(jieba)
     def tearDown(self):
         pass
@@ -151,7 +152,7 @@
     def testTokenize(self):
         for content in test_contents:
 -            result = jieba.tokenize(content.decode('utf-8'))
 +            result = jieba.tokenize(content)
             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
             result = list(result)
             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
@@ -181,7 +181,7 @@
     def testTokenize_NOHMM(self):
         for content in test_contents:
 -            result = jieba.tokenize(content.decode('utf-8'),HMM=False)
 +            result = jieba.tokenize(content,HMM=False)
             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
             result = list(result)
             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
 --- ./test/test_tokenize_no_hmm.py	2014-11-29 15:46:47.355925556 +0800
 +++ ../jieba/test/test_tokenize_no_hmm.py	2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
 def cuttest(test_sent):
     global g_mode
 -    test_sent = test_sent.decode('utf-8')
     result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
     for tk in result:
         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
 --- ./test/test_tokenize.py	2014-11-29 15:46:47.403925555 +0800
 +++ ../jieba/test/test_tokenize.py	2014-11-29 15:34:42.919932464 +0800
@@ -7,7 +7,6 @@
 def cuttest(test_sent):
     global g_mode
 -    test_sent = test_sent.decode('utf-8')
     result = jieba.tokenize(test_sent,mode=g_mode)
     for tk in result:
         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
--- a/test/auto2to3
+++ b/test/auto2to3
@ -1,34 +0,0 @@
 #!/bin/bash
 # Set 2to3 path.
 PYTHON2TO3=2to3
 # Copy the python2 version.
 echo Jieba 2to3 manual conversion tool
 echo
 if ! git rev-parse; then
 exit 1
 fi
 echo Copying working directory to ../jieba2
 if [ -d ../jieba2 ]; then
 echo Found existing ../jieba2
 read -p "Replace it with new one? (y/n) " -r
 if ! [[ $REPLY =~ ^[Yy]$ ]]; then
  echo Cancelled.
  exit
 else
  rm -rf ../jieba2
 fi
 fi
 if ! git checkout jieba3k; then
 exit 1
 fi
 cp -r . ../jieba2
 cd ../jieba2
 if ! git checkout master; then
 exit 1
 fi
 # Here starts auto conversion.
 echo Converting jieba2 to Python3 ...
 find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
 find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
 patch -p0 -s <../jieba/test/2to3.diff
 echo Done. Compare jieba and jieba2 to manually port.
--- a/test/demo.py
+++ b/test/demo.py
@ -1,17 +1,18 @@
 #encoding=utf-8
 from __future__ import unicode_literals
 import sys
 sys.path.append("../")
 import jieba
-seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
+seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
-print u"Full Mode:", u"/ ".join(seg_list)  # 全模式
+print("Full Mode: " + "/ ".join(seg_list))  # 全模式
-seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
+seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print u"Default Mode:", u"/ ".join(seg_list)  # 默认模式
+print("Default Mode: " + "/ ".join(seg_list))  # 默认模式
-seg_list = jieba.cut(u"他来到了网易杭研大厦")
+seg_list = jieba.cut("他来到了网易杭研大厦")
-print u", ".join(seg_list)
+print(", ".join(seg_list))
-seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
+seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
-print u", ".join(seg_list)
+print(", ".join(seg_list))
--- a/test/extract_tags.py
+++ b/test/extract_tags.py
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)
 file_name = args[0]
@ -27,4 +27,4 @@ content = open(file_name, 'rb').read()
 tags = jieba.analyse.extract_tags(content, topK=topK)
-print ",".join(tags)
+print(",".join(tags))
--- a/test/extract_tags_idfpath.py
+++ b/test/extract_tags_idfpath.py
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)
 file_name = args[0]
@ -29,4 +29,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
 tags = jieba.analyse.extract_tags(content, topK=topK)
-print ",".join(tags)
+print(",".join(tags))
--- a/test/extract_tags_stop_words.py
+++ b/test/extract_tags_stop_words.py
@ -13,7 +13,7 @@ opt, args = parser.parse_args()
 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)
 file_name = args[0]
@ -30,4 +30,4 @@ jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
 tags = jieba.analyse.extract_tags(content, topK=topK)
-print ",".join(tags)
+print(",".join(tags))
--- a/test/extract_tags_with_weight.py
+++ b/test/extract_tags_with_weight.py
@ -14,7 +14,7 @@ opt, args = parser.parse_args()
 if len(args) < 1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)
 file_name = args[0]
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
 if withWeight is True:
    for tag in tags:
-        print "tag: %s\t\t weight: %f" % (tag[0],tag[1])
+        print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
 else:
-    print ",".join(tags)
+    print(",".join(tags))
--- a/test/extract_topic.py
+++ b/test/extract_topic.py
@ -12,7 +12,7 @@ import os
 import random
 if len(sys.argv)<2:
-    print "usage: extract_topic.py directory [n_topic] [n_top_words]"
+    print("usage: extract_topic.py directory [n_topic] [n_top_words]")
    sys.exit(0)
 n_topic = 10
@ -28,27 +28,27 @@ count_vect = CountVectorizer()
 docs = []
 pattern = os.path.join(sys.argv[1],"*.txt") 
-print "read "+pattern
+print("read "+pattern)
 for f_name in glob.glob(pattern):
    with open(f_name) as f:
-        print "read file:", f_name
+        print("read file:", f_name)
        for line in f: #one line as a document
            words = " ".join(jieba.cut(line))
            docs.append(words)
 random.shuffle(docs)
-print "read done."
+print("read done.")
-print "transform"
+print("transform")
 counts = count_vect.fit_transform(docs)
 tfidf = TfidfTransformer().fit_transform(counts)
-print tfidf.shape
+print(tfidf.shape)
 t0 = time.time()
-print "training..."
+print("training...")
 nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
 print("done in %0.3fs." % (time.time() - t0))
--- a/test/jieba_test.py
+++ b/test/jieba_test.py
@ -1,9 +1,13 @@
 #-*-coding: utf-8 -*-
 from __future__ import unicode_literals, print_function
 import sys
 sys.path.append("../")
 import unittest
 import types
 import jieba
 if sys.version_info[0] > 2:
    from imp import reload
 jieba.initialize()
@ -108,8 +112,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
+            print(" , ".join(result), file=sys.stderr)
-        print  >> sys.stderr, "testDefaultCut"
+        print("testDefaultCut", file=sys.stderr)
    def testCutAll(self):
        for content in test_contents:
@ -117,8 +121,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutAll Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutAll error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
+            print(" , ".join(result), file=sys.stderr)
-        print  >> sys.stderr, "testCutAll"
+        print("testCutAll", file=sys.stderr)
    def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
@ -127,8 +131,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
+            print(" , ".join(result), file=sys.stderr)
-        print  >> sys.stderr, "testSetDictionary"
+        print("testSetDictionary", file=sys.stderr)
    def testCutForSearch(self):
        for content in test_contents:
@ -136,8 +140,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
+            print(" , ".join(result), file=sys.stderr)
-        print  >> sys.stderr, "testCutForSearch"
+        print("testCutForSearch", file=sys.stderr)
    def testPosseg(self):
        import jieba.posseg as pseg
@ -146,18 +150,18 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
-            print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
+            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
-        print  >> sys.stderr, "testPosseg"
+        print("testPosseg", file=sys.stderr)
    def testTokenize(self):
        for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'))
+            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
-                print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
-        print  >> sys.stderr, "testTokenize"
+        print("testTokenize", file=sys.stderr)
    def testDefaultCut_NOHMM(self):
        for content in test_contents:
@ -165,8 +169,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error"
            result = list(result)
            assert isinstance(result, list), "Test DefaultCut error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
+            print(" , ".join(result), file=sys.stderr)
-        print  >> sys.stderr, "testDefaultCut_NOHMM"
+        print("testDefaultCut_NOHMM", file=sys.stderr)
    def testPosseg_NOHMM(self):
        import jieba.posseg as pseg
@ -175,18 +179,18 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
-            print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
+            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
-        print  >> sys.stderr, "testPosseg_NOHMM"
+        print("testPosseg_NOHMM", file=sys.stderr)
    def testTokenize_NOHMM(self):
        for content in test_contents:
-            result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
-                print >>sys.stderr, "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
-        print  >> sys.stderr, "testTokenize_NOHMM"
+        print("testTokenize_NOHMM", file=sys.stderr)
    def testCutForSearch_NOHMM(self):
        for content in test_contents:
@ -194,8 +198,8 @@ class JiebaTestCase(unittest.TestCase):
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
-            print >> sys.stderr, " , ".join(result)
+            print(" , ".join(result), file=sys.stderr)
-        print  >> sys.stderr, "testCutForSearch_NOHMM"
+        print("testCutForSearch_NOHMM", file=sys.stderr)
 if __name__ == "__main__":
    unittest.main()
--- a/test/jiebacmd.py
+++ b/test/jiebacmd.py
@ -6,7 +6,7 @@ cat abc.txt | python jiebacmd.py | sort | uniq -c | sort -nr -k1 | head -100
 '''
-
+from __future__ import unicode_literals
 import sys
 sys.path.append("../")
@ -23,6 +23,6 @@ while True:
        break
    line = line.strip()
    for word in jieba.cut(line):
-        print word.encode(default_encoding)
+        print(word)
--- a/test/parallel/extract_tags.py
+++ b/test/parallel/extract_tags.py
@ -14,7 +14,7 @@ opt, args = parser.parse_args()
 if len(args) <1:
-    print USAGE
+    print(USAGE)
    sys.exit(1)
 file_name = args[0]
@ -29,6 +29,6 @@ content = open(file_name,'rb').read()
 tags = jieba.analyse.extract_tags(content,topK=topK)
-print ",".join(tags)
+print(",".join(tags))
--- a/test/parallel/test.py
+++ b/test/parallel/test.py
@ -1,4 +1,5 @@
 #encoding=utf-8
 from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut(test_sent)
    for word in result:
-        print word, "/", 
+        print(word, "/", end=' ') 
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/parallel/test2.py
+++ b/test/parallel/test2.py
@ -1,4 +1,5 @@
 #encoding=utf-8
 from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut(test_sent,cut_all=True)
    for word in result:
-        print word, "/", 
+        print(word, "/", end=' ') 
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/parallel/test_cut_for_search.py
+++ b/test/parallel/test_cut_for_search.py
@ -1,4 +1,5 @@
 #encoding=utf-8
 from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -7,8 +8,8 @@ jieba.enable_parallel(4)
 def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
-        print word, "/", 
+        print(word, "/", end=' ') 
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/parallel/test_file.py
+++ b/test/parallel/test_file.py
@ -1,6 +1,5 @@
 import urllib2
 import sys,time
 import sys
 import time
 sys.path.append("../../")
 import jieba
@ -17,5 +16,5 @@ tm_cost = t2-t1
 log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))
-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed %s bytes/second' % (len(content)/tm_cost))
--- a/test/parallel/test_pos.py
+++ b/test/parallel/test_pos.py
@ -1,4 +1,5 @@
 #encoding=utf-8
 from __future__ import print_function
 import sys
 sys.path.append("../../")
 import jieba
@ -8,8 +9,8 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
-        print w.word, "/", w.flag, ", ",  
+        print(w.word, "/", w.flag, ", ", end=' ')  
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/parallel/test_pos_file.py
+++ b/test/parallel/test_pos_file.py
@ -1,9 +1,10 @@
-import urllib2
+from __future__ import print_function
 import sys,time
 import sys
 sys.path.append("../../")
 import jieba
 import jieba.posseg as pseg
 jieba.enable_parallel(4)
 url = sys.argv[1]
@ -14,9 +15,8 @@ words = list(pseg.cut(content))
 t2 = time.time()
 tm_cost = t2-t1
-log_f = open("1.log","wb")
+log_f = open("1.log","w")
-for w in words:
+log_f.write(' / '.join(map(str, words)))
    print >> log_f, w.encode("utf-8"), "/" ,
-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed' , len(content)/tm_cost, " bytes/second")
--- a/test/test.py
+++ b/test/test.py
@ -6,7 +6,7 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent)
-    print " / ".join(result)
+    print(" / ".join(result))
 if __name__ == "__main__":
--- a/test/test_bug.py
+++ b/test/test_bug.py
@ -5,5 +5,5 @@ import jieba
 import jieba.posseg as pseg
 words=pseg.cut("又跛又啞")
 for w in words:
-	print w.word,w.flag
+	print(w.word,w.flag)
--- a/test/test_change_dictpath.py
+++ b/test/test_change_dictpath.py
@ -5,7 +5,7 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent)
-    print "  ".join(result)
+    print("  ".join(result))
 def testcase():
    cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。")
@ -22,6 +22,6 @@ def testcase():
 if __name__ == "__main__":
    testcase()
    jieba.set_dictionary("foobar.txt")
-    print "================================"
+    print("================================")
    testcase()
--- a/test/test_cut_for_search.py
+++ b/test/test_cut_for_search.py
@ -6,8 +6,8 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
-        print word, "/", 
+        print(word, "/", end=' ') 
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/test_cutall.py
+++ b/test/test_cutall.py
@ -6,8 +6,8 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent,cut_all=True)
    for word in result:
-        print word, "/", 
+        print(word, "/", end=' ') 
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/test_file.py
+++ b/test/test_file.py
@ -1,4 +1,3 @@
 import urllib2
 import sys,time
 import sys
 sys.path.append("../")
@ -17,6 +16,6 @@ log_f = open("1.log","wb")
 log_f.write(words.encode('utf-8'))
 log_f.close()
-print 'cost',tm_cost
+print('cost ' + tm_cost)
-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed %s bytes/second' % (len(content)/tm_cost))
--- a/test/test_multithread.py
+++ b/test/test_multithread.py
@ -8,18 +8,18 @@ import jieba
 class Worker(threading.Thread):
    def run(self):
        seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
-        print "Full Mode:" + "/ ".join(seg_list) #全模式
+        print("Full Mode:" + "/ ".join(seg_list)) #全模式
        seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
-        print "Default Mode:" + "/ ".join(seg_list) #默认模式
+        print("Default Mode:" + "/ ".join(seg_list)) #默认模式
        seg_list = jieba.cut("他来到了网易杭研大厦")
-        print ", ".join(seg_list)
+        print(", ".join(seg_list))
        seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") #搜索引擎模式
-        print ", ".join(seg_list)
+        print(", ".join(seg_list))
 workers = []
-for i in xrange(10):
+for i in range(10):
    worker = Worker()
    workers.append(worker)
    worker.start()
--- a/test/test_no_hmm.py
+++ b/test/test_no_hmm.py
@ -6,7 +6,7 @@ import jieba
 def cuttest(test_sent):
    result = jieba.cut(test_sent,HMM=False)
-    print " / ".join(result)
+    print(" / ".join(result))
 if __name__ == "__main__":
--- a/test/test_pos.py
+++ b/test/test_pos.py
@ -1,4 +1,5 @@
 #encoding=utf-8
 from __future__ import print_function
 import sys
 sys.path.append("../")
 import jieba.posseg as pseg
@ -6,8 +7,8 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
-        print w.word, "/", w.flag, ", ",
+        print(w.word, "/", w.flag, ", ", end=' ')
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/test_pos_file.py
+++ b/test/test_pos_file.py
@ -1,6 +1,6 @@
-import urllib2
+from __future__ import print_function
 import sys,time
 import sys
 import time
 sys.path.append("../")
 import jieba
 jieba.initialize()
@ -14,9 +14,8 @@ words = list(pseg.cut(content))
 t2 = time.time()
 tm_cost = t2-t1
-log_f = open("1.log","wb")
+log_f = open("1.log","w")
-for w in words:
+log_f.write(' / '.join(map(str, words)))
    print >> log_f, w.encode("utf-8"), "/" ,
-print 'speed' , len(content)/tm_cost, " bytes/second"
+print('speed' , len(content)/tm_cost, " bytes/second")
--- a/test/test_pos_no_hmm.py
+++ b/test/test_pos_no_hmm.py
@ -6,8 +6,8 @@ import jieba.posseg as pseg
 def cuttest(test_sent):
    result = pseg.cut(test_sent,HMM=False)
    for w in result:
-        print w.word, "/", w.flag, ", ",  
+        print(w.word, "/", w.flag, ", ", end=' ')  
-    print ""
+    print("")
 if __name__ == "__main__":
--- a/test/test_tokenize.py
+++ b/test/test_tokenize.py
@ -7,10 +7,9 @@ g_mode="default"
 def cuttest(test_sent):
    global g_mode
    test_sent = test_sent.decode('utf-8')
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
-        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 if __name__ == "__main__":
--- a/test/test_tokenize_no_hmm.py
+++ b/test/test_tokenize_no_hmm.py
@ -7,10 +7,9 @@ g_mode="default"
 def cuttest(test_sent):
    global g_mode
    test_sent = test_sent.decode('utf-8')
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
-        print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
 if __name__ == "__main__":
--- a/test/test_userdict.py
+++ b/test/test_userdict.py
@ -9,19 +9,19 @@ test_sent = "李小福是创新办主任也是云计算方面的专家; 什么
 test_sent += "例如我输入一个带“韩玉赏鉴”的标题，在自定义词库中也增加了此词为N类型"
 words = jieba.cut(test_sent)
 for w in words:
-    print w
+    print(w)
 result = pseg.cut(test_sent)
 for w in result:
-    print w.word, "/", w.flag, ", ",
+    print(w.word, "/", w.flag, ", ", end=' ')
-print "\n========"
+print("\n========")
 terms = jieba.cut('easy_install is great')
 for t in terms:
-    print t
+    print(t)
-print '-------------------------'
+print('-------------------------')
 terms = jieba.cut('python 的正则表达式是好用的')
 for t in terms:
-    print t
+    print(t)
--- a/test/test_whoosh.py
+++ b/test/test_whoosh.py
@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
 from __future__ import unicode_literals
 import sys,os
 sys.path.append("../")
 from whoosh.index import create_in,open_dir
@ -18,46 +19,46 @@ ix = create_in("tmp", schema) # for create new index
 writer = ix.writer()
 writer.add_document(
-    title=u"document1",
+    title="document1",
-    path=u"/a",
+    path="/a",
-    content=u"This is the first document we’ve added!"
+    content="This is the first document we’ve added!"
 )
 writer.add_document(
-    title=u"document2",
+    title="document2",
-    path=u"/b",
+    path="/b",
-    content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
+    content="The second one 你 中文测试中文 is even more interesting! 吃水果"
 )
 writer.add_document(
-    title=u"document3",
+    title="document3",
-    path=u"/c",
+    path="/c",
-    content=u"买水果然后来世博园。"
+    content="买水果然后来世博园。"
 )
 writer.add_document(
-    title=u"document4",
+    title="document4",
-    path=u"/c",
+    path="/c",
-    content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
+    content="工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
 )
 writer.add_document(
-    title=u"document4",
+    title="document4",
-    path=u"/c",
+    path="/c",
-    content=u"咱俩交换一下吧。"
+    content="咱俩交换一下吧。"
 )
 writer.commit()
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)
-for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交换"):
+for keyword in ("水果世博园","你","first","中文","交换机","交换"):
-    print "result of ",keyword
+    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
-        print hit.highlights("content")
+        print(hit.highlights("content"))
-    print "="*10
+    print("="*10)
-for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
+for t in analyzer("我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
-    print t.text
+    print(t.text)
--- a/test/test_whoosh_file.py
+++ b/test/test_whoosh_file.py
@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
 from __future__ import unicode_literals
 import sys
 import os
 sys.path.append("../")
@ -23,8 +24,8 @@ with open(file_name,"rb") as inf:
    for line in inf:
        i+=1
        writer.add_document(
-            title=u"line"+str(i),
+            title="line"+str(i),
-            path=u"/a",
+            path="/a",
            content=line.decode('gbk','ignore')
        )
 writer.commit()
@ -32,10 +33,10 @@ writer.commit()
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)
-for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"):
+for keyword in ("水果小姐","你","first","中文","交换机","交换"):
-    print "result of ",keyword
+    print("result of " + keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
-        print hit.highlights("content")
+        print(hit.highlights("content"))
-    print "="*10
+    print("="*10)
--- a/test/test_whoosh_file_read.py
+++ b/test/test_whoosh_file_read.py
@ -1,4 +1,5 @@
 # -*- coding: UTF-8 -*-
 from __future__ import unicode_literals
 import sys
 import os
 sys.path.append("../")
@ -18,10 +19,10 @@ ix = open_dir("tmp")
 searcher = ix.searcher()
 parser = QueryParser("content", schema=ix.schema)
-for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换",u"少林",u"乔峰"):
+for keyword in ("水果小姐","你","first","中文","交换机","交换","少林","乔峰"):
-    print "result of ",keyword
+    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:  
-        print hit.highlights("content")
+        print(hit.highlights("content"))
-    print "="*10
+    print("="*10)