port extract_tags, etc to jieba3k; add auto2to3 script

2025-07-10 00:01:33 +08:00 · 2014-11-07 23:33:31 +08:00 · 2014-11-07 23:33:31 +08:00 · 7a6caa0c3c
commit 7a6caa0c3c
parent fd9f1f2c0e
12 changed files with 96481 additions and 95988 deletions
--- a/README.md
+++ b/README.md
@ -156,17 +156,16 @@ jieba.analyse.textrank(raw_text)
 来自`__main__`的示例结果：

 ```
-吉林 100.0
-欧亚 86.4592606421
-置业 55.3262889963
-实现 52.0353476663
-收入 37.9475518129
-增资 35.5042189944
-子公司 34.9286032861
-全资 30.8154823412
-城市 30.6031961172
-商业 30.4779050167
-
+吉林 1.0
+欧亚 0.864834432786
+置业 0.553465925497
+实现 0.520660869531
+收入 0.379699688954
+增资 0.355086023683
+子公司 0.349758490263
+全资 0.308537396283
+城市 0.306103738053
+商业 0.304837414946
 ```

 4) : 词性标注
@ -344,6 +343,10 @@ https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
 作者：falood
 地址：https://github.com/falood/exjieba

+结巴分词 R 版本
+----------------
+作者：qinwf
+地址：https://github.com/qinwf/jiebaR

 系统集成
 ========
@ -411,9 +414,9 @@ seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
 print("Full Mode:", "/ ".join(seg_list))  # 全模式

 seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
-print("Default Mode:", "/ ".join(seg_list))  # 精确模式
+print("Default Mode:", "/ ".join(seg_list))  # 默认模式

-seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
+seg_list = jieba.cut("他来到了网易杭研大厦")
 print(", ".join(seg_list))

 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
--- a/jieba/init.py
+++ b/jieba/init.py
@ -13,6 +13,7 @@ import random
 import threading
 from functools import wraps
 import logging
+from hashlib import md5

 DICTIONARY = "dict.txt"
 DICT_LOCK = threading.RLock()
@ -52,12 +53,10 @@ def gen_pfdict(f_name):
                raise e
    return pfdict, lfreq, ltotal

-def initialize(*args):
-    global pfdict, FREQ, total, min_freq, initialized
-    if not args:
+def initialize(dictionary=None):
+    global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
+    if not dictionary:
        dictionary = DICTIONARY
-    else:
-        dictionary = args[0]
    with DICT_LOCK:
        if initialized:
            return
@ -66,13 +65,13 @@ def initialize(*args):
            pfdict = None
        _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

-        abs_path = os.path.join(_curpath,dictionary)
+        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
        else: #custom dictionary
-            cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
+            cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())

        load_from_cache_fail = True
        if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
@ -87,18 +86,18 @@ def initialize(*args):

        if load_from_cache_fail:
            pfdict,FREQ,total = gen_pfdict(abs_path)
-            FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
+            FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize
            min_freq = min(FREQ.values())
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
-                tmp_suffix = "."+str(random.random())
-                with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
+                fd, fpath = tempfile.mkstemp()
+                with os.fdopen(fd, 'wb') as temp_cache_file:
                    marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
-                replace_file(cache_file + tmp_suffix, cache_file)
+                replace_file(fpath, cache_file)
            except:
                logger.exception("Dump cache file failed.")

@ -136,12 +135,11 @@ def __cut_all(sentence):
                    old_j = j


-def calc(sentence,DAG,idx,route):
+def calc(sentence, DAG, idx, route):
    N = len(sentence)
    route[N] = (0.0, '')
    for idx in range(N-1, -1, -1):
-        candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
-        route[idx] = max(candidates)
+        route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])

@require_initialized
 def get_DAG(sentence):
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
    re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
    DAG = get_DAG(sentence)
    route = {}
-    calc(sentence, DAG, 0, route=route)
+    calc(sentence, DAG, 0, route)
    x = 0
    N = len(sentence)
    buf = ''
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,6 +1,7 @@
 #encoding=utf-8
 import jieba
 import os
+from operator import itemgetter
 try:
    from .analyzer import ChineseAnalyzer
 except ImportError:
@ -26,9 +27,7 @@ class IDFLoader:
        if self.path != new_idf_path:
            content = open(new_idf_path, 'r', encoding='utf-8').read()
            idf_freq = {}
-            lines = content.split('\n')
-            if lines and not lines[-1]:
-                lines.pop(-1)
+            lines = content.rstrip('\n').split('\n')
            for line in lines:
                word, freq = line.split(' ')
                idf_freq[word] = float(freq)
@ -60,27 +59,32 @@ def set_stop_words(stop_words_path):
        STOP_WORDS.add(line)

 def extract_tags(sentence, topK=20, withWeight=False):
-    global STOP_WORDS
+    """
+    Extract keywords from sentence using TF-IDF algorithm.
+    Parameter:
+        - topK: return how many top keywords. `None` for all possible words.
+        - withWeight: if True, return a list of (word, weight);
+                      if False, return a list of words.
+    """
+    global STOP_WORDS, idf_loader

    idf_freq, median_idf = idf_loader.get_idf()

    words = jieba.cut(sentence)
    freq = {}
    for w in words:
-        if len(w.strip()) < 2:
-            continue
-        if w.lower() in STOP_WORDS:
+        if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
            continue
        freq[w] = freq.get(w, 0.0) + 1.0
    total = sum(freq.values())
-    freq = [(k,v/total) for k,v in freq.items()]
-
-    tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
-    st_list = sorted(tf_idf_list, reverse=True)
+    for k in freq:
+        freq[k] *= idf_freq.get(k, median_idf) / total

    if withWeight:
-        tags = st_list[:topK]
+        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
    else:
-        top_tuples = st_list[:topK]
-        tags = [a[1] for a in top_tuples]
-    return tags
+        tags = sorted(freq, key=freq.__getitem__, reverse=True)
+    if topK:
+        return tags[:topK]
+    else:
+        return tags
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-import jieba.posseg as pseg
-import collections
 import sys
+import collections
+from operator import itemgetter
+import jieba.posseg as pseg

 class UndirectWeightedGraph:
    d = 0.85
@ -41,17 +42,25 @@ class UndirectWeightedGraph:
                max_rank = w

        for n, w in ws.items():
-            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0) * 100
+            # to unify the weights, don't *100.
+            ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)

        return ws


-def textrank(raw, topk=10):
+def textrank(sentence, topK=10, withWeight=False):
+    """
+    Extract keywords from sentence using TextRank algorithm.
+    Parameter:
+        - topK: return how many top keywords. `None` for all possible words.
+        - withWeight: if True, return a list of (word, weight);
+                      if False, return a list of words.
+    """
    pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
    g = UndirectWeightedGraph()
    cm = collections.defaultdict(int)
    span = 5
-    words = [x for x in pseg.cut(raw)]
+    words = list(pseg.cut(sentence))
    for i in range(len(words)):
        if words[i].flag in pos_filt:
            for j in range(i + 1, i + span):
@ -65,10 +74,16 @@ def textrank(raw, topk=10):
        g.addEdge(terms[0], terms[1], w)

    nodes_rank = g.rank()
-    nrs = sorted(nodes_rank.items(), key=lambda x: x[1], reverse=True)
-    return nrs[:topk]
+    if withWeight:
+        tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
+    else:
+        tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
+    if topK:
+        return tags[:topK]
+    else:
+        return tags

 if __name__ == '__main__':
    s = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"
-    for x, w in textrank(s):
+    for x, w in textrank(s, withWeight=True):
        print(x, w)
--- a/jieba/finalseg/init.py
+++ b/jieba/finalseg/init.py
@ -18,25 +18,22 @@ PrevStatus = {
 }

 def load_model():
-    _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

    start_p = {}
    abs_path = os.path.join(_curpath, PROB_START_P)
-    with open(abs_path, mode='rb') as f:
+    with open(abs_path, 'rb') as f:
        start_p = marshal.load(f)
-    f.closed

    trans_p = {}
    abs_path = os.path.join(_curpath, PROB_TRANS_P)
    with open(abs_path, 'rb') as f:
        trans_p = marshal.load(f)
-    f.closed

    emit_p = {}
    abs_path = os.path.join(_curpath, PROB_EMIT_P)
    with open(abs_path, 'rb') as f:
        emit_p = marshal.load(f)
-    f.closed

    return start_p, trans_p, emit_p

--- a/jieba/posseg/init.py
+++ b/jieba/posseg/init.py
@ -25,27 +25,23 @@ def load_model(f_name, isJython=True):
            line = line.decode("utf-8")
            word, _, tag = line.split(" ")
            result[word] = tag
-    f.closed
    if not isJython:
        return result

    start_p = {}
    abs_path = os.path.join(_curpath, PROB_START_P)
-    with open(abs_path, mode='rb') as f:
+    with open(abs_path, 'rb') as f:
        start_p = marshal.load(f)
-    f.closed

    trans_p = {}
    abs_path = os.path.join(_curpath, PROB_TRANS_P)
    with open(abs_path, 'rb') as f:
        trans_p = marshal.load(f)
-    f.closed

    emit_p = {}
    abs_path = os.path.join(_curpath, PROB_EMIT_P)
    with open(abs_path, 'rb') as f:
        emit_p = marshal.load(f)
-    f.closed

    state = {}
    abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
--- a/jieba/posseg/char_state_tab.py
+++ b/jieba/posseg/char_state_tab.py
--- a/jieba/posseg/prob_emit.py
+++ b/jieba/posseg/prob_emit.py
--- a/test/2to3.diff
+++ b/test/2to3.diff
@ -0,0 +1,450 @@
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/analyzer.py ../jieba/jieba/analyse/analyzer.py
+--- ./jieba/analyse/analyzer.py	2014-11-07 23:07:02.779210408 +0800
+++ ../jieba/jieba/analyse/analyzer.py	2014-11-07 23:07:02.079210422 +0800
+@@ -1,4 +1,4 @@
+-##encoding=utf-8
+#encoding=utf-8
+ from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
+ from whoosh.analysis import Tokenizer,Token
+ from whoosh.lang.porter import stem
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/__init__.py ../jieba/jieba/analyse/__init__.py
+--- ./jieba/analyse/__init__.py	2014-11-07 23:07:02.879210406 +0800
+++ ../jieba/jieba/analyse/__init__.py	2014-11-07 23:16:27.171198767 +0800
+@@ -25,7 +25,7 @@
+ 
+     def set_new_path(self, new_idf_path):
+         if self.path != new_idf_path:
+-            content = open(new_idf_path, 'rb').read().decode('utf-8')
+            content = open(new_idf_path, 'r', encoding='utf-8').read()
+             idf_freq = {}
+             lines = content.rstrip('\n').split('\n')
+             for line in lines:
+@@ -81,7 +81,7 @@
+         freq[k] *= idf_freq.get(k, median_idf) / total
+ 
+     if withWeight:
+-        tags = sorted(list(freq.items()), key=itemgetter(1), reverse=True)
+        tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
+     else:
+         tags = sorted(freq, key=freq.__getitem__, reverse=True)
+     if topK:
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/analyse/textrank.py ../jieba/jieba/analyse/textrank.py
+--- ./jieba/analyse/textrank.py	2014-11-07 23:07:02.827210407 +0800
+++ ../jieba/jieba/analyse/textrank.py	2014-11-07 23:18:22.059196398 +0800
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env python
+#!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+ 
+ import sys
+@@ -22,12 +22,12 @@
+         outSum = collections.defaultdict(float)
+ 
+         wsdef = 1.0 / len(self.graph)
+-        for n, out in list(self.graph.items()):
+        for n, out in self.graph.items():
+             ws[n] = wsdef
+             outSum[n] = sum((e[2] for e in out), 0.0)
+ 
+         for x in range(10):  # 10 iters
+-            for n, inedges in list(self.graph.items()):
+            for n, inedges in self.graph.items():
+                 s = 0
+                 for e in inedges:
+                     s += e[2] / outSum[e[1]] * ws[e[1]]
+@@ -41,7 +41,7 @@
+             elif w > max_rank:
+                 max_rank = w
+ 
+-        for n, w in list(ws.items()):
+        for n, w in ws.items():
+             # to unify the weights, don't *100.
+             ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
+ 
+@@ -70,12 +70,12 @@
+                     continue
+                 cm[(words[i].word, words[j].word)] += 1
+ 
+-    for terms, w in list(cm.items()):
+    for terms, w in cm.items():
+         g.addEdge(terms[0], terms[1], w)
+ 
+     nodes_rank = g.rank()
+     if withWeight:
+-        tags = sorted(list(nodes_rank.items()), key=itemgetter(1), reverse=True)
+        tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
+     else:
+         tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
+     if topK:
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/finalseg/__init__.py ../jieba/jieba/finalseg/__init__.py
+--- ./jieba/finalseg/__init__.py	2014-11-07 23:07:03.147210400 +0800
+++ ../jieba/jieba/finalseg/__init__.py	2014-11-07 23:18:43.495195956 +0800
+@@ -1,4 +1,3 @@
+-
+ import re
+ import os
+ import marshal
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__init__.py ../jieba/jieba/__init__.py
+--- ./jieba/__init__.py	2014-11-07 23:07:02.751210408 +0800
+++ ../jieba/jieba/__init__.py	2014-11-07 23:22:34.963191182 +0800
+@@ -1,4 +1,3 @@
+-
+ __version__ = '0.34'
+ __license__ = 'MIT'
+ 
+@@ -51,7 +50,7 @@
+                     pfdict.add(word[:ch+1])
+             except ValueError as e:
+                 logger.debug('%s at line %s %s' % (f_name, lineno, line))
+-                raise ValueError(e)
+                raise e
+     return pfdict, lfreq, ltotal
+ 
+ def initialize(dictionary=None):
+@@ -78,7 +77,8 @@
+         if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
+             logger.debug("Loading model from cache %s" % cache_file)
+             try:
+-                pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
+                with open(cache_file, 'rb') as cf:
+                    pfdict,FREQ,total,min_freq = marshal.load(cf)
+                 # prevent conflict with old version
+                 load_from_cache_fail = not isinstance(pfdict, set)
+             except:
+@@ -228,11 +228,11 @@
+     '''The main function that segments an entire sentence that contains
+     Chinese characters into seperated words.
+     Parameter:
+-        - sentence: The str/unicode to be segmented.
+        - sentence: The str to be segmented.
+         - cut_all: Model type. True for full pattern, False for accurate pattern.
+         - HMM: Whether to use the Hidden Markov Model.
+     '''
+-    if not isinstance(sentence, str):
+    if isinstance(sentence, bytes):
+         try:
+             sentence = sentence.decode('utf-8')
+         except UnicodeDecodeError:
+@@ -338,8 +338,6 @@
+     global pool, cut, cut_for_search
+     if os.name == 'nt':
+         raise Exception("jieba: parallel mode only supports posix system")
+-    if sys.version_info[0]==2 and sys.version_info[1]<6:
+-        raise Exception("jieba: the parallel feature needs Python version>2.5")
+     from multiprocessing import Pool, cpu_count
+     if processnum is None:
+         processnum = cpu_count()
+@@ -392,12 +390,12 @@
+ def tokenize(unicode_sentence, mode="default", HMM=True):
+     """Tokenize a sentence and yields tuples of (word, start, end)
+     Parameter:
+-        - sentence: the unicode to be segmented.
+        - sentence: the str to be segmented.
+         - mode: "default" or "search", "search" is for finer segmentation.
+         - HMM: whether to use the Hidden Markov Model.
+     """
+     if not isinstance(unicode_sentence, str):
+-        raise Exception("jieba: the input parameter should be unicode.")
+        raise Exception("jieba: the input parameter should be str.")
+     start = 0
+     if mode == 'default':
+         for w in cut(unicode_sentence, HMM=HMM):
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/__main__.py ../jieba/jieba/__main__.py
+--- ./jieba/__main__.py	2014-11-07 23:07:02.563210412 +0800
+++ ../jieba/jieba/__main__.py	2014-11-07 23:07:02.079210422 +0800
+@@ -40,7 +40,7 @@
+ ln = fp.readline()
+ while ln:
+     l = ln.rstrip('\r\n')
+-    print((delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')))
+    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
+     ln = fp.readline()
+ 
+ fp.close()
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/__init__.py ../jieba/jieba/posseg/__init__.py
+--- ./jieba/posseg/__init__.py	2014-11-07 23:07:03.047210402 +0800
+++ ../jieba/jieba/posseg/__init__.py	2014-11-07 23:19:40.883194772 +0800
+@@ -1,4 +1,3 @@
+-
+ import re
+ import os
+ from . import viterbi
+@@ -18,14 +17,14 @@
+     _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+ 
+     result = {}
+-    with open(f_name, "r") as f:
+    with open(f_name, "rb") as f:
+         for line in f:
+             line = line.strip()
+             if not line:
+                 continue
+-            word, _, tag = line.split(' ')
+-            result[word.decode('utf-8')] = tag
+-
+            line = line.decode("utf-8")
+            word, _, tag = line.split(" ")
+            result[word] = tag
+     if not isJython:
+         return result
+ 
+@@ -46,7 +45,7 @@
+ 
+     state = {}
+     abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
+-    with open(abs_path, 'r') as f:
+    with open(abs_path, 'rb') as f:
+         state = marshal.load(f)
+     f.closed
+ 
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./jieba/posseg/viterbi.py ../jieba/jieba/posseg/viterbi.py
+--- ./jieba/posseg/viterbi.py	2014-11-07 23:07:03.079210402 +0800
+++ ../jieba/jieba/posseg/viterbi.py	2014-11-07 23:07:02.095210422 +0800
+@@ -3,14 +3,13 @@
+ MIN_INF = float("-inf")
+ 
+ def get_top_states(t_state_v, K=4):
+-    items = list(t_state_v.items())
+-    topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
+    topK = sorted(t_state_v.items(), key=operator.itemgetter(1), reverse=True)[:K]
+     return [x[0] for x in topK]
+ 
+ def viterbi(obs, states, start_p, trans_p, emit_p):
+     V = [{}] #tabular
+     mem_path = [{}]
+-    all_states = list(trans_p.keys())
+    all_states = trans_p.keys()
+     for y in states.get(obs[0], all_states): #init
+         V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
+         mem_path[0][y] = ''
+@@ -18,9 +17,9 @@
+         V.append({})
+         mem_path.append({})
+         #prev_states = get_top_states(V[t-1])
+-        prev_states = [x for x in list(mem_path[t-1].keys()) if len(trans_p[x]) > 0]
+        prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
+ 
+-        prev_states_expect_next = set((y for x in prev_states for y in list(trans_p[x].keys())))
+        prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
+         obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
+ 
+         if not obs_states:
+@@ -31,7 +30,7 @@
+             V[t][y] = prob
+             mem_path[t][y] = state
+ 
+-    last = [(V[-1][y], y) for y in list(mem_path[-1].keys())]
+    last = [(V[-1][y], y) for y in mem_path[-1].keys()]
+     #if len(last)==0:
+         #print obs
+     prob, state = max(last)
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./README.md ../jieba/README.md
+--- ./README.md	2014-11-07 23:07:02.067210423 +0800
+++ ../jieba/README.md	2014-11-07 23:24:49.263188412 +0800
+@@ -4,6 +4,9 @@
+ "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
+ - _Scroll down for English documentation._
+ 
+注意！
+========
+这个branch `jieba3k` 是专门用于Python3.x的版本
+ 
+ 特点
+ ========
+@@ -68,16 +71,16 @@
+ import jieba
+ 
+ seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
+-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode:", "/ ".join(seg_list))  # 全模式
+ 
+ seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
+-print "Default Mode:", "/ ".join(seg_list)  # 精确模式
+print("Default Mode:", "/ ".join(seg_list))  # 精确模式
+ 
+ seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
+-print ", ".join(seg_list)
+print(", ".join(seg_list))
+ 
+ seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
+-print ", ".join(seg_list)
+print(", ".join(seg_list))
+ ```
+ 
+ 输出:
+@@ -174,7 +177,7 @@
+ >>> import jieba.posseg as pseg
+ >>> words = pseg.cut("我爱北京天安门")
+ >>> for w in words:
+-...    print w.word, w.flag
+...    print(w.word, w.flag)
+ ...
+ 我 r
+ 爱 v
+@@ -203,7 +206,7 @@
+ ```python
+ result = jieba.tokenize(u'永和服装饰品有限公司')
+ for tk in result:
+-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
+ ```
+ 
+ ```
+@@ -219,7 +222,7 @@
+ ```python
+ result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
+ for tk in result:
+-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
+ ```
+ 
+ ```
+@@ -408,16 +411,16 @@
+ import jieba
+ 
+ seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
+-print "Full Mode:", "/ ".join(seg_list)  # 全模式
+print("Full Mode:", "/ ".join(seg_list))  # 全模式
+ 
+ seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
+-print "Default Mode:", "/ ".join(seg_list)  # 默认模式
+print("Default Mode:", "/ ".join(seg_list))  # 默认模式
+ 
+ seg_list = jieba.cut("他来到了网易杭研大厦")
+-print ", ".join(seg_list)
+print(", ".join(seg_list))
+ 
+ seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
+-print ", ".join(seg_list)
+print(", ".join(seg_list))
+ ```
+ 
+ Output:
+@@ -483,7 +486,7 @@
+ >>> import jieba.posseg as pseg
+ >>> words = pseg.cut("我爱北京天安门")
+ >>> for w in words:
+-...    print w.word, w.flag
+...    print(w.word, w.flag)
+ ...
+ 我 r
+ 爱 v
+@@ -512,7 +515,7 @@
+ ```python
+ result = jieba.tokenize(u'永和服装饰品有限公司')
+ for tk in result:
+-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
+ ```
+ 
+ ```
+@@ -528,7 +531,7 @@
+ ```python
+ result = jieba.tokenize(u'永和服装饰品有限公司',mode='search')
+ for tk in result:
+-    print "word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])
+    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
+ ```
+ 
+ ```
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./setup.py ../jieba/setup.py
+--- ./setup.py	2014-11-07 23:07:02.067210423 +0800
+++ ../jieba/setup.py	2014-11-07 23:07:02.095210422 +0800
+@@ -1,5 +1,5 @@
+ from distutils.core import setup
+-setup(name='jieba',
+setup(name='jieba3k',
+       version='0.34',
+       description='Chinese Words Segementation Utilities',
+       author='Sun, Junyi',
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/extract_topic.py ../jieba/test/extract_topic.py
+--- ./test/extract_topic.py	2014-11-07 23:07:03.707210389 +0800
+++ ../jieba/test/extract_topic.py	2014-11-07 23:07:02.095210422 +0800
+@@ -51,13 +51,13 @@
+ print("training...")
+ 
+ nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
+-print(("done in %0.3fs." % (time.time() - t0)))
+print("done in %0.3fs." % (time.time() - t0))
+ 
+ # Inverse the vectorizer vocabulary to be able
+ feature_names = count_vect.get_feature_names()
+ 
+ for topic_idx, topic in enumerate(nmf.components_):
+-    print(("Topic #%d:" % topic_idx))
+-    print((" ".join([feature_names[i]
+-                    for i in topic.argsort()[:-n_top_words - 1:-1]])))
+    print("Topic #%d:" % topic_idx)
+    print(" ".join([feature_names[i]
+                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
+     print("")
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jiebacmd.py ../jieba/test/jiebacmd.py
+--- ./test/jiebacmd.py	2014-11-07 23:07:03.211210399 +0800
+++ ../jieba/test/jiebacmd.py	2014-11-07 23:07:02.099210422 +0800
+@@ -23,6 +23,6 @@
+         break
+     line = line.strip()
+     for word in jieba.cut(line):
+-        print(word.encode(default_encoding))
+        print(word)
+ 
+ 
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/jieba_test.py ../jieba/test/jieba_test.py
+--- ./test/jieba_test.py	2014-11-07 23:07:03.947210384 +0800
+++ ../jieba/test/jieba_test.py	2014-11-07 23:07:02.099210422 +0800
+@@ -1,5 +1,6 @@
+ #-*-coding: utf-8 -*-
+ import sys
+import imp
+ sys.path.append("../")
+ import unittest
+ import types
+@@ -97,7 +98,7 @@
+ 
+ class JiebaTestCase(unittest.TestCase):
+     def setUp(self):
+-        reload(jieba)
+        imp.reload(jieba)
+ 
+     def tearDown(self):
+         pass
+@@ -151,7 +152,7 @@
+ 
+     def testTokenize(self):
+         for content in test_contents:
+-            result = jieba.tokenize(content.decode('utf-8'))
+            result = jieba.tokenize(content)
+             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
+             result = list(result)
+             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
+@@ -180,7 +181,7 @@
+ 
+     def testTokenize_NOHMM(self):
+         for content in test_contents:
+-            result = jieba.tokenize(content.decode('utf-8'),HMM=False)
+            result = jieba.tokenize(content,HMM=False)
+             assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
+             result = list(result)
+             assert isinstance(result, list), "Test Tokenize error on content: %s" % content
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize_no_hmm.py ../jieba/test/test_tokenize_no_hmm.py
+--- ./test/test_tokenize_no_hmm.py	2014-11-07 23:07:04.031210382 +0800
+++ ../jieba/test/test_tokenize_no_hmm.py	2014-11-07 23:07:02.099210422 +0800
+@@ -7,7 +7,6 @@
+ 
+ def cuttest(test_sent):
+     global g_mode
+-    test_sent = test_sent.decode('utf-8')
+     result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
+     for tk in result:
+         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
+diff -d -r -u '--exclude=.git' '--exclude=prob_*.py' '--exclude=char_state_tab.py' ./test/test_tokenize.py ../jieba/test/test_tokenize.py
+--- ./test/test_tokenize.py	2014-11-07 23:07:04.071210381 +0800
+++ ../jieba/test/test_tokenize.py	2014-11-07 23:07:02.099210422 +0800
+@@ -7,7 +7,6 @@
+ 
+ def cuttest(test_sent):
+     global g_mode
+-    test_sent = test_sent.decode('utf-8')
+     result = jieba.tokenize(test_sent,mode=g_mode)
+     for tk in result:
+         print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
--- a/test/auto2to3
+++ b/test/auto2to3
@ -0,0 +1,30 @@
+#!/bin/bash
+# Set 2to3 path.
+PYTHON2TO3=2to3
+# Copy the python2 version.
+echo Jieba 2to3 manual conversion tool
+echo
+if ! git rev-parse; then
+ exit 1
+fi
+echo Copying working directory to ../jieba2
+if [ -d ../jieba2 ]; then
+ echo Found existing ../jieba2
+ read -p "Replace it with new one? (y/n) " -r
+ if ! [[ $REPLY =~ ^[Yy]$ ]]; then
+  echo Cancelled.
+  exit
+ else
+  rm -rf ../jieba2
+ fi
+fi
+git checkout master
+cp -r . ../jieba2
+git checkout jieba3k
+cd ../jieba2
+# Here starts auto conversion.
+echo Converting jieba2 to Python3 ...
+find . -type f -name '*.py' \! -path '*/build/*' \! -name 'prob_*.py' \! -name 'char_state_tab.py' -exec $PYTHON2TO3 -w -n {} +
+find . -type f \! -path '*/build/*' -a \( -name 'prob_*.py' -o -name 'char_state_tab.py' \) -exec sed -i "s/u'\\\u/'\\\u/g" {} \;
+patch -p0 <2to3.diff
+echo Done. Compare jieba and jieba2 to manually port.
--- a/test/extract_tags_with_weight.py
+++ b/test/extract_tags_with_weight.py
@ -38,6 +38,6 @@ tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)

 if withWeight is True:
    for tag in tags:
-        print("tag: %s\t\t weight: %f" % (tag[1],tag[0]))
+        print("tag: %s\t\t weight: %f" % (tag[0],tag[1]))
 else:
    print(",".join(tags))