merge change of @fukuball

2025-07-10 00:01:33 +08:00 · 2014-10-23 15:59:08 +08:00 · 2014-10-23 15:59:08 +08:00 · f5ca87e088
commit f5ca87e088
parent 10b86e90fb
4 changed files with 99 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -128,9 +128,10 @@ Output:
 功能 3) ：关键词提取
 ================
-* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse
+* jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse
 * setence 为待提取的文本
 * topK 为返回几个 TF/IDF 权重最大的关键词，默认值为 20
 * withWeight 为是否一并返回关键词权重值，默认值为 False
 代码示例 （关键词提取）
@ -148,6 +149,10 @@ Output:
 * 自定义语料库示例：https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
 关键词一并返回关键词权重值示例
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
 功能 4) : 词性标注
 ================
 * 标注句子分词后每个词的词性，采用和 ictclas 兼容的标记法
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -58,7 +58,7 @@ def set_stop_words(stop_words_path):
    for line in lines:
        STOP_WORDS.add(line)
-def extract_tags(sentence, topK=20):
+def extract_tags(sentence,topK=20,withWeight=False):
    global STOP_WORDS
    idf_freq, median_idf = idf_loader.get_idf()
@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20):
    tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
    st_list = sorted(tf_idf_list, reverse=True)
-    top_tuples = st_list[:topK]
+    if withWeight:
-    tags = [a[1] for a in top_tuples]
+        tags = st_list[:topK]
    else:
        top_tuples = st_list[:topK]
        tags = [a[1] for a in top_tuples]
    return tags
--- a/test/extract_tags_with_weight.py
+++ b/test/extract_tags_with_weight.py
@ -0,0 +1,43 @@
 import sys
 sys.path.append('../')
 import jieba
 import jieba.analyse
 from optparse import OptionParser
 USAGE = "usage:    python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
 parser = OptionParser(USAGE)
 parser.add_option("-k", dest="topK")
 parser.add_option("-w", dest="withWeight")
 opt, args = parser.parse_args()
 if len(args) < 1:
    print USAGE
    sys.exit(1)
 file_name = args[0]
 if opt.topK is None:
    topK = 10
 else:
    topK = int(opt.topK)
 if opt.withWeight is None:
    withWeight = False
 else:
    if int(opt.withWeight) is 1:
        withWeight = True
    else:
        withWeight = False
 content = open(file_name, 'rb').read()
 tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
 if withWeight is True:
    for tag in tags:
        print "tag: %s\t\t weight: %f" % (tag[1],tag[0])
 else:
    print ",".join(tags)
--- a/test/lyric.txt
+++ b/test/lyric.txt
@ -0,0 +1,44 @@
 我沒有心
 我沒有真實的自我
 我只有消瘦的臉孔
 所謂軟弱
 所謂的順從一向是我
 的座右銘
 而我
 沒有那海洋的寬闊
 我只要熱情的撫摸
 所謂空洞
 所謂不安全感是我
 的墓誌銘
 而你
 是否和我一般怯懦
 是否和我一般矯作
 和我一般囉唆
 而你
 是否和我一般退縮
 是否和我一般肌迫
 一般地困惑
 我沒有力
 我沒有滿腔的熱火
 我只有滿肚的如果
 所謂勇氣
 所謂的認同感是我
 隨便說說
 而你
 是否和我一般怯懦
 是否和我一般矯作
 是否對你來說
 只是一場遊戲
 雖然沒有把握
 而你
 是否和我一般退縮
 是否和我一般肌迫
 是否對你來說
 只是逼不得已
 雖然沒有藉口