merge change of @fukuball

2025-07-10 00:01:33 +08:00 · 2014-10-23 15:59:08 +08:00 · 2014-10-23 15:59:08 +08:00 · f5ca87e088
commit f5ca87e088
parent 10b86e90fb
4 changed files with 99 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -128,9 +128,10 @@ Output:

 功能 3) ：关键词提取
 ================
-* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse
+* jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse
 * setence 为待提取的文本
 * topK 为返回几个 TF/IDF 权重最大的关键词，默认值为 20
+* withWeight 为是否一并返回关键词权重值，默认值为 False

 代码示例 （关键词提取）

@ -148,6 +149,10 @@ Output:
 * 自定义语料库示例：https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py

+关键词一并返回关键词权重值示例
+
+* 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
+
 功能 4) : 词性标注
 ================
 * 标注句子分词后每个词的词性，采用和 ictclas 兼容的标记法
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -58,7 +58,7 @@ def set_stop_words(stop_words_path):
    for line in lines:
        STOP_WORDS.add(line)

-def extract_tags(sentence, topK=20):
+def extract_tags(sentence,topK=20,withWeight=False):
    global STOP_WORDS

    idf_freq, median_idf = idf_loader.get_idf()
@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20):
    tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
    st_list = sorted(tf_idf_list, reverse=True)

+    if withWeight:
+        tags = st_list[:topK]
+    else:
        top_tuples = st_list[:topK]
        tags = [a[1] for a in top_tuples]
    return tags
--- a/test/extract_tags_with_weight.py
+++ b/test/extract_tags_with_weight.py
@ -0,0 +1,43 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+parser.add_option("-w", dest="withWeight")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+if opt.withWeight is None:
+    withWeight = False
+else:
+    if int(opt.withWeight) is 1:
+        withWeight = True
+    else:
+        withWeight = False
+
+content = open(file_name, 'rb').read()
+
+tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
+
+if withWeight is True:
+    for tag in tags:
+        print "tag: %s\t\t weight: %f" % (tag[1],tag[0])
+else:
+    print ",".join(tags)
--- a/test/lyric.txt
+++ b/test/lyric.txt
@ -0,0 +1,44 @@
+我沒有心
+我沒有真實的自我
+我只有消瘦的臉孔
+所謂軟弱
+所謂的順從一向是我
+的座右銘
+
+而我
+沒有那海洋的寬闊
+我只要熱情的撫摸
+所謂空洞
+所謂不安全感是我
+的墓誌銘
+
+而你
+是否和我一般怯懦
+是否和我一般矯作
+和我一般囉唆
+
+而你
+是否和我一般退縮
+是否和我一般肌迫
+一般地困惑
+
+我沒有力
+我沒有滿腔的熱火
+我只有滿肚的如果
+所謂勇氣
+所謂的認同感是我
+隨便說說
+
+而你
+是否和我一般怯懦
+是否和我一般矯作
+是否對你來說
+只是一場遊戲
+雖然沒有把握
+
+而你
+是否和我一般退縮
+是否和我一般肌迫
+是否對你來說
+只是逼不得已
+雖然沒有藉口