From f5ca87e0884ec6f013b4f9c4119aa47d44e090fe Mon Sep 17 00:00:00 2001
From: fxsjy <ccnusjy@gmail.com>
Date: Thu, 23 Oct 2014 15:59:08 +0800
Subject: [PATCH] merge change of @fukuball

---
 README.md                        |  7 ++++-
 jieba/analyse/__init__.py        |  9 ++++---
 test/extract_tags_with_weight.py | 43 +++++++++++++++++++++++++++++++
 test/lyric.txt                   | 44 ++++++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+), 4 deletions(-)
 create mode 100644 test/extract_tags_with_weight.py
 create mode 100644 test/lyric.txt

diff --git a/README.md b/README.md
index 5fbe069..2295f3c 100644
--- a/README.md
+++ b/README.md
@@ -128,9 +128,10 @@ Output:
 
 功能 3) ：关键词提取
 ================
-* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse
+* jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse
 * setence 为待提取的文本
 * topK 为返回几个 TF/IDF 权重最大的关键词，默认值为 20
+* withWeight 为是否一并返回关键词权重值，默认值为 False
 
 代码示例 （关键词提取）
 
@@ -148,6 +149,10 @@ Output:
 * 自定义语料库示例：https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
 * 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
 
+关键词一并返回关键词权重值示例
+
+* 用法示例：https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
+
 功能 4) : 词性标注
 ================
 * 标注句子分词后每个词的词性，采用和 ictclas 兼容的标记法
diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index 4a80d93..61772d5 100755
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -58,7 +58,7 @@ def set_stop_words(stop_words_path):
     for line in lines:
         STOP_WORDS.add(line)
 
-def extract_tags(sentence, topK=20):
+def extract_tags(sentence,topK=20,withWeight=False):
     global STOP_WORDS
 
     idf_freq, median_idf = idf_loader.get_idf()
@@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20):
     tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
     st_list = sorted(tf_idf_list, reverse=True)
 
-    top_tuples = st_list[:topK]
-    tags = [a[1] for a in top_tuples]
+    if withWeight:
+        tags = st_list[:topK]
+    else:
+        top_tuples = st_list[:topK]
+        tags = [a[1] for a in top_tuples]
     return tags
diff --git a/test/extract_tags_with_weight.py b/test/extract_tags_with_weight.py
new file mode 100644
index 0000000..394434f
--- /dev/null
+++ b/test/extract_tags_with_weight.py
@@ -0,0 +1,43 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+parser.add_option("-w", dest="withWeight")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+if opt.withWeight is None:
+    withWeight = False
+else:
+    if int(opt.withWeight) is 1:
+        withWeight = True
+    else:
+        withWeight = False
+
+content = open(file_name, 'rb').read()
+
+tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
+
+if withWeight is True:
+    for tag in tags:
+        print "tag: %s\t\t weight: %f" % (tag[1],tag[0])
+else:
+    print ",".join(tags)
diff --git a/test/lyric.txt b/test/lyric.txt
new file mode 100644
index 0000000..db2d23b
--- /dev/null
+++ b/test/lyric.txt
@@ -0,0 +1,44 @@
+我沒有心
+我沒有真實的自我
+我只有消瘦的臉孔
+所謂軟弱
+所謂的順從一向是我
+的座右銘
+
+而我
+沒有那海洋的寬闊
+我只要熱情的撫摸
+所謂空洞
+所謂不安全感是我
+的墓誌銘
+
+而你
+是否和我一般怯懦
+是否和我一般矯作
+和我一般囉唆
+
+而你
+是否和我一般退縮
+是否和我一般肌迫
+一般地困惑
+
+我沒有力
+我沒有滿腔的熱火
+我只有滿肚的如果
+所謂勇氣
+所謂的認同感是我
+隨便說說
+
+而你
+是否和我一般怯懦
+是否和我一般矯作
+是否對你來說
+只是一場遊戲
+雖然沒有把握
+
+而你
+是否和我一般退縮
+是否和我一般肌迫
+是否對你來說
+只是逼不得已
+雖然沒有藉口
\ No newline at end of file