From f5ca87e0884ec6f013b4f9c4119aa47d44e090fe Mon Sep 17 00:00:00 2001 From: fxsjy Date: Thu, 23 Oct 2014 15:59:08 +0800 Subject: [PATCH] merge change of @fukuball --- README.md | 7 ++++- jieba/analyse/__init__.py | 9 ++++--- test/extract_tags_with_weight.py | 43 +++++++++++++++++++++++++++++++ test/lyric.txt | 44 ++++++++++++++++++++++++++++++++ 4 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 test/extract_tags_with_weight.py create mode 100644 test/lyric.txt diff --git a/README.md b/README.md index 5fbe069..2295f3c 100644 --- a/README.md +++ b/README.md @@ -128,9 +128,10 @@ Output: 功能 3) :关键词提取 ================ -* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse +* jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse * setence 为待提取的文本 * topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20 +* withWeight 为是否一并返回关键词权重值,默认值为 False 代码示例 (关键词提取) @@ -148,6 +149,10 @@ Output: * 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt * 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py +关键词一并返回关键词权重值示例 + +* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py + 功能 4) : 词性标注 ================ * 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法 diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 4a80d93..61772d5 100755 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -58,7 +58,7 @@ def set_stop_words(stop_words_path): for line in lines: STOP_WORDS.add(line) -def extract_tags(sentence, topK=20): +def extract_tags(sentence,topK=20,withWeight=False): global STOP_WORDS idf_freq, median_idf = idf_loader.get_idf() @@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20): tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq] st_list = sorted(tf_idf_list, reverse=True) - top_tuples = st_list[:topK] - tags = [a[1] for a in top_tuples] + if withWeight: + tags = st_list[:topK] + else: + top_tuples = st_list[:topK] + tags = [a[1] for a in top_tuples] return tags diff --git a/test/extract_tags_with_weight.py b/test/extract_tags_with_weight.py new file mode 100644 index 0000000..394434f --- /dev/null +++ b/test/extract_tags_with_weight.py @@ -0,0 +1,43 @@ +import sys +sys.path.append('../') + +import jieba +import jieba.analyse +from optparse import OptionParser + +USAGE = "usage: python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]" + +parser = OptionParser(USAGE) +parser.add_option("-k", dest="topK") +parser.add_option("-w", dest="withWeight") +opt, args = parser.parse_args() + + +if len(args) < 1: + print USAGE + sys.exit(1) + +file_name = args[0] + +if opt.topK is None: + topK = 10 +else: + topK = int(opt.topK) + +if opt.withWeight is None: + withWeight = False +else: + if int(opt.withWeight) is 1: + withWeight = True + else: + withWeight = False + +content = open(file_name, 'rb').read() + +tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight) + +if withWeight is True: + for tag in tags: + print "tag: %s\t\t weight: %f" % (tag[1],tag[0]) +else: + print ",".join(tags) diff --git a/test/lyric.txt b/test/lyric.txt new file mode 100644 index 0000000..db2d23b --- /dev/null +++ b/test/lyric.txt @@ -0,0 +1,44 @@ +我沒有心 +我沒有真實的自我 +我只有消瘦的臉孔 +所謂軟弱 +所謂的順從一向是我 +的座右銘 + +而我 +沒有那海洋的寬闊 +我只要熱情的撫摸 +所謂空洞 +所謂不安全感是我 +的墓誌銘 + +而你 +是否和我一般怯懦 +是否和我一般矯作 +和我一般囉唆 + +而你 +是否和我一般退縮 +是否和我一般肌迫 +一般地困惑 + +我沒有力 +我沒有滿腔的熱火 +我只有滿肚的如果 +所謂勇氣 +所謂的認同感是我 +隨便說說 + +而你 +是否和我一般怯懦 +是否和我一般矯作 +是否對你來說 +只是一場遊戲 +雖然沒有把握 + +而你 +是否和我一般退縮 +是否和我一般肌迫 +是否對你來說 +只是逼不得已 +雖然沒有藉口 \ No newline at end of file