merge change of @fukuball

This commit is contained in:
fxsjy 2014-10-23 15:59:08 +08:00
parent 10b86e90fb
commit f5ca87e088
4 changed files with 99 additions and 4 deletions

View File

@ -128,9 +128,10 @@ Output:
功能 3) :关键词提取 功能 3) :关键词提取
================ ================
* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse * jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse
* setence 为待提取的文本 * setence 为待提取的文本
* topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20 * topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
* withWeight 为是否一并返回关键词权重值,默认值为 False
代码示例 (关键词提取) 代码示例 (关键词提取)
@ -148,6 +149,10 @@ Output:
* 自定义语料库示例https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt * 自定义语料库示例https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
* 用法示例https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py * 用法示例https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
关键词一并返回关键词权重值示例
* 用法示例https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
功能 4) : 词性标注 功能 4) : 词性标注
================ ================
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法 * 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法

View File

@ -58,7 +58,7 @@ def set_stop_words(stop_words_path):
for line in lines: for line in lines:
STOP_WORDS.add(line) STOP_WORDS.add(line)
def extract_tags(sentence, topK=20): def extract_tags(sentence,topK=20,withWeight=False):
global STOP_WORDS global STOP_WORDS
idf_freq, median_idf = idf_loader.get_idf() idf_freq, median_idf = idf_loader.get_idf()
@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20):
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq] tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list, reverse=True) st_list = sorted(tf_idf_list, reverse=True)
top_tuples = st_list[:topK] if withWeight:
tags = [a[1] for a in top_tuples] tags = st_list[:topK]
else:
top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags return tags

View File

@ -0,0 +1,43 @@
import sys
sys.path.append('../')
import jieba
import jieba.analyse
from optparse import OptionParser
USAGE = "usage: python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
parser = OptionParser(USAGE)
parser.add_option("-k", dest="topK")
parser.add_option("-w", dest="withWeight")
opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
sys.exit(1)
file_name = args[0]
if opt.topK is None:
topK = 10
else:
topK = int(opt.topK)
if opt.withWeight is None:
withWeight = False
else:
if int(opt.withWeight) is 1:
withWeight = True
else:
withWeight = False
content = open(file_name, 'rb').read()
tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
if withWeight is True:
for tag in tags:
print "tag: %s\t\t weight: %f" % (tag[1],tag[0])
else:
print ",".join(tags)

44
test/lyric.txt Normal file
View File

@ -0,0 +1,44 @@
我沒有心
我沒有真實的自我
我只有消瘦的臉孔
所謂軟弱
所謂的順從一向是我
的座右銘
而我
沒有那海洋的寬闊
我只要熱情的撫摸
所謂空洞
所謂不安全感是我
的墓誌銘
而你
是否和我一般怯懦
是否和我一般矯作
和我一般囉唆
而你
是否和我一般退縮
是否和我一般肌迫
一般地困惑
我沒有力
我沒有滿腔的熱火
我只有滿肚的如果
所謂勇氣
所謂的認同感是我
隨便說說
而你
是否和我一般怯懦
是否和我一般矯作
是否對你來說
只是一場遊戲
雖然沒有把握
而你
是否和我一般退縮
是否和我一般肌迫
是否對你來說
只是逼不得已
雖然沒有藉口