mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
merge change of @fukuball
This commit is contained in:
parent
10b86e90fb
commit
f5ca87e088
@ -128,9 +128,10 @@ Output:
|
||||
|
||||
功能 3) :关键词提取
|
||||
================
|
||||
* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse
|
||||
* jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse
|
||||
* setence 为待提取的文本
|
||||
* topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
|
||||
* withWeight 为是否一并返回关键词权重值,默认值为 False
|
||||
|
||||
代码示例 (关键词提取)
|
||||
|
||||
@ -148,6 +149,10 @@ Output:
|
||||
* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
|
||||
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
|
||||
|
||||
关键词一并返回关键词权重值示例
|
||||
|
||||
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
|
||||
|
||||
功能 4) : 词性标注
|
||||
================
|
||||
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法
|
||||
|
@ -58,7 +58,7 @@ def set_stop_words(stop_words_path):
|
||||
for line in lines:
|
||||
STOP_WORDS.add(line)
|
||||
|
||||
def extract_tags(sentence, topK=20):
|
||||
def extract_tags(sentence,topK=20,withWeight=False):
|
||||
global STOP_WORDS
|
||||
|
||||
idf_freq, median_idf = idf_loader.get_idf()
|
||||
@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20):
|
||||
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list, reverse=True)
|
||||
|
||||
if withWeight:
|
||||
tags = st_list[:topK]
|
||||
else:
|
||||
top_tuples = st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
|
43
test/extract_tags_with_weight.py
Normal file
43
test/extract_tags_with_weight.py
Normal file
@ -0,0 +1,43 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
|
||||
import jieba
|
||||
import jieba.analyse
|
||||
from optparse import OptionParser
|
||||
|
||||
USAGE = "usage: python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
|
||||
|
||||
parser = OptionParser(USAGE)
|
||||
parser.add_option("-k", dest="topK")
|
||||
parser.add_option("-w", dest="withWeight")
|
||||
opt, args = parser.parse_args()
|
||||
|
||||
|
||||
if len(args) < 1:
|
||||
print USAGE
|
||||
sys.exit(1)
|
||||
|
||||
file_name = args[0]
|
||||
|
||||
if opt.topK is None:
|
||||
topK = 10
|
||||
else:
|
||||
topK = int(opt.topK)
|
||||
|
||||
if opt.withWeight is None:
|
||||
withWeight = False
|
||||
else:
|
||||
if int(opt.withWeight) is 1:
|
||||
withWeight = True
|
||||
else:
|
||||
withWeight = False
|
||||
|
||||
content = open(file_name, 'rb').read()
|
||||
|
||||
tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
|
||||
|
||||
if withWeight is True:
|
||||
for tag in tags:
|
||||
print "tag: %s\t\t weight: %f" % (tag[1],tag[0])
|
||||
else:
|
||||
print ",".join(tags)
|
44
test/lyric.txt
Normal file
44
test/lyric.txt
Normal file
@ -0,0 +1,44 @@
|
||||
我沒有心
|
||||
我沒有真實的自我
|
||||
我只有消瘦的臉孔
|
||||
所謂軟弱
|
||||
所謂的順從一向是我
|
||||
的座右銘
|
||||
|
||||
而我
|
||||
沒有那海洋的寬闊
|
||||
我只要熱情的撫摸
|
||||
所謂空洞
|
||||
所謂不安全感是我
|
||||
的墓誌銘
|
||||
|
||||
而你
|
||||
是否和我一般怯懦
|
||||
是否和我一般矯作
|
||||
和我一般囉唆
|
||||
|
||||
而你
|
||||
是否和我一般退縮
|
||||
是否和我一般肌迫
|
||||
一般地困惑
|
||||
|
||||
我沒有力
|
||||
我沒有滿腔的熱火
|
||||
我只有滿肚的如果
|
||||
所謂勇氣
|
||||
所謂的認同感是我
|
||||
隨便說說
|
||||
|
||||
而你
|
||||
是否和我一般怯懦
|
||||
是否和我一般矯作
|
||||
是否對你來說
|
||||
只是一場遊戲
|
||||
雖然沒有把握
|
||||
|
||||
而你
|
||||
是否和我一般退縮
|
||||
是否和我一般肌迫
|
||||
是否對你來說
|
||||
只是逼不得已
|
||||
雖然沒有藉口
|
Loading…
x
Reference in New Issue
Block a user