mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
merge change of @fukuball
This commit is contained in:
parent
10b86e90fb
commit
f5ca87e088
@ -128,9 +128,10 @@ Output:
|
|||||||
|
|
||||||
功能 3) :关键词提取
|
功能 3) :关键词提取
|
||||||
================
|
================
|
||||||
* jieba.analyse.extract_tags(sentence,topK) #需要先 import jieba.analyse
|
* jieba.analyse.extract_tags(sentence,topK,withWeight) #需要先 import jieba.analyse
|
||||||
* setence 为待提取的文本
|
* setence 为待提取的文本
|
||||||
* topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
|
* topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20
|
||||||
|
* withWeight 为是否一并返回关键词权重值,默认值为 False
|
||||||
|
|
||||||
代码示例 (关键词提取)
|
代码示例 (关键词提取)
|
||||||
|
|
||||||
@ -148,6 +149,10 @@ Output:
|
|||||||
* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
|
* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt
|
||||||
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py
|
||||||
|
|
||||||
|
关键词一并返回关键词权重值示例
|
||||||
|
|
||||||
|
* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py
|
||||||
|
|
||||||
功能 4) : 词性标注
|
功能 4) : 词性标注
|
||||||
================
|
================
|
||||||
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法
|
* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法
|
||||||
|
@ -58,7 +58,7 @@ def set_stop_words(stop_words_path):
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
STOP_WORDS.add(line)
|
STOP_WORDS.add(line)
|
||||||
|
|
||||||
def extract_tags(sentence, topK=20):
|
def extract_tags(sentence,topK=20,withWeight=False):
|
||||||
global STOP_WORDS
|
global STOP_WORDS
|
||||||
|
|
||||||
idf_freq, median_idf = idf_loader.get_idf()
|
idf_freq, median_idf = idf_loader.get_idf()
|
||||||
@ -77,6 +77,9 @@ def extract_tags(sentence, topK=20):
|
|||||||
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
|
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
|
||||||
st_list = sorted(tf_idf_list, reverse=True)
|
st_list = sorted(tf_idf_list, reverse=True)
|
||||||
|
|
||||||
top_tuples = st_list[:topK]
|
if withWeight:
|
||||||
tags = [a[1] for a in top_tuples]
|
tags = st_list[:topK]
|
||||||
|
else:
|
||||||
|
top_tuples = st_list[:topK]
|
||||||
|
tags = [a[1] for a in top_tuples]
|
||||||
return tags
|
return tags
|
||||||
|
43
test/extract_tags_with_weight.py
Normal file
43
test/extract_tags_with_weight.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE = "usage: python extract_tags_with_weight.py [file name] -k [top k] -w [with weight=1 or 0]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k", dest="topK")
|
||||||
|
parser.add_option("-w", dest="withWeight")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) < 1:
|
||||||
|
print USAGE
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK is None:
|
||||||
|
topK = 10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
if opt.withWeight is None:
|
||||||
|
withWeight = False
|
||||||
|
else:
|
||||||
|
if int(opt.withWeight) is 1:
|
||||||
|
withWeight = True
|
||||||
|
else:
|
||||||
|
withWeight = False
|
||||||
|
|
||||||
|
content = open(file_name, 'rb').read()
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content, topK=topK, withWeight=withWeight)
|
||||||
|
|
||||||
|
if withWeight is True:
|
||||||
|
for tag in tags:
|
||||||
|
print "tag: %s\t\t weight: %f" % (tag[1],tag[0])
|
||||||
|
else:
|
||||||
|
print ",".join(tags)
|
44
test/lyric.txt
Normal file
44
test/lyric.txt
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
我沒有心
|
||||||
|
我沒有真實的自我
|
||||||
|
我只有消瘦的臉孔
|
||||||
|
所謂軟弱
|
||||||
|
所謂的順從一向是我
|
||||||
|
的座右銘
|
||||||
|
|
||||||
|
而我
|
||||||
|
沒有那海洋的寬闊
|
||||||
|
我只要熱情的撫摸
|
||||||
|
所謂空洞
|
||||||
|
所謂不安全感是我
|
||||||
|
的墓誌銘
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般怯懦
|
||||||
|
是否和我一般矯作
|
||||||
|
和我一般囉唆
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般退縮
|
||||||
|
是否和我一般肌迫
|
||||||
|
一般地困惑
|
||||||
|
|
||||||
|
我沒有力
|
||||||
|
我沒有滿腔的熱火
|
||||||
|
我只有滿肚的如果
|
||||||
|
所謂勇氣
|
||||||
|
所謂的認同感是我
|
||||||
|
隨便說說
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般怯懦
|
||||||
|
是否和我一般矯作
|
||||||
|
是否對你來說
|
||||||
|
只是一場遊戲
|
||||||
|
雖然沒有把握
|
||||||
|
|
||||||
|
而你
|
||||||
|
是否和我一般退縮
|
||||||
|
是否和我一般肌迫
|
||||||
|
是否對你來說
|
||||||
|
只是逼不得已
|
||||||
|
雖然沒有藉口
|
Loading…
x
Reference in New Issue
Block a user