mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
new feature: tag extraction
This commit is contained in:
parent
3c9ddd4da6
commit
3fe92f8520
30
jieba/analyse/__init__.py
Normal file
30
jieba/analyse/__init__.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import jieba
|
||||||
|
import os
|
||||||
|
|
||||||
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
|
f_name = os.path.join(_curpath,"idf.txt")
|
||||||
|
content = open(f_name,'rb').read().decode('utf-8')
|
||||||
|
|
||||||
|
idf_freq = {}
|
||||||
|
lines = content.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
word,freq = line.split(' ')
|
||||||
|
idf_freq[word] = float(freq)
|
||||||
|
max_idf = max(idf_freq.values())
|
||||||
|
|
||||||
|
def extract_tags(sentence,topK=20):
|
||||||
|
words = jieba.cut(sentence)
|
||||||
|
freq = {}
|
||||||
|
for w in words:
|
||||||
|
if len(w.strip())<2: continue
|
||||||
|
freq[w]=freq.get(w,0.0)+1.0
|
||||||
|
total = sum(freq.values())
|
||||||
|
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||||
|
|
||||||
|
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
|
||||||
|
st_list = sorted(tf_idf_list,reverse=True)
|
||||||
|
|
||||||
|
top_tuples= st_list[:topK]
|
||||||
|
tags = [a[1] for a in top_tuples]
|
||||||
|
return tags
|
||||||
|
|
270132
jieba/analyse/idf.txt
Normal file
270132
jieba/analyse/idf.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user