new feature: tag extraction

2025-07-10 00:01:33 +08:00 · 2012-10-16 12:54:48 +08:00 · 2012-10-16 12:54:48 +08:00 · 3fe92f8520
commit 3fe92f8520
parent 3c9ddd4da6
2 changed files with 270162 additions and 0 deletions
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -0,0 +1,30 @@
+import jieba
+import os
+
+_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
+f_name = os.path.join(_curpath,"idf.txt")
+content = open(f_name,'rb').read().decode('utf-8')
+
+idf_freq = {}
+lines = content.split('\n')
+for line in lines:
+	word,freq = line.split(' ')
+	idf_freq[word] = float(freq)
+max_idf = max(idf_freq.values())
+
+def extract_tags(sentence,topK=20):
+	words = jieba.cut(sentence)
+	freq = {}
+	for w in words:
+		if len(w.strip())<2: continue
+		freq[w]=freq.get(w,0.0)+1.0
+	total = sum(freq.values())
+	freq = [(k,v/total) for k,v in freq.iteritems()]
+
+	tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
+	st_list = sorted(tf_idf_list,reverse=True)
+
+	top_tuples= st_list[:topK]
+	tags = [a[1] for a in top_tuples]
+	return tags
+
--- a/jieba/analyse/idf.txt
+++ b/jieba/analyse/idf.txt