Merge pull request #201 from skyerown/master

为关键字提取函数增加词性过滤功能
This commit is contained in:
Sun Junyi 2014-11-18 10:27:52 +08:00
commit 99748bfc17
3 changed files with 22 additions and 6 deletions

6
.gitignore vendored
View File

@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
############
## pycharm
############
.idea
############
## Windows

View File

@ -1,5 +1,6 @@
#encoding=utf-8
import jieba
import jieba.posseg
import os
from operator import itemgetter
try:
@ -58,21 +59,32 @@ def set_stop_words(stop_words_path):
for line in lines:
STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False):
def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list,it will be filtered.
"""
global STOP_WORDS, idf_loader
idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence)
if allowPOS:
allowPOS = frozenset(allowPOS)
words = jieba.posseg.cut(sentence)
else:
words = jieba.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
else:
w = w.word
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0

View File

@ -48,15 +48,17 @@ class UndirectWeightedGraph:
return ws
def textrank(sentence, topK=10, withWeight=False):
def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list,it will be filtered.
"""
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph()
cm = collections.defaultdict(int)
span = 5