为函数textrank增加参数allowPOS,并修改extract_tags的参数allowPOS与textrank保持一致。

This commit is contained in:
walkskyer 2014-11-15 18:36:09 +08:00
parent bab5f362ba
commit a336e26403
2 changed files with 6 additions and 4 deletions

View File

@ -59,14 +59,14 @@ def set_stop_words(stop_words_path):
for line in lines: for line in lines:
STOP_WORDS.add(line) STOP_WORDS.add(line)
def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]): def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
""" """
Extract keywords from sentence using TF-IDF algorithm. Extract keywords from sentence using TF-IDF algorithm.
Parameter: Parameter:
- topK: return how many top keywords. `None` for all possible words. - topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight); - withWeight: if True, return a list of (word, weight);
if False, return a list of words. if False, return a list of words.
- allowPOS: the allowed POS list eg. ['n']. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list,it will be filtered. if the POS of w is not in this list,it will be filtered.
""" """
global STOP_WORDS, idf_loader global STOP_WORDS, idf_loader

View File

@ -48,15 +48,17 @@ class UndirectWeightedGraph:
return ws return ws
def textrank(sentence, topK=10, withWeight=False): def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
""" """
Extract keywords from sentence using TextRank algorithm. Extract keywords from sentence using TextRank algorithm.
Parameter: Parameter:
- topK: return how many top keywords. `None` for all possible words. - topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight); - withWeight: if True, return a list of (word, weight);
if False, return a list of words. if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list,it will be filtered.
""" """
pos_filt = frozenset(('ns', 'n', 'vn', 'v')) pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph() g = UndirectWeightedGraph()
cm = collections.defaultdict(int) cm = collections.defaultdict(int)
span = 5 span = 5