Merge pull request #201 from skyerown/master

为关键字提取函数增加词性过滤功能
2025-07-10 00:01:33 +08:00 · 2014-11-18 10:27:52 +08:00 · 2014-11-18 10:27:52 +08:00 · 99748bfc17
commit 99748bfc17
parent ec68c21ea0 a336e26403
3 changed files with 22 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
 _UpgradeReport_Files/
 Backup*/
 UpgradeLog*.XML
-
-
+############
+## pycharm
+############
+.idea

 ############
 ## Windows
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,5 +1,6 @@
 #encoding=utf-8
 import jieba
+import jieba.posseg
 import os
 from operator import itemgetter
 try:
@ -58,21 +59,32 @@ def set_stop_words(stop_words_path):
    for line in lines:
        STOP_WORDS.add(line)

-def extract_tags(sentence, topK=20, withWeight=False):
+def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
    """
    Extract keywords from sentence using TF-IDF algorithm.
    Parameter:
        - topK: return how many top keywords. `None` for all possible words.
        - withWeight: if True, return a list of (word, weight);
                      if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
+                    if the POS of w is not in this list,it will be filtered.
    """
    global STOP_WORDS, idf_loader

    idf_freq, median_idf = idf_loader.get_idf()

-    words = jieba.cut(sentence)
+    if allowPOS:
+        allowPOS = frozenset(allowPOS)
+        words = jieba.posseg.cut(sentence)
+    else:
+        words = jieba.cut(sentence)
    freq = {}
    for w in words:
+        if allowPOS:
+            if w.flag not in allowPOS:
+                continue
+            else:
+                w = w.word
        if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
            continue
        freq[w] = freq.get(w, 0.0) + 1.0
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@ -48,15 +48,17 @@ class UndirectWeightedGraph:
        return ws


-def textrank(sentence, topK=10, withWeight=False):
+def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
    """
    Extract keywords from sentence using TextRank algorithm.
    Parameter:
        - topK: return how many top keywords. `None` for all possible words.
        - withWeight: if True, return a list of (word, weight);
                      if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
+                    if the POS of w is not in this list,it will be filtered.
    """
-    pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
+    pos_filt = frozenset(allowPOS)
    g = UndirectWeightedGraph()
    cm = collections.defaultdict(int)
    span = 5