From d82d2c18df20342c88291d2a94093bc51bc5eee0 Mon Sep 17 00:00:00 2001
From: walkskyer <walkskyer@qq.com>
Date: Thu, 13 Nov 2014 22:26:22 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E4=B8=BA=E5=85=B3=E9=94=AE=E5=AD=97?=
 =?UTF-8?q?=E6=8F=90=E5=8F=96=E5=87=BD=E6=95=B0=E5=A2=9E=E5=8A=A0=E8=AF=8D?=
 =?UTF-8?q?=E6=80=A7=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 jieba/analyse/__init__.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index af36149..c8a996f 100755
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -1,5 +1,6 @@
 #encoding=utf-8
 import jieba
+import jieba.posseg
 import os
 from operator import itemgetter
 try:
@@ -58,21 +59,31 @@ def set_stop_words(stop_words_path):
     for line in lines:
         STOP_WORDS.add(line)
 
-def extract_tags(sentence, topK=20, withWeight=False):
+def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]):
     """
     Extract keywords from sentence using TF-IDF algorithm.
     Parameter:
         - topK: return how many top keywords. `None` for all possible words.
         - withWeight: if True, return a list of (word, weight);
                       if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['n'].
+                    if the POS of w is not in this list,it will be filtered.
     """
     global STOP_WORDS, idf_loader
 
     idf_freq, median_idf = idf_loader.get_idf()
 
-    words = jieba.cut(sentence)
+    if allowPOS:
+        words = jieba.posseg.cut(sentence)
+    else:
+        words = jieba.cut(sentence)
     freq = {}
     for w in words:
+        if allowPOS:
+            if w.flag not in allowPOS:
+                continue
+            else:
+                w = w.word
         if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
             continue
         freq[w] = freq.get(w, 0.0) + 1.0

From dd624776052c70db26ad564f2243049360d1c27c Mon Sep 17 00:00:00 2001
From: walkskyer <walkskyer@qq.com>
Date: Sat, 15 Nov 2014 13:33:13 +0800
Subject: [PATCH 2/4] =?UTF-8?q?.gitignore=E4=B8=AD=E5=BF=BD=E7=95=A5pychar?=
 =?UTF-8?q?m=E9=A1=B9=E7=9B=AE=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8c2c5f4..e36fabc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
 _UpgradeReport_Files/
 Backup*/
 UpgradeLog*.XML
-
-
+############
+## pycharm
+############
+.idea
 
 ############
 ## Windows

From bab5f362bae7a7b4795b753a1587592b2b097c6c Mon Sep 17 00:00:00 2001
From: walkskyer <walkskyer@qq.com>
Date: Sat, 15 Nov 2014 18:14:47 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E5=B0=86exstract=5Ftags=E5=8F=82=E6=95=B0a?=
 =?UTF-8?q?llowPOS=E8=BD=AC=E6=8D=A2=E4=B8=BAfrozenset=E4=BB=A5=E5=87=8F?=
 =?UTF-8?q?=E5=B0=91=E6=9F=A5=E6=89=BE=E6=97=B6=E9=97=B4=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 jieba/analyse/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index c8a996f..ea7e66c 100755
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -74,6 +74,7 @@ def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]):
     idf_freq, median_idf = idf_loader.get_idf()
 
     if allowPOS:
+        allowPOS = frozenset(allowPOS)
         words = jieba.posseg.cut(sentence)
     else:
         words = jieba.cut(sentence)

From a336e2640310aa7d0f3095ba395c788b228c05cf Mon Sep 17 00:00:00 2001
From: walkskyer <walkskyer@qq.com>
Date: Sat, 15 Nov 2014 18:36:09 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E4=B8=BA=E5=87=BD=E6=95=B0textrank?=
 =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=8F=82=E6=95=B0allowPOS=EF=BC=8C=E5=B9=B6?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9extract=5Ftags=E7=9A=84=E5=8F=82=E6=95=B0allo?=
 =?UTF-8?q?wPOS=E4=B8=8Etextrank=E4=BF=9D=E6=8C=81=E4=B8=80=E8=87=B4?=
 =?UTF-8?q?=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 jieba/analyse/__init__.py | 4 ++--
 jieba/analyse/textrank.py | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index ea7e66c..c56b9a4 100755
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -59,14 +59,14 @@ def set_stop_words(stop_words_path):
     for line in lines:
         STOP_WORDS.add(line)
 
-def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]):
+def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
     """
     Extract keywords from sentence using TF-IDF algorithm.
     Parameter:
         - topK: return how many top keywords. `None` for all possible words.
         - withWeight: if True, return a list of (word, weight);
                       if False, return a list of words.
-        - allowPOS: the allowed POS list eg. ['n'].
+        - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                     if the POS of w is not in this list,it will be filtered.
     """
     global STOP_WORDS, idf_loader
diff --git a/jieba/analyse/textrank.py b/jieba/analyse/textrank.py
index 9ac9ece..739c60e 100644
--- a/jieba/analyse/textrank.py
+++ b/jieba/analyse/textrank.py
@@ -48,15 +48,17 @@ class UndirectWeightedGraph:
         return ws
 
 
-def textrank(sentence, topK=10, withWeight=False):
+def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
     """
     Extract keywords from sentence using TextRank algorithm.
     Parameter:
         - topK: return how many top keywords. `None` for all possible words.
         - withWeight: if True, return a list of (word, weight);
                       if False, return a list of words.
+        - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
+                    if the POS of w is not in this list,it will be filtered.
     """
-    pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
+    pos_filt = frozenset(allowPOS)
     g = UndirectWeightedGraph()
     cm = collections.defaultdict(int)
     span = 5