From d82d2c18df20342c88291d2a94093bc51bc5eee0 Mon Sep 17 00:00:00 2001 From: walkskyer Date: Thu, 13 Nov 2014 22:26:22 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E4=B8=BA=E5=85=B3=E9=94=AE=E5=AD=97?= =?UTF-8?q?=E6=8F=90=E5=8F=96=E5=87=BD=E6=95=B0=E5=A2=9E=E5=8A=A0=E8=AF=8D?= =?UTF-8?q?=E6=80=A7=E8=BF=87=E6=BB=A4=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jieba/analyse/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index af36149..c8a996f 100755 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -1,5 +1,6 @@ #encoding=utf-8 import jieba +import jieba.posseg import os from operator import itemgetter try: @@ -58,21 +59,31 @@ def set_stop_words(stop_words_path): for line in lines: STOP_WORDS.add(line) -def extract_tags(sentence, topK=20, withWeight=False): +def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. + - allowPOS: the allowed POS list eg. ['n']. + if the POS of w is not in this list,it will be filtered. """ global STOP_WORDS, idf_loader idf_freq, median_idf = idf_loader.get_idf() - words = jieba.cut(sentence) + if allowPOS: + words = jieba.posseg.cut(sentence) + else: + words = jieba.cut(sentence) freq = {} for w in words: + if allowPOS: + if w.flag not in allowPOS: + continue + else: + w = w.word if len(w.strip()) < 2 or w.lower() in STOP_WORDS: continue freq[w] = freq.get(w, 0.0) + 1.0 From dd624776052c70db26ad564f2243049360d1c27c Mon Sep 17 00:00:00 2001 From: walkskyer Date: Sat, 15 Nov 2014 13:33:13 +0800 Subject: [PATCH 2/4] =?UTF-8?q?.gitignore=E4=B8=AD=E5=BF=BD=E7=95=A5pychar?= =?UTF-8?q?m=E9=A1=B9=E7=9B=AE=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 8c2c5f4..e36fabc 100644 --- a/.gitignore +++ b/.gitignore @@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML - - +############ +## pycharm +############ +.idea ############ ## Windows From bab5f362bae7a7b4795b753a1587592b2b097c6c Mon Sep 17 00:00:00 2001 From: walkskyer Date: Sat, 15 Nov 2014 18:14:47 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E5=B0=86exstract=5Ftags=E5=8F=82=E6=95=B0a?= =?UTF-8?q?llowPOS=E8=BD=AC=E6=8D=A2=E4=B8=BAfrozenset=E4=BB=A5=E5=87=8F?= =?UTF-8?q?=E5=B0=91=E6=9F=A5=E6=89=BE=E6=97=B6=E9=97=B4=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jieba/analyse/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index c8a996f..ea7e66c 100755 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -74,6 +74,7 @@ def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]): idf_freq, median_idf = idf_loader.get_idf() if allowPOS: + allowPOS = frozenset(allowPOS) words = jieba.posseg.cut(sentence) else: words = jieba.cut(sentence) From a336e2640310aa7d0f3095ba395c788b228c05cf Mon Sep 17 00:00:00 2001 From: walkskyer Date: Sat, 15 Nov 2014 18:36:09 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E4=B8=BA=E5=87=BD=E6=95=B0textrank?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=8F=82=E6=95=B0allowPOS=EF=BC=8C=E5=B9=B6?= =?UTF-8?q?=E4=BF=AE=E6=94=B9extract=5Ftags=E7=9A=84=E5=8F=82=E6=95=B0allo?= =?UTF-8?q?wPOS=E4=B8=8Etextrank=E4=BF=9D=E6=8C=81=E4=B8=80=E8=87=B4?= =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jieba/analyse/__init__.py | 4 ++-- jieba/analyse/textrank.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index ea7e66c..c56b9a4 100755 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -59,14 +59,14 @@ def set_stop_words(stop_words_path): for line in lines: STOP_WORDS.add(line) -def extract_tags(sentence, topK=20, withWeight=False, allowPOS=[]): +def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - - allowPOS: the allowed POS list eg. ['n']. + - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. if the POS of w is not in this list,it will be filtered. """ global STOP_WORDS, idf_loader diff --git a/jieba/analyse/textrank.py b/jieba/analyse/textrank.py index 9ac9ece..739c60e 100644 --- a/jieba/analyse/textrank.py +++ b/jieba/analyse/textrank.py @@ -48,15 +48,17 @@ class UndirectWeightedGraph: return ws -def textrank(sentence, topK=10, withWeight=False): +def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']): """ Extract keywords from sentence using TextRank algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. + - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. + if the POS of w is not in this list,it will be filtered. """ - pos_filt = frozenset(('ns', 'n', 'vn', 'v')) + pos_filt = frozenset(allowPOS) g = UndirectWeightedGraph() cm = collections.defaultdict(int) span = 5