diff --git a/jieba/analyse/textrank.py b/jieba/analyse/textrank.py index 019a1cb..230e2a9 100644 --- a/jieba/analyse/textrank.py +++ b/jieba/analyse/textrank.py @@ -44,7 +44,7 @@ class UndirectWeightedGraph: for w in itervalues(ws): if w < min_rank: min_rank = w - elif w > max_rank: + if w > max_rank: max_rank = w for n, w in ws.items(): @@ -66,7 +66,7 @@ class TextRank(KeywordExtractor): return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2 and wp.word.lower() not in self.stop_words) - def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')): + def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False): """ Extract keywords from sentence using TextRank algorithm. Parameter: @@ -75,6 +75,8 @@ class TextRank(KeywordExtractor): if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. if the POS of w is not in this list, it will be filtered. + - withFlag: if True, return a list of pair(word, weight) like posseg.cut + if False, return a list of words """ self.pos_filt = frozenset(allowPOS) g = UndirectWeightedGraph() @@ -87,7 +89,10 @@ class TextRank(KeywordExtractor): break if not self.pairfilter(words[j]): continue - cm[(wp.word, words[j].word)] += 1 + if allowPOS and withFlag: + cm[(wp, words[j])] += 1 + else: + cm[(wp.word, words[j].word)] += 1 for terms, w in cm.items(): g.addEdge(terms[0], terms[1], w) @@ -96,6 +101,7 @@ class TextRank(KeywordExtractor): tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) else: tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) + if topK: return tags[:topK] else: diff --git a/jieba/analyse/tfidf.py b/jieba/analyse/tfidf.py index 14abfb0..680b889 100755 --- a/jieba/analyse/tfidf.py +++ b/jieba/analyse/tfidf.py @@ -72,7 +72,7 @@ class TFIDF(KeywordExtractor): self.idf_loader.set_new_path(new_abs_path) self.idf_freq, self.median_idf = self.idf_loader.get_idf() - def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()): + def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: @@ -81,6 +81,9 @@ class TFIDF(KeywordExtractor): if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. if the POS of w is not in this list,it will be filtered. + - withFlag: only work with allowPOS is not empty. + if True, return a list of pair(word, weight) like posseg.cut + if False, return a list of words """ if allowPOS: allowPOS = frozenset(allowPOS) @@ -92,14 +95,16 @@ class TFIDF(KeywordExtractor): if allowPOS: if w.flag not in allowPOS: continue - else: + elif not withFlag: w = w.word - if len(w.strip()) < 2 or w.lower() in self.stop_words: + wc = w.word if allowPOS and withFlag else w + if len(wc.strip()) < 2 or wc.lower() in self.stop_words: continue freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) for k in freq: - freq[k] *= self.idf_freq.get(k, self.median_idf) / total + kw = k.word if allowPOS and withFlag else k + freq[k] *= self.idf_freq.get(kw, self.median_idf) / total if withWeight: tags = sorted(freq.items(), key=itemgetter(1), reverse=True) diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 429ba69..0c9c470 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -62,6 +62,15 @@ class pair(object): def __iter__(self): return iter((self.word, self.flag)) + def __lt__(self, other): + return self.word < other.word + + def __eq__(self, other): + return isinstance(other, pair) and self.word == other.word and self.flag == other.flag + + def __hash__(self): + return hash(self.word) + def encode(self, arg): return self.__unicode__().encode(arg)