mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
Merge pull request #303 from jerryday/master
add a withFlag param to extract_tags
This commit is contained in:
commit
093980647b
@ -44,7 +44,7 @@ class UndirectWeightedGraph:
|
|||||||
for w in itervalues(ws):
|
for w in itervalues(ws):
|
||||||
if w < min_rank:
|
if w < min_rank:
|
||||||
min_rank = w
|
min_rank = w
|
||||||
elif w > max_rank:
|
if w > max_rank:
|
||||||
max_rank = w
|
max_rank = w
|
||||||
|
|
||||||
for n, w in ws.items():
|
for n, w in ws.items():
|
||||||
@ -66,7 +66,7 @@ class TextRank(KeywordExtractor):
|
|||||||
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
|
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
|
||||||
and wp.word.lower() not in self.stop_words)
|
and wp.word.lower() not in self.stop_words)
|
||||||
|
|
||||||
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')):
|
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
|
||||||
"""
|
"""
|
||||||
Extract keywords from sentence using TextRank algorithm.
|
Extract keywords from sentence using TextRank algorithm.
|
||||||
Parameter:
|
Parameter:
|
||||||
@ -75,6 +75,8 @@ class TextRank(KeywordExtractor):
|
|||||||
if False, return a list of words.
|
if False, return a list of words.
|
||||||
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
||||||
if the POS of w is not in this list, it will be filtered.
|
if the POS of w is not in this list, it will be filtered.
|
||||||
|
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
|
||||||
|
if False, return a list of words
|
||||||
"""
|
"""
|
||||||
self.pos_filt = frozenset(allowPOS)
|
self.pos_filt = frozenset(allowPOS)
|
||||||
g = UndirectWeightedGraph()
|
g = UndirectWeightedGraph()
|
||||||
@ -87,7 +89,10 @@ class TextRank(KeywordExtractor):
|
|||||||
break
|
break
|
||||||
if not self.pairfilter(words[j]):
|
if not self.pairfilter(words[j]):
|
||||||
continue
|
continue
|
||||||
cm[(wp.word, words[j].word)] += 1
|
if allowPOS and withFlag:
|
||||||
|
cm[(wp, words[j])] += 1
|
||||||
|
else:
|
||||||
|
cm[(wp.word, words[j].word)] += 1
|
||||||
|
|
||||||
for terms, w in cm.items():
|
for terms, w in cm.items():
|
||||||
g.addEdge(terms[0], terms[1], w)
|
g.addEdge(terms[0], terms[1], w)
|
||||||
@ -96,6 +101,7 @@ class TextRank(KeywordExtractor):
|
|||||||
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
|
||||||
else:
|
else:
|
||||||
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
|
||||||
|
|
||||||
if topK:
|
if topK:
|
||||||
return tags[:topK]
|
return tags[:topK]
|
||||||
else:
|
else:
|
||||||
|
@ -72,7 +72,7 @@ class TFIDF(KeywordExtractor):
|
|||||||
self.idf_loader.set_new_path(new_abs_path)
|
self.idf_loader.set_new_path(new_abs_path)
|
||||||
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
|
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
|
||||||
|
|
||||||
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()):
|
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
|
||||||
"""
|
"""
|
||||||
Extract keywords from sentence using TF-IDF algorithm.
|
Extract keywords from sentence using TF-IDF algorithm.
|
||||||
Parameter:
|
Parameter:
|
||||||
@ -81,6 +81,9 @@ class TFIDF(KeywordExtractor):
|
|||||||
if False, return a list of words.
|
if False, return a list of words.
|
||||||
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
|
||||||
if the POS of w is not in this list,it will be filtered.
|
if the POS of w is not in this list,it will be filtered.
|
||||||
|
- withFlag: only work with allowPOS is not empty.
|
||||||
|
if True, return a list of pair(word, weight) like posseg.cut
|
||||||
|
if False, return a list of words
|
||||||
"""
|
"""
|
||||||
if allowPOS:
|
if allowPOS:
|
||||||
allowPOS = frozenset(allowPOS)
|
allowPOS = frozenset(allowPOS)
|
||||||
@ -92,14 +95,16 @@ class TFIDF(KeywordExtractor):
|
|||||||
if allowPOS:
|
if allowPOS:
|
||||||
if w.flag not in allowPOS:
|
if w.flag not in allowPOS:
|
||||||
continue
|
continue
|
||||||
else:
|
elif not withFlag:
|
||||||
w = w.word
|
w = w.word
|
||||||
if len(w.strip()) < 2 or w.lower() in self.stop_words:
|
wc = w.word if allowPOS and withFlag else w
|
||||||
|
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
|
||||||
continue
|
continue
|
||||||
freq[w] = freq.get(w, 0.0) + 1.0
|
freq[w] = freq.get(w, 0.0) + 1.0
|
||||||
total = sum(freq.values())
|
total = sum(freq.values())
|
||||||
for k in freq:
|
for k in freq:
|
||||||
freq[k] *= self.idf_freq.get(k, self.median_idf) / total
|
kw = k.word if allowPOS and withFlag else k
|
||||||
|
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
|
||||||
|
|
||||||
if withWeight:
|
if withWeight:
|
||||||
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
|
||||||
|
@ -62,6 +62,15 @@ class pair(object):
|
|||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter((self.word, self.flag))
|
return iter((self.word, self.flag))
|
||||||
|
|
||||||
|
def __lt__(self, other):
|
||||||
|
return self.word < other.word
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return isinstance(other, pair) and self.word == other.word and self.flag == other.flag
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash(self.word)
|
||||||
|
|
||||||
def encode(self, arg):
|
def encode(self, arg):
|
||||||
return self.__unicode__().encode(arg)
|
return self.__unicode__().encode(arg)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user