mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
commit
99748bfc17
6
.gitignore
vendored
6
.gitignore
vendored
@ -113,8 +113,10 @@ Generated_Code #added for RIA/Silverlight projects
|
|||||||
_UpgradeReport_Files/
|
_UpgradeReport_Files/
|
||||||
Backup*/
|
Backup*/
|
||||||
UpgradeLog*.XML
|
UpgradeLog*.XML
|
||||||
|
############
|
||||||
|
## pycharm
|
||||||
|
############
|
||||||
|
.idea
|
||||||
|
|
||||||
############
|
############
|
||||||
## Windows
|
## Windows
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#encoding=utf-8
|
#encoding=utf-8
|
||||||
import jieba
|
import jieba
|
||||||
|
import jieba.posseg
|
||||||
import os
|
import os
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
try:
|
try:
|
||||||
@ -58,21 +59,32 @@ def set_stop_words(stop_words_path):
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
STOP_WORDS.add(line)
|
STOP_WORDS.add(line)
|
||||||
|
|
||||||
def extract_tags(sentence, topK=20, withWeight=False):
|
def extract_tags(sentence, topK=20, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
|
||||||
"""
|
"""
|
||||||
Extract keywords from sentence using TF-IDF algorithm.
|
Extract keywords from sentence using TF-IDF algorithm.
|
||||||
Parameter:
|
Parameter:
|
||||||
- topK: return how many top keywords. `None` for all possible words.
|
- topK: return how many top keywords. `None` for all possible words.
|
||||||
- withWeight: if True, return a list of (word, weight);
|
- withWeight: if True, return a list of (word, weight);
|
||||||
if False, return a list of words.
|
if False, return a list of words.
|
||||||
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
||||||
|
if the POS of w is not in this list,it will be filtered.
|
||||||
"""
|
"""
|
||||||
global STOP_WORDS, idf_loader
|
global STOP_WORDS, idf_loader
|
||||||
|
|
||||||
idf_freq, median_idf = idf_loader.get_idf()
|
idf_freq, median_idf = idf_loader.get_idf()
|
||||||
|
|
||||||
|
if allowPOS:
|
||||||
|
allowPOS = frozenset(allowPOS)
|
||||||
|
words = jieba.posseg.cut(sentence)
|
||||||
|
else:
|
||||||
words = jieba.cut(sentence)
|
words = jieba.cut(sentence)
|
||||||
freq = {}
|
freq = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
|
if allowPOS:
|
||||||
|
if w.flag not in allowPOS:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
w = w.word
|
||||||
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
|
if len(w.strip()) < 2 or w.lower() in STOP_WORDS:
|
||||||
continue
|
continue
|
||||||
freq[w] = freq.get(w, 0.0) + 1.0
|
freq[w] = freq.get(w, 0.0) + 1.0
|
||||||
|
@ -48,15 +48,17 @@ class UndirectWeightedGraph:
|
|||||||
return ws
|
return ws
|
||||||
|
|
||||||
|
|
||||||
def textrank(sentence, topK=10, withWeight=False):
|
def textrank(sentence, topK=10, withWeight=False, allowPOS=['ns', 'n', 'vn', 'v']):
|
||||||
"""
|
"""
|
||||||
Extract keywords from sentence using TextRank algorithm.
|
Extract keywords from sentence using TextRank algorithm.
|
||||||
Parameter:
|
Parameter:
|
||||||
- topK: return how many top keywords. `None` for all possible words.
|
- topK: return how many top keywords. `None` for all possible words.
|
||||||
- withWeight: if True, return a list of (word, weight);
|
- withWeight: if True, return a list of (word, weight);
|
||||||
if False, return a list of words.
|
if False, return a list of words.
|
||||||
|
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
|
||||||
|
if the POS of w is not in this list,it will be filtered.
|
||||||
"""
|
"""
|
||||||
pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
|
pos_filt = frozenset(allowPOS)
|
||||||
g = UndirectWeightedGraph()
|
g = UndirectWeightedGraph()
|
||||||
cm = collections.defaultdict(int)
|
cm = collections.defaultdict(int)
|
||||||
span = 5
|
span = 5
|
||||||
|
Loading…
x
Reference in New Issue
Block a user