讓 jieba 可以自行增加 stop words 語料庫

1. 增加範例 stop words 語料庫
2. 為了讓 jieba 可以切換 stop words 語料庫,新增 set_stop_words 方法,並改寫 extract_tags
3. test 增加 extract_tags_stop_words.py 測試範例
This commit is contained in:
Fukuball Lin 2014-08-06 03:35:16 +08:00
parent 7198d562f1
commit b658ee69cb
3 changed files with 103 additions and 4 deletions

51
extra_dict/stop_words.txt Normal file
View File

@ -0,0 +1,51 @@
the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
一個
沒有
我們
你們
妳們
他們
她們
是否

View File

@ -8,7 +8,11 @@ except ImportError:
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
abs_path = os.path.join(_curpath, "idf.txt")
IDF_DICTIONARY = abs_path
STOP_WORDS = set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
])
def set_idf_path(idf_path):
global IDF_DICTIONARY
@ -28,17 +32,28 @@ def get_idf(abs_path):
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
return idf_freq, median_idf
def set_stop_words(stop_words_path):
global STOP_WORDS
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) )
if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path)
content = open(abs_path,'rb').read().decode('utf-8')
lines = content.split('\n')
for line in lines:
STOP_WORDS.add(line)
return
def extract_tags(sentence,topK=20):
global IDF_DICTIONARY
global STOP_WORDS
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
stop_words= set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
])
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip())<2: continue
if w.lower() in stop_words: continue
if w.lower() in STOP_WORDS: continue
freq[w]=freq.get(w,0.0)+1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()]

View File

@ -0,0 +1,33 @@
import sys
sys.path.append('../')
import jieba
import jieba.analyse
from optparse import OptionParser
USAGE = "usage: python extract_tags_stop_words.py [file name] -k [top k]"
parser = OptionParser(USAGE)
parser.add_option("-k", dest="topK")
opt, args = parser.parse_args()
if len(args) < 1:
print USAGE
sys.exit(1)
file_name = args[0]
if opt.topK is None:
topK = 10
else:
topK = int(opt.topK)
content = open(file_name, 'rb').read()
jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
tags = jieba.analyse.extract_tags(content, topK=topK)
print ",".join(tags)