mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
讓 jieba 可以自行增加 stop words 語料庫
1. 增加範例 stop words 語料庫 2. 為了讓 jieba 可以切換 stop words 語料庫,新增 set_stop_words 方法,並改寫 extract_tags 3. test 增加 extract_tags_stop_words.py 測試範例
This commit is contained in:
parent
7198d562f1
commit
b658ee69cb
51
extra_dict/stop_words.txt
Normal file
51
extra_dict/stop_words.txt
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
the
|
||||||
|
of
|
||||||
|
is
|
||||||
|
and
|
||||||
|
to
|
||||||
|
in
|
||||||
|
that
|
||||||
|
we
|
||||||
|
for
|
||||||
|
an
|
||||||
|
are
|
||||||
|
by
|
||||||
|
be
|
||||||
|
as
|
||||||
|
on
|
||||||
|
with
|
||||||
|
can
|
||||||
|
if
|
||||||
|
from
|
||||||
|
which
|
||||||
|
you
|
||||||
|
it
|
||||||
|
this
|
||||||
|
then
|
||||||
|
at
|
||||||
|
have
|
||||||
|
all
|
||||||
|
not
|
||||||
|
one
|
||||||
|
has
|
||||||
|
or
|
||||||
|
that
|
||||||
|
的
|
||||||
|
了
|
||||||
|
和
|
||||||
|
是
|
||||||
|
就
|
||||||
|
都
|
||||||
|
而
|
||||||
|
及
|
||||||
|
與
|
||||||
|
著
|
||||||
|
或
|
||||||
|
一個
|
||||||
|
沒有
|
||||||
|
我們
|
||||||
|
你們
|
||||||
|
妳們
|
||||||
|
他們
|
||||||
|
她們
|
||||||
|
是否
|
@ -8,7 +8,11 @@ except ImportError:
|
|||||||
|
|
||||||
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
abs_path = os.path.join(_curpath, "idf.txt")
|
abs_path = os.path.join(_curpath, "idf.txt")
|
||||||
|
|
||||||
IDF_DICTIONARY = abs_path
|
IDF_DICTIONARY = abs_path
|
||||||
|
STOP_WORDS = set([
|
||||||
|
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||||
|
])
|
||||||
|
|
||||||
def set_idf_path(idf_path):
|
def set_idf_path(idf_path):
|
||||||
global IDF_DICTIONARY
|
global IDF_DICTIONARY
|
||||||
@ -28,17 +32,28 @@ def get_idf(abs_path):
|
|||||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||||
return idf_freq, median_idf
|
return idf_freq, median_idf
|
||||||
|
|
||||||
|
def set_stop_words(stop_words_path):
|
||||||
|
global STOP_WORDS
|
||||||
|
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) )
|
||||||
|
if not os.path.exists(abs_path):
|
||||||
|
raise Exception("jieba: path does not exist:" + abs_path)
|
||||||
|
content = open(abs_path,'rb').read().decode('utf-8')
|
||||||
|
lines = content.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
STOP_WORDS.add(line)
|
||||||
|
return
|
||||||
|
|
||||||
def extract_tags(sentence,topK=20):
|
def extract_tags(sentence,topK=20):
|
||||||
global IDF_DICTIONARY
|
global IDF_DICTIONARY
|
||||||
|
global STOP_WORDS
|
||||||
|
|
||||||
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
|
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
|
||||||
stop_words= set([
|
|
||||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
|
||||||
])
|
|
||||||
words = jieba.cut(sentence)
|
words = jieba.cut(sentence)
|
||||||
freq = {}
|
freq = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
if len(w.strip())<2: continue
|
if len(w.strip())<2: continue
|
||||||
if w.lower() in stop_words: continue
|
if w.lower() in STOP_WORDS: continue
|
||||||
freq[w]=freq.get(w,0.0)+1.0
|
freq[w]=freq.get(w,0.0)+1.0
|
||||||
total = sum(freq.values())
|
total = sum(freq.values())
|
||||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||||
|
33
test/extract_tags_stop_words.py
Normal file
33
test/extract_tags_stop_words.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import sys
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import jieba.analyse
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
USAGE = "usage: python extract_tags_stop_words.py [file name] -k [top k]"
|
||||||
|
|
||||||
|
parser = OptionParser(USAGE)
|
||||||
|
parser.add_option("-k", dest="topK")
|
||||||
|
opt, args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if len(args) < 1:
|
||||||
|
print USAGE
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
file_name = args[0]
|
||||||
|
|
||||||
|
if opt.topK is None:
|
||||||
|
topK = 10
|
||||||
|
else:
|
||||||
|
topK = int(opt.topK)
|
||||||
|
|
||||||
|
content = open(file_name, 'rb').read()
|
||||||
|
|
||||||
|
jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
|
||||||
|
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
||||||
|
|
||||||
|
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||||
|
|
||||||
|
print ",".join(tags)
|
Loading…
x
Reference in New Issue
Block a user