mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
Merge pull request #174 from fukuball/master
讓 jieba 可以切換 idf 語料庫及 stop words 語料庫
This commit is contained in:
commit
16d626d347
176239
extra_dict/idf.txt.big
Normal file
176239
extra_dict/idf.txt.big
Normal file
File diff suppressed because it is too large
Load Diff
51
extra_dict/stop_words.txt
Normal file
51
extra_dict/stop_words.txt
Normal file
@ -0,0 +1,51 @@
|
||||
the
|
||||
of
|
||||
is
|
||||
and
|
||||
to
|
||||
in
|
||||
that
|
||||
we
|
||||
for
|
||||
an
|
||||
are
|
||||
by
|
||||
be
|
||||
as
|
||||
on
|
||||
with
|
||||
can
|
||||
if
|
||||
from
|
||||
which
|
||||
you
|
||||
it
|
||||
this
|
||||
then
|
||||
at
|
||||
have
|
||||
all
|
||||
not
|
||||
one
|
||||
has
|
||||
or
|
||||
that
|
||||
的
|
||||
了
|
||||
和
|
||||
是
|
||||
就
|
||||
都
|
||||
而
|
||||
及
|
||||
與
|
||||
著
|
||||
或
|
||||
一個
|
||||
沒有
|
||||
我們
|
||||
你們
|
||||
妳們
|
||||
他們
|
||||
她們
|
||||
是否
|
@ -39,7 +39,7 @@ def gen_trie(f_name):
|
||||
trie = {}
|
||||
ltotal = 0.0
|
||||
with open(f_name, 'rb') as f:
|
||||
lineno = 0
|
||||
lineno = 0
|
||||
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
||||
lineno += 1
|
||||
try:
|
||||
@ -134,7 +134,7 @@ def __cut_all(sentence):
|
||||
for k,L in dag.iteritems():
|
||||
if len(L)==1 and k>old_j:
|
||||
yield sentence[k:L[0]+1]
|
||||
old_j = L[0]
|
||||
old_j = L[0]
|
||||
else:
|
||||
for j in L:
|
||||
if j>k:
|
||||
@ -195,7 +195,7 @@ def __cut_DAG_NO_HMM(sentence):
|
||||
if len(buf)>0:
|
||||
yield buf
|
||||
buf = u''
|
||||
yield l_word
|
||||
yield l_word
|
||||
x =y
|
||||
if len(buf)>0:
|
||||
yield buf
|
||||
@ -227,7 +227,7 @@ def __cut_DAG(sentence):
|
||||
for elem in buf:
|
||||
yield elem
|
||||
buf=u''
|
||||
yield l_word
|
||||
yield l_word
|
||||
x =y
|
||||
|
||||
if len(buf)>0:
|
||||
@ -243,8 +243,8 @@ def __cut_DAG(sentence):
|
||||
yield elem
|
||||
|
||||
def cut(sentence,cut_all=False,HMM=True):
|
||||
'''The main function that segments an entire sentence that contains
|
||||
Chinese characters into seperated words.
|
||||
'''The main function that segments an entire sentence that contains
|
||||
Chinese characters into seperated words.
|
||||
Parameter:
|
||||
- sentence: The String to be segmented
|
||||
- cut_all: Model. True means full pattern, false means accurate pattern.
|
||||
@ -257,8 +257,8 @@ def cut(sentence,cut_all=False,HMM=True):
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
'''
|
||||
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||
\r\n|\s : whitespace characters. Will not be Handled.
|
||||
'''
|
||||
\r\n|\s : whitespace characters. Will not be Handled.
|
||||
'''
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
||||
@ -306,7 +306,7 @@ def load_userdict(f):
|
||||
''' Load personalized dict to improve detect rate.
|
||||
Parameter:
|
||||
- f : A plain text file contains words and their ocurrences.
|
||||
Structure of dict file:
|
||||
Structure of dict file:
|
||||
word1 freq1 word_type1
|
||||
word2 freq2 word_type2
|
||||
...
|
||||
@ -372,7 +372,7 @@ def enable_parallel(processnum=None):
|
||||
def pcut(sentence,cut_all=False,HMM=True):
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
if cut_all:
|
||||
result = pool.map(__lcut_all,parts)
|
||||
result = pool.map(__lcut_all,parts)
|
||||
else:
|
||||
if HMM:
|
||||
result = pool.map(__lcut,parts)
|
||||
@ -418,7 +418,7 @@ def tokenize(unicode_sentence,mode="default",HMM=True):
|
||||
#mode ("default" or "search")
|
||||
if not isinstance(unicode_sentence, unicode):
|
||||
raise Exception("jieba: the input parameter should unicode.")
|
||||
start = 0
|
||||
start = 0
|
||||
if mode=='default':
|
||||
for w in cut(unicode_sentence,HMM=HMM):
|
||||
width = len(w)
|
||||
|
@ -1,3 +1,4 @@
|
||||
#encoding=utf-8
|
||||
import jieba
|
||||
import os
|
||||
try:
|
||||
@ -5,27 +6,54 @@ try:
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
f_name = os.path.join(_curpath,"idf.txt")
|
||||
content = open(f_name,'rb').read().decode('utf-8')
|
||||
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
abs_path = os.path.join(_curpath, "idf.txt")
|
||||
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
word,freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||
stop_words= set([
|
||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||
IDF_DICTIONARY = abs_path
|
||||
STOP_WORDS = set([
|
||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||
])
|
||||
|
||||
def set_idf_path(idf_path):
|
||||
global IDF_DICTIONARY
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("jieba: path does not exist:" + abs_path)
|
||||
IDF_DICTIONARY = abs_path
|
||||
return
|
||||
|
||||
def get_idf(abs_path):
|
||||
content = open(abs_path,'rb').read().decode('utf-8')
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
word,freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||
return idf_freq, median_idf
|
||||
|
||||
def set_stop_words(stop_words_path):
|
||||
global STOP_WORDS
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) )
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("jieba: path does not exist:" + abs_path)
|
||||
content = open(abs_path,'rb').read().decode('utf-8')
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
STOP_WORDS.add(line)
|
||||
return
|
||||
|
||||
def extract_tags(sentence,topK=20):
|
||||
global IDF_DICTIONARY
|
||||
global STOP_WORDS
|
||||
|
||||
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
|
||||
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip())<2: continue
|
||||
if w.lower() in stop_words: continue
|
||||
if w.lower() in STOP_WORDS: continue
|
||||
freq[w]=freq.get(w,0.0)+1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||
|
32
test/extract_tags_idfpath.py
Normal file
32
test/extract_tags_idfpath.py
Normal file
@ -0,0 +1,32 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
|
||||
import jieba
|
||||
import jieba.analyse
|
||||
from optparse import OptionParser
|
||||
|
||||
USAGE = "usage: python extract_tags_idfpath.py [file name] -k [top k]"
|
||||
|
||||
parser = OptionParser(USAGE)
|
||||
parser.add_option("-k", dest="topK")
|
||||
opt, args = parser.parse_args()
|
||||
|
||||
|
||||
if len(args) < 1:
|
||||
print USAGE
|
||||
sys.exit(1)
|
||||
|
||||
file_name = args[0]
|
||||
|
||||
if opt.topK is None:
|
||||
topK = 10
|
||||
else:
|
||||
topK = int(opt.topK)
|
||||
|
||||
content = open(file_name, 'rb').read()
|
||||
|
||||
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
||||
|
||||
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||
|
||||
print ",".join(tags)
|
33
test/extract_tags_stop_words.py
Normal file
33
test/extract_tags_stop_words.py
Normal file
@ -0,0 +1,33 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
|
||||
import jieba
|
||||
import jieba.analyse
|
||||
from optparse import OptionParser
|
||||
|
||||
USAGE = "usage: python extract_tags_stop_words.py [file name] -k [top k]"
|
||||
|
||||
parser = OptionParser(USAGE)
|
||||
parser.add_option("-k", dest="topK")
|
||||
opt, args = parser.parse_args()
|
||||
|
||||
|
||||
if len(args) < 1:
|
||||
print USAGE
|
||||
sys.exit(1)
|
||||
|
||||
file_name = args[0]
|
||||
|
||||
if opt.topK is None:
|
||||
topK = 10
|
||||
else:
|
||||
topK = int(opt.topK)
|
||||
|
||||
content = open(file_name, 'rb').read()
|
||||
|
||||
jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
|
||||
jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
|
||||
|
||||
tags = jieba.analyse.extract_tags(content, topK=topK)
|
||||
|
||||
print ",".join(tags)
|
Loading…
x
Reference in New Issue
Block a user