mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
fix performance problem of extrag_tags
This commit is contained in:
parent
7f965e0aa3
commit
eb98eb9248
@ -9,30 +9,43 @@ except ImportError:
|
|||||||
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
abs_path = os.path.join(_curpath, "idf.txt")
|
abs_path = os.path.join(_curpath, "idf.txt")
|
||||||
|
|
||||||
IDF_DICTIONARY = abs_path
|
|
||||||
STOP_WORDS = set([
|
STOP_WORDS = set([
|
||||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||||
])
|
])
|
||||||
|
|
||||||
def set_idf_path(idf_path):
|
class IDFLoader:
|
||||||
global IDF_DICTIONARY
|
def __init__(self):
|
||||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
|
self.path = ""
|
||||||
if not os.path.exists(abs_path):
|
self.idf_freq = {}
|
||||||
raise Exception("jieba: path does not exist:" + abs_path)
|
self.median_idf = 0.0
|
||||||
IDF_DICTIONARY = abs_path
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_idf(abs_path):
|
def set_new_path(self, new_idf_path):
|
||||||
content = open(abs_path,'rb').read().decode('utf-8')
|
if self.path != new_idf_path:
|
||||||
idf_freq = {}
|
content = open(new_idf_path,'rb').read().decode('utf-8')
|
||||||
lines = content.split('\n')
|
idf_freq = {}
|
||||||
if lines and not lines[-1]:
|
lines = content.split('\n')
|
||||||
lines.pop(-1)
|
if lines and not lines[-1]:
|
||||||
for line in lines:
|
lines.pop(-1)
|
||||||
word,freq = line.split(' ')
|
for line in lines:
|
||||||
idf_freq[word] = float(freq)
|
word,freq = line.split(' ')
|
||||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
idf_freq[word] = float(freq)
|
||||||
return idf_freq, median_idf
|
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||||
|
self.idf_freq = idf_freq
|
||||||
|
self.median_idf = median_idf
|
||||||
|
self.path = new_idf_path
|
||||||
|
|
||||||
|
def get_idf(self):
|
||||||
|
return self.idf_freq, self.median_idf
|
||||||
|
|
||||||
|
idf_loader = IDFLoader()
|
||||||
|
idf_loader.set_new_path(abs_path)
|
||||||
|
|
||||||
|
def set_idf_path(idf_path):
|
||||||
|
new_abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
|
||||||
|
if not os.path.exists(new_abs_path):
|
||||||
|
raise Exception("jieba: path does not exist:" + new_abs_path)
|
||||||
|
idf_loader.set_new_path(new_abs_path)
|
||||||
|
return
|
||||||
|
|
||||||
def set_stop_words(stop_words_path):
|
def set_stop_words(stop_words_path):
|
||||||
global STOP_WORDS
|
global STOP_WORDS
|
||||||
@ -46,10 +59,9 @@ def set_stop_words(stop_words_path):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def extract_tags(sentence,topK=20):
|
def extract_tags(sentence,topK=20):
|
||||||
global IDF_DICTIONARY
|
|
||||||
global STOP_WORDS
|
global STOP_WORDS
|
||||||
|
|
||||||
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
|
idf_freq, median_idf = idf_loader.get_idf()
|
||||||
|
|
||||||
words = jieba.cut(sentence)
|
words = jieba.cut(sentence)
|
||||||
freq = {}
|
freq = {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user