fix performance problem of extrag_tags

This commit is contained in:
fxsjy 2014-10-10 15:41:28 +08:00
parent 7f965e0aa3
commit eb98eb9248

View File

@ -9,30 +9,43 @@ except ImportError:
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
abs_path = os.path.join(_curpath, "idf.txt") abs_path = os.path.join(_curpath, "idf.txt")
IDF_DICTIONARY = abs_path
STOP_WORDS = set([ STOP_WORDS = set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
]) ])
def set_idf_path(idf_path): class IDFLoader:
global IDF_DICTIONARY def __init__(self):
abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) ) self.path = ""
if not os.path.exists(abs_path): self.idf_freq = {}
raise Exception("jieba: path does not exist:" + abs_path) self.median_idf = 0.0
IDF_DICTIONARY = abs_path
return
def get_idf(abs_path): def set_new_path(self, new_idf_path):
content = open(abs_path,'rb').read().decode('utf-8') if self.path != new_idf_path:
idf_freq = {} content = open(new_idf_path,'rb').read().decode('utf-8')
lines = content.split('\n') idf_freq = {}
if lines and not lines[-1]: lines = content.split('\n')
lines.pop(-1) if lines and not lines[-1]:
for line in lines: lines.pop(-1)
word,freq = line.split(' ') for line in lines:
idf_freq[word] = float(freq) word,freq = line.split(' ')
median_idf = sorted(idf_freq.values())[len(idf_freq)/2] idf_freq[word] = float(freq)
return idf_freq, median_idf median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
self.idf_freq = idf_freq
self.median_idf = median_idf
self.path = new_idf_path
def get_idf(self):
return self.idf_freq, self.median_idf
idf_loader = IDFLoader()
idf_loader.set_new_path(abs_path)
def set_idf_path(idf_path):
new_abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
if not os.path.exists(new_abs_path):
raise Exception("jieba: path does not exist:" + new_abs_path)
idf_loader.set_new_path(new_abs_path)
return
def set_stop_words(stop_words_path): def set_stop_words(stop_words_path):
global STOP_WORDS global STOP_WORDS
@ -46,10 +59,9 @@ def set_stop_words(stop_words_path):
return return
def extract_tags(sentence,topK=20): def extract_tags(sentence,topK=20):
global IDF_DICTIONARY
global STOP_WORDS global STOP_WORDS
idf_freq, median_idf = get_idf(IDF_DICTIONARY) idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence) words = jieba.cut(sentence)
freq = {} freq = {}