make lazy load thread safe

This commit is contained in:
fxsjy 2013-04-26 12:54:05 +08:00
parent d2460029d5
commit bc049090a5
3 changed files with 70 additions and 36 deletions

View File

@ -9,9 +9,10 @@ import tempfile
import marshal
from math import log
import random
import threading
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
trie = None # to be initialized
FREQ = {}
min_freq = 0.0
@ -45,13 +46,19 @@ def gen_trie(f_name):
def initialize(dictionary=DICTIONARY):
global trie, FREQ, total, min_freq, initialized
with DICT_LOCK:
if initialized:
return
if trie:
del trie
trie = None
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
print >> sys.stderr, "Building Trie..."
t1 = time.time()
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,dictionary)):
print >> sys.stderr, "loading model from cache"
try:
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
@ -111,7 +118,6 @@ def calc(sentence,DAG,idx,route):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
route[idx] = max(candidates)
@require_initialized
def get_DAG(sentence):
N = len(sentence)
@ -173,7 +179,6 @@ def __cut_DAG(sentence):
regognized = finalseg.cut(buf)
for t in regognized:
yield t
def cut(sentence,cut_all=False):
if not ( type(sentence) is unicode):
try:
@ -201,7 +206,6 @@ def cut(sentence,cut_all=False):
else:
for xx in x:
yield xx
def cut_for_search(sentence):
words = cut(sentence)
for w in words:
@ -252,6 +256,7 @@ def __lcut_all(sentence):
def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence))
@require_initialized
def enable_parallel(processnum):
global pool,cut,cut_for_search
if os.name=='nt':
@ -290,6 +295,6 @@ def disable_parallel():
def set_dictionary(dictionary_path):
global initialized, DICTIONARY
with DICT_LOCK:
DICTIONARY = dictionary_path
if initialized:
initialize()
initialized = False

View File

@ -15,6 +15,6 @@ tm_cost = t2-t1
log_f = open("1.log","wb")
for w in words:
print >> log_f, w.encode("gbk"), "/" ,
print 'cost',tm_cost
print 'speed' , len(content)/tm_cost, " bytes/second"

29
test/test_multithread.py Normal file
View File

@ -0,0 +1,29 @@
#encoding=utf-8
import sys
import threading
sys.path.append("../")
import jieba
class Worker(threading.Thread):
def run(self):
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
print "Full Mode:" + "/ ".join(seg_list) #全模式
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
print "Default Mode:" + "/ ".join(seg_list) #默认模式
seg_list = jieba.cut("他来到了网易杭研大厦")
print ", ".join(seg_list)
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
print ", ".join(seg_list)
workers = []
for i in xrange(10):
worker = Worker()
workers.append(worker)
worker.start()
for worker in workers:
worker.join()