mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
improve the loading and caching process
This commit is contained in:
parent
d6ef07a472
commit
e3f3dcccba
@ -14,6 +14,7 @@ import random
|
|||||||
import threading
|
import threading
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
import logging
|
import logging
|
||||||
|
from hashlib import md5
|
||||||
|
|
||||||
DICTIONARY = "dict.txt"
|
DICTIONARY = "dict.txt"
|
||||||
DICT_LOCK = threading.RLock()
|
DICT_LOCK = threading.RLock()
|
||||||
@ -53,12 +54,10 @@ def gen_pfdict(f_name):
|
|||||||
raise ValueError, e
|
raise ValueError, e
|
||||||
return pfdict, lfreq, ltotal
|
return pfdict, lfreq, ltotal
|
||||||
|
|
||||||
def initialize(*args):
|
def initialize(dictionary=None):
|
||||||
global pfdict, FREQ, total, min_freq, initialized
|
global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
|
||||||
if not args:
|
if not dictionary:
|
||||||
dictionary = DICTIONARY
|
dictionary = DICTIONARY
|
||||||
else:
|
|
||||||
dictionary = args[0]
|
|
||||||
with DICT_LOCK:
|
with DICT_LOCK:
|
||||||
if initialized:
|
if initialized:
|
||||||
return
|
return
|
||||||
@ -67,13 +66,13 @@ def initialize(*args):
|
|||||||
pfdict = None
|
pfdict = None
|
||||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
abs_path = os.path.join(_curpath,dictionary)
|
abs_path = os.path.join(_curpath, dictionary)
|
||||||
logger.debug("Building prefix dict from %s ..." % abs_path)
|
logger.debug("Building prefix dict from %s ..." % abs_path)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
|
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
|
||||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
||||||
else: #custom dictionary
|
else: #custom dictionary
|
||||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
|
cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())
|
||||||
|
|
||||||
load_from_cache_fail = True
|
load_from_cache_fail = True
|
||||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||||
@ -87,18 +86,18 @@ def initialize(*args):
|
|||||||
|
|
||||||
if load_from_cache_fail:
|
if load_from_cache_fail:
|
||||||
pfdict,FREQ,total = gen_pfdict(abs_path)
|
pfdict,FREQ,total = gen_pfdict(abs_path)
|
||||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
|
FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.iteritems()) #normalize
|
||||||
min_freq = min(FREQ.itervalues())
|
min_freq = min(FREQ.itervalues())
|
||||||
logger.debug("Dumping model to file cache %s" % cache_file)
|
logger.debug("Dumping model to file cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
tmp_suffix = "."+str(random.random())
|
fd, fpath = tempfile.mkstemp()
|
||||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
with os.fdopen(fd, 'wb') as temp_cache_file:
|
||||||
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
|
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
from shutil import move as replace_file
|
from shutil import move as replace_file
|
||||||
else:
|
else:
|
||||||
replace_file = os.rename
|
replace_file = os.rename
|
||||||
replace_file(cache_file + tmp_suffix, cache_file)
|
replace_file(fpath, cache_file)
|
||||||
except:
|
except:
|
||||||
logger.exception("Dump cache file failed.")
|
logger.exception("Dump cache file failed.")
|
||||||
|
|
||||||
@ -136,12 +135,11 @@ def __cut_all(sentence):
|
|||||||
old_j = j
|
old_j = j
|
||||||
|
|
||||||
|
|
||||||
def calc(sentence,DAG,idx,route):
|
def calc(sentence, DAG, idx, route):
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
route[N] = (0.0, '')
|
route[N] = (0.0, '')
|
||||||
for idx in xrange(N-1, -1, -1):
|
for idx in xrange(N-1, -1, -1):
|
||||||
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
|
route[idx] = max((FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx])
|
||||||
route[idx] = max(candidates)
|
|
||||||
|
|
||||||
@require_initialized
|
@require_initialized
|
||||||
def get_DAG(sentence):
|
def get_DAG(sentence):
|
||||||
@ -166,7 +164,7 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
||||||
DAG = get_DAG(sentence)
|
DAG = get_DAG(sentence)
|
||||||
route = {}
|
route = {}
|
||||||
calc(sentence, DAG, 0, route=route)
|
calc(sentence, DAG, 0, route)
|
||||||
x = 0
|
x = 0
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
buf = u''
|
buf = u''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user