Add initialize function and lazy initialization

2025-07-24 00:00:05 +08:00 · 2013-04-25 21:04:56 +09:00 · 2013-04-25 21:04:56 +09:00 · c6098a8657
commit c6098a8657
parent 87c2799692
4 changed files with 718157 additions and 718116 deletions
--- a/README.md
+++ b/README.md
@ -229,6 +229,17 @@ Code sample (keyword extraction)
 	https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
 Using Other Dictionaries
 ========
 It is possible to supply Jieba with your own custom dictionary, and there are also two dictionaries readily available for download:
 1. You can employ a smaller dictionary to use less memory:
 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
 2. There is also a bigger file that has better support for traditional characters (繁體):
 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
 In either case, download the file you want first, and then call `jieba.load_userdict('dict.txt.small')` or just replace the existing `dict.txt`.
 Segmentation speed
 =========
--- a/jieba/init.py
+++ b/jieba/init.py
@ -9,9 +9,13 @@ import marshal
 from math import log
 import random
-FREQ = {}
+DICTIONARY = "dict.txt"
 total =0.0
 trie = None # to be initialized
 FREQ = {}
 min_freq = 0.0
 total =0.0
 initialized = False
 def gen_trie(f_name):
 	lfreq = {}
@ -31,7 +35,8 @@ def gen_trie(f_name):
 		p['']='' #ending flag
 	return trie, lfreq,ltotal
-
+def initialize(dictionary=DICTIONARY):
 	global trie, FREQ, total, min_freq, initialized
 	_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 	print >> sys.stderr, "Building Trie..."
@ -47,7 +52,7 @@ if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(
 			load_from_cache_fail = True
 	if load_from_cache_fail:
-	trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
+		trie,FREQ,total = gen_trie(os.path.join(_curpath, dictionary))
 		FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
 		min_freq = min(FREQ.itervalues())
 		print >> sys.stderr, "dumping model to file cache"
@ -60,10 +65,24 @@ if load_from_cache_fail:
 			replace_file = os.rename
 		replace_file(cache_file+tmp_suffix,cache_file)
 	initialized = True
 	print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
 	print >> sys.stderr, "Trie has been built succesfully."
 def require_initialized(fn):
 		global initialized
 		def wrapped(*args, **kwargs):
 			if initialized:
 				return fn(*args, **kwargs)
 			else:
 				initialize()
 				return fn(*args, **kwargs)
 		return wrapped
 def __cut_all(sentence):
 	dag = get_DAG(sentence)
 	old_j = -1
@ -77,6 +96,7 @@ def __cut_all(sentence):
 					yield sentence[k:j+1]
 					old_j = j
 def calc(sentence,DAG,idx,route):
 	N = len(sentence)
 	route[N] = (1.0,'')
@ -84,6 +104,8 @@ def calc(sentence,DAG,idx,route):
 		candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
 		route[idx] = max(candidates)
@require_initialized
 def get_DAG(sentence):
 	N = len(sentence)
 	i,j=0,0
@ -111,6 +133,7 @@ def get_DAG(sentence):
 			DAG[i] =[i]
 	return DAG
 def __cut_DAG(sentence):
 	DAG = get_DAG(sentence)
 	route ={}
@ -144,7 +167,6 @@ def __cut_DAG(sentence):
 			for t in regognized:
 				yield t
 def cut(sentence,cut_all=False):
 	if not ( type(sentence) is unicode):
 		try:
@ -184,6 +206,7 @@ def cut_for_search(sentence):
 					yield gram3
 		yield w
@require_initialized
 def load_userdict(f):
 	global trie,total,FREQ
 	if isinstance(f, (str, unicode)):
@ -200,3 +223,10 @@ def load_userdict(f):
 				p[c] ={}
 			p = p[c]
 		p['']='' #ending flag
 def set_dictionary(dictionary_path):
 	global initialized, DICTIONARY
 	DICTIONARY = dictionary_path
 	if initialized:
 		initialize()