mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
Add initialize function and lazy initialization
This commit is contained in:
parent
87c2799692
commit
c6098a8657
11
README.md
11
README.md
@ -229,6 +229,17 @@ Code sample (keyword extraction)
|
|||||||
|
|
||||||
https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
|
||||||
|
|
||||||
|
Using Other Dictionaries
|
||||||
|
========
|
||||||
|
It is possible to supply Jieba with your own custom dictionary, and there are also two dictionaries readily available for download:
|
||||||
|
|
||||||
|
1. You can employ a smaller dictionary to use less memory:
|
||||||
|
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small
|
||||||
|
|
||||||
|
2. There is also a bigger file that has better support for traditional characters (繁體):
|
||||||
|
https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big
|
||||||
|
|
||||||
|
In either case, download the file you want first, and then call `jieba.load_userdict('dict.txt.small')` or just replace the existing `dict.txt`.
|
||||||
|
|
||||||
Segmentation speed
|
Segmentation speed
|
||||||
=========
|
=========
|
||||||
|
@ -9,9 +9,13 @@ import marshal
|
|||||||
from math import log
|
from math import log
|
||||||
import random
|
import random
|
||||||
|
|
||||||
FREQ = {}
|
DICTIONARY = "dict.txt"
|
||||||
total =0.0
|
|
||||||
|
|
||||||
|
trie = None # to be initialized
|
||||||
|
FREQ = {}
|
||||||
|
min_freq = 0.0
|
||||||
|
total =0.0
|
||||||
|
initialized = False
|
||||||
|
|
||||||
def gen_trie(f_name):
|
def gen_trie(f_name):
|
||||||
lfreq = {}
|
lfreq = {}
|
||||||
@ -31,7 +35,8 @@ def gen_trie(f_name):
|
|||||||
p['']='' #ending flag
|
p['']='' #ending flag
|
||||||
return trie, lfreq,ltotal
|
return trie, lfreq,ltotal
|
||||||
|
|
||||||
|
def initialize(dictionary=DICTIONARY):
|
||||||
|
global trie, FREQ, total, min_freq, initialized
|
||||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||||
|
|
||||||
print >> sys.stderr, "Building Trie..."
|
print >> sys.stderr, "Building Trie..."
|
||||||
@ -47,7 +52,7 @@ if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(
|
|||||||
load_from_cache_fail = True
|
load_from_cache_fail = True
|
||||||
|
|
||||||
if load_from_cache_fail:
|
if load_from_cache_fail:
|
||||||
trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
|
trie,FREQ,total = gen_trie(os.path.join(_curpath, dictionary))
|
||||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
|
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
|
||||||
min_freq = min(FREQ.itervalues())
|
min_freq = min(FREQ.itervalues())
|
||||||
print >> sys.stderr, "dumping model to file cache"
|
print >> sys.stderr, "dumping model to file cache"
|
||||||
@ -60,10 +65,24 @@ if load_from_cache_fail:
|
|||||||
replace_file = os.rename
|
replace_file = os.rename
|
||||||
replace_file(cache_file+tmp_suffix,cache_file)
|
replace_file(cache_file+tmp_suffix,cache_file)
|
||||||
|
|
||||||
|
initialized = True
|
||||||
|
|
||||||
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
|
print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
|
||||||
print >> sys.stderr, "Trie has been built succesfully."
|
print >> sys.stderr, "Trie has been built succesfully."
|
||||||
|
|
||||||
|
|
||||||
|
def require_initialized(fn):
|
||||||
|
global initialized
|
||||||
|
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
if initialized:
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
else:
|
||||||
|
initialize()
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
def __cut_all(sentence):
|
def __cut_all(sentence):
|
||||||
dag = get_DAG(sentence)
|
dag = get_DAG(sentence)
|
||||||
old_j = -1
|
old_j = -1
|
||||||
@ -77,6 +96,7 @@ def __cut_all(sentence):
|
|||||||
yield sentence[k:j+1]
|
yield sentence[k:j+1]
|
||||||
old_j = j
|
old_j = j
|
||||||
|
|
||||||
|
|
||||||
def calc(sentence,DAG,idx,route):
|
def calc(sentence,DAG,idx,route):
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
route[N] = (1.0,'')
|
route[N] = (1.0,'')
|
||||||
@ -84,6 +104,8 @@ def calc(sentence,DAG,idx,route):
|
|||||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
||||||
route[idx] = max(candidates)
|
route[idx] = max(candidates)
|
||||||
|
|
||||||
|
|
||||||
|
@require_initialized
|
||||||
def get_DAG(sentence):
|
def get_DAG(sentence):
|
||||||
N = len(sentence)
|
N = len(sentence)
|
||||||
i,j=0,0
|
i,j=0,0
|
||||||
@ -111,6 +133,7 @@ def get_DAG(sentence):
|
|||||||
DAG[i] =[i]
|
DAG[i] =[i]
|
||||||
return DAG
|
return DAG
|
||||||
|
|
||||||
|
|
||||||
def __cut_DAG(sentence):
|
def __cut_DAG(sentence):
|
||||||
DAG = get_DAG(sentence)
|
DAG = get_DAG(sentence)
|
||||||
route ={}
|
route ={}
|
||||||
@ -144,7 +167,6 @@ def __cut_DAG(sentence):
|
|||||||
for t in regognized:
|
for t in regognized:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
|
|
||||||
def cut(sentence,cut_all=False):
|
def cut(sentence,cut_all=False):
|
||||||
if not ( type(sentence) is unicode):
|
if not ( type(sentence) is unicode):
|
||||||
try:
|
try:
|
||||||
@ -184,6 +206,7 @@ def cut_for_search(sentence):
|
|||||||
yield gram3
|
yield gram3
|
||||||
yield w
|
yield w
|
||||||
|
|
||||||
|
@require_initialized
|
||||||
def load_userdict(f):
|
def load_userdict(f):
|
||||||
global trie,total,FREQ
|
global trie,total,FREQ
|
||||||
if isinstance(f, (str, unicode)):
|
if isinstance(f, (str, unicode)):
|
||||||
@ -200,3 +223,10 @@ def load_userdict(f):
|
|||||||
p[c] ={}
|
p[c] ={}
|
||||||
p = p[c]
|
p = p[c]
|
||||||
p['']='' #ending flag
|
p['']='' #ending flag
|
||||||
|
|
||||||
|
|
||||||
|
def set_dictionary(dictionary_path):
|
||||||
|
global initialized, DICTIONARY
|
||||||
|
DICTIONARY = dictionary_path
|
||||||
|
if initialized:
|
||||||
|
initialize()
|
Loading…
x
Reference in New Issue
Block a user