mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
commit
f73a2183a5
@ -20,11 +20,10 @@ if os.name == 'nt':
|
|||||||
else:
|
else:
|
||||||
_replace_file = os.rename
|
_replace_file = os.rename
|
||||||
|
|
||||||
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
|
|
||||||
os.path.dirname(__file__), path))
|
|
||||||
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
||||||
|
|
||||||
DEFAULT_DICT = _get_module_path("dict.txt")
|
DEFAULT_DICT = None
|
||||||
|
DEFAULT_DICT_NAME = "dict.txt"
|
||||||
|
|
||||||
log_console = logging.StreamHandler(sys.stderr)
|
log_console = logging.StreamHandler(sys.stderr)
|
||||||
default_logger = logging.getLogger(__name__)
|
default_logger = logging.getLogger(__name__)
|
||||||
@ -54,7 +53,10 @@ class Tokenizer(object):
|
|||||||
|
|
||||||
def __init__(self, dictionary=DEFAULT_DICT):
|
def __init__(self, dictionary=DEFAULT_DICT):
|
||||||
self.lock = threading.RLock()
|
self.lock = threading.RLock()
|
||||||
self.dictionary = _get_abs_path(dictionary)
|
if dictionary == DEFAULT_DICT:
|
||||||
|
self.dictionary = dictionary
|
||||||
|
else:
|
||||||
|
self.dictionary = _get_abs_path(dictionary)
|
||||||
self.FREQ = {}
|
self.FREQ = {}
|
||||||
self.total = 0
|
self.total = 0
|
||||||
self.user_word_tag_tab = {}
|
self.user_word_tag_tab = {}
|
||||||
@ -65,10 +67,11 @@ class Tokenizer(object):
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<Tokenizer dictionary=%r>' % self.dictionary
|
return '<Tokenizer dictionary=%r>' % self.dictionary
|
||||||
|
|
||||||
def gen_pfdict(self, f_name):
|
def gen_pfdict(self, f):
|
||||||
lfreq = {}
|
lfreq = {}
|
||||||
ltotal = 0
|
ltotal = 0
|
||||||
with open(f_name, 'rb') as f:
|
f_name = resolve_filename(f)
|
||||||
|
with f:
|
||||||
for lineno, line in enumerate(f, 1):
|
for lineno, line in enumerate(f, 1):
|
||||||
try:
|
try:
|
||||||
line = line.strip().decode('utf-8')
|
line = line.strip().decode('utf-8')
|
||||||
@ -105,7 +108,7 @@ class Tokenizer(object):
|
|||||||
if self.initialized:
|
if self.initialized:
|
||||||
return
|
return
|
||||||
|
|
||||||
default_logger.debug("Building prefix dict from %s ..." % abs_path)
|
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
if self.cache_file:
|
if self.cache_file:
|
||||||
cache_file = self.cache_file
|
cache_file = self.cache_file
|
||||||
@ -122,7 +125,8 @@ class Tokenizer(object):
|
|||||||
tmpdir = os.path.dirname(cache_file)
|
tmpdir = os.path.dirname(cache_file)
|
||||||
|
|
||||||
load_from_cache_fail = True
|
load_from_cache_fail = True
|
||||||
if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
|
||||||
|
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
|
||||||
default_logger.debug(
|
default_logger.debug(
|
||||||
"Loading model from cache %s" % cache_file)
|
"Loading model from cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
@ -136,7 +140,7 @@ class Tokenizer(object):
|
|||||||
wlock = DICT_WRITING.get(abs_path, threading.RLock())
|
wlock = DICT_WRITING.get(abs_path, threading.RLock())
|
||||||
DICT_WRITING[abs_path] = wlock
|
DICT_WRITING[abs_path] = wlock
|
||||||
with wlock:
|
with wlock:
|
||||||
self.FREQ, self.total = self.gen_pfdict(abs_path)
|
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
|
||||||
default_logger.debug(
|
default_logger.debug(
|
||||||
"Dumping model to file cache %s" % cache_file)
|
"Dumping model to file cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
@ -343,8 +347,11 @@ class Tokenizer(object):
|
|||||||
def _lcut_for_search_no_hmm(self, sentence):
|
def _lcut_for_search_no_hmm(self, sentence):
|
||||||
return self.lcut_for_search(sentence, False)
|
return self.lcut_for_search(sentence, False)
|
||||||
|
|
||||||
def get_abs_path_dict(self):
|
def get_dict_file(self):
|
||||||
return _get_abs_path(self.dictionary)
|
if self.dictionary == DEFAULT_DICT:
|
||||||
|
return get_module_res(DEFAULT_DICT_NAME)
|
||||||
|
else:
|
||||||
|
return open(self.dictionary, 'rb')
|
||||||
|
|
||||||
def load_userdict(self, f):
|
def load_userdict(self, f):
|
||||||
'''
|
'''
|
||||||
@ -363,14 +370,17 @@ class Tokenizer(object):
|
|||||||
'''
|
'''
|
||||||
self.check_initialized()
|
self.check_initialized()
|
||||||
if isinstance(f, string_types):
|
if isinstance(f, string_types):
|
||||||
|
f_name = f
|
||||||
f = open(f, 'rb')
|
f = open(f, 'rb')
|
||||||
|
else:
|
||||||
|
f_name = resolve_filename(f)
|
||||||
for lineno, ln in enumerate(f, 1):
|
for lineno, ln in enumerate(f, 1):
|
||||||
line = ln.strip()
|
line = ln.strip()
|
||||||
if not isinstance(line, text_type):
|
if not isinstance(line, text_type):
|
||||||
try:
|
try:
|
||||||
line = line.decode('utf-8').lstrip('\ufeff')
|
line = line.decode('utf-8').lstrip('\ufeff')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
raise ValueError('dictionary file %s must be utf-8' % f.name)
|
raise ValueError('dictionary file %s must be utf-8' % f_name)
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
# match won't be None because there's at least one character
|
# match won't be None because there's at least one character
|
||||||
@ -494,7 +504,7 @@ cut_for_search = dt.cut_for_search
|
|||||||
lcut_for_search = dt.lcut_for_search
|
lcut_for_search = dt.lcut_for_search
|
||||||
del_word = dt.del_word
|
del_word = dt.del_word
|
||||||
get_DAG = dt.get_DAG
|
get_DAG = dt.get_DAG
|
||||||
get_abs_path_dict = dt.get_abs_path_dict
|
get_dict_file = dt.get_dict_file
|
||||||
initialize = dt.initialize
|
initialize = dt.initialize
|
||||||
load_userdict = dt.load_userdict
|
load_userdict = dt.load_userdict
|
||||||
set_dictionary = dt.set_dictionary
|
set_dictionary = dt.set_dictionary
|
||||||
|
@ -1,6 +1,15 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pkg_resources
|
||||||
|
get_module_res = lambda *res: pkg_resources.resource_stream(__name__,
|
||||||
|
os.path.join(*res))
|
||||||
|
except ImportError:
|
||||||
|
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
|
||||||
|
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
|
||||||
|
|
||||||
PY2 = sys.version_info[0] == 2
|
PY2 = sys.version_info[0] == 2
|
||||||
|
|
||||||
default_encoding = sys.getfilesystemencoding()
|
default_encoding = sys.getfilesystemencoding()
|
||||||
@ -29,3 +38,9 @@ def strdecode(sentence):
|
|||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
def resolve_filename(f):
|
||||||
|
try:
|
||||||
|
return f.name
|
||||||
|
except AttributeError:
|
||||||
|
return repr(f)
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import marshal
|
|
||||||
import sys
|
import sys
|
||||||
|
import pickle
|
||||||
from .._compat import *
|
from .._compat import *
|
||||||
|
|
||||||
MIN_FLOAT = -3.14e100
|
MIN_FLOAT = -3.14e100
|
||||||
@ -21,24 +21,9 @@ PrevStatus = {
|
|||||||
|
|
||||||
|
|
||||||
def load_model():
|
def load_model():
|
||||||
_curpath = os.path.normpath(
|
start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
|
||||||
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
|
||||||
|
emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
|
||||||
start_p = {}
|
|
||||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
|
||||||
with open(abs_path, 'rb') as f:
|
|
||||||
start_p = marshal.load(f)
|
|
||||||
|
|
||||||
trans_p = {}
|
|
||||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
|
||||||
with open(abs_path, 'rb') as f:
|
|
||||||
trans_p = marshal.load(f)
|
|
||||||
|
|
||||||
emit_p = {}
|
|
||||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
|
||||||
with open(abs_path, 'rb') as f:
|
|
||||||
emit_p = marshal.load(f)
|
|
||||||
|
|
||||||
return start_p, trans_p, emit_p
|
return start_p, trans_p, emit_p
|
||||||
|
|
||||||
if sys.platform.startswith("java"):
|
if sys.platform.startswith("java"):
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -3,7 +3,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import jieba
|
import jieba
|
||||||
import marshal
|
import pickle
|
||||||
from .._compat import *
|
from .._compat import *
|
||||||
from .viterbi import viterbi
|
from .viterbi import viterbi
|
||||||
|
|
||||||
@ -23,36 +23,17 @@ re_num = re.compile("[\.0-9]+")
|
|||||||
re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U)
|
re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U)
|
||||||
|
|
||||||
|
|
||||||
def load_model(f_name):
|
def load_model():
|
||||||
_curpath = os.path.normpath(
|
|
||||||
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
||||||
# For Jython
|
# For Jython
|
||||||
start_p = {}
|
start_p = pickle.load(get_module_res("posseg", PROB_START_P))
|
||||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
trans_p = pickle.load(get_module_res("posseg", PROB_TRANS_P))
|
||||||
with open(abs_path, 'rb') as f:
|
emit_p = pickle.load(get_module_res("posseg", PROB_EMIT_P))
|
||||||
start_p = marshal.load(f)
|
state = pickle.load(get_module_res("posseg", CHAR_STATE_TAB_P))
|
||||||
|
return state, start_p, trans_p, emit_p
|
||||||
trans_p = {}
|
|
||||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
|
||||||
with open(abs_path, 'rb') as f:
|
|
||||||
trans_p = marshal.load(f)
|
|
||||||
|
|
||||||
emit_p = {}
|
|
||||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
|
||||||
with open(abs_path, 'rb') as f:
|
|
||||||
emit_p = marshal.load(f)
|
|
||||||
|
|
||||||
state = {}
|
|
||||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
|
||||||
with open(abs_path, 'rb') as f:
|
|
||||||
state = marshal.load(f)
|
|
||||||
f.closed
|
|
||||||
|
|
||||||
return state, start_p, trans_p, emit_p, result
|
|
||||||
|
|
||||||
|
|
||||||
if sys.platform.startswith("java"):
|
if sys.platform.startswith("java"):
|
||||||
char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model()
|
char_state_tab_P, start_P, trans_P, emit_P = load_model()
|
||||||
else:
|
else:
|
||||||
from .char_state_tab import P as char_state_tab_P
|
from .char_state_tab import P as char_state_tab_P
|
||||||
from .prob_start import P as start_P
|
from .prob_start import P as start_P
|
||||||
@ -89,7 +70,7 @@ class POSTokenizer(object):
|
|||||||
|
|
||||||
def __init__(self, tokenizer=None):
|
def __init__(self, tokenizer=None):
|
||||||
self.tokenizer = tokenizer or jieba.Tokenizer()
|
self.tokenizer = tokenizer or jieba.Tokenizer()
|
||||||
self.load_word_tag(self.tokenizer.get_abs_path_dict())
|
self.load_word_tag(self.tokenizer.get_dict_file())
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<POSTokenizer tokenizer=%r>' % self.tokenizer
|
return '<POSTokenizer tokenizer=%r>' % self.tokenizer
|
||||||
@ -102,11 +83,12 @@ class POSTokenizer(object):
|
|||||||
|
|
||||||
def initialize(self, dictionary=None):
|
def initialize(self, dictionary=None):
|
||||||
self.tokenizer.initialize(dictionary)
|
self.tokenizer.initialize(dictionary)
|
||||||
self.load_word_tag(self.tokenizer.get_abs_path_dict())
|
self.load_word_tag(self.tokenizer.get_dict_file())
|
||||||
|
|
||||||
def load_word_tag(self, f_name):
|
def load_word_tag(self, f):
|
||||||
self.word_tag_tab = {}
|
self.word_tag_tab = {}
|
||||||
with open(f_name, "rb") as f:
|
f_name = resolve_filename(f)
|
||||||
|
with f:
|
||||||
for lineno, line in enumerate(f, 1):
|
for lineno, line in enumerate(f, 1):
|
||||||
try:
|
try:
|
||||||
line = line.strip().decode("utf-8")
|
line = line.strip().decode("utf-8")
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user