mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
594 lines
18 KiB
Python
594 lines
18 KiB
Python
from __future__ import absolute_import, unicode_literals
|
|
__version__ = '0.38'
|
|
__license__ = 'MIT'
|
|
|
|
import re
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
import marshal
|
|
import tempfile
|
|
import threading
|
|
from math import log
|
|
from hashlib import md5
|
|
from ._compat import *
|
|
from . import finalseg
|
|
|
|
if os.name == 'nt':
|
|
from shutil import move as _replace_file
|
|
else:
|
|
_replace_file = os.rename
|
|
|
|
_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
|
|
|
|
DEFAULT_DICT = None
|
|
DEFAULT_DICT_NAME = "dict.txt"
|
|
|
|
log_console = logging.StreamHandler(sys.stderr)
|
|
default_logger = logging.getLogger(__name__)
|
|
default_logger.setLevel(logging.DEBUG)
|
|
default_logger.addHandler(log_console)
|
|
|
|
DICT_WRITING = {}
|
|
|
|
pool = None
|
|
|
|
re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
|
|
|
|
re_eng = re.compile('[a-zA-Z0-9]', re.U)
|
|
|
|
# \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
|
# \r\n|\s : whitespace characters. Will not be handled.
|
|
re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", re.U)
|
|
re_skip_default = re.compile("(\r\n|\s)", re.U)
|
|
re_han_cut_all = re.compile("([\u4E00-\u9FD5]+)", re.U)
|
|
re_skip_cut_all = re.compile("[^a-zA-Z0-9+#\n]", re.U)
|
|
|
|
def setLogLevel(log_level):
|
|
global logger
|
|
default_logger.setLevel(log_level)
|
|
|
|
class Tokenizer(object):
|
|
|
|
def __init__(self, dictionary=DEFAULT_DICT):
|
|
self.lock = threading.RLock()
|
|
if dictionary == DEFAULT_DICT:
|
|
self.dictionary = dictionary
|
|
else:
|
|
self.dictionary = _get_abs_path(dictionary)
|
|
self.FREQ = {}
|
|
self.total = 0
|
|
self.user_word_tag_tab = {}
|
|
self.initialized = False
|
|
self.tmp_dir = None
|
|
self.cache_file = None
|
|
|
|
def __repr__(self):
|
|
return '<Tokenizer dictionary=%r>' % self.dictionary
|
|
|
|
def gen_pfdict(self, f):
|
|
lfreq = {}
|
|
ltotal = 0
|
|
f_name = resolve_filename(f)
|
|
for lineno, line in enumerate(f, 1):
|
|
try:
|
|
line = line.strip().decode('utf-8')
|
|
word, freq = line.split(' ')[:2]
|
|
freq = int(freq)
|
|
lfreq[word] = freq
|
|
ltotal += freq
|
|
for ch in xrange(len(word)):
|
|
wfrag = word[:ch + 1]
|
|
if wfrag not in lfreq:
|
|
lfreq[wfrag] = 0
|
|
except ValueError:
|
|
raise ValueError(
|
|
'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
|
|
f.close()
|
|
return lfreq, ltotal
|
|
|
|
def initialize(self, dictionary=None):
|
|
if dictionary:
|
|
abs_path = _get_abs_path(dictionary)
|
|
if self.dictionary == abs_path and self.initialized:
|
|
return
|
|
else:
|
|
self.dictionary = abs_path
|
|
self.initialized = False
|
|
else:
|
|
abs_path = self.dictionary
|
|
|
|
with self.lock:
|
|
try:
|
|
with DICT_WRITING[abs_path]:
|
|
pass
|
|
except KeyError:
|
|
pass
|
|
if self.initialized:
|
|
return
|
|
|
|
default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
|
|
t1 = time.time()
|
|
if self.cache_file:
|
|
cache_file = self.cache_file
|
|
# default dictionary
|
|
elif abs_path == DEFAULT_DICT:
|
|
cache_file = "jieba.cache"
|
|
# custom dictionary
|
|
else:
|
|
cache_file = "jieba.u%s.cache" % md5(
|
|
abs_path.encode('utf-8', 'replace')).hexdigest()
|
|
cache_file = os.path.join(
|
|
self.tmp_dir or tempfile.gettempdir(), cache_file)
|
|
# prevent absolute path in self.cache_file
|
|
tmpdir = os.path.dirname(cache_file)
|
|
|
|
load_from_cache_fail = True
|
|
if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
|
|
os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
|
|
default_logger.debug(
|
|
"Loading model from cache %s" % cache_file)
|
|
try:
|
|
with open(cache_file, 'rb') as cf:
|
|
self.FREQ, self.total = marshal.load(cf)
|
|
load_from_cache_fail = False
|
|
except Exception:
|
|
load_from_cache_fail = True
|
|
|
|
if load_from_cache_fail:
|
|
wlock = DICT_WRITING.get(abs_path, threading.RLock())
|
|
DICT_WRITING[abs_path] = wlock
|
|
with wlock:
|
|
self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
|
|
default_logger.debug(
|
|
"Dumping model to file cache %s" % cache_file)
|
|
try:
|
|
# prevent moving across different filesystems
|
|
fd, fpath = tempfile.mkstemp(dir=tmpdir)
|
|
with os.fdopen(fd, 'wb') as temp_cache_file:
|
|
marshal.dump(
|
|
(self.FREQ, self.total), temp_cache_file)
|
|
_replace_file(fpath, cache_file)
|
|
except Exception:
|
|
default_logger.exception("Dump cache file failed.")
|
|
|
|
try:
|
|
del DICT_WRITING[abs_path]
|
|
except KeyError:
|
|
pass
|
|
|
|
self.initialized = True
|
|
default_logger.debug(
|
|
"Loading model cost %.3f seconds." % (time.time() - t1))
|
|
default_logger.debug("Prefix dict has been built succesfully.")
|
|
|
|
def check_initialized(self):
|
|
if not self.initialized:
|
|
self.initialize()
|
|
|
|
def calc(self, sentence, DAG, route):
|
|
N = len(sentence)
|
|
route[N] = (0, 0)
|
|
logtotal = log(self.total)
|
|
for idx in xrange(N - 1, -1, -1):
|
|
route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
|
|
logtotal + route[x + 1][0], x) for x in DAG[idx])
|
|
|
|
def get_DAG(self, sentence):
|
|
self.check_initialized()
|
|
DAG = {}
|
|
N = len(sentence)
|
|
for k in xrange(N):
|
|
tmplist = []
|
|
i = k
|
|
frag = sentence[k]
|
|
while i < N and frag in self.FREQ:
|
|
if self.FREQ[frag]:
|
|
tmplist.append(i)
|
|
i += 1
|
|
frag = sentence[k:i + 1]
|
|
if not tmplist:
|
|
tmplist.append(k)
|
|
DAG[k] = tmplist
|
|
return DAG
|
|
|
|
def __cut_all(self, sentence):
|
|
dag = self.get_DAG(sentence)
|
|
old_j = -1
|
|
for k, L in iteritems(dag):
|
|
if len(L) == 1 and k > old_j:
|
|
yield sentence[k:L[0] + 1]
|
|
old_j = L[0]
|
|
else:
|
|
for j in L:
|
|
if j > k:
|
|
yield sentence[k:j + 1]
|
|
old_j = j
|
|
|
|
def __cut_DAG_NO_HMM(self, sentence):
|
|
DAG = self.get_DAG(sentence)
|
|
route = {}
|
|
self.calc(sentence, DAG, route)
|
|
x = 0
|
|
N = len(sentence)
|
|
buf = ''
|
|
while x < N:
|
|
y = route[x][1] + 1
|
|
l_word = sentence[x:y]
|
|
if re_eng.match(l_word) and len(l_word) == 1:
|
|
buf += l_word
|
|
x = y
|
|
else:
|
|
if buf:
|
|
yield buf
|
|
buf = ''
|
|
yield l_word
|
|
x = y
|
|
if buf:
|
|
yield buf
|
|
buf = ''
|
|
|
|
def __cut_DAG(self, sentence):
|
|
DAG = self.get_DAG(sentence)
|
|
route = {}
|
|
self.calc(sentence, DAG, route)
|
|
x = 0
|
|
buf = ''
|
|
N = len(sentence)
|
|
while x < N:
|
|
y = route[x][1] + 1
|
|
l_word = sentence[x:y]
|
|
if y - x == 1:
|
|
buf += l_word
|
|
else:
|
|
if buf:
|
|
if len(buf) == 1:
|
|
yield buf
|
|
buf = ''
|
|
else:
|
|
if not self.FREQ.get(buf):
|
|
recognized = finalseg.cut(buf)
|
|
for t in recognized:
|
|
yield t
|
|
else:
|
|
for elem in buf:
|
|
yield elem
|
|
buf = ''
|
|
yield l_word
|
|
x = y
|
|
|
|
if buf:
|
|
if len(buf) == 1:
|
|
yield buf
|
|
elif not self.FREQ.get(buf):
|
|
recognized = finalseg.cut(buf)
|
|
for t in recognized:
|
|
yield t
|
|
else:
|
|
for elem in buf:
|
|
yield elem
|
|
|
|
def cut(self, sentence, cut_all=False, HMM=True):
|
|
'''
|
|
The main function that segments an entire sentence that contains
|
|
Chinese characters into seperated words.
|
|
|
|
Parameter:
|
|
- sentence: The str(unicode) to be segmented.
|
|
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
|
- HMM: Whether to use the Hidden Markov Model.
|
|
'''
|
|
sentence = strdecode(sentence)
|
|
|
|
if cut_all:
|
|
re_han = re_han_cut_all
|
|
re_skip = re_skip_cut_all
|
|
else:
|
|
re_han = re_han_default
|
|
re_skip = re_skip_default
|
|
if cut_all:
|
|
cut_block = self.__cut_all
|
|
elif HMM:
|
|
cut_block = self.__cut_DAG
|
|
else:
|
|
cut_block = self.__cut_DAG_NO_HMM
|
|
blocks = re_han.split(sentence)
|
|
for blk in blocks:
|
|
if not blk:
|
|
continue
|
|
if re_han.match(blk):
|
|
for word in cut_block(blk):
|
|
yield word
|
|
else:
|
|
tmp = re_skip.split(blk)
|
|
for x in tmp:
|
|
if re_skip.match(x):
|
|
yield x
|
|
elif not cut_all:
|
|
for xx in x:
|
|
yield xx
|
|
else:
|
|
yield x
|
|
|
|
def cut_for_search(self, sentence, HMM=True):
|
|
"""
|
|
Finer segmentation for search engines.
|
|
"""
|
|
words = self.cut(sentence, HMM=HMM)
|
|
for w in words:
|
|
if len(w) > 2:
|
|
for i in xrange(len(w) - 1):
|
|
gram2 = w[i:i + 2]
|
|
if self.FREQ.get(gram2):
|
|
yield gram2
|
|
if len(w) > 3:
|
|
for i in xrange(len(w) - 2):
|
|
gram3 = w[i:i + 3]
|
|
if self.FREQ.get(gram3):
|
|
yield gram3
|
|
yield w
|
|
|
|
def lcut(self, *args, **kwargs):
|
|
return list(self.cut(*args, **kwargs))
|
|
|
|
def lcut_for_search(self, *args, **kwargs):
|
|
return list(self.cut_for_search(*args, **kwargs))
|
|
|
|
_lcut = lcut
|
|
_lcut_for_search = lcut_for_search
|
|
|
|
def _lcut_no_hmm(self, sentence):
|
|
return self.lcut(sentence, False, False)
|
|
|
|
def _lcut_all(self, sentence):
|
|
return self.lcut(sentence, True)
|
|
|
|
def _lcut_for_search_no_hmm(self, sentence):
|
|
return self.lcut_for_search(sentence, False)
|
|
|
|
def get_dict_file(self):
|
|
if self.dictionary == DEFAULT_DICT:
|
|
return get_module_res(DEFAULT_DICT_NAME)
|
|
else:
|
|
return open(self.dictionary, 'rb')
|
|
|
|
def load_userdict(self, f):
|
|
'''
|
|
Load personalized dict to improve detect rate.
|
|
|
|
Parameter:
|
|
- f : A plain text file contains words and their ocurrences.
|
|
Can be a file-like object, or the path of the dictionary file,
|
|
whose encoding must be utf-8.
|
|
|
|
Structure of dict file:
|
|
word1 freq1 word_type1
|
|
word2 freq2 word_type2
|
|
...
|
|
Word type may be ignored
|
|
'''
|
|
self.check_initialized()
|
|
if isinstance(f, string_types):
|
|
f_name = f
|
|
f = open(f, 'rb')
|
|
else:
|
|
f_name = resolve_filename(f)
|
|
for lineno, ln in enumerate(f, 1):
|
|
line = ln.strip()
|
|
if not isinstance(line, text_type):
|
|
try:
|
|
line = line.decode('utf-8').lstrip('\ufeff')
|
|
except UnicodeDecodeError:
|
|
raise ValueError('dictionary file %s must be utf-8' % f_name)
|
|
if not line:
|
|
continue
|
|
# match won't be None because there's at least one character
|
|
word, freq, tag = re_userdict.match(line).groups()
|
|
if freq is not None:
|
|
freq = freq.strip()
|
|
if tag is not None:
|
|
tag = tag.strip()
|
|
self.add_word(word, freq, tag)
|
|
|
|
def add_word(self, word, freq=None, tag=None):
|
|
"""
|
|
Add a word to dictionary.
|
|
|
|
freq and tag can be omitted, freq defaults to be a calculated value
|
|
that ensures the word can be cut out.
|
|
"""
|
|
self.check_initialized()
|
|
word = strdecode(word)
|
|
freq = int(freq) if freq is not None else self.suggest_freq(word, False)
|
|
self.FREQ[word] = freq
|
|
self.total += freq
|
|
if tag:
|
|
self.user_word_tag_tab[word] = tag
|
|
for ch in xrange(len(word)):
|
|
wfrag = word[:ch + 1]
|
|
if wfrag not in self.FREQ:
|
|
self.FREQ[wfrag] = 0
|
|
|
|
def del_word(self, word):
|
|
"""
|
|
Convenient function for deleting a word.
|
|
"""
|
|
self.add_word(word, 0)
|
|
|
|
def suggest_freq(self, segment, tune=False):
|
|
"""
|
|
Suggest word frequency to force the characters in a word to be
|
|
joined or splitted.
|
|
|
|
Parameter:
|
|
- segment : The segments that the word is expected to be cut into,
|
|
If the word should be treated as a whole, use a str.
|
|
- tune : If True, tune the word frequency.
|
|
|
|
Note that HMM may affect the final result. If the result doesn't change,
|
|
set HMM=False.
|
|
"""
|
|
self.check_initialized()
|
|
ftotal = float(self.total)
|
|
freq = 1
|
|
if isinstance(segment, string_types):
|
|
word = segment
|
|
for seg in self.cut(word, HMM=False):
|
|
freq *= self.FREQ.get(seg, 1) / ftotal
|
|
freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1))
|
|
else:
|
|
segment = tuple(map(strdecode, segment))
|
|
word = ''.join(segment)
|
|
for seg in segment:
|
|
freq *= self.FREQ.get(seg, 1) / ftotal
|
|
freq = min(int(freq * self.total), self.FREQ.get(word, 0))
|
|
if tune:
|
|
add_word(word, freq)
|
|
return freq
|
|
|
|
def tokenize(self, unicode_sentence, mode="default", HMM=True):
|
|
"""
|
|
Tokenize a sentence and yields tuples of (word, start, end)
|
|
|
|
Parameter:
|
|
- sentence: the str(unicode) to be segmented.
|
|
- mode: "default" or "search", "search" is for finer segmentation.
|
|
- HMM: whether to use the Hidden Markov Model.
|
|
"""
|
|
if not isinstance(unicode_sentence, text_type):
|
|
raise ValueError("jieba: the input parameter should be unicode.")
|
|
start = 0
|
|
if mode == 'default':
|
|
for w in self.cut(unicode_sentence, HMM=HMM):
|
|
width = len(w)
|
|
yield (w, start, start + width)
|
|
start += width
|
|
else:
|
|
for w in self.cut(unicode_sentence, HMM=HMM):
|
|
width = len(w)
|
|
if len(w) > 2:
|
|
for i in xrange(len(w) - 1):
|
|
gram2 = w[i:i + 2]
|
|
if self.FREQ.get(gram2):
|
|
yield (gram2, start + i, start + i + 2)
|
|
if len(w) > 3:
|
|
for i in xrange(len(w) - 2):
|
|
gram3 = w[i:i + 3]
|
|
if self.FREQ.get(gram3):
|
|
yield (gram3, start + i, start + i + 3)
|
|
yield (w, start, start + width)
|
|
start += width
|
|
|
|
def set_dictionary(self, dictionary_path):
|
|
with self.lock:
|
|
abs_path = _get_abs_path(dictionary_path)
|
|
if not os.path.isfile(abs_path):
|
|
raise Exception("jieba: file does not exist: " + abs_path)
|
|
self.dictionary = abs_path
|
|
self.initialized = False
|
|
|
|
|
|
# default Tokenizer instance
|
|
|
|
dt = Tokenizer()
|
|
|
|
# global functions
|
|
|
|
get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
|
|
add_word = dt.add_word
|
|
calc = dt.calc
|
|
cut = dt.cut
|
|
lcut = dt.lcut
|
|
cut_for_search = dt.cut_for_search
|
|
lcut_for_search = dt.lcut_for_search
|
|
del_word = dt.del_word
|
|
get_DAG = dt.get_DAG
|
|
get_dict_file = dt.get_dict_file
|
|
initialize = dt.initialize
|
|
load_userdict = dt.load_userdict
|
|
set_dictionary = dt.set_dictionary
|
|
suggest_freq = dt.suggest_freq
|
|
tokenize = dt.tokenize
|
|
user_word_tag_tab = dt.user_word_tag_tab
|
|
|
|
|
|
def _lcut_all(s):
|
|
return dt._lcut_all(s)
|
|
|
|
|
|
def _lcut(s):
|
|
return dt._lcut(s)
|
|
|
|
|
|
def _lcut_no_hmm(s):
|
|
return dt._lcut_no_hmm(s)
|
|
|
|
|
|
def _lcut_all(s):
|
|
return dt._lcut_all(s)
|
|
|
|
|
|
def _lcut_for_search(s):
|
|
return dt._lcut_for_search(s)
|
|
|
|
|
|
def _lcut_for_search_no_hmm(s):
|
|
return dt._lcut_for_search_no_hmm(s)
|
|
|
|
|
|
def _pcut(sentence, cut_all=False, HMM=True):
|
|
parts = strdecode(sentence).splitlines(True)
|
|
if cut_all:
|
|
result = pool.map(_lcut_all, parts)
|
|
elif HMM:
|
|
result = pool.map(_lcut, parts)
|
|
else:
|
|
result = pool.map(_lcut_no_hmm, parts)
|
|
for r in result:
|
|
for w in r:
|
|
yield w
|
|
|
|
|
|
def _pcut_for_search(sentence, HMM=True):
|
|
parts = strdecode(sentence).splitlines(True)
|
|
if HMM:
|
|
result = pool.map(_lcut_for_search, parts)
|
|
else:
|
|
result = pool.map(_lcut_for_search_no_hmm, parts)
|
|
for r in result:
|
|
for w in r:
|
|
yield w
|
|
|
|
|
|
def enable_parallel(processnum=None):
|
|
"""
|
|
Change the module's `cut` and `cut_for_search` functions to the
|
|
parallel version.
|
|
|
|
Note that this only works using dt, custom Tokenizer
|
|
instances are not supported.
|
|
"""
|
|
global pool, dt, cut, cut_for_search
|
|
from multiprocessing import cpu_count
|
|
if os.name == 'nt':
|
|
raise NotImplementedError(
|
|
"jieba: parallel mode only supports posix system")
|
|
else:
|
|
from multiprocessing import Pool
|
|
dt.check_initialized()
|
|
if processnum is None:
|
|
processnum = cpu_count()
|
|
pool = Pool(processnum)
|
|
cut = _pcut
|
|
cut_for_search = _pcut_for_search
|
|
|
|
|
|
def disable_parallel():
|
|
global pool, dt, cut, cut_for_search
|
|
if pool:
|
|
pool.close()
|
|
pool = None
|
|
cut = dt.cut
|
|
cut_for_search = dt.cut_for_search
|