jieba/jieba/__init__.py

422 lines
12 KiB
Python

__version__ = '0.32'
__license__ = 'MIT'
import re
import os
import sys
from . import finalseg
import time
import tempfile
import marshal
from math import log
import random
import threading
from functools import wraps
import logging
DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock()
pfdict = None # to be initialized
FREQ = {}
min_freq = 0.0
total = 0.0
user_word_tag_tab = {}
initialized = False
log_console = logging.StreamHandler(sys.stderr)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(log_console)
def setLogLevel(log_level):
global logger
logger.setLevel(log_level)
def gen_pfdict(f_name):
lfreq = {}
pfdict = set()
ltotal = 0.0
with open(f_name, 'rb') as f:
lineno = 0
for line in f.read().rstrip().decode('utf-8').split('\n'):
lineno += 1
try:
word,freq = line.split(' ')[:2]
freq = float(freq)
lfreq[word] = freq
ltotal += freq
for ch in range(len(word)):
pfdict.add(word[:ch+1])
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise e
return pfdict, lfreq, ltotal
def initialize(*args):
global pfdict, FREQ, total, min_freq, initialized
if not args:
dictionary = DICTIONARY
else:
dictionary = args[0]
with DICT_LOCK:
if initialized:
return
if pfdict:
del pfdict
pfdict = None
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary)
logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time()
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
logger.debug("Loading model from cache %s" % cache_file)
try:
with open(cache_file, 'rb') as cf:
pfdict,FREQ,total,min_freq = marshal.load(cf)
# prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
except:
load_from_cache_fail = True
if load_from_cache_fail:
pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
min_freq = min(FREQ.values())
logger.debug("Dumping model to file cache %s" % cache_file)
try:
tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name == 'nt':
from shutil import move as replace_file
else:
replace_file = os.rename
replace_file(cache_file + tmp_suffix, cache_file)
except:
logger.exception("Dump cache file failed.")
initialized = True
logger.debug("Loading model cost %s seconds." % (time.time() - t1))
logger.debug("Prefix dict has been built succesfully.")
def require_initialized(fn):
@wraps(fn)
def wrapped(*args, **kwargs):
global initialized
if initialized:
return fn(*args, **kwargs)
else:
initialize(DICTIONARY)
return fn(*args, **kwargs)
return wrapped
def __cut_all(sentence):
dag = get_DAG(sentence)
old_j = -1
for k,L in dag.items():
if len(L) == 1 and k > old_j:
yield sentence[k:L[0]+1]
old_j = L[0]
else:
for j in L:
if j > k:
yield sentence[k:j+1]
old_j = j
def calc(sentence,DAG,idx,route):
N = len(sentence)
route[N] = (0.0, '')
for idx in range(N-1, -1, -1):
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
route[idx] = max(candidates)
@require_initialized
def get_DAG(sentence):
global pfdict, FREQ
DAG = {}
N = len(sentence)
for k in range(N):
tmplist = []
i = k
frag = sentence[k]
while i < N and frag in pfdict:
if frag in FREQ:
tmplist.append(i)
i += 1
frag = sentence[k:i+1]
if not tmplist:
tmplist.append(k)
DAG[k] = tmplist
return DAG
def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence)
route = {}
calc(sentence, DAG, 0, route=route)
x = 0
N = len(sentence)
buf = ''
while x < N:
y = route[x][1] + 1
l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word
x = y
else:
if buf:
yield buf
buf = ''
yield l_word
x = y
if buf:
yield buf
buf = ''
def __cut_DAG(sentence):
DAG = get_DAG(sentence)
route = {}
calc(sentence, DAG, 0, route=route)
x = 0
buf = ''
N = len(sentence)
while x < N:
y = route[x][1]+1
l_word = sentence[x:y]
if y-x == 1:
buf += l_word
else:
if buf:
if len(buf) == 1:
yield buf
buf = ''
else:
if (buf not in FREQ):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else:
for elem in buf:
yield elem
buf = ''
yield l_word
x = y
if buf:
if len(buf) == 1:
yield buf
elif (buf not in FREQ):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else:
for elem in buf:
yield elem
def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains
Chinese characters into seperated words.
Parameter:
- sentence: The str to be segmented.
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
if isinstance(sentence, bytes):
try:
sentence = sentence.decode('utf-8')
except UnicodeDecodeError:
sentence = sentence.decode('gbk', 'ignore')
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
# \r\n|\s : whitespace characters. Will not be handled.
if cut_all:
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
else:
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
blocks = re_han.split(sentence)
if cut_all:
cut_block = __cut_all
elif HMM:
cut_block = __cut_DAG
else:
cut_block = __cut_DAG_NO_HMM
for blk in blocks:
if not blk:
continue
if re_han.match(blk):
for word in cut_block(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if re_skip.match(x):
yield x
elif not cut_all:
for xx in x:
yield xx
else:
yield x
def cut_for_search(sentence, HMM=True):
words = cut(sentence, HMM=HMM)
for w in words:
if len(w) > 2:
for i in range(len(w)-1):
gram2 = w[i:i+2]
if gram2 in FREQ:
yield gram2
if len(w) > 3:
for i in range(len(w)-2):
gram3 = w[i:i+3]
if gram3 in FREQ:
yield gram3
yield w
@require_initialized
def load_userdict(f):
''' Load personalized dict to improve detect rate.
Parameter:
- f : A plain text file contains words and their ocurrences.
Structure of dict file:
word1 freq1 word_type1
word2 freq2 word_type2
...
Word type may be ignored
'''
if isinstance(f, str):
f = open(f, 'rb')
content = f.read().decode('utf-8')
line_no = 0
for line in content.split("\n"):
line_no += 1
if not line.rstrip():
continue
tup = line.split(" ")
word, freq = tup[0], tup[1]
if freq.isdigit() is False:
continue
if line_no == 1:
word = word.replace('\ufeff',"") #remove bom flag if it exists
add_word(*tup)
@require_initialized
def add_word(word, freq, tag=None):
global FREQ, pfdict, total, user_word_tag_tab
FREQ[word] = log(float(freq) / total)
if tag is not None:
user_word_tag_tab[word] = tag.strip()
for ch in range(len(word)):
pfdict.add(word[:ch+1])
__ref_cut = cut
__ref_cut_for_search = cut_for_search
def __lcut(sentence):
return list(__ref_cut(sentence, False))
def __lcut_no_hmm(sentence):
return list(__ref_cut(sentence, False, False))
def __lcut_all(sentence):
return list(__ref_cut(sentence, True))
def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence))
@require_initialized
def enable_parallel(processnum=None):
global pool, cut, cut_for_search
if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system")
from multiprocessing import Pool, cpu_count
if processnum is None:
processnum = cpu_count()
pool = Pool(processnum)
def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence)
if cut_all:
result = pool.map(__lcut_all, parts)
elif HMM:
result = pool.map(__lcut, parts)
else:
result = pool.map(__lcut_no_hmm, parts)
for r in result:
for w in r:
yield w
def pcut_for_search(sentence):
parts = re.compile('([\r\n]+)').split(sentence)
result = pool.map(__lcut_for_search, parts)
for r in result:
for w in r:
yield w
cut = pcut
cut_for_search = pcut_for_search
def disable_parallel():
global pool,cut,cut_for_search
if 'pool' in globals():
pool.close()
pool = None
cut = __ref_cut
cut_for_search = __ref_cut_for_search
def set_dictionary(dictionary_path):
global initialized, DICTIONARY
with DICT_LOCK:
abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist: " + abs_path)
DICTIONARY = abs_path
initialized = False
def get_abs_path_dict():
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,DICTIONARY)
return abs_path
def tokenize(unicode_sentence, mode="default", HMM=True):
"""Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- sentence: the str to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, str):
raise Exception("jieba: the input parameter should be str.")
start = 0
if mode == 'default':
for w in cut(unicode_sentence, HMM=HMM):
width = len(w)
yield (w, start, start+width)
start += width
else:
for w in cut(unicode_sentence, HMM=HMM):
width = len(w)
if len(w) > 2:
for i in range(len(w)-1):
gram2 = w[i:i+2]
if gram2 in FREQ:
yield (gram2, start+i, start+i+2)
if len(w) > 3:
for i in range(len(w)-2):
gram3 = w[i:i+3]
if gram3 in FREQ:
yield (gram3, start+i, start+i+3)
yield (w, start, start+width)
start += width