use prefix dict instead of trie, add a command line interface, and a few small improvements

This commit is contained in:
Dingyuan Wang 2014-10-18 22:22:14 +08:00
parent eb98eb9248
commit 51df77831b
8 changed files with 331 additions and 317 deletions

View File

@ -17,14 +17,13 @@ import logging
DICTIONARY = "dict.txt" DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock() DICT_LOCK = threading.RLock()
trie = None # to be initialized pfdict = None # to be initialized
FREQ = {} FREQ = {}
min_freq = 0.0 min_freq = 0.0
total =0.0 total = 0.0
user_word_tag_tab={} user_word_tag_tab = {}
initialized = False initialized = False
log_console = logging.StreamHandler(sys.stderr) log_console = logging.StreamHandler(sys.stderr)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
@ -34,84 +33,79 @@ def setLogLevel(log_level):
global logger global logger
logger.setLevel(log_level) logger.setLevel(log_level)
def gen_trie(f_name): def gen_pfdict(f_name):
lfreq = {} lfreq = {}
trie = {} pfdict = set()
ltotal = 0.0 ltotal = 0.0
with open(f_name, 'rb') as f: with open(f_name, 'rb') as f:
lineno = 0 lineno = 0
for line in f.read().rstrip().decode('utf-8').split('\n'): for line in f.read().rstrip().decode('utf-8').split('\n'):
lineno += 1 lineno += 1
try: try:
word,freq,_ = line.split(' ') word,freq = line.split(' ')[:2]
freq = float(freq) freq = float(freq)
lfreq[word] = freq lfreq[word] = freq
ltotal+=freq ltotal += freq
p = trie for ch in xrange(len(word)):
for c in word: pfdict.add(word[:ch+1])
if c not in p:
p[c] ={}
p = p[c]
p['']='' #ending flag
except ValueError, e: except ValueError, e:
logger.debug('%s at line %s %s' % (f_name, lineno, line)) logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise ValueError, e raise ValueError, e
return trie, lfreq,ltotal return pfdict, lfreq, ltotal
def initialize(*args): def initialize(*args):
global trie, FREQ, total, min_freq, initialized global pfdict, FREQ, total, min_freq, initialized
if len(args)==0: if not args:
dictionary = DICTIONARY dictionary = DICTIONARY
else: else:
dictionary = args[0] dictionary = args[0]
with DICT_LOCK: with DICT_LOCK:
if initialized: if initialized:
return return
if trie: if pfdict:
del trie del pfdict
trie = None pfdict = None
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary) abs_path = os.path.join(_curpath,dictionary)
logger.debug("Building Trie..., from %s" % abs_path) logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time() t1 = time.time()
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #customer dictionary else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
load_from_cache_fail = True load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path): if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
logger.debug("loading model from cache %s" % cache_file) logger.debug("Loading model from cache %s" % cache_file)
try: try:
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
load_from_cache_fail = False # prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
except: except:
load_from_cache_fail = True load_from_cache_fail = True
if load_from_cache_fail: if load_from_cache_fail:
trie,FREQ,total = gen_trie(abs_path) pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
min_freq = min(FREQ.itervalues()) min_freq = min(FREQ.itervalues())
logger.debug("dumping model to file cache %s" % cache_file) logger.debug("Dumping model to file cache %s" % cache_file)
try: try:
tmp_suffix = "."+str(random.random()) tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file: with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file) marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name=='nt': if os.name == 'nt':
import shutil from shutil import move as replace_file
replace_file = shutil.move
else: else:
replace_file = os.rename replace_file = os.rename
replace_file(cache_file+tmp_suffix,cache_file) replace_file(cache_file + tmp_suffix, cache_file)
except: except:
logger.error("dump cache file failed.") logger.exception("Dump cache file failed.")
logger.exception("")
initialized = True initialized = True
logger.debug("loading model cost %s seconds." % (time.time() - t1)) logger.debug("Loading model cost %s seconds." % (time.time() - t1))
logger.debug("Trie has been built succesfully.") logger.debug("Prefix dict has been built succesfully.")
def require_initialized(fn): def require_initialized(fn):
@ -132,145 +126,136 @@ def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
old_j = -1 old_j = -1
for k,L in dag.iteritems(): for k,L in dag.iteritems():
if len(L)==1 and k>old_j: if len(L) == 1 and k > old_j:
yield sentence[k:L[0]+1] yield sentence[k:L[0]+1]
old_j = L[0] old_j = L[0]
else: else:
for j in L: for j in L:
if j>k: if j > k:
yield sentence[k:j+1] yield sentence[k:j+1]
old_j = j old_j = j
def calc(sentence,DAG,idx,route): def calc(sentence,DAG,idx,route):
N = len(sentence) N = len(sentence)
route[N] = (0.0,'') route[N] = (0.0, '')
for idx in xrange(N-1,-1,-1): for idx in xrange(N-1, -1, -1):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ] candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
route[idx] = max(candidates) route[idx] = max(candidates)
@require_initialized @require_initialized
def get_DAG(sentence): def get_DAG(sentence):
N = len(sentence) global pfdict, FREQ
i,j=0,0
p = trie
DAG = {} DAG = {}
while i<N: N = len(sentence)
c = sentence[j] for k in xrange(N):
if c in p: tmplist = []
p = p[c] i = k
if '' in p: frag = sentence[k]
if i not in DAG: while i < N and frag in pfdict:
DAG[i]=[] if frag in FREQ:
DAG[i].append(j) tmplist.append(i)
j+=1 i += 1
if j>=N: frag = sentence[k:i+1]
i+=1 if not tmplist:
j=i tmplist.append(k)
p=trie DAG[k] = tmplist
else:
p = trie
i+=1
j=i
for i in xrange(len(sentence)):
if i not in DAG:
DAG[i] =[i]
return DAG return DAG
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route ={} route = {}
calc(sentence,DAG,0,route=route) calc(sentence, DAG, 0, route=route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = u'' buf = u''
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1] + 1
l_word = sentence[x:y] l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word)==1: if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word buf += l_word
x =y x = y
else: else:
if len(buf)>0: if buf:
yield buf yield buf
buf = u'' buf = u''
yield l_word yield l_word
x =y x = y
if len(buf)>0: if buf:
yield buf yield buf
buf = u'' buf = u''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route ={} route = {}
calc(sentence,DAG,0,route=route) calc(sentence, DAG, 0, route=route)
x = 0 x = 0
buf =u'' buf = u''
N = len(sentence) N = len(sentence)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if y-x==1: if y-x == 1:
buf+= l_word buf += l_word
else: else:
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield buf yield buf
buf=u'' buf = u''
else: else:
if (buf not in FREQ): if (buf not in FREQ):
regognized = finalseg.cut(buf) recognized = finalseg.cut(buf)
for t in regognized: for t in recognized:
yield t yield t
else: else:
for elem in buf: for elem in buf:
yield elem yield elem
buf=u'' buf = u''
yield l_word yield l_word
x =y x = y
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield buf yield buf
elif (buf not in FREQ):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else: else:
if (buf not in FREQ): for elem in buf:
regognized = finalseg.cut(buf) yield elem
for t in regognized:
yield t
else:
for elem in buf:
yield elem
def cut(sentence,cut_all=False,HMM=True): def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains '''The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
- sentence: The String to be segmented - sentence: The str/unicode to be segmented.
- cut_all: Model. True means full pattern, false means accurate pattern. - cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether use Hidden Markov Model. - HMM: Whether to use the Hidden Markov Model.
''' '''
if not isinstance(sentence, unicode): if not isinstance(sentence, unicode):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
'''
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
\r\n|\s : whitespace characters. Will not be Handled. # \r\n|\s : whitespace characters. Will not be handled.
'''
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
if cut_all: if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U) re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
else:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if cut_all:
cut_block = __cut_all
elif HMM:
cut_block = __cut_DAG cut_block = __cut_DAG
else: else:
cut_block = __cut_DAG_NO_HMM cut_block = __cut_DAG_NO_HMM
if cut_all:
cut_block = __cut_all
for blk in blocks: for blk in blocks:
if len(blk)==0: if not blk:
continue continue
if re_han.match(blk): if re_han.match(blk):
for word in cut_block(blk): for word in cut_block(blk):
@ -286,15 +271,15 @@ def cut(sentence,cut_all=False,HMM=True):
else: else:
yield x yield x
def cut_for_search(sentence,HMM=True): def cut_for_search(sentence, HMM=True):
words = cut(sentence,HMM=HMM) words = cut(sentence, HMM=HMM)
for w in words: for w in words:
if len(w)>2: if len(w) > 2:
for i in xrange(len(w)-1): for i in xrange(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield gram2 yield gram2
if len(w)>3: if len(w) > 3:
for i in xrange(len(w)-2): for i in xrange(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
@ -312,79 +297,71 @@ def load_userdict(f):
... ...
Word type may be ignored Word type may be ignored
''' '''
global trie,total,FREQ
if isinstance(f, (str, unicode)): if isinstance(f, (str, unicode)):
f = open(f, 'rb') f = open(f, 'rb')
content = f.read().decode('utf-8') content = f.read().decode('utf-8')
line_no = 0 line_no = 0
for line in content.split("\n"): for line in content.split("\n"):
line_no+=1 line_no += 1
if line.rstrip()=='': continue if not line.rstrip():
tup =line.split(" ") continue
word,freq = tup[0],tup[1] tup = line.split(" ")
if freq.isdigit() is False: continue word, freq = tup[0], tup[1]
if line_no==1: if freq.isdigit() is False:
continue
if line_no == 1:
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
if len(tup)==3: add_word(*tup)
add_word(word, freq, tup[2])
else:
add_word(word, freq)
@require_initialized @require_initialized
def add_word(word, freq, tag=None): def add_word(word, freq, tag=None):
global FREQ, trie, total, user_word_tag_tab global FREQ, pfdict, total, user_word_tag_tab
freq = float(freq) FREQ[word] = log(float(freq) / total)
FREQ[word] = log(freq / total)
if tag is not None: if tag is not None:
user_word_tag_tab[word] = tag.strip() user_word_tag_tab[word] = tag.strip()
p = trie for ch in xrange(len(word)):
for c in word: pfdict.add(word[:ch+1])
if c not in p:
p[c] = {}
p = p[c]
p[''] = '' # ending flag
__ref_cut = cut __ref_cut = cut
__ref_cut_for_search = cut_for_search __ref_cut_for_search = cut_for_search
def __lcut(sentence): def __lcut(sentence):
return list(__ref_cut(sentence,False)) return list(__ref_cut(sentence, False))
def __lcut_no_hmm(sentence): def __lcut_no_hmm(sentence):
return list(__ref_cut(sentence,False,False)) return list(__ref_cut(sentence, False, False))
def __lcut_all(sentence): def __lcut_all(sentence):
return list(__ref_cut(sentence,True)) return list(__ref_cut(sentence, True))
def __lcut_for_search(sentence): def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence)) return list(__ref_cut_for_search(sentence))
@require_initialized @require_initialized
def enable_parallel(processnum=None): def enable_parallel(processnum=None):
global pool,cut,cut_for_search global pool, cut, cut_for_search
if os.name=='nt': if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system") raise Exception("jieba: parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6: if sys.version_info[0]==2 and sys.version_info[1]<6:
raise Exception("jieba: the parallel feature needs Python version>2.5 ") raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool,cpu_count from multiprocessing import Pool, cpu_count
if processnum==None: if processnum is None:
processnum = cpu_count() processnum = cpu_count()
pool = Pool(processnum) pool = Pool(processnum)
def pcut(sentence,cut_all=False,HMM=True): def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if cut_all: if cut_all:
result = pool.map(__lcut_all,parts) result = pool.map(__lcut_all, parts)
elif HMM:
result = pool.map(__lcut, parts)
else: else:
if HMM: result = pool.map(__lcut_no_hmm, parts)
result = pool.map(__lcut,parts)
else:
result = pool.map(__lcut_no_hmm,parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w
def pcut_for_search(sentence): def pcut_for_search(sentence):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
result = pool.map(__lcut_for_search,parts) result = pool.map(__lcut_for_search, parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w
@ -403,40 +380,44 @@ def disable_parallel():
def set_dictionary(dictionary_path): def set_dictionary(dictionary_path):
global initialized, DICTIONARY global initialized, DICTIONARY
with DICT_LOCK: with DICT_LOCK:
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) ) abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
DICTIONARY = abs_path DICTIONARY = abs_path
initialized = False initialized = False
def get_abs_path_dict(): def get_abs_path_dict():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,DICTIONARY) abs_path = os.path.join(_curpath,DICTIONARY)
return abs_path return abs_path
def tokenize(unicode_sentence,mode="default",HMM=True): def tokenize(unicode_sentence, mode="default", HMM=True):
#mode ("default" or "search") """Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- sentence: the unicode to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, unicode): if not isinstance(unicode_sentence, unicode):
raise Exception("jieba: the input parameter should unicode.") raise Exception("jieba: the input parameter should be unicode.")
start = 0 start = 0
if mode=='default': if mode == 'default':
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
width = len(w) width = len(w)
yield (w,start,start+width) yield (w, start, start+width)
start+=width start += width
else: else:
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
width = len(w) width = len(w)
if len(w)>2: if len(w) > 2:
for i in xrange(len(w)-1): for i in xrange(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield (gram2,start+i,start+i+2) yield (gram2, start+i, start+i+2)
if len(w)>3: if len(w) > 3:
for i in xrange(len(w)-2): for i in xrange(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
yield (gram3,start+i,start+i+3) yield (gram3, start+i, start+i+3)
yield (w,start,start+width) yield (w, start, start+width)
start+=width start += width

35
jieba/__main__.py Normal file
View File

@ -0,0 +1,35 @@
"""Jieba command line interface."""
import sys
import jieba
from argparse import ArgumentParser
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", version="Jieba " + jieba.__version__, epilog="If no filename specified, use STDIN instead.")
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
nargs='?', const=' ',
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
parser.add_argument("-a", "--cut-all",
action="store_true", dest="cutall", default=False,
help="full pattern cutting")
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
help="don't print loading messages to stderr")
parser.add_argument("filename", nargs='?', help="input file")
args = parser.parse_args()
if args.quiet:
jieba.setLogLevel(60)
delim = unicode(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin
jieba.initialize()
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
ln = fp.readline()
fp.close()

View File

@ -6,12 +6,14 @@ try:
except ImportError: except ImportError:
pass pass
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath, "idf.txt") abs_path = os.path.join(_curpath, "idf.txt")
STOP_WORDS = set([ STOP_WORDS = set((
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" "the","of","is","and","to","in","that","we","for","an","are",
]) "by","be","as","on","with","can","if","from","which","you","it",
"this","then","at","have","all","not","one","has","or","that"
))
class IDFLoader: class IDFLoader:
def __init__(self): def __init__(self):
@ -21,13 +23,13 @@ class IDFLoader:
def set_new_path(self, new_idf_path): def set_new_path(self, new_idf_path):
if self.path != new_idf_path: if self.path != new_idf_path:
content = open(new_idf_path,'rb').read().decode('utf-8') content = open(new_idf_path, 'rb').read().decode('utf-8')
idf_freq = {} idf_freq = {}
lines = content.split('\n') lines = content.split('\n')
if lines and not lines[-1]: if lines and not lines[-1]:
lines.pop(-1) lines.pop(-1)
for line in lines: for line in lines:
word,freq = line.split(' ') word, freq = line.split(' ')
idf_freq[word] = float(freq) idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2] median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
self.idf_freq = idf_freq self.idf_freq = idf_freq
@ -41,24 +43,22 @@ idf_loader = IDFLoader()
idf_loader.set_new_path(abs_path) idf_loader.set_new_path(abs_path)
def set_idf_path(idf_path): def set_idf_path(idf_path):
new_abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) ) new_abs_path = os.path.normpath(os.path.join(os.getcwd(), idf_path))
if not os.path.exists(new_abs_path): if not os.path.exists(new_abs_path):
raise Exception("jieba: path does not exist:" + new_abs_path) raise Exception("jieba: path does not exist: " + new_abs_path)
idf_loader.set_new_path(new_abs_path) idf_loader.set_new_path(new_abs_path)
return
def set_stop_words(stop_words_path): def set_stop_words(stop_words_path):
global STOP_WORDS global STOP_WORDS
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) ) abs_path = os.path.normpath(os.path.join(os.getcwd(), stop_words_path))
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
content = open(abs_path,'rb').read().decode('utf-8') content = open(abs_path,'rb').read().decode('utf-8')
lines = content.split('\n') lines = content.split('\n')
for line in lines: for line in lines:
STOP_WORDS.add(line) STOP_WORDS.add(line)
return
def extract_tags(sentence,topK=20): def extract_tags(sentence, topK=20):
global STOP_WORDS global STOP_WORDS
idf_freq, median_idf = idf_loader.get_idf() idf_freq, median_idf = idf_loader.get_idf()
@ -66,15 +66,17 @@ def extract_tags(sentence,topK=20):
words = jieba.cut(sentence) words = jieba.cut(sentence)
freq = {} freq = {}
for w in words: for w in words:
if len(w.strip())<2: continue if len(w.strip()) < 2:
if w.lower() in STOP_WORDS: continue continue
freq[w]=freq.get(w,0.0)+1.0 if w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values()) total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()] freq = [(k,v/total) for k,v in freq.iteritems()]
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list,reverse=True) st_list = sorted(tf_idf_list, reverse=True)
top_tuples= st_list[:topK] top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples] tags = [a[1] for a in top_tuples]
return tags return tags

View File

@ -1,4 +1,4 @@
#encoding=utf-8 ##encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem from whoosh.lang.porter import stem
@ -10,26 +10,24 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'to', 'us', 'we', 'when', 'will', 'with', 'yet',
'you', 'your',u'',u'',u'')) 'you', 'your', u'', u'', u''))
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
class ChineseTokenizer(Tokenizer): class ChineseTokenizer(Tokenizer):
def __call__(self,text,**kargs): def __call__(self, text, **kargs):
words = jieba.tokenize(text,mode="search") words = jieba.tokenize(text, mode="search")
token = Token() token = Token()
for (w,start_pos,stop_pos) in words: for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w): if not accepted_chars.match(w) and len(w)<=1:
if len(w)>1: continue
pass
else:
continue
token.original = token.text = w token.original = token.text = w
token.pos = start_pos token.pos = start_pos
token.startchar = start_pos token.startchar = start_pos
token.endchar = stop_pos token.endchar = stop_pos
yield token yield token
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ return (ChineseTokenizer() | LowercaseFilter() |
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize) StopFilter(stoplist=stoplist,minsize=minsize) |
StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize))

View File

@ -4,7 +4,7 @@ import os
import marshal import marshal
import sys import sys
MIN_FLOAT=-3.14e100 MIN_FLOAT = -3.14e100
PROB_START_P = "prob_start.p" PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p" PROB_TRANS_P = "prob_trans.p"
@ -19,20 +19,20 @@ PrevStatus = {
} }
def load_model(): def load_model():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {} start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P) abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f: with open(abs_path, mode='rb') as f:
start_p = marshal.load(f) start_p = marshal.load(f)
f.closed f.closed
trans_p = {} trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P) abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f: with open(abs_path, 'rb') as f:
trans_p = marshal.load(f) trans_p = marshal.load(f)
f.closed f.closed
emit_p = {} emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P) abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f: with file(abs_path, 'rb') as f:
@ -53,45 +53,45 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
for y in states: #init for y in states: #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
path[y] = [y] path[y] = [y]
for t in range(1,len(obs)): for t in xrange(1,len(obs)):
V.append({}) V.append({})
newpath = {} newpath = {}
for y in states: for y in states:
em_p = emit_p[y].get(obs[t],MIN_FLOAT) em_p = emit_p[y].get(obs[t],MIN_FLOAT)
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ]) (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
V[t][y] =prob V[t][y] = prob
newpath[y] = path[state] + [y] newpath[y] = path[state] + [y]
path = newpath path = newpath
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')]) (prob, state) = max([(V[len(obs)-1][y], y) for y in ('E','S')])
return (prob, path[state]) return (prob, path[state])
def __cut(sentence): def __cut(sentence):
global emit_P global emit_P
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P) prob, pos_list = viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0,0
#print pos_list, sentence #print pos_list, sentence
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i] pos = pos_list[i]
if pos=='B': if pos == 'B':
begin = i begin = i
elif pos=='E': elif pos == 'E':
yield sentence[begin:i+1] yield sentence[begin:i+1]
next = i+1 next = i+1
elif pos=='S': elif pos == 'S':
yield char yield char
next = i+1 next = i+1
if next<len(sentence): if next < len(sentence):
yield sentence[next:] yield sentence[next:]
def cut(sentence): def cut(sentence):
if not ( type(sentence) is unicode): if not isinstance(sentence, unicode):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
@ -101,5 +101,5 @@ def cut(sentence):
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if x!="": if x:
yield x yield x

View File

@ -14,8 +14,8 @@ PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p" PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p" CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name,isJython=True): def load_model(f_name, isJython=True):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {} result = {}
with file(f_name, "rb") as f: with file(f_name, "rb") as f:
@ -23,7 +23,7 @@ def load_model(f_name,isJython=True):
line = line.strip() line = line.strip()
if line=="":continue if line=="":continue
word, _, tag = line.split(' ') word, _, tag = line.split(' ')
result[word.decode('utf-8')]=tag result[word.decode('utf-8')] = tag
f.closed f.closed
if not isJython: if not isJython:
return result return result
@ -59,7 +59,7 @@ if sys.platform.startswith("java"):
else: else:
import char_state_tab, prob_start, prob_trans, prob_emit import char_state_tab, prob_start, prob_trans, prob_emit
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False) word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
def makesure_userdict_loaded(fn): def makesure_userdict_loaded(fn):
@ -78,7 +78,7 @@ class pair(object):
self.flag = flag self.flag = flag
def __unicode__(self): def __unicode__(self):
return self.word+u"/"+self.flag return u'%s/%s' % (self.word, self.flag)
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
@ -90,25 +90,25 @@ class pair(object):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)
def __cut(sentence): def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P) prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0, 0
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i][0] pos = pos_list[i][0]
if pos=='B': if pos == 'B':
begin = i begin = i
elif pos=='E': elif pos == 'E':
yield pair(sentence[begin:i+1], pos_list[i][1]) yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1 next = i+1
elif pos=='S': elif pos == 'S':
yield pair(char,pos_list[i][1]) yield pair(char, pos_list[i][1])
next = i+1 next = i+1
if next<len(sentence): if next < len(sentence):
yield pair(sentence[next:], pos_list[next][1] ) yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence): def __cut_detail(sentence):
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
@ -117,89 +117,88 @@ def __cut_detail(sentence):
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if x!="": if x:
if re_num.match(x): if re_num.match(x):
yield pair(x,'m') yield pair(x, 'm')
elif re_eng.match(x): elif re_eng.match(x):
yield pair(x,'eng') yield pair(x, 'eng')
else: else:
yield pair(x,'x') yield pair(x, 'x')
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route ={} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence, DAG, 0, route=route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf =u'' buf = u''
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word)==1: if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word buf += l_word
x = y x = y
else: else:
if len(buf)>0: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = u'' buf = u''
yield pair(l_word,word_tag_tab.get(l_word,'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x =y x = y
if len(buf)>0: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = u'' buf = u''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route ={} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence,DAG,0,route=route)
x = 0 x = 0
buf =u'' buf = u''
N = len(sentence) N = len(sentence)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if y-x==1: if y-x == 1:
buf+= l_word buf += l_word
else: else:
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield pair(buf,word_tag_tab.get(buf,'x')) yield pair(buf, word_tag_tab.get(buf, 'x'))
buf=u'' buf = u''
else: else:
if (buf not in jieba.FREQ): if (buf not in jieba.FREQ):
regognized = __cut_detail(buf) recognized = __cut_detail(buf)
for t in regognized: for t in recognized:
yield t yield t
else: else:
for elem in buf: for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x')) yield pair(elem, word_tag_tab.get(elem, 'x'))
buf=u'' buf = u''
yield pair(l_word,word_tag_tab.get(l_word,'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x =y x = y
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield pair(buf,word_tag_tab.get(buf,'x')) yield pair(buf, word_tag_tab.get(buf, 'x'))
elif (buf not in jieba.FREQ):
recognized = __cut_detail(buf)
for t in recognized:
yield t
else: else:
if (buf not in jieba.FREQ): for elem in buf:
regognized = __cut_detail(buf) yield pair(elem, word_tag_tab.get(elem, 'x'))
for t in regognized:
yield t
else:
for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x'))
def __cut_internal(sentence,HMM=True): def __cut_internal(sentence, HMM=True):
if not ( type(sentence) is unicode): if not isinstance(sentence, unicode):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if HMM:
__cut_blk = __cut_DAG __cut_blk = __cut_DAG
@ -214,15 +213,15 @@ def __cut_internal(sentence,HMM=True):
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if re_skip.match(x): if re_skip.match(x):
yield pair(x,'x') yield pair(x, 'x')
else: else:
for xx in x: for xx in x:
if re_num.match(xx): if re_num.match(xx):
yield pair(xx,'m') yield pair(xx, 'm')
elif re_eng.match(x): elif re_eng.match(x):
yield pair(xx,'eng') yield pair(xx, 'eng')
else: else:
yield pair(xx,'x') yield pair(xx, 'x')
def __lcut_internal(sentence): def __lcut_internal(sentence):
return list(__cut_internal(sentence)) return list(__cut_internal(sentence))
@ -231,16 +230,16 @@ def __lcut_internal_no_hmm(sentence):
@makesure_userdict_loaded @makesure_userdict_loaded
def cut(sentence,HMM=True): def cut(sentence, HMM=True):
if (not hasattr(jieba,'pool')) or (jieba.pool==None): if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
for w in __cut_internal(sentence,HMM=HMM): for w in __cut_internal(sentence, HMM=HMM):
yield w yield w
else: else:
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if HMM: if HMM:
result = jieba.pool.map(__lcut_internal,parts) result = jieba.pool.map(__lcut_internal, parts)
else: else:
result = jieba.pool.map(__lcut_internal_no_hmm,parts) result = jieba.pool.map(__lcut_internal_no_hmm, parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w

View File

@ -1,46 +1,45 @@
import operator import operator
MIN_FLOAT=-3.14e100 MIN_FLOAT = -3.14e100
MIN_INF=float("-inf") MIN_INF = float("-inf")
def get_top_states(t_state_v,K=4): def get_top_states(t_state_v, K=4):
items = t_state_v.items() items = t_state_v.items()
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K] topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
return [x[0] for x in topK] return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
mem_path = [{}] mem_path = [{}]
all_states = trans_p.keys() all_states = trans_p.keys()
for y in states.get(obs[0],all_states): #init for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = '' mem_path[0][y] = ''
for t in range(1,len(obs)): for t in xrange(1, len(obs)):
V.append({}) V.append({})
mem_path.append({}) mem_path.append({})
#prev_states = get_top_states(V[t-1]) #prev_states = get_top_states(V[t-1])
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ] prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) ) prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = states.get(obs[t],all_states) obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
obs_states = set(obs_states) & set(prev_states_expect_next)
if len(obs_states)==0: obs_states = prev_states_expect_next if not obs_states:
if len(obs_states)==0: obs_states = all_states obs_states = prev_states_expect_next if prev_states_expect_next else all_states
for y in obs_states: for y in obs_states:
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states]) prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
V[t][y] =prob V[t][y] = prob
mem_path[t][y] = state mem_path[t][y] = state
last = [(V[-1][y], y) for y in mem_path[-1].keys() ] last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0: #if len(last)==0:
#print obs #print obs
(prob, state) = max(last) prob, state = max(last)
route = [None] * len(obs) route = [None] * len(obs)
i = len(obs)-1 i = len(obs) - 1
while i>=0: while i >= 0:
route[i] = state route[i] = state
state = mem_path[i][state] state = mem_path[i][state]
i-=1 i -= 1
return (prob, route) return (prob, route)

View File

@ -4,14 +4,14 @@ sys.path.append("../")
import jieba import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True) seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
print "Full Mode:", "/ ".join(seg_list) # 全模式 print u"Full Mode:", u"/ ".join(seg_list) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False) seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
print "Default Mode:", "/ ".join(seg_list) # 默认模式 print u"Default Mode:", u"/ ".join(seg_list) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦") seg_list = jieba.cut(u"他来到了网易杭研大厦")
print ", ".join(seg_list) print u", ".join(seg_list)
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print ", ".join(seg_list) print u", ".join(seg_list)