Merge pull request #188 from gumblex/jieba3k

不用Trie,同#187
This commit is contained in:
Sun Junyi 2014-10-19 19:43:48 +08:00
commit 56e8336af1
7 changed files with 348 additions and 321 deletions

View File

@ -16,14 +16,13 @@ import logging
DICTIONARY = "dict.txt" DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock() DICT_LOCK = threading.RLock()
trie = None # to be initialized pfdict = None # to be initialized
FREQ = {} FREQ = {}
min_freq = 0.0 min_freq = 0.0
total =0.0 total = 0.0
user_word_tag_tab={} user_word_tag_tab = {}
initialized = False initialized = False
log_console = logging.StreamHandler(sys.stderr) log_console = logging.StreamHandler(sys.stderr)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
@ -33,85 +32,80 @@ def setLogLevel(log_level):
global logger global logger
logger.setLevel(log_level) logger.setLevel(log_level)
def gen_trie(f_name): def gen_pfdict(f_name):
lfreq = {} lfreq = {}
trie = {} pfdict = set()
ltotal = 0.0 ltotal = 0.0
with open(f_name, 'rb') as f: with open(f_name, 'rb') as f:
lineno = 0 lineno = 0
for line in f.read().rstrip().decode('utf-8').split('\n'): for line in f.read().rstrip().decode('utf-8').split('\n'):
lineno += 1 lineno += 1
try: try:
word,freq,_ = line.split(' ') word,freq = line.split(' ')[:2]
freq = float(freq) freq = float(freq)
lfreq[word] = freq lfreq[word] = freq
ltotal+=freq ltotal += freq
p = trie for ch in range(len(word)):
for c in word: pfdict.add(word[:ch+1])
if c not in p:
p[c] ={}
p = p[c]
p['']='' #ending flag
except ValueError as e: except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line)) logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise e raise e
return trie, lfreq,ltotal return pfdict, lfreq, ltotal
def initialize(*args): def initialize(*args):
global trie, FREQ, total, min_freq, initialized global pfdict, FREQ, total, min_freq, initialized
if len(args)==0: if not args:
dictionary = DICTIONARY dictionary = DICTIONARY
else: else:
dictionary = args[0] dictionary = args[0]
with DICT_LOCK: with DICT_LOCK:
if initialized: if initialized:
return return
if trie: if pfdict:
del trie del pfdict
trie = None pfdict = None
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary) abs_path = os.path.join(_curpath,dictionary)
logger.debug("Building Trie..., from %s" % abs_path) logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time() t1 = time.time()
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #customer dictionary else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
load_from_cache_fail = True load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path): if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
logger.debug("loading model from cache %s" % cache_file) logger.debug("Loading model from cache %s" % cache_file)
try: try:
with open(cache_file, 'rb') as cf: with open(cache_file, 'rb') as cf:
trie,FREQ,total,min_freq = marshal.load(cf) pfdict,FREQ,total,min_freq = marshal.load(cf)
load_from_cache_fail = False # prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
except: except:
load_from_cache_fail = True load_from_cache_fail = True
if load_from_cache_fail: if load_from_cache_fail:
trie,FREQ,total = gen_trie(abs_path) pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
min_freq = min(FREQ.values()) min_freq = min(FREQ.values())
logger.debug("dumping model to file cache %s" % cache_file) logger.debug("Dumping model to file cache %s" % cache_file)
try: try:
tmp_suffix = "."+str(random.random()) tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file: with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file) marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name=='nt': if os.name == 'nt':
import shutil from shutil import move as replace_file
replace_file = shutil.move
else: else:
replace_file = os.rename replace_file = os.rename
replace_file(cache_file+tmp_suffix,cache_file) replace_file(cache_file + tmp_suffix, cache_file)
except: except:
logger.error("dump cache file failed.") logger.exception("Dump cache file failed.")
logger.exception("")
initialized = True initialized = True
logger.debug("loading model cost %s seconds." % (time.time() - t1)) logger.debug("Loading model cost %s seconds." % (time.time() - t1))
logger.debug("Trie has been built succesfully.") logger.debug("Prefix dict has been built succesfully.")
def require_initialized(fn): def require_initialized(fn):
@ -132,145 +126,136 @@ def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
old_j = -1 old_j = -1
for k,L in dag.items(): for k,L in dag.items():
if len(L)==1 and k>old_j: if len(L) == 1 and k > old_j:
yield sentence[k:L[0]+1] yield sentence[k:L[0]+1]
old_j = L[0] old_j = L[0]
else: else:
for j in L: for j in L:
if j>k: if j > k:
yield sentence[k:j+1] yield sentence[k:j+1]
old_j = j old_j = j
def calc(sentence,DAG,idx,route): def calc(sentence,DAG,idx,route):
N = len(sentence) N = len(sentence)
route[N] = (0.0,'') route[N] = (0.0, '')
for idx in range(N-1,-1,-1): for idx in range(N-1, -1, -1):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ] candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
route[idx] = max(candidates) route[idx] = max(candidates)
@require_initialized @require_initialized
def get_DAG(sentence): def get_DAG(sentence):
N = len(sentence) global pfdict, FREQ
i,j=0,0
p = trie
DAG = {} DAG = {}
while i<N: N = len(sentence)
c = sentence[j] for k in range(N):
if c in p: tmplist = []
p = p[c] i = k
if '' in p: frag = sentence[k]
if i not in DAG: while i < N and frag in pfdict:
DAG[i]=[] if frag in FREQ:
DAG[i].append(j) tmplist.append(i)
j+=1 i += 1
if j>=N: frag = sentence[k:i+1]
i+=1 if not tmplist:
j=i tmplist.append(k)
p=trie DAG[k] = tmplist
else:
p = trie
i+=1
j=i
for i in range(len(sentence)):
if i not in DAG:
DAG[i] =[i]
return DAG return DAG
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(r'[a-zA-Z0-9]',re.U) re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route ={} route = {}
calc(sentence,DAG,0,route=route) calc(sentence, DAG, 0, route=route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = '' buf = ''
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1] + 1
l_word = sentence[x:y] l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word)==1: if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word buf += l_word
x =y x = y
else: else:
if len(buf)>0: if buf:
yield buf yield buf
buf = '' buf = ''
yield l_word yield l_word
x =y x = y
if len(buf)>0: if buf:
yield buf yield buf
buf = '' buf = ''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route ={} route = {}
calc(sentence,DAG,0,route=route) calc(sentence, DAG, 0, route=route)
x = 0 x = 0
buf ='' buf = ''
N = len(sentence) N = len(sentence)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if y-x==1: if y-x == 1:
buf+= l_word buf += l_word
else: else:
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield buf yield buf
buf='' buf = ''
else: else:
if (buf not in FREQ): if (buf not in FREQ):
regognized = finalseg.cut(buf) recognized = finalseg.cut(buf)
for t in regognized: for t in recognized:
yield t yield t
else: else:
for elem in buf: for elem in buf:
yield elem yield elem
buf='' buf = ''
yield l_word yield l_word
x =y x = y
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield buf yield buf
elif (buf not in FREQ):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else: else:
if (buf not in FREQ): for elem in buf:
regognized = finalseg.cut(buf) yield elem
for t in regognized:
yield t
else:
for elem in buf:
yield elem
def cut(sentence,cut_all=False,HMM=True): def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains '''The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
- sentence: The String to be segmented - sentence: The str to be segmented.
- cut_all: Model. True means full pattern, false means accurate pattern. - cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether use Hidden Markov Model. - HMM: Whether to use the Hidden Markov Model.
''' '''
if isinstance(sentence, bytes): if isinstance(sentence, bytes):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
'''
\\u4E00-\\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
\r\n|\s : whitespace characters. Will not be Handled. # \r\n|\s : whitespace characters. Will not be handled.
'''
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)")
if cut_all: if cut_all:
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
else:
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if cut_all:
cut_block = __cut_all
elif HMM:
cut_block = __cut_DAG cut_block = __cut_DAG
else: else:
cut_block = __cut_DAG_NO_HMM cut_block = __cut_DAG_NO_HMM
if cut_all:
cut_block = __cut_all
for blk in blocks: for blk in blocks:
if len(blk)==0: if not blk:
continue continue
if re_han.match(blk): if re_han.match(blk):
for word in cut_block(blk): for word in cut_block(blk):
@ -286,15 +271,15 @@ def cut(sentence,cut_all=False,HMM=True):
else: else:
yield x yield x
def cut_for_search(sentence,HMM=True): def cut_for_search(sentence, HMM=True):
words = cut(sentence,HMM=HMM) words = cut(sentence, HMM=HMM)
for w in words: for w in words:
if len(w)>2: if len(w) > 2:
for i in range(len(w)-1): for i in range(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield gram2 yield gram2
if len(w)>3: if len(w) > 3:
for i in range(len(w)-2): for i in range(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
@ -312,79 +297,69 @@ def load_userdict(f):
... ...
Word type may be ignored Word type may be ignored
''' '''
global trie,total,FREQ
if isinstance(f, str): if isinstance(f, str):
f = open(f, 'rb') f = open(f, 'rb')
content = f.read().decode('utf-8') content = f.read().decode('utf-8')
line_no = 0 line_no = 0
for line in content.split("\n"): for line in content.split("\n"):
line_no+=1 line_no += 1
if line.rstrip()=='': continue if not line.rstrip():
tup =line.split(" ") continue
word,freq = tup[0],tup[1] tup = line.split(" ")
if freq.isdigit() is False: continue word, freq = tup[0], tup[1]
if line_no==1: if freq.isdigit() is False:
continue
if line_no == 1:
word = word.replace('\ufeff',"") #remove bom flag if it exists word = word.replace('\ufeff',"") #remove bom flag if it exists
if len(tup)==3: add_word(*tup)
add_word(word, freq, tup[2])
else:
add_word(word, freq)
@require_initialized @require_initialized
def add_word(word, freq, tag=None): def add_word(word, freq, tag=None):
global FREQ, trie, total, user_word_tag_tab global FREQ, pfdict, total, user_word_tag_tab
freq = float(freq) FREQ[word] = log(float(freq) / total)
FREQ[word] = log(freq / total)
if tag is not None: if tag is not None:
user_word_tag_tab[word] = tag.strip() user_word_tag_tab[word] = tag.strip()
p = trie for ch in range(len(word)):
for c in word: pfdict.add(word[:ch+1])
if c not in p:
p[c] = {}
p = p[c]
p[''] = '' # ending flag
__ref_cut = cut __ref_cut = cut
__ref_cut_for_search = cut_for_search __ref_cut_for_search = cut_for_search
def __lcut(sentence): def __lcut(sentence):
return list(__ref_cut(sentence,False)) return list(__ref_cut(sentence, False))
def __lcut_no_hmm(sentence): def __lcut_no_hmm(sentence):
return list(__ref_cut(sentence,False,False)) return list(__ref_cut(sentence, False, False))
def __lcut_all(sentence): def __lcut_all(sentence):
return list(__ref_cut(sentence,True)) return list(__ref_cut(sentence, True))
def __lcut_for_search(sentence): def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence)) return list(__ref_cut_for_search(sentence))
@require_initialized @require_initialized
def enable_parallel(processnum=None): def enable_parallel(processnum=None):
global pool,cut,cut_for_search global pool, cut, cut_for_search
if os.name=='nt': if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system") raise Exception("jieba: parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6: from multiprocessing import Pool, cpu_count
raise Exception("jieba: the parallel feature needs Python version>2.5 ") if processnum is None:
from multiprocessing import Pool,cpu_count
if processnum==None:
processnum = cpu_count() processnum = cpu_count()
pool = Pool(processnum) pool = Pool(processnum)
def pcut(sentence,cut_all=False,HMM=True): def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if cut_all: if cut_all:
result = pool.map(__lcut_all,parts) result = pool.map(__lcut_all, parts)
elif HMM:
result = pool.map(__lcut, parts)
else: else:
if HMM: result = pool.map(__lcut_no_hmm, parts)
result = pool.map(__lcut,parts)
else:
result = pool.map(__lcut_no_hmm,parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w
def pcut_for_search(sentence): def pcut_for_search(sentence):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
result = pool.map(__lcut_for_search,parts) result = pool.map(__lcut_for_search, parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w
@ -403,40 +378,44 @@ def disable_parallel():
def set_dictionary(dictionary_path): def set_dictionary(dictionary_path):
global initialized, DICTIONARY global initialized, DICTIONARY
with DICT_LOCK: with DICT_LOCK:
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) ) abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
DICTIONARY = abs_path DICTIONARY = abs_path
initialized = False initialized = False
def get_abs_path_dict(): def get_abs_path_dict():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,DICTIONARY) abs_path = os.path.join(_curpath,DICTIONARY)
return abs_path return abs_path
def tokenize(unicode_sentence,mode="default",HMM=True): def tokenize(unicode_sentence, mode="default", HMM=True):
#mode ("default" or "search") """Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- sentence: the str to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, str): if not isinstance(unicode_sentence, str):
raise Exception("jieba: the input parameter should be str.") raise Exception("jieba: the input parameter should be str.")
start = 0 start = 0
if mode=='default': if mode == 'default':
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
width = len(w) width = len(w)
yield (w,start,start+width) yield (w, start, start+width)
start+=width start += width
else: else:
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
width = len(w) width = len(w)
if len(w)>2: if len(w) > 2:
for i in range(len(w)-1): for i in range(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield (gram2,start+i,start+i+2) yield (gram2, start+i, start+i+2)
if len(w)>3: if len(w) > 3:
for i in range(len(w)-2): for i in range(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
yield (gram3,start+i,start+i+3) yield (gram3, start+i, start+i+3)
yield (w,start,start+width) yield (w, start, start+width)
start+=width start += width

36
jieba/__main__.py Normal file
View File

@ -0,0 +1,36 @@
"""Jieba command line interface."""
import sys
import jieba
from argparse import ArgumentParser
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
nargs='?', const=' ',
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
parser.add_argument("-a", "--cut-all",
action="store_true", dest="cutall", default=False,
help="full pattern cutting")
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
help="don't print loading messages to stderr")
parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")
args = parser.parse_args()
if args.quiet:
jieba.setLogLevel(60)
delim = str(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin
jieba.initialize()
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
ln = fp.readline()
fp.close()

View File

@ -6,61 +6,77 @@ try:
except ImportError: except ImportError:
pass pass
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath, "idf.txt") abs_path = os.path.join(_curpath, "idf.txt")
IDF_DICTIONARY = abs_path STOP_WORDS = set((
STOP_WORDS = set([ "the","of","is","and","to","in","that","we","for","an","are",
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" "by","be","as","on","with","can","if","from","which","you","it",
]) "this","then","at","have","all","not","one","has","or","that"
))
class IDFLoader:
def __init__(self):
self.path = ""
self.idf_freq = {}
self.median_idf = 0.0
def set_new_path(self, new_idf_path):
if self.path != new_idf_path:
content = open(new_idf_path, 'r', encoding='utf-8').read()
idf_freq = {}
lines = content.split('\n')
if lines and not lines[-1]:
lines.pop(-1)
for line in lines:
word, freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
self.idf_freq = idf_freq
self.median_idf = median_idf
self.path = new_idf_path
def get_idf(self):
return self.idf_freq, self.median_idf
idf_loader = IDFLoader()
idf_loader.set_new_path(abs_path)
def set_idf_path(idf_path): def set_idf_path(idf_path):
global IDF_DICTIONARY new_abs_path = os.path.normpath(os.path.join(os.getcwd(), idf_path))
abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) ) if not os.path.exists(new_abs_path):
if not os.path.exists(abs_path): raise Exception("jieba: path does not exist: " + new_abs_path)
raise Exception("jieba: path does not exist:" + abs_path) idf_loader.set_new_path(new_abs_path)
IDF_DICTIONARY = abs_path
return
def get_idf(abs_path):
content = open(abs_path,'rb').read().decode('utf-8')
idf_freq = {}
lines = content.split('\n')
for line in lines:
word,freq = line.split(' ')
idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
return idf_freq, median_idf
def set_stop_words(stop_words_path): def set_stop_words(stop_words_path):
global STOP_WORDS global STOP_WORDS
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) ) abs_path = os.path.normpath(os.path.join(os.getcwd(), stop_words_path))
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
content = open(abs_path,'rb').read().decode('utf-8') content = open(abs_path,'rb').read().decode('utf-8')
lines = content.split('\n') lines = content.split('\n')
for line in lines: for line in lines:
STOP_WORDS.add(line) STOP_WORDS.add(line)
return
def extract_tags(sentence,topK=20): def extract_tags(sentence, topK=20):
global IDF_DICTIONARY
global STOP_WORDS global STOP_WORDS
idf_freq, median_idf = get_idf(IDF_DICTIONARY) idf_freq, median_idf = idf_loader.get_idf()
words = jieba.cut(sentence) words = jieba.cut(sentence)
freq = {} freq = {}
for w in words: for w in words:
if len(w.strip())<2: continue if len(w.strip()) < 2:
if w.lower() in STOP_WORDS: continue continue
freq[w]=freq.get(w,0.0)+1.0 if w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values()) total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.items()] freq = [(k,v/total) for k,v in freq.items()]
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list,reverse=True) st_list = sorted(tf_idf_list, reverse=True)
top_tuples= st_list[:topK] top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples] tags = [a[1] for a in top_tuples]
return tags return tags

View File

@ -15,21 +15,19 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
accepted_chars = re.compile(r"[\u4E00-\u9FA5]+") accepted_chars = re.compile(r"[\u4E00-\u9FA5]+")
class ChineseTokenizer(Tokenizer): class ChineseTokenizer(Tokenizer):
def __call__(self,text,**kargs): def __call__(self, text, **kargs):
words = jieba.tokenize(text,mode="search") words = jieba.tokenize(text, mode="search")
token = Token() token = Token()
for (w,start_pos,stop_pos) in words: for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w): if not accepted_chars.match(w) and len(w)<=1:
if len(w)>1: continue
pass
else:
continue
token.original = token.text = w token.original = token.text = w
token.pos = start_pos token.pos = start_pos
token.startchar = start_pos token.startchar = start_pos
token.endchar = stop_pos token.endchar = stop_pos
yield token yield token
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ return (ChineseTokenizer() | LowercaseFilter() |
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize) StopFilter(stoplist=stoplist,minsize=minsize) |
StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize))

View File

@ -3,7 +3,7 @@ import os
import marshal import marshal
import sys import sys
MIN_FLOAT=-3.14e100 MIN_FLOAT = -3.14e100
PROB_START_P = "prob_start.p" PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p" PROB_TRANS_P = "prob_trans.p"
@ -18,20 +18,20 @@ PrevStatus = {
} }
def load_model(): def load_model():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {} start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P) abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f: with open(abs_path, mode='rb') as f:
start_p = marshal.load(f) start_p = marshal.load(f)
f.closed f.closed
trans_p = {} trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P) abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f: with open(abs_path, 'rb') as f:
trans_p = marshal.load(f) trans_p = marshal.load(f)
f.closed f.closed
emit_p = {} emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P) abs_path = os.path.join(_curpath, PROB_EMIT_P)
with open(abs_path, 'rb') as f: with open(abs_path, 'rb') as f:
@ -57,40 +57,40 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
newpath = {} newpath = {}
for y in states: for y in states:
em_p = emit_p[y].get(obs[t],MIN_FLOAT) em_p = emit_p[y].get(obs[t],MIN_FLOAT)
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ]) (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
V[t][y] =prob V[t][y] = prob
newpath[y] = path[state] + [y] newpath[y] = path[state] + [y]
path = newpath path = newpath
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')]) (prob, state) = max([(V[len(obs)-1][y], y) for y in ('E','S')])
return (prob, path[state]) return (prob, path[state])
def __cut(sentence): def __cut(sentence):
global emit_P global emit_P
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P) prob, pos_list = viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0,0
#print pos_list, sentence #print pos_list, sentence
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i] pos = pos_list[i]
if pos=='B': if pos == 'B':
begin = i begin = i
elif pos=='E': elif pos == 'E':
yield sentence[begin:i+1] yield sentence[begin:i+1]
next = i+1 next = i+1
elif pos=='S': elif pos == 'S':
yield char yield char
next = i+1 next = i+1
if next<len(sentence): if next < len(sentence):
yield sentence[next:] yield sentence[next:]
def cut(sentence): def cut(sentence):
if not ( type(sentence) is str): if not isinstance(sentence, str):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
@ -100,5 +100,5 @@ def cut(sentence):
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if x!="": if x:
yield x yield x

View File

@ -13,17 +13,17 @@ PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p" PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p" CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name,isJython=True): def load_model(f_name, isJython=True):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {} result = {}
with open(f_name, "rb") as f: with open(f_name, "rb") as f:
for line in open(f_name,"rb"): for line in open(f_name,"rb"):
line = line.strip() line = line.strip()
if line=="":continue if not line: continue
line = line.decode("utf-8") line = line.decode("utf-8")
word, _, tag = line.split(" ") word, _, tag = line.split(" ")
result[word]=tag result[word] = tag
f.closed f.closed
if not isJython: if not isJython:
return result return result
@ -59,7 +59,7 @@ if sys.platform.startswith("java"):
else: else:
from . import char_state_tab, prob_start, prob_trans, prob_emit from . import char_state_tab, prob_start, prob_trans, prob_emit
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False) word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
def makesure_userdict_loaded(fn): def makesure_userdict_loaded(fn):
@ -78,7 +78,7 @@ class pair(object):
self.flag = flag self.flag = flag
def __unicode__(self): def __unicode__(self):
return self.word+"/"+self.flag return '%s/%s' % (self.word, self.flag)
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
@ -90,25 +90,25 @@ class pair(object):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)
def __cut(sentence): def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P) prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0, 0
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i][0] pos = pos_list[i][0]
if pos=='B': if pos == 'B':
begin = i begin = i
elif pos=='E': elif pos == 'E':
yield pair(sentence[begin:i+1], pos_list[i][1]) yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1 next = i+1
elif pos=='S': elif pos == 'S':
yield pair(char,pos_list[i][1]) yield pair(char, pos_list[i][1])
next = i+1 next = i+1
if next<len(sentence): if next < len(sentence):
yield pair(sentence[next:], pos_list[next][1] ) yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence): def __cut_detail(sentence):
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"([\.0-9]+|[a-zA-Z0-9]+)")
re_eng,re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
@ -117,89 +117,88 @@ def __cut_detail(sentence):
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if x!="": if x:
if re_num.match(x): if re_num.match(x):
yield pair(x,'m') yield pair(x, 'm')
elif re_eng.match(x): elif re_eng.match(x):
yield pair(x,'eng') yield pair(x, 'eng')
else: else:
yield pair(x,'x') yield pair(x, 'x')
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route ={} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence, DAG, 0, route=route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf ='' buf = ''
re_eng = re.compile(r'[a-zA-Z0-9]',re.U) re_eng = re.compile(r'[a-zA-Z0-9]',re.U)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word)==1: if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word buf += l_word
x = y x = y
else: else:
if len(buf)>0: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = '' buf = ''
yield pair(l_word,word_tag_tab.get(l_word,'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x =y x = y
if len(buf)>0: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = '' buf = ''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route ={} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence,DAG,0,route=route)
x = 0 x = 0
buf ='' buf = ''
N = len(sentence) N = len(sentence)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if y-x==1: if y-x == 1:
buf+= l_word buf += l_word
else: else:
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield pair(buf,word_tag_tab.get(buf,'x')) yield pair(buf, word_tag_tab.get(buf, 'x'))
buf='' buf = ''
else: else:
if (buf not in jieba.FREQ): if (buf not in jieba.FREQ):
regognized = __cut_detail(buf) recognized = __cut_detail(buf)
for t in regognized: for t in recognized:
yield t yield t
else: else:
for elem in buf: for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x')) yield pair(elem, word_tag_tab.get(elem, 'x'))
buf='' buf = ''
yield pair(l_word,word_tag_tab.get(l_word,'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x =y x = y
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield pair(buf,word_tag_tab.get(buf,'x')) yield pair(buf, word_tag_tab.get(buf, 'x'))
elif (buf not in jieba.FREQ):
recognized = __cut_detail(buf)
for t in recognized:
yield t
else: else:
if (buf not in jieba.FREQ): for elem in buf:
regognized = __cut_detail(buf) yield pair(elem, word_tag_tab.get(elem, 'x'))
for t in regognized:
yield t
else:
for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x'))
def __cut_internal(sentence,HMM=True): def __cut_internal(sentence, HMM=True):
if not isinstance(sentence, str): if not isinstance(sentence, str):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)") re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
re_eng,re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+") re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if HMM:
__cut_blk = __cut_DAG __cut_blk = __cut_DAG
@ -214,15 +213,15 @@ def __cut_internal(sentence,HMM=True):
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if re_skip.match(x): if re_skip.match(x):
yield pair(x,'x') yield pair(x, 'x')
else: else:
for xx in x: for xx in x:
if re_num.match(xx): if re_num.match(xx):
yield pair(xx,'m') yield pair(xx, 'm')
elif re_eng.match(x): elif re_eng.match(x):
yield pair(xx,'eng') yield pair(xx, 'eng')
else: else:
yield pair(xx,'x') yield pair(xx, 'x')
def __lcut_internal(sentence): def __lcut_internal(sentence):
return list(__cut_internal(sentence)) return list(__cut_internal(sentence))
@ -231,16 +230,16 @@ def __lcut_internal_no_hmm(sentence):
@makesure_userdict_loaded @makesure_userdict_loaded
def cut(sentence,HMM=True): def cut(sentence, HMM=True):
if (not hasattr(jieba,'pool')) or (jieba.pool==None): if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
for w in __cut_internal(sentence,HMM=HMM): for w in __cut_internal(sentence, HMM=HMM):
yield w yield w
else: else:
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if HMM: if HMM:
result = jieba.pool.map(__lcut_internal,parts) result = jieba.pool.map(__lcut_internal, parts)
else: else:
result = jieba.pool.map(__lcut_internal_no_hmm,parts) result = jieba.pool.map(__lcut_internal_no_hmm, parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w

View File

@ -1,46 +1,45 @@
import operator import operator
MIN_FLOAT=-3.14e100 MIN_FLOAT = -3.14e100
MIN_INF=float("-inf") MIN_INF = float("-inf")
def get_top_states(t_state_v,K=4): def get_top_states(t_state_v, K=4):
items = t_state_v.items() items = t_state_v.items()
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K] topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
return [x[0] for x in topK] return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
mem_path = [{}] mem_path = [{}]
all_states = trans_p.keys() all_states = trans_p.keys()
for y in states.get(obs[0],all_states): #init for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = '' mem_path[0][y] = ''
for t in range(1,len(obs)): for t in range(1, len(obs)):
V.append({}) V.append({})
mem_path.append({}) mem_path.append({})
#prev_states = get_top_states(V[t-1]) #prev_states = get_top_states(V[t-1])
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ] prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) ) prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = states.get(obs[t],all_states) obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
obs_states = set(obs_states) & set(prev_states_expect_next)
if len(obs_states)==0: obs_states = prev_states_expect_next if not obs_states:
if len(obs_states)==0: obs_states = all_states obs_states = prev_states_expect_next if prev_states_expect_next else all_states
for y in obs_states: for y in obs_states:
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states]) prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
V[t][y] =prob V[t][y] = prob
mem_path[t][y] = state mem_path[t][y] = state
last = [(V[-1][y], y) for y in mem_path[-1].keys() ] last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0: #if len(last)==0:
#print obs #print obs
(prob, state) = max(last) prob, state = max(last)
route = [None] * len(obs) route = [None] * len(obs)
i = len(obs)-1 i = len(obs) - 1
while i>=0: while i >= 0:
route[i] = state route[i] = state
state = mem_path[i][state] state = mem_path[i][state]
i-=1 i -= 1
return (prob, route) return (prob, route)