Merge pull request #187 from gumblex/master

不用Trie,减少内存加快速度;优化代码细节
This commit is contained in:
Sun Junyi 2014-10-19 19:43:30 +08:00
commit 4a93f21918
14 changed files with 382 additions and 365 deletions

View File

@ -13,10 +13,10 @@
2013-07-01: version 0.31 2013-07-01: version 0.31
1. 修改了代码缩进格式遵循PEP8标准 1. 修改了代码缩进格式遵循PEP8标准
2. 支持Jython解析器感谢 @piaolingxue 2. 支持Jython解析器感谢 @piaolingxue
3. 修复中英混合词汇不能识别数字在前词语的Bug 3. 修复中英混合词汇不能识别数字在前词语的Bug
4. 部分代码重构,感谢 @chao78787 4. 部分代码重构,感谢 @chao78787
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数感谢@linkerlin 5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数感谢@linkerlin
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖 6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
@ -55,8 +55,8 @@
2013-04-27: version 0.28 2013-04-27: version 0.28
======================== ========================
1) 新增词典lazy load功能用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf 1) 新增词典lazy load功能用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
2) 显示词典加载异常时错误的词条信息. 感谢neuront 2) 显示词典加载异常时错误的词条信息. 感谢neuront
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront 3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
2013-04-22: version 0.27 2013-04-22: version 0.27
======================== ========================
@ -93,7 +93,7 @@
2012-11-28: version 0.22 2012-11-28: version 0.22
======================== ========================
1) 新增jieba.cut_for_search方法 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。 1) 新增jieba.cut_for_search方法 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列从这个版本起有一个单独的jieba3k 2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列从这个版本起有一个单独的jieba3k
2012-11-23: version 0.21 2012-11-23: version 0.21
@ -104,7 +104,7 @@
2012-11-06: version 0.20 2012-11-06: version 0.20
======================== ========================
1) 新增词性标注功能 1) 新增词性标注功能
2012-10-25: version 0.19 2012-10-25: version 0.19

View File

@ -17,14 +17,13 @@ import logging
DICTIONARY = "dict.txt" DICTIONARY = "dict.txt"
DICT_LOCK = threading.RLock() DICT_LOCK = threading.RLock()
trie = None # to be initialized pfdict = None # to be initialized
FREQ = {} FREQ = {}
min_freq = 0.0 min_freq = 0.0
total =0.0 total = 0.0
user_word_tag_tab={} user_word_tag_tab = {}
initialized = False initialized = False
log_console = logging.StreamHandler(sys.stderr) log_console = logging.StreamHandler(sys.stderr)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
@ -34,84 +33,79 @@ def setLogLevel(log_level):
global logger global logger
logger.setLevel(log_level) logger.setLevel(log_level)
def gen_trie(f_name): def gen_pfdict(f_name):
lfreq = {} lfreq = {}
trie = {} pfdict = set()
ltotal = 0.0 ltotal = 0.0
with open(f_name, 'rb') as f: with open(f_name, 'rb') as f:
lineno = 0 lineno = 0
for line in f.read().rstrip().decode('utf-8').split('\n'): for line in f.read().rstrip().decode('utf-8').split('\n'):
lineno += 1 lineno += 1
try: try:
word,freq,_ = line.split(' ') word,freq = line.split(' ')[:2]
freq = float(freq) freq = float(freq)
lfreq[word] = freq lfreq[word] = freq
ltotal+=freq ltotal += freq
p = trie for ch in xrange(len(word)):
for c in word: pfdict.add(word[:ch+1])
if c not in p:
p[c] ={}
p = p[c]
p['']='' #ending flag
except ValueError, e: except ValueError, e:
logger.debug('%s at line %s %s' % (f_name, lineno, line)) logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise ValueError, e raise ValueError, e
return trie, lfreq,ltotal return pfdict, lfreq, ltotal
def initialize(*args): def initialize(*args):
global trie, FREQ, total, min_freq, initialized global pfdict, FREQ, total, min_freq, initialized
if len(args)==0: if not args:
dictionary = DICTIONARY dictionary = DICTIONARY
else: else:
dictionary = args[0] dictionary = args[0]
with DICT_LOCK: with DICT_LOCK:
if initialized: if initialized:
return return
if trie: if pfdict:
del trie del pfdict
trie = None pfdict = None
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,dictionary) abs_path = os.path.join(_curpath,dictionary)
logger.debug("Building Trie..., from %s" % abs_path) logger.debug("Building prefix dict from %s ..." % abs_path)
t1 = time.time() t1 = time.time()
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
else: #customer dictionary else: #custom dictionary
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache") cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
load_from_cache_fail = True load_from_cache_fail = True
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path): if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
logger.debug("loading model from cache %s" % cache_file) logger.debug("Loading model from cache %s" % cache_file)
try: try:
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
load_from_cache_fail = False # prevent conflict with old version
load_from_cache_fail = not isinstance(pfdict, set)
except: except:
load_from_cache_fail = True load_from_cache_fail = True
if load_from_cache_fail: if load_from_cache_fail:
trie,FREQ,total = gen_trie(abs_path) pfdict,FREQ,total = gen_pfdict(abs_path)
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
min_freq = min(FREQ.itervalues()) min_freq = min(FREQ.itervalues())
logger.debug("dumping model to file cache %s" % cache_file) logger.debug("Dumping model to file cache %s" % cache_file)
try: try:
tmp_suffix = "."+str(random.random()) tmp_suffix = "."+str(random.random())
with open(cache_file+tmp_suffix,'wb') as temp_cache_file: with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file) marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
if os.name=='nt': if os.name == 'nt':
import shutil from shutil import move as replace_file
replace_file = shutil.move
else: else:
replace_file = os.rename replace_file = os.rename
replace_file(cache_file+tmp_suffix,cache_file) replace_file(cache_file + tmp_suffix, cache_file)
except: except:
logger.error("dump cache file failed.") logger.exception("Dump cache file failed.")
logger.exception("")
initialized = True initialized = True
logger.debug("loading model cost %s seconds." % (time.time() - t1)) logger.debug("Loading model cost %s seconds." % (time.time() - t1))
logger.debug("Trie has been built succesfully.") logger.debug("Prefix dict has been built succesfully.")
def require_initialized(fn): def require_initialized(fn):
@ -132,145 +126,136 @@ def __cut_all(sentence):
dag = get_DAG(sentence) dag = get_DAG(sentence)
old_j = -1 old_j = -1
for k,L in dag.iteritems(): for k,L in dag.iteritems():
if len(L)==1 and k>old_j: if len(L) == 1 and k > old_j:
yield sentence[k:L[0]+1] yield sentence[k:L[0]+1]
old_j = L[0] old_j = L[0]
else: else:
for j in L: for j in L:
if j>k: if j > k:
yield sentence[k:j+1] yield sentence[k:j+1]
old_j = j old_j = j
def calc(sentence,DAG,idx,route): def calc(sentence,DAG,idx,route):
N = len(sentence) N = len(sentence)
route[N] = (0.0,'') route[N] = (0.0, '')
for idx in xrange(N-1,-1,-1): for idx in xrange(N-1, -1, -1):
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ] candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
route[idx] = max(candidates) route[idx] = max(candidates)
@require_initialized @require_initialized
def get_DAG(sentence): def get_DAG(sentence):
N = len(sentence) global pfdict, FREQ
i,j=0,0
p = trie
DAG = {} DAG = {}
while i<N: N = len(sentence)
c = sentence[j] for k in xrange(N):
if c in p: tmplist = []
p = p[c] i = k
if '' in p: frag = sentence[k]
if i not in DAG: while i < N and frag in pfdict:
DAG[i]=[] if frag in FREQ:
DAG[i].append(j) tmplist.append(i)
j+=1 i += 1
if j>=N: frag = sentence[k:i+1]
i+=1 if not tmplist:
j=i tmplist.append(k)
p=trie DAG[k] = tmplist
else:
p = trie
i+=1
j=i
for i in xrange(len(sentence)):
if i not in DAG:
DAG[i] =[i]
return DAG return DAG
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route ={} route = {}
calc(sentence,DAG,0,route=route) calc(sentence, DAG, 0, route=route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf = u'' buf = u''
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1] + 1
l_word = sentence[x:y] l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word)==1: if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word buf += l_word
x =y x = y
else: else:
if len(buf)>0: if buf:
yield buf yield buf
buf = u'' buf = u''
yield l_word yield l_word
x =y x = y
if len(buf)>0: if buf:
yield buf yield buf
buf = u'' buf = u''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = get_DAG(sentence) DAG = get_DAG(sentence)
route ={} route = {}
calc(sentence,DAG,0,route=route) calc(sentence, DAG, 0, route=route)
x = 0 x = 0
buf =u'' buf = u''
N = len(sentence) N = len(sentence)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if y-x==1: if y-x == 1:
buf+= l_word buf += l_word
else: else:
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield buf yield buf
buf=u'' buf = u''
else: else:
if (buf not in FREQ): if (buf not in FREQ):
regognized = finalseg.cut(buf) recognized = finalseg.cut(buf)
for t in regognized: for t in recognized:
yield t yield t
else: else:
for elem in buf: for elem in buf:
yield elem yield elem
buf=u'' buf = u''
yield l_word yield l_word
x =y x = y
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield buf yield buf
elif (buf not in FREQ):
recognized = finalseg.cut(buf)
for t in recognized:
yield t
else: else:
if (buf not in FREQ): for elem in buf:
regognized = finalseg.cut(buf) yield elem
for t in regognized:
yield t
else:
for elem in buf:
yield elem
def cut(sentence,cut_all=False,HMM=True): def cut(sentence, cut_all=False, HMM=True):
'''The main function that segments an entire sentence that contains '''The main function that segments an entire sentence that contains
Chinese characters into seperated words. Chinese characters into seperated words.
Parameter: Parameter:
- sentence: The String to be segmented - sentence: The str/unicode to be segmented.
- cut_all: Model. True means full pattern, false means accurate pattern. - cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether use Hidden Markov Model. - HMM: Whether to use the Hidden Markov Model.
''' '''
if not isinstance(sentence, unicode): if not isinstance(sentence, unicode):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
'''
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han # \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
\r\n|\s : whitespace characters. Will not be Handled. # \r\n|\s : whitespace characters. Will not be handled.
'''
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
if cut_all: if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U) re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
else:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if cut_all:
cut_block = __cut_all
elif HMM:
cut_block = __cut_DAG cut_block = __cut_DAG
else: else:
cut_block = __cut_DAG_NO_HMM cut_block = __cut_DAG_NO_HMM
if cut_all:
cut_block = __cut_all
for blk in blocks: for blk in blocks:
if len(blk)==0: if not blk:
continue continue
if re_han.match(blk): if re_han.match(blk):
for word in cut_block(blk): for word in cut_block(blk):
@ -286,15 +271,15 @@ def cut(sentence,cut_all=False,HMM=True):
else: else:
yield x yield x
def cut_for_search(sentence,HMM=True): def cut_for_search(sentence, HMM=True):
words = cut(sentence,HMM=HMM) words = cut(sentence, HMM=HMM)
for w in words: for w in words:
if len(w)>2: if len(w) > 2:
for i in xrange(len(w)-1): for i in xrange(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield gram2 yield gram2
if len(w)>3: if len(w) > 3:
for i in xrange(len(w)-2): for i in xrange(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
@ -312,79 +297,71 @@ def load_userdict(f):
... ...
Word type may be ignored Word type may be ignored
''' '''
global trie,total,FREQ
if isinstance(f, (str, unicode)): if isinstance(f, (str, unicode)):
f = open(f, 'rb') f = open(f, 'rb')
content = f.read().decode('utf-8') content = f.read().decode('utf-8')
line_no = 0 line_no = 0
for line in content.split("\n"): for line in content.split("\n"):
line_no+=1 line_no += 1
if line.rstrip()=='': continue if not line.rstrip():
tup =line.split(" ") continue
word,freq = tup[0],tup[1] tup = line.split(" ")
if freq.isdigit() is False: continue word, freq = tup[0], tup[1]
if line_no==1: if freq.isdigit() is False:
continue
if line_no == 1:
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
if len(tup)==3: add_word(*tup)
add_word(word, freq, tup[2])
else:
add_word(word, freq)
@require_initialized @require_initialized
def add_word(word, freq, tag=None): def add_word(word, freq, tag=None):
global FREQ, trie, total, user_word_tag_tab global FREQ, pfdict, total, user_word_tag_tab
freq = float(freq) FREQ[word] = log(float(freq) / total)
FREQ[word] = log(freq / total)
if tag is not None: if tag is not None:
user_word_tag_tab[word] = tag.strip() user_word_tag_tab[word] = tag.strip()
p = trie for ch in xrange(len(word)):
for c in word: pfdict.add(word[:ch+1])
if c not in p:
p[c] = {}
p = p[c]
p[''] = '' # ending flag
__ref_cut = cut __ref_cut = cut
__ref_cut_for_search = cut_for_search __ref_cut_for_search = cut_for_search
def __lcut(sentence): def __lcut(sentence):
return list(__ref_cut(sentence,False)) return list(__ref_cut(sentence, False))
def __lcut_no_hmm(sentence): def __lcut_no_hmm(sentence):
return list(__ref_cut(sentence,False,False)) return list(__ref_cut(sentence, False, False))
def __lcut_all(sentence): def __lcut_all(sentence):
return list(__ref_cut(sentence,True)) return list(__ref_cut(sentence, True))
def __lcut_for_search(sentence): def __lcut_for_search(sentence):
return list(__ref_cut_for_search(sentence)) return list(__ref_cut_for_search(sentence))
@require_initialized @require_initialized
def enable_parallel(processnum=None): def enable_parallel(processnum=None):
global pool,cut,cut_for_search global pool, cut, cut_for_search
if os.name=='nt': if os.name == 'nt':
raise Exception("jieba: parallel mode only supports posix system") raise Exception("jieba: parallel mode only supports posix system")
if sys.version_info[0]==2 and sys.version_info[1]<6: if sys.version_info[0]==2 and sys.version_info[1]<6:
raise Exception("jieba: the parallel feature needs Python version>2.5 ") raise Exception("jieba: the parallel feature needs Python version>2.5")
from multiprocessing import Pool,cpu_count from multiprocessing import Pool, cpu_count
if processnum==None: if processnum is None:
processnum = cpu_count() processnum = cpu_count()
pool = Pool(processnum) pool = Pool(processnum)
def pcut(sentence,cut_all=False,HMM=True): def pcut(sentence,cut_all=False,HMM=True):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if cut_all: if cut_all:
result = pool.map(__lcut_all,parts) result = pool.map(__lcut_all, parts)
elif HMM:
result = pool.map(__lcut, parts)
else: else:
if HMM: result = pool.map(__lcut_no_hmm, parts)
result = pool.map(__lcut,parts)
else:
result = pool.map(__lcut_no_hmm,parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w
def pcut_for_search(sentence): def pcut_for_search(sentence):
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
result = pool.map(__lcut_for_search,parts) result = pool.map(__lcut_for_search, parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w
@ -403,40 +380,44 @@ def disable_parallel():
def set_dictionary(dictionary_path): def set_dictionary(dictionary_path):
global initialized, DICTIONARY global initialized, DICTIONARY
with DICT_LOCK: with DICT_LOCK:
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) ) abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
DICTIONARY = abs_path DICTIONARY = abs_path
initialized = False initialized = False
def get_abs_path_dict(): def get_abs_path_dict():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath,DICTIONARY) abs_path = os.path.join(_curpath,DICTIONARY)
return abs_path return abs_path
def tokenize(unicode_sentence,mode="default",HMM=True): def tokenize(unicode_sentence, mode="default", HMM=True):
#mode ("default" or "search") """Tokenize a sentence and yields tuples of (word, start, end)
Parameter:
- sentence: the unicode to be segmented.
- mode: "default" or "search", "search" is for finer segmentation.
- HMM: whether to use the Hidden Markov Model.
"""
if not isinstance(unicode_sentence, unicode): if not isinstance(unicode_sentence, unicode):
raise Exception("jieba: the input parameter should unicode.") raise Exception("jieba: the input parameter should be unicode.")
start = 0 start = 0
if mode=='default': if mode == 'default':
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
width = len(w) width = len(w)
yield (w,start,start+width) yield (w, start, start+width)
start+=width start += width
else: else:
for w in cut(unicode_sentence,HMM=HMM): for w in cut(unicode_sentence, HMM=HMM):
width = len(w) width = len(w)
if len(w)>2: if len(w) > 2:
for i in xrange(len(w)-1): for i in xrange(len(w)-1):
gram2 = w[i:i+2] gram2 = w[i:i+2]
if gram2 in FREQ: if gram2 in FREQ:
yield (gram2,start+i,start+i+2) yield (gram2, start+i, start+i+2)
if len(w)>3: if len(w) > 3:
for i in xrange(len(w)-2): for i in xrange(len(w)-2):
gram3 = w[i:i+3] gram3 = w[i:i+3]
if gram3 in FREQ: if gram3 in FREQ:
yield (gram3,start+i,start+i+3) yield (gram3, start+i, start+i+3)
yield (w,start,start+width) yield (w, start, start+width)
start+=width start += width

37
jieba/__main__.py Normal file
View File

@ -0,0 +1,37 @@
"""Jieba command line interface."""
import sys
import jieba
from argparse import ArgumentParser
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
nargs='?', const=' ',
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
parser.add_argument("-a", "--cut-all",
action="store_true", dest="cutall", default=False,
help="full pattern cutting")
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
help="don't print loading messages to stderr")
parser.add_argument("-V", '--version', action='version',
version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")
args = parser.parse_args()
if args.quiet:
jieba.setLogLevel(60)
delim = unicode(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin
jieba.initialize()
ln = fp.readline()
while ln:
l = ln.rstrip('\r\n')
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
ln = fp.readline()
fp.close()

View File

@ -6,12 +6,14 @@ try:
except ImportError: except ImportError:
pass pass
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
abs_path = os.path.join(_curpath, "idf.txt") abs_path = os.path.join(_curpath, "idf.txt")
STOP_WORDS = set([ STOP_WORDS = set((
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" "the","of","is","and","to","in","that","we","for","an","are",
]) "by","be","as","on","with","can","if","from","which","you","it",
"this","then","at","have","all","not","one","has","or","that"
))
class IDFLoader: class IDFLoader:
def __init__(self): def __init__(self):
@ -21,13 +23,13 @@ class IDFLoader:
def set_new_path(self, new_idf_path): def set_new_path(self, new_idf_path):
if self.path != new_idf_path: if self.path != new_idf_path:
content = open(new_idf_path,'rb').read().decode('utf-8') content = open(new_idf_path, 'rb').read().decode('utf-8')
idf_freq = {} idf_freq = {}
lines = content.split('\n') lines = content.split('\n')
if lines and not lines[-1]: if lines and not lines[-1]:
lines.pop(-1) lines.pop(-1)
for line in lines: for line in lines:
word,freq = line.split(' ') word, freq = line.split(' ')
idf_freq[word] = float(freq) idf_freq[word] = float(freq)
median_idf = sorted(idf_freq.values())[len(idf_freq)/2] median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
self.idf_freq = idf_freq self.idf_freq = idf_freq
@ -41,24 +43,22 @@ idf_loader = IDFLoader()
idf_loader.set_new_path(abs_path) idf_loader.set_new_path(abs_path)
def set_idf_path(idf_path): def set_idf_path(idf_path):
new_abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) ) new_abs_path = os.path.normpath(os.path.join(os.getcwd(), idf_path))
if not os.path.exists(new_abs_path): if not os.path.exists(new_abs_path):
raise Exception("jieba: path does not exist:" + new_abs_path) raise Exception("jieba: path does not exist: " + new_abs_path)
idf_loader.set_new_path(new_abs_path) idf_loader.set_new_path(new_abs_path)
return
def set_stop_words(stop_words_path): def set_stop_words(stop_words_path):
global STOP_WORDS global STOP_WORDS
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) ) abs_path = os.path.normpath(os.path.join(os.getcwd(), stop_words_path))
if not os.path.exists(abs_path): if not os.path.exists(abs_path):
raise Exception("jieba: path does not exist:" + abs_path) raise Exception("jieba: path does not exist: " + abs_path)
content = open(abs_path,'rb').read().decode('utf-8') content = open(abs_path,'rb').read().decode('utf-8')
lines = content.split('\n') lines = content.split('\n')
for line in lines: for line in lines:
STOP_WORDS.add(line) STOP_WORDS.add(line)
return
def extract_tags(sentence,topK=20): def extract_tags(sentence, topK=20):
global STOP_WORDS global STOP_WORDS
idf_freq, median_idf = idf_loader.get_idf() idf_freq, median_idf = idf_loader.get_idf()
@ -66,15 +66,17 @@ def extract_tags(sentence,topK=20):
words = jieba.cut(sentence) words = jieba.cut(sentence)
freq = {} freq = {}
for w in words: for w in words:
if len(w.strip())<2: continue if len(w.strip()) < 2:
if w.lower() in STOP_WORDS: continue continue
freq[w]=freq.get(w,0.0)+1.0 if w.lower() in STOP_WORDS:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values()) total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()] freq = [(k,v/total) for k,v in freq.iteritems()]
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
st_list = sorted(tf_idf_list,reverse=True) st_list = sorted(tf_idf_list, reverse=True)
top_tuples= st_list[:topK] top_tuples = st_list[:topK]
tags = [a[1] for a in top_tuples] tags = [a[1] for a in top_tuples]
return tags return tags

View File

@ -1,6 +1,6 @@
#encoding=utf-8 ##encoding=utf-8
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
from whoosh.analysis import Tokenizer,Token from whoosh.analysis import Tokenizer,Token
from whoosh.lang.porter import stem from whoosh.lang.porter import stem
import jieba import jieba
@ -10,26 +10,24 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'to', 'us', 'we', 'when', 'will', 'with', 'yet',
'you', 'your',u'',u'',u'')) 'you', 'your', u'', u'', u''))
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
class ChineseTokenizer(Tokenizer): class ChineseTokenizer(Tokenizer):
def __call__(self,text,**kargs): def __call__(self, text, **kargs):
words = jieba.tokenize(text,mode="search") words = jieba.tokenize(text, mode="search")
token = Token() token = Token()
for (w,start_pos,stop_pos) in words: for (w,start_pos,stop_pos) in words:
if not accepted_chars.match(w): if not accepted_chars.match(w) and len(w)<=1:
if len(w)>1: continue
pass
else:
continue
token.original = token.text = w token.original = token.text = w
token.pos = start_pos token.pos = start_pos
token.startchar = start_pos token.startchar = start_pos
token.endchar = stop_pos token.endchar = stop_pos
yield token yield token
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000): def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\ return (ChineseTokenizer() | LowercaseFilter() |
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize) StopFilter(stoplist=stoplist,minsize=minsize) |
StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize))

View File

@ -4,7 +4,7 @@ import os
import marshal import marshal
import sys import sys
MIN_FLOAT=-3.14e100 MIN_FLOAT = -3.14e100
PROB_START_P = "prob_start.p" PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p" PROB_TRANS_P = "prob_trans.p"
@ -19,30 +19,30 @@ PrevStatus = {
} }
def load_model(): def load_model():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
start_p = {} start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P) abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f: with open(abs_path, mode='r') as f:
start_p = marshal.load(f) start_p = marshal.load(f)
f.closed f.closed
trans_p = {} trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P) abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f: with open(abs_path, 'r') as f:
trans_p = marshal.load(f) trans_p = marshal.load(f)
f.closed f.closed
emit_p = {} emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P) abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f: with open(abs_path, 'r') as f:
emit_p = marshal.load(f) emit_p = marshal.load(f)
f.closed f.closed
return start_p, trans_p, emit_p return start_p, trans_p, emit_p
if sys.platform.startswith("java"): if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model() start_P, trans_P, emit_P = load_model()
else: else:
import prob_start,prob_trans,prob_emit import prob_start,prob_trans,prob_emit
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
@ -53,45 +53,45 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
for y in states: #init for y in states: #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
path[y] = [y] path[y] = [y]
for t in range(1,len(obs)): for t in xrange(1,len(obs)):
V.append({}) V.append({})
newpath = {} newpath = {}
for y in states: for y in states:
em_p = emit_p[y].get(obs[t],MIN_FLOAT) em_p = emit_p[y].get(obs[t],MIN_FLOAT)
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ]) (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
V[t][y] =prob V[t][y] = prob
newpath[y] = path[state] + [y] newpath[y] = path[state] + [y]
path = newpath path = newpath
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')]) (prob, state) = max([(V[len(obs)-1][y], y) for y in ('E','S')])
return (prob, path[state]) return (prob, path[state])
def __cut(sentence): def __cut(sentence):
global emit_P global emit_P
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P) prob, pos_list = viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0,0
#print pos_list, sentence #print pos_list, sentence
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i] pos = pos_list[i]
if pos=='B': if pos == 'B':
begin = i begin = i
elif pos=='E': elif pos == 'E':
yield sentence[begin:i+1] yield sentence[begin:i+1]
next = i+1 next = i+1
elif pos=='S': elif pos == 'S':
yield char yield char
next = i+1 next = i+1
if next<len(sentence): if next < len(sentence):
yield sentence[next:] yield sentence[next:]
def cut(sentence): def cut(sentence):
if not ( type(sentence) is unicode): if not isinstance(sentence, unicode):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
@ -101,5 +101,5 @@ def cut(sentence):
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if x!="": if x:
yield x yield x

View File

@ -14,41 +14,42 @@ PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p" PROB_EMIT_P = "prob_emit.p"
CHAR_STATE_TAB_P = "char_state_tab.p" CHAR_STATE_TAB_P = "char_state_tab.p"
def load_model(f_name,isJython=True): def load_model(f_name, isJython=True):
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
result = {} result = {}
with file(f_name, "rb") as f: with open(f_name, "r") as f:
for line in open(f_name,"rb"): for line in f:
line = line.strip() line = line.strip()
if line=="":continue if not line:
continue
word, _, tag = line.split(' ') word, _, tag = line.split(' ')
result[word.decode('utf-8')]=tag result[word.decode('utf-8')] = tag
f.closed f.closed
if not isJython: if not isJython:
return result return result
start_p = {} start_p = {}
abs_path = os.path.join(_curpath, PROB_START_P) abs_path = os.path.join(_curpath, PROB_START_P)
with open(abs_path, mode='rb') as f: with open(abs_path, mode='r') as f:
start_p = marshal.load(f) start_p = marshal.load(f)
f.closed f.closed
trans_p = {} trans_p = {}
abs_path = os.path.join(_curpath, PROB_TRANS_P) abs_path = os.path.join(_curpath, PROB_TRANS_P)
with open(abs_path, 'rb') as f: with open(abs_path, 'r') as f:
trans_p = marshal.load(f) trans_p = marshal.load(f)
f.closed f.closed
emit_p = {} emit_p = {}
abs_path = os.path.join(_curpath, PROB_EMIT_P) abs_path = os.path.join(_curpath, PROB_EMIT_P)
with file(abs_path, 'rb') as f: with open(abs_path, 'r') as f:
emit_p = marshal.load(f) emit_p = marshal.load(f)
f.closed f.closed
state = {} state = {}
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
with file(abs_path, 'rb') as f: with open(abs_path, 'r') as f:
state = marshal.load(f) state = marshal.load(f)
f.closed f.closed
@ -59,17 +60,17 @@ if sys.platform.startswith("java"):
else: else:
import char_state_tab, prob_start, prob_trans, prob_emit import char_state_tab, prob_start, prob_trans, prob_emit
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False) word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
def makesure_userdict_loaded(fn): def makesure_userdict_loaded(fn):
@wraps(fn) @wraps(fn)
def wrapped(*args,**kwargs): def wrapped(*args,**kwargs):
if len(jieba.user_word_tag_tab)>0: if jieba.user_word_tag_tab:
word_tag_tab.update(jieba.user_word_tag_tab) word_tag_tab.update(jieba.user_word_tag_tab)
jieba.user_word_tag_tab = {} jieba.user_word_tag_tab = {}
return fn(*args,**kwargs) return fn(*args,**kwargs)
return wrapped return wrapped
class pair(object): class pair(object):
@ -78,7 +79,7 @@ class pair(object):
self.flag = flag self.flag = flag
def __unicode__(self): def __unicode__(self):
return self.word+u"/"+self.flag return u'%s/%s' % (self.word, self.flag)
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
@ -90,25 +91,25 @@ class pair(object):
return self.__unicode__().encode(arg) return self.__unicode__().encode(arg)
def __cut(sentence): def __cut(sentence):
prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P) prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
begin, next = 0,0 begin, next = 0, 0
for i,char in enumerate(sentence): for i,char in enumerate(sentence):
pos = pos_list[i][0] pos = pos_list[i][0]
if pos=='B': if pos == 'B':
begin = i begin = i
elif pos=='E': elif pos == 'E':
yield pair(sentence[begin:i+1], pos_list[i][1]) yield pair(sentence[begin:i+1], pos_list[i][1])
next = i+1 next = i+1
elif pos=='S': elif pos == 'S':
yield pair(char,pos_list[i][1]) yield pair(char, pos_list[i][1])
next = i+1 next = i+1
if next<len(sentence): if next < len(sentence):
yield pair(sentence[next:], pos_list[next][1] ) yield pair(sentence[next:], pos_list[next][1])
def __cut_detail(sentence): def __cut_detail(sentence):
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
for blk in blocks: for blk in blocks:
if re_han.match(blk): if re_han.match(blk):
@ -117,89 +118,88 @@ def __cut_detail(sentence):
else: else:
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if x!="": if x:
if re_num.match(x): if re_num.match(x):
yield pair(x,'m') yield pair(x, 'm')
elif re_eng.match(x): elif re_eng.match(x):
yield pair(x,'eng') yield pair(x, 'eng')
else: else:
yield pair(x,'x') yield pair(x, 'x')
def __cut_DAG_NO_HMM(sentence): def __cut_DAG_NO_HMM(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route ={} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence, DAG, 0, route=route)
x = 0 x = 0
N = len(sentence) N = len(sentence)
buf =u'' buf = u''
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word)==1: if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word buf += l_word
x = y x = y
else: else:
if len(buf)>0: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = u'' buf = u''
yield pair(l_word,word_tag_tab.get(l_word,'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x =y x = y
if len(buf)>0: if buf:
yield pair(buf,'eng') yield pair(buf,'eng')
buf = u'' buf = u''
def __cut_DAG(sentence): def __cut_DAG(sentence):
DAG = jieba.get_DAG(sentence) DAG = jieba.get_DAG(sentence)
route ={} route = {}
jieba.calc(sentence,DAG,0,route=route) jieba.calc(sentence,DAG,0,route=route)
x = 0 x = 0
buf =u'' buf = u''
N = len(sentence) N = len(sentence)
while x<N: while x < N:
y = route[x][1]+1 y = route[x][1]+1
l_word = sentence[x:y] l_word = sentence[x:y]
if y-x==1: if y-x == 1:
buf+= l_word buf += l_word
else: else:
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield pair(buf,word_tag_tab.get(buf,'x')) yield pair(buf, word_tag_tab.get(buf, 'x'))
buf=u'' buf = u''
else: else:
if (buf not in jieba.FREQ): if (buf not in jieba.FREQ):
regognized = __cut_detail(buf) recognized = __cut_detail(buf)
for t in regognized: for t in recognized:
yield t yield t
else: else:
for elem in buf: for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x')) yield pair(elem, word_tag_tab.get(elem, 'x'))
buf=u'' buf = u''
yield pair(l_word,word_tag_tab.get(l_word,'x')) yield pair(l_word, word_tag_tab.get(l_word, 'x'))
x =y x = y
if len(buf)>0: if buf:
if len(buf)==1: if len(buf) == 1:
yield pair(buf,word_tag_tab.get(buf,'x')) yield pair(buf, word_tag_tab.get(buf, 'x'))
elif (buf not in jieba.FREQ):
recognized = __cut_detail(buf)
for t in recognized:
yield t
else: else:
if (buf not in jieba.FREQ): for elem in buf:
regognized = __cut_detail(buf) yield pair(elem, word_tag_tab.get(elem, 'x'))
for t in regognized:
yield t
else:
for elem in buf:
yield pair(elem,word_tag_tab.get(elem,'x'))
def __cut_internal(sentence,HMM=True): def __cut_internal(sentence, HMM=True):
if not ( type(sentence) is unicode): if not isinstance(sentence, unicode):
try: try:
sentence = sentence.decode('utf-8') sentence = sentence.decode('utf-8')
except: except UnicodeDecodeError:
sentence = sentence.decode('gbk','ignore') sentence = sentence.decode('gbk', 'ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)") re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+") re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
blocks = re_han.split(sentence) blocks = re_han.split(sentence)
if HMM: if HMM:
__cut_blk = __cut_DAG __cut_blk = __cut_DAG
@ -214,15 +214,15 @@ def __cut_internal(sentence,HMM=True):
tmp = re_skip.split(blk) tmp = re_skip.split(blk)
for x in tmp: for x in tmp:
if re_skip.match(x): if re_skip.match(x):
yield pair(x,'x') yield pair(x, 'x')
else: else:
for xx in x: for xx in x:
if re_num.match(xx): if re_num.match(xx):
yield pair(xx,'m') yield pair(xx, 'm')
elif re_eng.match(x): elif re_eng.match(x):
yield pair(xx,'eng') yield pair(xx, 'eng')
else: else:
yield pair(xx,'x') yield pair(xx, 'x')
def __lcut_internal(sentence): def __lcut_internal(sentence):
return list(__cut_internal(sentence)) return list(__cut_internal(sentence))
@ -231,16 +231,16 @@ def __lcut_internal_no_hmm(sentence):
@makesure_userdict_loaded @makesure_userdict_loaded
def cut(sentence,HMM=True): def cut(sentence, HMM=True):
if (not hasattr(jieba,'pool')) or (jieba.pool==None): if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
for w in __cut_internal(sentence,HMM=HMM): for w in __cut_internal(sentence, HMM=HMM):
yield w yield w
else: else:
parts = re.compile('([\r\n]+)').split(sentence) parts = re.compile('([\r\n]+)').split(sentence)
if HMM: if HMM:
result = jieba.pool.map(__lcut_internal,parts) result = jieba.pool.map(__lcut_internal, parts)
else: else:
result = jieba.pool.map(__lcut_internal_no_hmm,parts) result = jieba.pool.map(__lcut_internal_no_hmm, parts)
for r in result: for r in result:
for w in r: for w in r:
yield w yield w

View File

@ -1,46 +1,45 @@
import operator import operator
MIN_FLOAT=-3.14e100 MIN_FLOAT = -3.14e100
MIN_INF=float("-inf") MIN_INF = float("-inf")
def get_top_states(t_state_v,K=4): def get_top_states(t_state_v, K=4):
items = t_state_v.items() items = t_state_v.items()
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K] topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
return [x[0] for x in topK] return [x[0] for x in topK]
def viterbi(obs, states, start_p, trans_p, emit_p): def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] #tabular V = [{}] #tabular
mem_path = [{}] mem_path = [{}]
all_states = trans_p.keys() all_states = trans_p.keys()
for y in states.get(obs[0],all_states): #init for y in states.get(obs[0], all_states): #init
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT) V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
mem_path[0][y] = '' mem_path[0][y] = ''
for t in range(1,len(obs)): for t in xrange(1, len(obs)):
V.append({}) V.append({})
mem_path.append({}) mem_path.append({})
#prev_states = get_top_states(V[t-1]) #prev_states = get_top_states(V[t-1])
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ] prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) ) prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
obs_states = states.get(obs[t],all_states) obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
obs_states = set(obs_states) & set(prev_states_expect_next)
if len(obs_states)==0: obs_states = prev_states_expect_next if not obs_states:
if len(obs_states)==0: obs_states = all_states obs_states = prev_states_expect_next if prev_states_expect_next else all_states
for y in obs_states: for y in obs_states:
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states]) prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
V[t][y] =prob V[t][y] = prob
mem_path[t][y] = state mem_path[t][y] = state
last = [(V[-1][y], y) for y in mem_path[-1].keys() ] last = [(V[-1][y], y) for y in mem_path[-1].keys()]
#if len(last)==0: #if len(last)==0:
#print obs #print obs
(prob, state) = max(last) prob, state = max(last)
route = [None] * len(obs) route = [None] * len(obs)
i = len(obs)-1 i = len(obs) - 1
while i>=0: while i >= 0:
route[i] = state route[i] = state
state = mem_path[i][state] state = mem_path[i][state]
i-=1 i -= 1
return (prob, route) return (prob, route)

View File

@ -1,11 +1,11 @@
from distutils.core import setup from distutils.core import setup
setup(name='jieba', setup(name='jieba',
version='0.33', version='0.33',
description='Chinese Words Segementation Utilities', description='Chinese Words Segementation Utilities',
author='Sun, Junyi', author='Sun, Junyi',
author_email='ccnusjy@gmail.com', author_email='ccnusjy@gmail.com',
url='http://github.com/fxsjy', url='http://github.com/fxsjy',
packages=['jieba'], packages=['jieba'],
package_dir={'jieba':'jieba'}, package_dir={'jieba':'jieba'},
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']} package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
) )

View File

@ -4,14 +4,14 @@ sys.path.append("../")
import jieba import jieba
seg_list = jieba.cut("我来到北京清华大学", cut_all=True) seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
print "Full Mode:", "/ ".join(seg_list) # 全模式 print u"Full Mode:", u"/ ".join(seg_list) # 全模式
seg_list = jieba.cut("我来到北京清华大学", cut_all=False) seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
print "Default Mode:", "/ ".join(seg_list) # 默认模式 print u"Default Mode:", u"/ ".join(seg_list) # 默认模式
seg_list = jieba.cut("他来到了网易杭研大厦") seg_list = jieba.cut(u"他来到了网易杭研大厦")
print ", ".join(seg_list) print u", ".join(seg_list)
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print ", ".join(seg_list) print u", ".join(seg_list)

View File

@ -6,7 +6,7 @@ import jieba.posseg as pseg
def cuttest(test_sent): def cuttest(test_sent):
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", print w.word, "/", w.flag, ", ",
print "" print ""
@ -95,4 +95,4 @@ if __name__ == "__main__":
cuttest('AT&T是一件不错的公司给你发offer了吗') cuttest('AT&T是一件不错的公司给你发offer了吗')
cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159') cuttest('C++和c#是什么关系11+122=133是吗PI=3.14159')
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。') cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
cuttest('枪杆子中出政权') cuttest('枪杆子中出政权')

View File

@ -14,7 +14,7 @@ for w in words:
result = pseg.cut(test_sent) result = pseg.cut(test_sent)
for w in result: for w in result:
print w.word, "/", w.flag, ", ", print w.word, "/", w.flag, ", ",
print "\n========" print "\n========"

View File

@ -5,7 +5,7 @@ from whoosh.index import create_in,open_dir
from whoosh.fields import * from whoosh.fields import *
from whoosh.qparser import QueryParser from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer() analyzer = ChineseAnalyzer()
@ -18,31 +18,31 @@ ix = create_in("tmp", schema) # for create new index
writer = ix.writer() writer = ix.writer()
writer.add_document( writer.add_document(
title=u"document1", title=u"document1",
path=u"/a", path=u"/a",
content=u"This is the first document weve added!" content=u"This is the first document weve added!"
) )
writer.add_document( writer.add_document(
title=u"document2", title=u"document2",
path=u"/b", path=u"/b",
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果" content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
) )
writer.add_document( writer.add_document(
title=u"document3", title=u"document3",
path=u"/c", path=u"/c",
content=u"买水果然后来世博园。" content=u"买水果然后来世博园。"
) )
writer.add_document( writer.add_document(
title=u"document4", title=u"document4",
path=u"/c", path=u"/c",
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
) )
writer.add_document( writer.add_document(
title=u"document4", title=u"document4",
path=u"/c", path=u"/c",
content=u"咱俩交换一下吧。" content=u"咱俩交换一下吧。"
) )
@ -55,7 +55,7 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
print "result of ",keyword print "result of ",keyword
q = parser.parse(keyword) q = parser.parse(keyword)
results = searcher.search(q) results = searcher.search(q)
for hit in results: for hit in results:
print hit.highlights("content") print hit.highlights("content")
print "="*10 print "="*10

View File

@ -6,7 +6,7 @@ from whoosh.index import create_in
from whoosh.fields import * from whoosh.fields import *
from whoosh.qparser import QueryParser from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer from jieba.analyse import ChineseAnalyzer
analyzer = ChineseAnalyzer() analyzer = ChineseAnalyzer()
@ -23,7 +23,7 @@ with open(file_name,"rb") as inf:
for line in inf: for line in inf:
i+=1 i+=1
writer.add_document( writer.add_document(
title=u"line"+str(i), title=u"line"+str(i),
path=u"/a", path=u"/a",
content=line.decode('gbk','ignore') content=line.decode('gbk','ignore')
) )
@ -36,6 +36,6 @@ for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"
print "result of ",keyword print "result of ",keyword
q = parser.parse(keyword) q = parser.parse(keyword)
results = searcher.search(q) results = searcher.search(q)
for hit in results: for hit in results:
print hit.highlights("content") print hit.highlights("content")
print "="*10 print "="*10