mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
commit
4a93f21918
14
Changelog
14
Changelog
@ -13,10 +13,10 @@
|
||||
|
||||
2013-07-01: version 0.31
|
||||
1. 修改了代码缩进格式,遵循PEP8标准
|
||||
2. 支持Jython解析器,感谢 @piaolingxue
|
||||
2. 支持Jython解析器,感谢 @piaolingxue
|
||||
3. 修复中英混合词汇不能识别数字在前词语的Bug
|
||||
4. 部分代码重构,感谢 @chao78787
|
||||
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
||||
4. 部分代码重构,感谢 @chao78787
|
||||
5. 多进程并行分词模式下自动检测CPU个数设置合适的进程数,感谢@linkerlin
|
||||
6. 修复了0.3版中jieba.extra_tags方法对whoosh模块的错误依赖
|
||||
|
||||
|
||||
@ -55,8 +55,8 @@
|
||||
2013-04-27: version 0.28
|
||||
========================
|
||||
1) 新增词典lazy load功能,用户可以在'import jieba'后再改变词典的路径. 感谢hermanschaaf
|
||||
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
||||
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
||||
2) 显示词典加载异常时错误的词条信息. 感谢neuront
|
||||
3) 修正了词典被vim编辑后会加载失败的bug. 感谢neuront
|
||||
|
||||
2013-04-22: version 0.27
|
||||
========================
|
||||
@ -93,7 +93,7 @@
|
||||
2012-11-28: version 0.22
|
||||
========================
|
||||
1) 新增jieba.cut_for_search方法, 该方法在精确分词的基础上对“长词”进行再次切分,适用于搜索引擎领域的分词,比精确分词模式有更高的召回率。
|
||||
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
||||
2) 开始支持Python3.x版。 之前一直是只支持Python2.x系列,从这个版本起有一个单独的jieba3k
|
||||
|
||||
|
||||
2012-11-23: version 0.21
|
||||
@ -104,7 +104,7 @@
|
||||
|
||||
2012-11-06: version 0.20
|
||||
========================
|
||||
1) 新增词性标注功能
|
||||
1) 新增词性标注功能
|
||||
|
||||
|
||||
2012-10-25: version 0.19
|
||||
|
@ -17,14 +17,13 @@ import logging
|
||||
|
||||
DICTIONARY = "dict.txt"
|
||||
DICT_LOCK = threading.RLock()
|
||||
trie = None # to be initialized
|
||||
pfdict = None # to be initialized
|
||||
FREQ = {}
|
||||
min_freq = 0.0
|
||||
total =0.0
|
||||
user_word_tag_tab={}
|
||||
total = 0.0
|
||||
user_word_tag_tab = {}
|
||||
initialized = False
|
||||
|
||||
|
||||
log_console = logging.StreamHandler(sys.stderr)
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
@ -34,84 +33,79 @@ def setLogLevel(log_level):
|
||||
global logger
|
||||
logger.setLevel(log_level)
|
||||
|
||||
def gen_trie(f_name):
|
||||
def gen_pfdict(f_name):
|
||||
lfreq = {}
|
||||
trie = {}
|
||||
pfdict = set()
|
||||
ltotal = 0.0
|
||||
with open(f_name, 'rb') as f:
|
||||
lineno = 0
|
||||
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
||||
lineno += 1
|
||||
try:
|
||||
word,freq,_ = line.split(' ')
|
||||
word,freq = line.split(' ')[:2]
|
||||
freq = float(freq)
|
||||
lfreq[word] = freq
|
||||
ltotal+=freq
|
||||
p = trie
|
||||
for c in word:
|
||||
if c not in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
ltotal += freq
|
||||
for ch in xrange(len(word)):
|
||||
pfdict.add(word[:ch+1])
|
||||
except ValueError, e:
|
||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
||||
raise ValueError, e
|
||||
return trie, lfreq,ltotal
|
||||
return pfdict, lfreq, ltotal
|
||||
|
||||
def initialize(*args):
|
||||
global trie, FREQ, total, min_freq, initialized
|
||||
if len(args)==0:
|
||||
global pfdict, FREQ, total, min_freq, initialized
|
||||
if not args:
|
||||
dictionary = DICTIONARY
|
||||
else:
|
||||
dictionary = args[0]
|
||||
with DICT_LOCK:
|
||||
if initialized:
|
||||
return
|
||||
if trie:
|
||||
del trie
|
||||
trie = None
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
if pfdict:
|
||||
del pfdict
|
||||
pfdict = None
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
abs_path = os.path.join(_curpath,dictionary)
|
||||
logger.debug("Building Trie..., from %s" % abs_path)
|
||||
logger.debug("Building prefix dict from %s ..." % abs_path)
|
||||
t1 = time.time()
|
||||
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
||||
else: #customer dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
||||
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
||||
else: #custom dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
|
||||
|
||||
load_from_cache_fail = True
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
|
||||
logger.debug("loading model from cache %s" % cache_file)
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||
logger.debug("Loading model from cache %s" % cache_file)
|
||||
try:
|
||||
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
||||
load_from_cache_fail = False
|
||||
pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
||||
# prevent conflict with old version
|
||||
load_from_cache_fail = not isinstance(pfdict, set)
|
||||
except:
|
||||
load_from_cache_fail = True
|
||||
|
||||
if load_from_cache_fail:
|
||||
trie,FREQ,total = gen_trie(abs_path)
|
||||
pfdict,FREQ,total = gen_pfdict(abs_path)
|
||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
|
||||
min_freq = min(FREQ.itervalues())
|
||||
logger.debug("dumping model to file cache %s" % cache_file)
|
||||
logger.debug("Dumping model to file cache %s" % cache_file)
|
||||
try:
|
||||
tmp_suffix = "."+str(random.random())
|
||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
||||
if os.name=='nt':
|
||||
import shutil
|
||||
replace_file = shutil.move
|
||||
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
|
||||
if os.name == 'nt':
|
||||
from shutil import move as replace_file
|
||||
else:
|
||||
replace_file = os.rename
|
||||
replace_file(cache_file+tmp_suffix,cache_file)
|
||||
replace_file(cache_file + tmp_suffix, cache_file)
|
||||
except:
|
||||
logger.error("dump cache file failed.")
|
||||
logger.exception("")
|
||||
logger.exception("Dump cache file failed.")
|
||||
|
||||
initialized = True
|
||||
|
||||
logger.debug("loading model cost %s seconds." % (time.time() - t1))
|
||||
logger.debug("Trie has been built succesfully.")
|
||||
logger.debug("Loading model cost %s seconds." % (time.time() - t1))
|
||||
logger.debug("Prefix dict has been built succesfully.")
|
||||
|
||||
|
||||
def require_initialized(fn):
|
||||
@ -132,145 +126,136 @@ def __cut_all(sentence):
|
||||
dag = get_DAG(sentence)
|
||||
old_j = -1
|
||||
for k,L in dag.iteritems():
|
||||
if len(L)==1 and k>old_j:
|
||||
if len(L) == 1 and k > old_j:
|
||||
yield sentence[k:L[0]+1]
|
||||
old_j = L[0]
|
||||
else:
|
||||
for j in L:
|
||||
if j>k:
|
||||
if j > k:
|
||||
yield sentence[k:j+1]
|
||||
old_j = j
|
||||
|
||||
|
||||
def calc(sentence,DAG,idx,route):
|
||||
N = len(sentence)
|
||||
route[N] = (0.0,'')
|
||||
for idx in xrange(N-1,-1,-1):
|
||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
||||
route[N] = (0.0, '')
|
||||
for idx in xrange(N-1, -1, -1):
|
||||
candidates = [(FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0], x) for x in DAG[idx]]
|
||||
route[idx] = max(candidates)
|
||||
|
||||
@require_initialized
|
||||
def get_DAG(sentence):
|
||||
N = len(sentence)
|
||||
i,j=0,0
|
||||
p = trie
|
||||
global pfdict, FREQ
|
||||
DAG = {}
|
||||
while i<N:
|
||||
c = sentence[j]
|
||||
if c in p:
|
||||
p = p[c]
|
||||
if '' in p:
|
||||
if i not in DAG:
|
||||
DAG[i]=[]
|
||||
DAG[i].append(j)
|
||||
j+=1
|
||||
if j>=N:
|
||||
i+=1
|
||||
j=i
|
||||
p=trie
|
||||
else:
|
||||
p = trie
|
||||
i+=1
|
||||
j=i
|
||||
for i in xrange(len(sentence)):
|
||||
if i not in DAG:
|
||||
DAG[i] =[i]
|
||||
N = len(sentence)
|
||||
for k in xrange(N):
|
||||
tmplist = []
|
||||
i = k
|
||||
frag = sentence[k]
|
||||
while i < N and frag in pfdict:
|
||||
if frag in FREQ:
|
||||
tmplist.append(i)
|
||||
i += 1
|
||||
frag = sentence[k:i+1]
|
||||
if not tmplist:
|
||||
tmplist.append(k)
|
||||
DAG[k] = tmplist
|
||||
return DAG
|
||||
|
||||
def __cut_DAG_NO_HMM(sentence):
|
||||
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
||||
DAG = get_DAG(sentence)
|
||||
route ={}
|
||||
calc(sentence,DAG,0,route=route)
|
||||
route = {}
|
||||
calc(sentence, DAG, 0, route=route)
|
||||
x = 0
|
||||
N = len(sentence)
|
||||
buf = u''
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
while x < N:
|
||||
y = route[x][1] + 1
|
||||
l_word = sentence[x:y]
|
||||
if re_eng.match(l_word) and len(l_word)==1:
|
||||
if re_eng.match(l_word) and len(l_word) == 1:
|
||||
buf += l_word
|
||||
x =y
|
||||
x = y
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if buf:
|
||||
yield buf
|
||||
buf = u''
|
||||
yield l_word
|
||||
x =y
|
||||
if len(buf)>0:
|
||||
x = y
|
||||
if buf:
|
||||
yield buf
|
||||
buf = u''
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = get_DAG(sentence)
|
||||
route ={}
|
||||
calc(sentence,DAG,0,route=route)
|
||||
route = {}
|
||||
calc(sentence, DAG, 0, route=route)
|
||||
x = 0
|
||||
buf =u''
|
||||
buf = u''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
while x < N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
if y-x == 1:
|
||||
buf += l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
if buf:
|
||||
if len(buf) == 1:
|
||||
yield buf
|
||||
buf=u''
|
||||
buf = u''
|
||||
else:
|
||||
if (buf not in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
recognized = finalseg.cut(buf)
|
||||
for t in recognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
buf=u''
|
||||
buf = u''
|
||||
yield l_word
|
||||
x =y
|
||||
x = y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
if buf:
|
||||
if len(buf) == 1:
|
||||
yield buf
|
||||
elif (buf not in FREQ):
|
||||
recognized = finalseg.cut(buf)
|
||||
for t in recognized:
|
||||
yield t
|
||||
else:
|
||||
if (buf not in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
for elem in buf:
|
||||
yield elem
|
||||
|
||||
def cut(sentence,cut_all=False,HMM=True):
|
||||
def cut(sentence, cut_all=False, HMM=True):
|
||||
'''The main function that segments an entire sentence that contains
|
||||
Chinese characters into seperated words.
|
||||
Parameter:
|
||||
- sentence: The String to be segmented
|
||||
- cut_all: Model. True means full pattern, false means accurate pattern.
|
||||
- HMM: Whether use Hidden Markov Model.
|
||||
- sentence: The str/unicode to be segmented.
|
||||
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||
- HMM: Whether to use the Hidden Markov Model.
|
||||
'''
|
||||
if not isinstance(sentence, unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
'''
|
||||
\u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||
\r\n|\s : whitespace characters. Will not be Handled.
|
||||
'''
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
||||
sentence = sentence.decode('gbk', 'ignore')
|
||||
|
||||
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||
# \r\n|\s : whitespace characters. Will not be handled.
|
||||
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
|
||||
else:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
|
||||
blocks = re_han.split(sentence)
|
||||
if HMM:
|
||||
if cut_all:
|
||||
cut_block = __cut_all
|
||||
elif HMM:
|
||||
cut_block = __cut_DAG
|
||||
else:
|
||||
cut_block = __cut_DAG_NO_HMM
|
||||
if cut_all:
|
||||
cut_block = __cut_all
|
||||
for blk in blocks:
|
||||
if len(blk)==0:
|
||||
if not blk:
|
||||
continue
|
||||
if re_han.match(blk):
|
||||
for word in cut_block(blk):
|
||||
@ -286,15 +271,15 @@ def cut(sentence,cut_all=False,HMM=True):
|
||||
else:
|
||||
yield x
|
||||
|
||||
def cut_for_search(sentence,HMM=True):
|
||||
words = cut(sentence,HMM=HMM)
|
||||
def cut_for_search(sentence, HMM=True):
|
||||
words = cut(sentence, HMM=HMM)
|
||||
for w in words:
|
||||
if len(w)>2:
|
||||
if len(w) > 2:
|
||||
for i in xrange(len(w)-1):
|
||||
gram2 = w[i:i+2]
|
||||
if gram2 in FREQ:
|
||||
yield gram2
|
||||
if len(w)>3:
|
||||
if len(w) > 3:
|
||||
for i in xrange(len(w)-2):
|
||||
gram3 = w[i:i+3]
|
||||
if gram3 in FREQ:
|
||||
@ -312,79 +297,71 @@ def load_userdict(f):
|
||||
...
|
||||
Word type may be ignored
|
||||
'''
|
||||
global trie,total,FREQ
|
||||
if isinstance(f, (str, unicode)):
|
||||
f = open(f, 'rb')
|
||||
content = f.read().decode('utf-8')
|
||||
line_no = 0
|
||||
for line in content.split("\n"):
|
||||
line_no+=1
|
||||
if line.rstrip()=='': continue
|
||||
tup =line.split(" ")
|
||||
word,freq = tup[0],tup[1]
|
||||
if freq.isdigit() is False: continue
|
||||
if line_no==1:
|
||||
line_no += 1
|
||||
if not line.rstrip():
|
||||
continue
|
||||
tup = line.split(" ")
|
||||
word, freq = tup[0], tup[1]
|
||||
if freq.isdigit() is False:
|
||||
continue
|
||||
if line_no == 1:
|
||||
word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
|
||||
if len(tup)==3:
|
||||
add_word(word, freq, tup[2])
|
||||
else:
|
||||
add_word(word, freq)
|
||||
add_word(*tup)
|
||||
|
||||
@require_initialized
|
||||
def add_word(word, freq, tag=None):
|
||||
global FREQ, trie, total, user_word_tag_tab
|
||||
freq = float(freq)
|
||||
FREQ[word] = log(freq / total)
|
||||
global FREQ, pfdict, total, user_word_tag_tab
|
||||
FREQ[word] = log(float(freq) / total)
|
||||
if tag is not None:
|
||||
user_word_tag_tab[word] = tag.strip()
|
||||
p = trie
|
||||
for c in word:
|
||||
if c not in p:
|
||||
p[c] = {}
|
||||
p = p[c]
|
||||
p[''] = '' # ending flag
|
||||
for ch in xrange(len(word)):
|
||||
pfdict.add(word[:ch+1])
|
||||
|
||||
__ref_cut = cut
|
||||
__ref_cut_for_search = cut_for_search
|
||||
|
||||
def __lcut(sentence):
|
||||
return list(__ref_cut(sentence,False))
|
||||
return list(__ref_cut(sentence, False))
|
||||
def __lcut_no_hmm(sentence):
|
||||
return list(__ref_cut(sentence,False,False))
|
||||
return list(__ref_cut(sentence, False, False))
|
||||
def __lcut_all(sentence):
|
||||
return list(__ref_cut(sentence,True))
|
||||
return list(__ref_cut(sentence, True))
|
||||
def __lcut_for_search(sentence):
|
||||
return list(__ref_cut_for_search(sentence))
|
||||
|
||||
|
||||
@require_initialized
|
||||
def enable_parallel(processnum=None):
|
||||
global pool,cut,cut_for_search
|
||||
if os.name=='nt':
|
||||
global pool, cut, cut_for_search
|
||||
if os.name == 'nt':
|
||||
raise Exception("jieba: parallel mode only supports posix system")
|
||||
if sys.version_info[0]==2 and sys.version_info[1]<6:
|
||||
raise Exception("jieba: the parallel feature needs Python version>2.5 ")
|
||||
from multiprocessing import Pool,cpu_count
|
||||
if processnum==None:
|
||||
raise Exception("jieba: the parallel feature needs Python version>2.5")
|
||||
from multiprocessing import Pool, cpu_count
|
||||
if processnum is None:
|
||||
processnum = cpu_count()
|
||||
pool = Pool(processnum)
|
||||
|
||||
def pcut(sentence,cut_all=False,HMM=True):
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
if cut_all:
|
||||
result = pool.map(__lcut_all,parts)
|
||||
result = pool.map(__lcut_all, parts)
|
||||
elif HMM:
|
||||
result = pool.map(__lcut, parts)
|
||||
else:
|
||||
if HMM:
|
||||
result = pool.map(__lcut,parts)
|
||||
else:
|
||||
result = pool.map(__lcut_no_hmm,parts)
|
||||
result = pool.map(__lcut_no_hmm, parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
def pcut_for_search(sentence):
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
result = pool.map(__lcut_for_search,parts)
|
||||
result = pool.map(__lcut_for_search, parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
@ -403,40 +380,44 @@ def disable_parallel():
|
||||
def set_dictionary(dictionary_path):
|
||||
global initialized, DICTIONARY
|
||||
with DICT_LOCK:
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
|
||||
abs_path = os.path.normpath(os.path.join(os.getcwd(), dictionary_path))
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("jieba: path does not exist:" + abs_path)
|
||||
raise Exception("jieba: path does not exist: " + abs_path)
|
||||
DICTIONARY = abs_path
|
||||
initialized = False
|
||||
|
||||
def get_abs_path_dict():
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
abs_path = os.path.join(_curpath,DICTIONARY)
|
||||
return abs_path
|
||||
|
||||
def tokenize(unicode_sentence,mode="default",HMM=True):
|
||||
#mode ("default" or "search")
|
||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
||||
"""Tokenize a sentence and yields tuples of (word, start, end)
|
||||
Parameter:
|
||||
- sentence: the unicode to be segmented.
|
||||
- mode: "default" or "search", "search" is for finer segmentation.
|
||||
- HMM: whether to use the Hidden Markov Model.
|
||||
"""
|
||||
if not isinstance(unicode_sentence, unicode):
|
||||
raise Exception("jieba: the input parameter should unicode.")
|
||||
raise Exception("jieba: the input parameter should be unicode.")
|
||||
start = 0
|
||||
if mode=='default':
|
||||
for w in cut(unicode_sentence,HMM=HMM):
|
||||
if mode == 'default':
|
||||
for w in cut(unicode_sentence, HMM=HMM):
|
||||
width = len(w)
|
||||
yield (w,start,start+width)
|
||||
start+=width
|
||||
yield (w, start, start+width)
|
||||
start += width
|
||||
else:
|
||||
for w in cut(unicode_sentence,HMM=HMM):
|
||||
for w in cut(unicode_sentence, HMM=HMM):
|
||||
width = len(w)
|
||||
if len(w)>2:
|
||||
if len(w) > 2:
|
||||
for i in xrange(len(w)-1):
|
||||
gram2 = w[i:i+2]
|
||||
if gram2 in FREQ:
|
||||
yield (gram2,start+i,start+i+2)
|
||||
if len(w)>3:
|
||||
yield (gram2, start+i, start+i+2)
|
||||
if len(w) > 3:
|
||||
for i in xrange(len(w)-2):
|
||||
gram3 = w[i:i+3]
|
||||
if gram3 in FREQ:
|
||||
yield (gram3,start+i,start+i+3)
|
||||
yield (w,start,start+width)
|
||||
start+=width
|
||||
|
||||
yield (gram3, start+i, start+i+3)
|
||||
yield (w, start, start+width)
|
||||
start += width
|
||||
|
37
jieba/__main__.py
Normal file
37
jieba/__main__.py
Normal file
@ -0,0 +1,37 @@
|
||||
"""Jieba command line interface."""
|
||||
import sys
|
||||
import jieba
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
|
||||
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
||||
nargs='?', const=' ',
|
||||
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
|
||||
parser.add_argument("-a", "--cut-all",
|
||||
action="store_true", dest="cutall", default=False,
|
||||
help="full pattern cutting")
|
||||
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
||||
default=True, help="don't use the Hidden Markov Model")
|
||||
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
||||
help="don't print loading messages to stderr")
|
||||
parser.add_argument("-V", '--version', action='version',
|
||||
version="Jieba " + jieba.__version__)
|
||||
parser.add_argument("filename", nargs='?', help="input file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.quiet:
|
||||
jieba.setLogLevel(60)
|
||||
delim = unicode(args.delimiter)
|
||||
cutall = args.cutall
|
||||
hmm = args.hmm
|
||||
fp = open(args.filename, 'r') if args.filename else sys.stdin
|
||||
|
||||
jieba.initialize()
|
||||
ln = fp.readline()
|
||||
while ln:
|
||||
l = ln.rstrip('\r\n')
|
||||
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
|
||||
ln = fp.readline()
|
||||
|
||||
fp.close()
|
@ -6,12 +6,14 @@ try:
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
abs_path = os.path.join(_curpath, "idf.txt")
|
||||
|
||||
STOP_WORDS = set([
|
||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||
])
|
||||
STOP_WORDS = set((
|
||||
"the","of","is","and","to","in","that","we","for","an","are",
|
||||
"by","be","as","on","with","can","if","from","which","you","it",
|
||||
"this","then","at","have","all","not","one","has","or","that"
|
||||
))
|
||||
|
||||
class IDFLoader:
|
||||
def __init__(self):
|
||||
@ -21,13 +23,13 @@ class IDFLoader:
|
||||
|
||||
def set_new_path(self, new_idf_path):
|
||||
if self.path != new_idf_path:
|
||||
content = open(new_idf_path,'rb').read().decode('utf-8')
|
||||
content = open(new_idf_path, 'rb').read().decode('utf-8')
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
if lines and not lines[-1]:
|
||||
lines.pop(-1)
|
||||
for line in lines:
|
||||
word,freq = line.split(' ')
|
||||
word, freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||
self.idf_freq = idf_freq
|
||||
@ -41,24 +43,22 @@ idf_loader = IDFLoader()
|
||||
idf_loader.set_new_path(abs_path)
|
||||
|
||||
def set_idf_path(idf_path):
|
||||
new_abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
|
||||
new_abs_path = os.path.normpath(os.path.join(os.getcwd(), idf_path))
|
||||
if not os.path.exists(new_abs_path):
|
||||
raise Exception("jieba: path does not exist:" + new_abs_path)
|
||||
raise Exception("jieba: path does not exist: " + new_abs_path)
|
||||
idf_loader.set_new_path(new_abs_path)
|
||||
return
|
||||
|
||||
def set_stop_words(stop_words_path):
|
||||
global STOP_WORDS
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) )
|
||||
abs_path = os.path.normpath(os.path.join(os.getcwd(), stop_words_path))
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("jieba: path does not exist:" + abs_path)
|
||||
raise Exception("jieba: path does not exist: " + abs_path)
|
||||
content = open(abs_path,'rb').read().decode('utf-8')
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
STOP_WORDS.add(line)
|
||||
return
|
||||
|
||||
def extract_tags(sentence,topK=20):
|
||||
def extract_tags(sentence, topK=20):
|
||||
global STOP_WORDS
|
||||
|
||||
idf_freq, median_idf = idf_loader.get_idf()
|
||||
@ -66,15 +66,17 @@ def extract_tags(sentence,topK=20):
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip())<2: continue
|
||||
if w.lower() in STOP_WORDS: continue
|
||||
freq[w]=freq.get(w,0.0)+1.0
|
||||
if len(w.strip()) < 2:
|
||||
continue
|
||||
if w.lower() in STOP_WORDS:
|
||||
continue
|
||||
freq[w] = freq.get(w, 0.0) + 1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||
|
||||
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list,reverse=True)
|
||||
tf_idf_list = [(v*idf_freq.get(k,median_idf), k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list, reverse=True)
|
||||
|
||||
top_tuples= st_list[:topK]
|
||||
top_tuples = st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
|
@ -1,6 +1,6 @@
|
||||
#encoding=utf-8
|
||||
##encoding=utf-8
|
||||
from whoosh.analysis import RegexAnalyzer,LowercaseFilter,StopFilter,StemFilter
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
from whoosh.analysis import Tokenizer,Token
|
||||
from whoosh.lang.porter import stem
|
||||
|
||||
import jieba
|
||||
@ -10,26 +10,24 @@ STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
|
||||
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
|
||||
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
|
||||
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
|
||||
'you', 'your',u'的',u'了',u'和'))
|
||||
'you', 'your', u'的', u'了', u'和'))
|
||||
|
||||
accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")
|
||||
|
||||
class ChineseTokenizer(Tokenizer):
|
||||
def __call__(self,text,**kargs):
|
||||
words = jieba.tokenize(text,mode="search")
|
||||
token = Token()
|
||||
def __call__(self, text, **kargs):
|
||||
words = jieba.tokenize(text, mode="search")
|
||||
token = Token()
|
||||
for (w,start_pos,stop_pos) in words:
|
||||
if not accepted_chars.match(w):
|
||||
if len(w)>1:
|
||||
pass
|
||||
else:
|
||||
continue
|
||||
if not accepted_chars.match(w) and len(w)<=1:
|
||||
continue
|
||||
token.original = token.text = w
|
||||
token.pos = start_pos
|
||||
token.startchar = start_pos
|
||||
token.endchar = stop_pos
|
||||
yield token
|
||||
|
||||
def ChineseAnalyzer(stoplist=STOP_WORDS,minsize=1,stemfn=stem,cachesize=50000):
|
||||
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
|
||||
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)
|
||||
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
|
||||
return (ChineseTokenizer() | LowercaseFilter() |
|
||||
StopFilter(stoplist=stoplist,minsize=minsize) |
|
||||
StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize))
|
||||
|
@ -4,7 +4,7 @@ import os
|
||||
import marshal
|
||||
import sys
|
||||
|
||||
MIN_FLOAT=-3.14e100
|
||||
MIN_FLOAT = -3.14e100
|
||||
|
||||
PROB_START_P = "prob_start.p"
|
||||
PROB_TRANS_P = "prob_trans.p"
|
||||
@ -19,30 +19,30 @@ PrevStatus = {
|
||||
}
|
||||
|
||||
def load_model():
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
_curpath=os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
with open(abs_path, mode='r') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
return start_p, trans_p, emit_p
|
||||
|
||||
if sys.platform.startswith("java"):
|
||||
start_P, trans_P, emit_P = load_model()
|
||||
start_P, trans_P, emit_P = load_model()
|
||||
else:
|
||||
import prob_start,prob_trans,prob_emit
|
||||
start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
|
||||
@ -53,45 +53,45 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
for y in states: #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
path[y] = [y]
|
||||
for t in range(1,len(obs)):
|
||||
for t in xrange(1,len(obs)):
|
||||
V.append({})
|
||||
newpath = {}
|
||||
for y in states:
|
||||
em_p = emit_p[y].get(obs[t],MIN_FLOAT)
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
|
||||
V[t][y] =prob
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
|
||||
V[t][y] = prob
|
||||
newpath[y] = path[state] + [y]
|
||||
path = newpath
|
||||
|
||||
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
|
||||
|
||||
|
||||
(prob, state) = max([(V[len(obs)-1][y], y) for y in ('E','S')])
|
||||
|
||||
return (prob, path[state])
|
||||
|
||||
|
||||
def __cut(sentence):
|
||||
global emit_P
|
||||
prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
|
||||
prob, pos_list = viterbi(sentence, ('B','M','E','S'), start_P, trans_P, emit_P)
|
||||
begin, next = 0,0
|
||||
#print pos_list, sentence
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i]
|
||||
if pos=='B':
|
||||
if pos == 'B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
elif pos == 'E':
|
||||
yield sentence[begin:i+1]
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
elif pos == 'S':
|
||||
yield char
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
if next < len(sentence):
|
||||
yield sentence[next:]
|
||||
|
||||
def cut(sentence):
|
||||
if not ( type(sentence) is unicode):
|
||||
if not isinstance(sentence, unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk', 'ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
@ -101,5 +101,5 @@ def cut(sentence):
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if x:
|
||||
yield x
|
||||
|
@ -14,41 +14,42 @@ PROB_TRANS_P = "prob_trans.p"
|
||||
PROB_EMIT_P = "prob_emit.p"
|
||||
CHAR_STATE_TAB_P = "char_state_tab.p"
|
||||
|
||||
def load_model(f_name,isJython=True):
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
def load_model(f_name, isJython=True):
|
||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
result = {}
|
||||
with file(f_name, "rb") as f:
|
||||
for line in open(f_name,"rb"):
|
||||
with open(f_name, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line=="":continue
|
||||
if not line:
|
||||
continue
|
||||
word, _, tag = line.split(' ')
|
||||
result[word.decode('utf-8')]=tag
|
||||
result[word.decode('utf-8')] = tag
|
||||
f.closed
|
||||
if not isJython:
|
||||
return result
|
||||
|
||||
|
||||
start_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_START_P)
|
||||
with open(abs_path, mode='rb') as f:
|
||||
with open(abs_path, mode='r') as f:
|
||||
start_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
|
||||
trans_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_TRANS_P)
|
||||
with open(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
trans_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
|
||||
emit_p = {}
|
||||
abs_path = os.path.join(_curpath, PROB_EMIT_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
emit_p = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
state = {}
|
||||
abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
|
||||
with file(abs_path, 'rb') as f:
|
||||
with open(abs_path, 'r') as f:
|
||||
state = marshal.load(f)
|
||||
f.closed
|
||||
|
||||
@ -59,17 +60,17 @@ if sys.platform.startswith("java"):
|
||||
else:
|
||||
import char_state_tab, prob_start, prob_trans, prob_emit
|
||||
char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
|
||||
word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
|
||||
word_tag_tab = load_model(jieba.get_abs_path_dict(), isJython=False)
|
||||
|
||||
def makesure_userdict_loaded(fn):
|
||||
|
||||
|
||||
@wraps(fn)
|
||||
def wrapped(*args,**kwargs):
|
||||
if len(jieba.user_word_tag_tab)>0:
|
||||
if jieba.user_word_tag_tab:
|
||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||
jieba.user_word_tag_tab = {}
|
||||
return fn(*args,**kwargs)
|
||||
|
||||
|
||||
return wrapped
|
||||
|
||||
class pair(object):
|
||||
@ -78,7 +79,7 @@ class pair(object):
|
||||
self.flag = flag
|
||||
|
||||
def __unicode__(self):
|
||||
return self.word+u"/"+self.flag
|
||||
return u'%s/%s' % (self.word, self.flag)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
@ -90,25 +91,25 @@ class pair(object):
|
||||
return self.__unicode__().encode(arg)
|
||||
|
||||
def __cut(sentence):
|
||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
|
||||
begin, next = 0,0
|
||||
prob, pos_list = viterbi.viterbi(sentence, char_state_tab_P, start_P, trans_P, emit_P)
|
||||
begin, next = 0, 0
|
||||
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i][0]
|
||||
if pos=='B':
|
||||
if pos == 'B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
elif pos == 'E':
|
||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield pair(char,pos_list[i][1])
|
||||
elif pos == 'S':
|
||||
yield pair(char, pos_list[i][1])
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield pair(sentence[next:], pos_list[next][1] )
|
||||
if next < len(sentence):
|
||||
yield pair(sentence[next:], pos_list[next][1])
|
||||
|
||||
def __cut_detail(sentence):
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
@ -117,89 +118,88 @@ def __cut_detail(sentence):
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if x:
|
||||
if re_num.match(x):
|
||||
yield pair(x,'m')
|
||||
yield pair(x, 'm')
|
||||
elif re_eng.match(x):
|
||||
yield pair(x,'eng')
|
||||
yield pair(x, 'eng')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
yield pair(x, 'x')
|
||||
|
||||
def __cut_DAG_NO_HMM(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
route = {}
|
||||
jieba.calc(sentence, DAG, 0, route=route)
|
||||
x = 0
|
||||
N = len(sentence)
|
||||
buf =u''
|
||||
buf = u''
|
||||
re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
|
||||
while x<N:
|
||||
while x < N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if re_eng.match(l_word) and len(l_word)==1:
|
||||
if re_eng.match(l_word) and len(l_word) == 1:
|
||||
buf += l_word
|
||||
x = y
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if buf:
|
||||
yield pair(buf,'eng')
|
||||
buf = u''
|
||||
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
||||
x =y
|
||||
if len(buf)>0:
|
||||
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
||||
x = y
|
||||
if buf:
|
||||
yield pair(buf,'eng')
|
||||
buf = u''
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
|
||||
route = {}
|
||||
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
|
||||
x = 0
|
||||
buf =u''
|
||||
buf = u''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
while x < N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
if y-x == 1:
|
||||
buf += l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
buf=u''
|
||||
if buf:
|
||||
if len(buf) == 1:
|
||||
yield pair(buf, word_tag_tab.get(buf, 'x'))
|
||||
buf = u''
|
||||
else:
|
||||
if (buf not in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
recognized = __cut_detail(buf)
|
||||
for t in recognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
buf=u''
|
||||
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
||||
x =y
|
||||
yield pair(elem, word_tag_tab.get(elem, 'x'))
|
||||
buf = u''
|
||||
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
||||
x = y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
if buf:
|
||||
if len(buf) == 1:
|
||||
yield pair(buf, word_tag_tab.get(buf, 'x'))
|
||||
elif (buf not in jieba.FREQ):
|
||||
recognized = __cut_detail(buf)
|
||||
for t in recognized:
|
||||
yield t
|
||||
else:
|
||||
if (buf not in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
for elem in buf:
|
||||
yield pair(elem, word_tag_tab.get(elem, 'x'))
|
||||
|
||||
def __cut_internal(sentence,HMM=True):
|
||||
if not ( type(sentence) is unicode):
|
||||
def __cut_internal(sentence, HMM=True):
|
||||
if not isinstance(sentence, unicode):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk', 'ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
re_eng, re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
if HMM:
|
||||
__cut_blk = __cut_DAG
|
||||
@ -214,15 +214,15 @@ def __cut_internal(sentence,HMM=True):
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield pair(x,'x')
|
||||
yield pair(x, 'x')
|
||||
else:
|
||||
for xx in x:
|
||||
if re_num.match(xx):
|
||||
yield pair(xx,'m')
|
||||
yield pair(xx, 'm')
|
||||
elif re_eng.match(x):
|
||||
yield pair(xx,'eng')
|
||||
yield pair(xx, 'eng')
|
||||
else:
|
||||
yield pair(xx,'x')
|
||||
yield pair(xx, 'x')
|
||||
|
||||
def __lcut_internal(sentence):
|
||||
return list(__cut_internal(sentence))
|
||||
@ -231,16 +231,16 @@ def __lcut_internal_no_hmm(sentence):
|
||||
|
||||
|
||||
@makesure_userdict_loaded
|
||||
def cut(sentence,HMM=True):
|
||||
if (not hasattr(jieba,'pool')) or (jieba.pool==None):
|
||||
for w in __cut_internal(sentence,HMM=HMM):
|
||||
def cut(sentence, HMM=True):
|
||||
if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
|
||||
for w in __cut_internal(sentence, HMM=HMM):
|
||||
yield w
|
||||
else:
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
if HMM:
|
||||
result = jieba.pool.map(__lcut_internal,parts)
|
||||
result = jieba.pool.map(__lcut_internal, parts)
|
||||
else:
|
||||
result = jieba.pool.map(__lcut_internal_no_hmm,parts)
|
||||
result = jieba.pool.map(__lcut_internal_no_hmm, parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
@ -1,46 +1,45 @@
|
||||
import operator
|
||||
MIN_FLOAT=-3.14e100
|
||||
MIN_INF=float("-inf")
|
||||
MIN_FLOAT = -3.14e100
|
||||
MIN_INF = float("-inf")
|
||||
|
||||
def get_top_states(t_state_v,K=4):
|
||||
def get_top_states(t_state_v, K=4):
|
||||
items = t_state_v.items()
|
||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||
topK = sorted(items, key=operator.itemgetter(1), reverse=True)[:K]
|
||||
return [x[0] for x in topK]
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
V = [{}] #tabular
|
||||
mem_path = [{}]
|
||||
all_states = trans_p.keys()
|
||||
for y in states.get(obs[0],all_states): #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
for y in states.get(obs[0], all_states): #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
|
||||
mem_path[0][y] = ''
|
||||
for t in range(1,len(obs)):
|
||||
for t in xrange(1, len(obs)):
|
||||
V.append({})
|
||||
mem_path.append({})
|
||||
#prev_states = get_top_states(V[t-1])
|
||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||
prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
|
||||
|
||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||
obs_states = states.get(obs[t],all_states)
|
||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||
prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
|
||||
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
|
||||
|
||||
if len(obs_states)==0: obs_states = prev_states_expect_next
|
||||
if len(obs_states)==0: obs_states = all_states
|
||||
if not obs_states:
|
||||
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
||||
|
||||
for y in obs_states:
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
|
||||
V[t][y] =prob
|
||||
prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
|
||||
V[t][y] = prob
|
||||
mem_path[t][y] = state
|
||||
|
||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
||||
last = [(V[-1][y], y) for y in mem_path[-1].keys()]
|
||||
#if len(last)==0:
|
||||
#print obs
|
||||
(prob, state) = max(last)
|
||||
prob, state = max(last)
|
||||
|
||||
route = [None] * len(obs)
|
||||
i = len(obs)-1
|
||||
while i>=0:
|
||||
i = len(obs) - 1
|
||||
while i >= 0:
|
||||
route[i] = state
|
||||
state = mem_path[i][state]
|
||||
i-=1
|
||||
return (prob, route)
|
||||
i -= 1
|
||||
return (prob, route)
|
||||
|
18
setup.py
18
setup.py
@ -1,11 +1,11 @@
|
||||
from distutils.core import setup
|
||||
setup(name='jieba',
|
||||
version='0.33',
|
||||
description='Chinese Words Segementation Utilities',
|
||||
author='Sun, Junyi',
|
||||
author_email='ccnusjy@gmail.com',
|
||||
url='http://github.com/fxsjy',
|
||||
packages=['jieba'],
|
||||
from distutils.core import setup
|
||||
setup(name='jieba',
|
||||
version='0.33',
|
||||
description='Chinese Words Segementation Utilities',
|
||||
author='Sun, Junyi',
|
||||
author_email='ccnusjy@gmail.com',
|
||||
url='http://github.com/fxsjy',
|
||||
packages=['jieba'],
|
||||
package_dir={'jieba':'jieba'},
|
||||
package_data={'jieba':['*.*','finalseg/*','analyse/*','posseg/*']}
|
||||
)
|
||||
)
|
||||
|
16
test/demo.py
16
test/demo.py
@ -4,14 +4,14 @@ sys.path.append("../")
|
||||
|
||||
import jieba
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
|
||||
print "Full Mode:", "/ ".join(seg_list) # 全模式
|
||||
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=True)
|
||||
print u"Full Mode:", u"/ ".join(seg_list) # 全模式
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
|
||||
print "Default Mode:", "/ ".join(seg_list) # 默认模式
|
||||
seg_list = jieba.cut(u"我来到北京清华大学", cut_all=False)
|
||||
print u"Default Mode:", u"/ ".join(seg_list) # 默认模式
|
||||
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||
print ", ".join(seg_list)
|
||||
seg_list = jieba.cut(u"他来到了网易杭研大厦")
|
||||
print u", ".join(seg_list)
|
||||
|
||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||
print ", ".join(seg_list)
|
||||
seg_list = jieba.cut_for_search(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
|
||||
print u", ".join(seg_list)
|
||||
|
@ -6,7 +6,7 @@ import jieba.posseg as pseg
|
||||
def cuttest(test_sent):
|
||||
result = pseg.cut(test_sent)
|
||||
for w in result:
|
||||
print w.word, "/", w.flag, ", ",
|
||||
print w.word, "/", w.flag, ", ",
|
||||
print ""
|
||||
|
||||
|
||||
@ -95,4 +95,4 @@ if __name__ == "__main__":
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest('枪杆子中出政权')
|
||||
cuttest('枪杆子中出政权')
|
||||
|
@ -14,7 +14,7 @@ for w in words:
|
||||
result = pseg.cut(test_sent)
|
||||
|
||||
for w in result:
|
||||
print w.word, "/", w.flag, ", ",
|
||||
print w.word, "/", w.flag, ", ",
|
||||
|
||||
print "\n========"
|
||||
|
||||
|
@ -5,7 +5,7 @@ from whoosh.index import create_in,open_dir
|
||||
from whoosh.fields import *
|
||||
from whoosh.qparser import QueryParser
|
||||
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
|
||||
analyzer = ChineseAnalyzer()
|
||||
|
||||
@ -18,31 +18,31 @@ ix = create_in("tmp", schema) # for create new index
|
||||
writer = ix.writer()
|
||||
|
||||
writer.add_document(
|
||||
title=u"document1",
|
||||
title=u"document1",
|
||||
path=u"/a",
|
||||
content=u"This is the first document we’ve added!"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document2",
|
||||
title=u"document2",
|
||||
path=u"/b",
|
||||
content=u"The second one 你 中文测试中文 is even more interesting! 吃水果"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document3",
|
||||
title=u"document3",
|
||||
path=u"/c",
|
||||
content=u"买水果然后来世博园。"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document4",
|
||||
title=u"document4",
|
||||
path=u"/c",
|
||||
content=u"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"
|
||||
)
|
||||
|
||||
writer.add_document(
|
||||
title=u"document4",
|
||||
title=u"document4",
|
||||
path=u"/c",
|
||||
content=u"咱俩交换一下吧。"
|
||||
)
|
||||
@ -55,7 +55,7 @@ for keyword in (u"水果世博园",u"你",u"first",u"中文",u"交换机",u"交
|
||||
print "result of ",keyword
|
||||
q = parser.parse(keyword)
|
||||
results = searcher.search(q)
|
||||
for hit in results:
|
||||
for hit in results:
|
||||
print hit.highlights("content")
|
||||
print "="*10
|
||||
|
||||
|
@ -6,7 +6,7 @@ from whoosh.index import create_in
|
||||
from whoosh.fields import *
|
||||
from whoosh.qparser import QueryParser
|
||||
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
from jieba.analyse import ChineseAnalyzer
|
||||
|
||||
analyzer = ChineseAnalyzer()
|
||||
|
||||
@ -23,7 +23,7 @@ with open(file_name,"rb") as inf:
|
||||
for line in inf:
|
||||
i+=1
|
||||
writer.add_document(
|
||||
title=u"line"+str(i),
|
||||
title=u"line"+str(i),
|
||||
path=u"/a",
|
||||
content=line.decode('gbk','ignore')
|
||||
)
|
||||
@ -36,6 +36,6 @@ for keyword in (u"水果小姐",u"你",u"first",u"中文",u"交换机",u"交换"
|
||||
print "result of ",keyword
|
||||
q = parser.parse(keyword)
|
||||
results = searcher.search(q)
|
||||
for hit in results:
|
||||
for hit in results:
|
||||
print hit.highlights("content")
|
||||
print "="*10
|
||||
|
Loading…
x
Reference in New Issue
Block a user