mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
commit
56e8336af1
@ -16,14 +16,13 @@ import logging
|
|||||||
|
|
||||||
DICTIONARY = "dict.txt"
|
DICTIONARY = "dict.txt"
|
||||||
DICT_LOCK = threading.RLock()
|
DICT_LOCK = threading.RLock()
|
||||||
trie = None # to be initialized
|
pfdict = None # to be initialized
|
||||||
FREQ = {}
|
FREQ = {}
|
||||||
min_freq = 0.0
|
min_freq = 0.0
|
||||||
total = 0.0
|
total = 0.0
|
||||||
user_word_tag_tab = {}
|
user_word_tag_tab = {}
|
||||||
initialized = False
|
initialized = False
|
||||||
|
|
||||||
|
|
||||||
log_console = logging.StreamHandler(sys.stderr)
|
log_console = logging.StreamHandler(sys.stderr)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
@ -33,85 +32,80 @@ def setLogLevel(log_level):
|
|||||||
global logger
|
global logger
|
||||||
logger.setLevel(log_level)
|
logger.setLevel(log_level)
|
||||||
|
|
||||||
def gen_trie(f_name):
|
def gen_pfdict(f_name):
|
||||||
lfreq = {}
|
lfreq = {}
|
||||||
trie = {}
|
pfdict = set()
|
||||||
ltotal = 0.0
|
ltotal = 0.0
|
||||||
with open(f_name, 'rb') as f:
|
with open(f_name, 'rb') as f:
|
||||||
lineno = 0
|
lineno = 0
|
||||||
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
||||||
lineno += 1
|
lineno += 1
|
||||||
try:
|
try:
|
||||||
word,freq,_ = line.split(' ')
|
word,freq = line.split(' ')[:2]
|
||||||
freq = float(freq)
|
freq = float(freq)
|
||||||
lfreq[word] = freq
|
lfreq[word] = freq
|
||||||
ltotal += freq
|
ltotal += freq
|
||||||
p = trie
|
for ch in range(len(word)):
|
||||||
for c in word:
|
pfdict.add(word[:ch+1])
|
||||||
if c not in p:
|
|
||||||
p[c] ={}
|
|
||||||
p = p[c]
|
|
||||||
p['']='' #ending flag
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
logger.debug('%s at line %s %s' % (f_name, lineno, line))
|
||||||
raise e
|
raise e
|
||||||
return trie, lfreq,ltotal
|
return pfdict, lfreq, ltotal
|
||||||
|
|
||||||
def initialize(*args):
|
def initialize(*args):
|
||||||
global trie, FREQ, total, min_freq, initialized
|
global pfdict, FREQ, total, min_freq, initialized
|
||||||
if len(args)==0:
|
if not args:
|
||||||
dictionary = DICTIONARY
|
dictionary = DICTIONARY
|
||||||
else:
|
else:
|
||||||
dictionary = args[0]
|
dictionary = args[0]
|
||||||
with DICT_LOCK:
|
with DICT_LOCK:
|
||||||
if initialized:
|
if initialized:
|
||||||
return
|
return
|
||||||
if trie:
|
if pfdict:
|
||||||
del trie
|
del pfdict
|
||||||
trie = None
|
pfdict = None
|
||||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
abs_path = os.path.join(_curpath,dictionary)
|
abs_path = os.path.join(_curpath,dictionary)
|
||||||
logger.debug("Building Trie..., from %s" % abs_path)
|
logger.debug("Building prefix dict from %s ..." % abs_path)
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
|
||||||
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
|
||||||
else: #customer dictionary
|
else: #custom dictionary
|
||||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))
|
||||||
|
|
||||||
load_from_cache_fail = True
|
load_from_cache_fail = True
|
||||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
|
||||||
logger.debug("loading model from cache %s" % cache_file)
|
logger.debug("Loading model from cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
with open(cache_file, 'rb') as cf:
|
with open(cache_file, 'rb') as cf:
|
||||||
trie,FREQ,total,min_freq = marshal.load(cf)
|
pfdict,FREQ,total,min_freq = marshal.load(cf)
|
||||||
load_from_cache_fail = False
|
# prevent conflict with old version
|
||||||
|
load_from_cache_fail = not isinstance(pfdict, set)
|
||||||
except:
|
except:
|
||||||
load_from_cache_fail = True
|
load_from_cache_fail = True
|
||||||
|
|
||||||
if load_from_cache_fail:
|
if load_from_cache_fail:
|
||||||
trie,FREQ,total = gen_trie(abs_path)
|
pfdict,FREQ,total = gen_pfdict(abs_path)
|
||||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
||||||
min_freq = min(FREQ.values())
|
min_freq = min(FREQ.values())
|
||||||
logger.debug("dumping model to file cache %s" % cache_file)
|
logger.debug("Dumping model to file cache %s" % cache_file)
|
||||||
try:
|
try:
|
||||||
tmp_suffix = "."+str(random.random())
|
tmp_suffix = "."+str(random.random())
|
||||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||||
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
import shutil
|
from shutil import move as replace_file
|
||||||
replace_file = shutil.move
|
|
||||||
else:
|
else:
|
||||||
replace_file = os.rename
|
replace_file = os.rename
|
||||||
replace_file(cache_file + tmp_suffix, cache_file)
|
replace_file(cache_file + tmp_suffix, cache_file)
|
||||||
except:
|
except:
|
||||||
logger.error("dump cache file failed.")
|
logger.exception("Dump cache file failed.")
|
||||||
logger.exception("")
|
|
||||||
|
|
||||||
initialized = True
|
initialized = True
|
||||||
|
|
||||||
logger.debug("loading model cost %s seconds." % (time.time() - t1))
|
logger.debug("Loading model cost %s seconds." % (time.time() - t1))
|
||||||
logger.debug("Trie has been built succesfully.")
|
logger.debug("Prefix dict has been built succesfully.")
|
||||||
|
|
||||||
|
|
||||||
def require_initialized(fn):
|
def require_initialized(fn):
|
||||||
@ -151,30 +145,21 @@ def calc(sentence,DAG,idx,route):
|
|||||||
|
|
||||||
@require_initialized
|
@require_initialized
|
||||||
def get_DAG(sentence):
|
def get_DAG(sentence):
|
||||||
N = len(sentence)
|
global pfdict, FREQ
|
||||||
i,j=0,0
|
|
||||||
p = trie
|
|
||||||
DAG = {}
|
DAG = {}
|
||||||
while i<N:
|
N = len(sentence)
|
||||||
c = sentence[j]
|
for k in range(N):
|
||||||
if c in p:
|
tmplist = []
|
||||||
p = p[c]
|
i = k
|
||||||
if '' in p:
|
frag = sentence[k]
|
||||||
if i not in DAG:
|
while i < N and frag in pfdict:
|
||||||
DAG[i]=[]
|
if frag in FREQ:
|
||||||
DAG[i].append(j)
|
tmplist.append(i)
|
||||||
j+=1
|
|
||||||
if j>=N:
|
|
||||||
i += 1
|
i += 1
|
||||||
j=i
|
frag = sentence[k:i+1]
|
||||||
p=trie
|
if not tmplist:
|
||||||
else:
|
tmplist.append(k)
|
||||||
p = trie
|
DAG[k] = tmplist
|
||||||
i+=1
|
|
||||||
j=i
|
|
||||||
for i in range(len(sentence)):
|
|
||||||
if i not in DAG:
|
|
||||||
DAG[i] =[i]
|
|
||||||
return DAG
|
return DAG
|
||||||
|
|
||||||
def __cut_DAG_NO_HMM(sentence):
|
def __cut_DAG_NO_HMM(sentence):
|
||||||
@ -192,12 +177,12 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
buf += l_word
|
buf += l_word
|
||||||
x = y
|
x = y
|
||||||
else:
|
else:
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
yield buf
|
yield buf
|
||||||
buf = ''
|
buf = ''
|
||||||
yield l_word
|
yield l_word
|
||||||
x = y
|
x = y
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
yield buf
|
yield buf
|
||||||
buf = ''
|
buf = ''
|
||||||
|
|
||||||
@ -214,14 +199,14 @@ def __cut_DAG(sentence):
|
|||||||
if y-x == 1:
|
if y-x == 1:
|
||||||
buf += l_word
|
buf += l_word
|
||||||
else:
|
else:
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
if len(buf) == 1:
|
if len(buf) == 1:
|
||||||
yield buf
|
yield buf
|
||||||
buf = ''
|
buf = ''
|
||||||
else:
|
else:
|
||||||
if (buf not in FREQ):
|
if (buf not in FREQ):
|
||||||
regognized = finalseg.cut(buf)
|
recognized = finalseg.cut(buf)
|
||||||
for t in regognized:
|
for t in recognized:
|
||||||
yield t
|
yield t
|
||||||
else:
|
else:
|
||||||
for elem in buf:
|
for elem in buf:
|
||||||
@ -230,13 +215,12 @@ def __cut_DAG(sentence):
|
|||||||
yield l_word
|
yield l_word
|
||||||
x = y
|
x = y
|
||||||
|
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
if len(buf) == 1:
|
if len(buf) == 1:
|
||||||
yield buf
|
yield buf
|
||||||
else:
|
elif (buf not in FREQ):
|
||||||
if (buf not in FREQ):
|
recognized = finalseg.cut(buf)
|
||||||
regognized = finalseg.cut(buf)
|
for t in recognized:
|
||||||
for t in regognized:
|
|
||||||
yield t
|
yield t
|
||||||
else:
|
else:
|
||||||
for elem in buf:
|
for elem in buf:
|
||||||
@ -246,31 +230,32 @@ def cut(sentence,cut_all=False,HMM=True):
|
|||||||
'''The main function that segments an entire sentence that contains
|
'''The main function that segments an entire sentence that contains
|
||||||
Chinese characters into seperated words.
|
Chinese characters into seperated words.
|
||||||
Parameter:
|
Parameter:
|
||||||
- sentence: The String to be segmented
|
- sentence: The str to be segmented.
|
||||||
- cut_all: Model. True means full pattern, false means accurate pattern.
|
- cut_all: Model type. True for full pattern, False for accurate pattern.
|
||||||
- HMM: Whether use Hidden Markov Model.
|
- HMM: Whether to use the Hidden Markov Model.
|
||||||
'''
|
'''
|
||||||
if isinstance(sentence, bytes):
|
if isinstance(sentence, bytes):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
'''
|
|
||||||
\\u4E00-\\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
# \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
|
||||||
\r\n|\s : whitespace characters. Will not be Handled.
|
# \r\n|\s : whitespace characters. Will not be handled.
|
||||||
'''
|
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)")
|
|
||||||
if cut_all:
|
if cut_all:
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]")
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)", re.U), re.compile(r"[^a-zA-Z0-9+#\n]", re.U)
|
||||||
|
else:
|
||||||
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(r"(\r\n|\s)", re.U)
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
if HMM:
|
if cut_all:
|
||||||
|
cut_block = __cut_all
|
||||||
|
elif HMM:
|
||||||
cut_block = __cut_DAG
|
cut_block = __cut_DAG
|
||||||
else:
|
else:
|
||||||
cut_block = __cut_DAG_NO_HMM
|
cut_block = __cut_DAG_NO_HMM
|
||||||
if cut_all:
|
|
||||||
cut_block = __cut_all
|
|
||||||
for blk in blocks:
|
for blk in blocks:
|
||||||
if len(blk)==0:
|
if not blk:
|
||||||
continue
|
continue
|
||||||
if re_han.match(blk):
|
if re_han.match(blk):
|
||||||
for word in cut_block(blk):
|
for word in cut_block(blk):
|
||||||
@ -312,37 +297,30 @@ def load_userdict(f):
|
|||||||
...
|
...
|
||||||
Word type may be ignored
|
Word type may be ignored
|
||||||
'''
|
'''
|
||||||
global trie,total,FREQ
|
|
||||||
if isinstance(f, str):
|
if isinstance(f, str):
|
||||||
f = open(f, 'rb')
|
f = open(f, 'rb')
|
||||||
content = f.read().decode('utf-8')
|
content = f.read().decode('utf-8')
|
||||||
line_no = 0
|
line_no = 0
|
||||||
for line in content.split("\n"):
|
for line in content.split("\n"):
|
||||||
line_no += 1
|
line_no += 1
|
||||||
if line.rstrip()=='': continue
|
if not line.rstrip():
|
||||||
|
continue
|
||||||
tup = line.split(" ")
|
tup = line.split(" ")
|
||||||
word, freq = tup[0], tup[1]
|
word, freq = tup[0], tup[1]
|
||||||
if freq.isdigit() is False: continue
|
if freq.isdigit() is False:
|
||||||
|
continue
|
||||||
if line_no == 1:
|
if line_no == 1:
|
||||||
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
||||||
if len(tup)==3:
|
add_word(*tup)
|
||||||
add_word(word, freq, tup[2])
|
|
||||||
else:
|
|
||||||
add_word(word, freq)
|
|
||||||
|
|
||||||
@require_initialized
|
@require_initialized
|
||||||
def add_word(word, freq, tag=None):
|
def add_word(word, freq, tag=None):
|
||||||
global FREQ, trie, total, user_word_tag_tab
|
global FREQ, pfdict, total, user_word_tag_tab
|
||||||
freq = float(freq)
|
FREQ[word] = log(float(freq) / total)
|
||||||
FREQ[word] = log(freq / total)
|
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
user_word_tag_tab[word] = tag.strip()
|
user_word_tag_tab[word] = tag.strip()
|
||||||
p = trie
|
for ch in range(len(word)):
|
||||||
for c in word:
|
pfdict.add(word[:ch+1])
|
||||||
if c not in p:
|
|
||||||
p[c] = {}
|
|
||||||
p = p[c]
|
|
||||||
p[''] = '' # ending flag
|
|
||||||
|
|
||||||
__ref_cut = cut
|
__ref_cut = cut
|
||||||
__ref_cut_for_search = cut_for_search
|
__ref_cut_for_search = cut_for_search
|
||||||
@ -362,10 +340,8 @@ def enable_parallel(processnum=None):
|
|||||||
global pool, cut, cut_for_search
|
global pool, cut, cut_for_search
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
raise Exception("jieba: parallel mode only supports posix system")
|
raise Exception("jieba: parallel mode only supports posix system")
|
||||||
if sys.version_info[0]==2 and sys.version_info[1]<6:
|
|
||||||
raise Exception("jieba: the parallel feature needs Python version>2.5 ")
|
|
||||||
from multiprocessing import Pool, cpu_count
|
from multiprocessing import Pool, cpu_count
|
||||||
if processnum==None:
|
if processnum is None:
|
||||||
processnum = cpu_count()
|
processnum = cpu_count()
|
||||||
pool = Pool(processnum)
|
pool = Pool(processnum)
|
||||||
|
|
||||||
@ -373,8 +349,7 @@ def enable_parallel(processnum=None):
|
|||||||
parts = re.compile('([\r\n]+)').split(sentence)
|
parts = re.compile('([\r\n]+)').split(sentence)
|
||||||
if cut_all:
|
if cut_all:
|
||||||
result = pool.map(__lcut_all, parts)
|
result = pool.map(__lcut_all, parts)
|
||||||
else:
|
elif HMM:
|
||||||
if HMM:
|
|
||||||
result = pool.map(__lcut, parts)
|
result = pool.map(__lcut, parts)
|
||||||
else:
|
else:
|
||||||
result = pool.map(__lcut_no_hmm, parts)
|
result = pool.map(__lcut_no_hmm, parts)
|
||||||
@ -415,7 +390,12 @@ def get_abs_path_dict():
|
|||||||
return abs_path
|
return abs_path
|
||||||
|
|
||||||
def tokenize(unicode_sentence, mode="default", HMM=True):
|
def tokenize(unicode_sentence, mode="default", HMM=True):
|
||||||
#mode ("default" or "search")
|
"""Tokenize a sentence and yields tuples of (word, start, end)
|
||||||
|
Parameter:
|
||||||
|
- sentence: the str to be segmented.
|
||||||
|
- mode: "default" or "search", "search" is for finer segmentation.
|
||||||
|
- HMM: whether to use the Hidden Markov Model.
|
||||||
|
"""
|
||||||
if not isinstance(unicode_sentence, str):
|
if not isinstance(unicode_sentence, str):
|
||||||
raise Exception("jieba: the input parameter should be str.")
|
raise Exception("jieba: the input parameter should be str.")
|
||||||
start = 0
|
start = 0
|
||||||
@ -439,4 +419,3 @@ def tokenize(unicode_sentence,mode="default",HMM=True):
|
|||||||
yield (gram3, start+i, start+i+3)
|
yield (gram3, start+i, start+i+3)
|
||||||
yield (w, start, start+width)
|
yield (w, start, start+width)
|
||||||
start += width
|
start += width
|
||||||
|
|
||||||
|
36
jieba/__main__.py
Normal file
36
jieba/__main__.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
"""Jieba command line interface."""
|
||||||
|
import sys
|
||||||
|
import jieba
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
parser = ArgumentParser(usage="%s -m jieba [options] filename" % sys.executable, description="Jieba command line interface.", epilog="If no filename specified, use STDIN instead.")
|
||||||
|
parser.add_argument("-d", "--delimiter", metavar="DELIM", default=' / ',
|
||||||
|
nargs='?', const=' ',
|
||||||
|
help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
|
||||||
|
parser.add_argument("-a", "--cut-all",
|
||||||
|
action="store_true", dest="cutall", default=False,
|
||||||
|
help="full pattern cutting")
|
||||||
|
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
|
||||||
|
default=True, help="don't use the Hidden Markov Model")
|
||||||
|
parser.add_argument("-q", "--quiet", action="store_true", default=False,
|
||||||
|
help="don't print loading messages to stderr")
|
||||||
|
parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__)
|
||||||
|
parser.add_argument("filename", nargs='?', help="input file")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.quiet:
|
||||||
|
jieba.setLogLevel(60)
|
||||||
|
delim = str(args.delimiter)
|
||||||
|
cutall = args.cutall
|
||||||
|
hmm = args.hmm
|
||||||
|
fp = open(args.filename, 'r') if args.filename else sys.stdin
|
||||||
|
|
||||||
|
jieba.initialize()
|
||||||
|
ln = fp.readline()
|
||||||
|
while ln:
|
||||||
|
l = ln.rstrip('\r\n')
|
||||||
|
print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
|
||||||
|
ln = fp.readline()
|
||||||
|
|
||||||
|
fp.close()
|
@ -9,28 +9,44 @@ except ImportError:
|
|||||||
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
_curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
abs_path = os.path.join(_curpath, "idf.txt")
|
abs_path = os.path.join(_curpath, "idf.txt")
|
||||||
|
|
||||||
IDF_DICTIONARY = abs_path
|
STOP_WORDS = set((
|
||||||
STOP_WORDS = set([
|
"the","of","is","and","to","in","that","we","for","an","are",
|
||||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
"by","be","as","on","with","can","if","from","which","you","it",
|
||||||
])
|
"this","then","at","have","all","not","one","has","or","that"
|
||||||
|
))
|
||||||
|
|
||||||
def set_idf_path(idf_path):
|
class IDFLoader:
|
||||||
global IDF_DICTIONARY
|
def __init__(self):
|
||||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path ) )
|
self.path = ""
|
||||||
if not os.path.exists(abs_path):
|
self.idf_freq = {}
|
||||||
raise Exception("jieba: path does not exist:" + abs_path)
|
self.median_idf = 0.0
|
||||||
IDF_DICTIONARY = abs_path
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_idf(abs_path):
|
def set_new_path(self, new_idf_path):
|
||||||
content = open(abs_path,'rb').read().decode('utf-8')
|
if self.path != new_idf_path:
|
||||||
|
content = open(new_idf_path, 'r', encoding='utf-8').read()
|
||||||
idf_freq = {}
|
idf_freq = {}
|
||||||
lines = content.split('\n')
|
lines = content.split('\n')
|
||||||
|
if lines and not lines[-1]:
|
||||||
|
lines.pop(-1)
|
||||||
for line in lines:
|
for line in lines:
|
||||||
word, freq = line.split(' ')
|
word, freq = line.split(' ')
|
||||||
idf_freq[word] = float(freq)
|
idf_freq[word] = float(freq)
|
||||||
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
|
median_idf = sorted(idf_freq.values())[len(idf_freq)//2]
|
||||||
return idf_freq, median_idf
|
self.idf_freq = idf_freq
|
||||||
|
self.median_idf = median_idf
|
||||||
|
self.path = new_idf_path
|
||||||
|
|
||||||
|
def get_idf(self):
|
||||||
|
return self.idf_freq, self.median_idf
|
||||||
|
|
||||||
|
idf_loader = IDFLoader()
|
||||||
|
idf_loader.set_new_path(abs_path)
|
||||||
|
|
||||||
|
def set_idf_path(idf_path):
|
||||||
|
new_abs_path = os.path.normpath(os.path.join(os.getcwd(), idf_path))
|
||||||
|
if not os.path.exists(new_abs_path):
|
||||||
|
raise Exception("jieba: path does not exist: " + new_abs_path)
|
||||||
|
idf_loader.set_new_path(new_abs_path)
|
||||||
|
|
||||||
def set_stop_words(stop_words_path):
|
def set_stop_words(stop_words_path):
|
||||||
global STOP_WORDS
|
global STOP_WORDS
|
||||||
@ -41,19 +57,19 @@ def set_stop_words(stop_words_path):
|
|||||||
lines = content.split('\n')
|
lines = content.split('\n')
|
||||||
for line in lines:
|
for line in lines:
|
||||||
STOP_WORDS.add(line)
|
STOP_WORDS.add(line)
|
||||||
return
|
|
||||||
|
|
||||||
def extract_tags(sentence, topK=20):
|
def extract_tags(sentence, topK=20):
|
||||||
global IDF_DICTIONARY
|
|
||||||
global STOP_WORDS
|
global STOP_WORDS
|
||||||
|
|
||||||
idf_freq, median_idf = get_idf(IDF_DICTIONARY)
|
idf_freq, median_idf = idf_loader.get_idf()
|
||||||
|
|
||||||
words = jieba.cut(sentence)
|
words = jieba.cut(sentence)
|
||||||
freq = {}
|
freq = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
if len(w.strip())<2: continue
|
if len(w.strip()) < 2:
|
||||||
if w.lower() in STOP_WORDS: continue
|
continue
|
||||||
|
if w.lower() in STOP_WORDS:
|
||||||
|
continue
|
||||||
freq[w] = freq.get(w, 0.0) + 1.0
|
freq[w] = freq.get(w, 0.0) + 1.0
|
||||||
total = sum(freq.values())
|
total = sum(freq.values())
|
||||||
freq = [(k,v/total) for k,v in freq.items()]
|
freq = [(k,v/total) for k,v in freq.items()]
|
||||||
|
@ -19,10 +19,7 @@ class ChineseTokenizer(Tokenizer):
|
|||||||
words = jieba.tokenize(text, mode="search")
|
words = jieba.tokenize(text, mode="search")
|
||||||
token = Token()
|
token = Token()
|
||||||
for (w,start_pos,stop_pos) in words:
|
for (w,start_pos,stop_pos) in words:
|
||||||
if not accepted_chars.match(w):
|
if not accepted_chars.match(w) and len(w)<=1:
|
||||||
if len(w)>1:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
continue
|
continue
|
||||||
token.original = token.text = w
|
token.original = token.text = w
|
||||||
token.pos = start_pos
|
token.pos = start_pos
|
||||||
@ -31,5 +28,6 @@ class ChineseTokenizer(Tokenizer):
|
|||||||
yield token
|
yield token
|
||||||
|
|
||||||
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
|
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
|
||||||
return ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist,minsize=minsize)\
|
return (ChineseTokenizer() | LowercaseFilter() |
|
||||||
|StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)
|
StopFilter(stoplist=stoplist,minsize=minsize) |
|
||||||
|
StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize))
|
||||||
|
@ -86,10 +86,10 @@ def __cut(sentence):
|
|||||||
yield sentence[next:]
|
yield sentence[next:]
|
||||||
|
|
||||||
def cut(sentence):
|
def cut(sentence):
|
||||||
if not ( type(sentence) is str):
|
if not isinstance(sentence, str):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5]+)"), re.compile(r"(\d+\.\d+|[a-zA-Z0-9]+)")
|
||||||
blocks = re_han.split(sentence)
|
blocks = re_han.split(sentence)
|
||||||
@ -100,5 +100,5 @@ def cut(sentence):
|
|||||||
else:
|
else:
|
||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
if x!="":
|
if x:
|
||||||
yield x
|
yield x
|
||||||
|
@ -20,7 +20,7 @@ def load_model(f_name,isJython=True):
|
|||||||
with open(f_name, "rb") as f:
|
with open(f_name, "rb") as f:
|
||||||
for line in open(f_name,"rb"):
|
for line in open(f_name,"rb"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line=="":continue
|
if not line: continue
|
||||||
line = line.decode("utf-8")
|
line = line.decode("utf-8")
|
||||||
word, _, tag = line.split(" ")
|
word, _, tag = line.split(" ")
|
||||||
result[word] = tag
|
result[word] = tag
|
||||||
@ -78,7 +78,7 @@ class pair(object):
|
|||||||
self.flag = flag
|
self.flag = flag
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.word+"/"+self.flag
|
return '%s/%s' % (self.word, self.flag)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
@ -117,7 +117,7 @@ def __cut_detail(sentence):
|
|||||||
else:
|
else:
|
||||||
tmp = re_skip.split(blk)
|
tmp = re_skip.split(blk)
|
||||||
for x in tmp:
|
for x in tmp:
|
||||||
if x!="":
|
if x:
|
||||||
if re_num.match(x):
|
if re_num.match(x):
|
||||||
yield pair(x, 'm')
|
yield pair(x, 'm')
|
||||||
elif re_eng.match(x):
|
elif re_eng.match(x):
|
||||||
@ -140,12 +140,12 @@ def __cut_DAG_NO_HMM(sentence):
|
|||||||
buf += l_word
|
buf += l_word
|
||||||
x = y
|
x = y
|
||||||
else:
|
else:
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
yield pair(buf,'eng')
|
yield pair(buf,'eng')
|
||||||
buf = ''
|
buf = ''
|
||||||
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
||||||
x = y
|
x = y
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
yield pair(buf,'eng')
|
yield pair(buf,'eng')
|
||||||
buf = ''
|
buf = ''
|
||||||
|
|
||||||
@ -164,14 +164,14 @@ def __cut_DAG(sentence):
|
|||||||
if y-x == 1:
|
if y-x == 1:
|
||||||
buf += l_word
|
buf += l_word
|
||||||
else:
|
else:
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
if len(buf) == 1:
|
if len(buf) == 1:
|
||||||
yield pair(buf, word_tag_tab.get(buf, 'x'))
|
yield pair(buf, word_tag_tab.get(buf, 'x'))
|
||||||
buf = ''
|
buf = ''
|
||||||
else:
|
else:
|
||||||
if (buf not in jieba.FREQ):
|
if (buf not in jieba.FREQ):
|
||||||
regognized = __cut_detail(buf)
|
recognized = __cut_detail(buf)
|
||||||
for t in regognized:
|
for t in recognized:
|
||||||
yield t
|
yield t
|
||||||
else:
|
else:
|
||||||
for elem in buf:
|
for elem in buf:
|
||||||
@ -180,13 +180,12 @@ def __cut_DAG(sentence):
|
|||||||
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
yield pair(l_word, word_tag_tab.get(l_word, 'x'))
|
||||||
x = y
|
x = y
|
||||||
|
|
||||||
if len(buf)>0:
|
if buf:
|
||||||
if len(buf) == 1:
|
if len(buf) == 1:
|
||||||
yield pair(buf, word_tag_tab.get(buf, 'x'))
|
yield pair(buf, word_tag_tab.get(buf, 'x'))
|
||||||
else:
|
elif (buf not in jieba.FREQ):
|
||||||
if (buf not in jieba.FREQ):
|
recognized = __cut_detail(buf)
|
||||||
regognized = __cut_detail(buf)
|
for t in recognized:
|
||||||
for t in regognized:
|
|
||||||
yield t
|
yield t
|
||||||
else:
|
else:
|
||||||
for elem in buf:
|
for elem in buf:
|
||||||
@ -196,7 +195,7 @@ def __cut_internal(sentence,HMM=True):
|
|||||||
if not isinstance(sentence, str):
|
if not isinstance(sentence, str):
|
||||||
try:
|
try:
|
||||||
sentence = sentence.decode('utf-8')
|
sentence = sentence.decode('utf-8')
|
||||||
except:
|
except UnicodeDecodeError:
|
||||||
sentence = sentence.decode('gbk', 'ignore')
|
sentence = sentence.decode('gbk', 'ignore')
|
||||||
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
|
re_han, re_skip = re.compile(r"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(r"(\r\n|\s)")
|
||||||
re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
re_eng, re_num = re.compile(r"[a-zA-Z0-9]+"), re.compile(r"[\.0-9]+")
|
||||||
@ -232,7 +231,7 @@ def __lcut_internal_no_hmm(sentence):
|
|||||||
|
|
||||||
@makesure_userdict_loaded
|
@makesure_userdict_loaded
|
||||||
def cut(sentence, HMM=True):
|
def cut(sentence, HMM=True):
|
||||||
if (not hasattr(jieba,'pool')) or (jieba.pool==None):
|
if (not hasattr(jieba, 'pool')) or (jieba.pool is None):
|
||||||
for w in __cut_internal(sentence, HMM=HMM):
|
for w in __cut_internal(sentence, HMM=HMM):
|
||||||
yield w
|
yield w
|
||||||
else:
|
else:
|
||||||
|
@ -21,21 +21,20 @@ def viterbi(obs, states, start_p, trans_p, emit_p):
|
|||||||
prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
|
prev_states = [x for x in mem_path[t-1].keys() if len(trans_p[x]) > 0]
|
||||||
|
|
||||||
prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
|
prev_states_expect_next = set((y for x in prev_states for y in trans_p[x].keys()))
|
||||||
obs_states = states.get(obs[t],all_states)
|
obs_states = set(states.get(obs[t], all_states)) & prev_states_expect_next
|
||||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
|
||||||
|
|
||||||
if len(obs_states)==0: obs_states = prev_states_expect_next
|
if not obs_states:
|
||||||
if len(obs_states)==0: obs_states = all_states
|
obs_states = prev_states_expect_next if prev_states_expect_next else all_states
|
||||||
|
|
||||||
for y in obs_states:
|
for y in obs_states:
|
||||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
|
prob, state = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_INF) + emit_p[y].get(obs[t],MIN_FLOAT), y0) for y0 in prev_states])
|
||||||
V[t][y] = prob
|
V[t][y] = prob
|
||||||
mem_path[t][y] = state
|
mem_path[t][y] = state
|
||||||
|
|
||||||
last = [(V[-1][y], y) for y in mem_path[-1].keys()]
|
last = [(V[-1][y], y) for y in mem_path[-1].keys()]
|
||||||
#if len(last)==0:
|
#if len(last)==0:
|
||||||
#print obs
|
#print obs
|
||||||
(prob, state) = max(last)
|
prob, state = max(last)
|
||||||
|
|
||||||
route = [None] * len(obs)
|
route = [None] * len(obs)
|
||||||
i = len(obs) - 1
|
i = len(obs) - 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user