mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
PEP8
This commit is contained in:
parent
ca97b19951
commit
b62f052927
@ -23,313 +23,313 @@ user_word_tag_tab={}
|
||||
initialized = False
|
||||
|
||||
def gen_trie(f_name):
|
||||
lfreq = {}
|
||||
trie = {}
|
||||
ltotal = 0.0
|
||||
with open(f_name, 'rb') as f:
|
||||
lineno = 0
|
||||
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
||||
lineno += 1
|
||||
try:
|
||||
word,freq,_ = line.split(' ')
|
||||
freq = float(freq)
|
||||
lfreq[word] = freq
|
||||
ltotal+=freq
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
except ValueError as e:
|
||||
print(f_name,' at line',lineno,line, file=sys.stderr)
|
||||
raise e
|
||||
return trie, lfreq,ltotal
|
||||
lfreq = {}
|
||||
trie = {}
|
||||
ltotal = 0.0
|
||||
with open(f_name, 'rb') as f:
|
||||
lineno = 0
|
||||
for line in f.read().rstrip().decode('utf-8').split('\n'):
|
||||
lineno += 1
|
||||
try:
|
||||
word,freq,_ = line.split(' ')
|
||||
freq = float(freq)
|
||||
lfreq[word] = freq
|
||||
ltotal+=freq
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
except ValueError as e:
|
||||
print(f_name,' at line',lineno,line, file=sys.stderr)
|
||||
raise e
|
||||
return trie, lfreq,ltotal
|
||||
|
||||
def initialize(*args):
|
||||
global trie, FREQ, total, min_freq, initialized
|
||||
if len(args)==0:
|
||||
dictionary = DICTIONARY
|
||||
else:
|
||||
dictionary = args[0]
|
||||
with DICT_LOCK:
|
||||
if initialized:
|
||||
return
|
||||
if trie:
|
||||
del trie
|
||||
trie = None
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
global trie, FREQ, total, min_freq, initialized
|
||||
if len(args)==0:
|
||||
dictionary = DICTIONARY
|
||||
else:
|
||||
dictionary = args[0]
|
||||
with DICT_LOCK:
|
||||
if initialized:
|
||||
return
|
||||
if trie:
|
||||
del trie
|
||||
trie = None
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
|
||||
abs_path = os.path.join(_curpath,dictionary)
|
||||
print("Building Trie..., from " + abs_path, file=sys.stderr)
|
||||
t1 = time.time()
|
||||
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
||||
else: #customer dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
||||
abs_path = os.path.join(_curpath,dictionary)
|
||||
print("Building Trie..., from " + abs_path, file=sys.stderr)
|
||||
t1 = time.time()
|
||||
if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
|
||||
else: #customer dictionary
|
||||
cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
|
||||
|
||||
load_from_cache_fail = True
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
|
||||
print("loading model from cache " + cache_file, file=sys.stderr)
|
||||
try:
|
||||
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
||||
load_from_cache_fail = False
|
||||
except:
|
||||
load_from_cache_fail = True
|
||||
load_from_cache_fail = True
|
||||
if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
|
||||
print("loading model from cache " + cache_file, file=sys.stderr)
|
||||
try:
|
||||
trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
|
||||
load_from_cache_fail = False
|
||||
except:
|
||||
load_from_cache_fail = True
|
||||
|
||||
if load_from_cache_fail:
|
||||
trie,FREQ,total = gen_trie(abs_path)
|
||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
||||
min_freq = min(FREQ.values())
|
||||
print("dumping model to file cache " + cache_file, file=sys.stderr)
|
||||
tmp_suffix = "."+str(random.random())
|
||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
||||
if os.name=='nt':
|
||||
import shutil
|
||||
replace_file = shutil.move
|
||||
else:
|
||||
replace_file = os.rename
|
||||
replace_file(cache_file+tmp_suffix,cache_file)
|
||||
if load_from_cache_fail:
|
||||
trie,FREQ,total = gen_trie(abs_path)
|
||||
FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.items()]) #normalize
|
||||
min_freq = min(FREQ.values())
|
||||
print("dumping model to file cache " + cache_file, file=sys.stderr)
|
||||
tmp_suffix = "."+str(random.random())
|
||||
with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
|
||||
marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
|
||||
if os.name=='nt':
|
||||
import shutil
|
||||
replace_file = shutil.move
|
||||
else:
|
||||
replace_file = os.rename
|
||||
replace_file(cache_file+tmp_suffix,cache_file)
|
||||
|
||||
initialized = True
|
||||
initialized = True
|
||||
|
||||
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
|
||||
print("Trie has been built succesfully.", file=sys.stderr)
|
||||
print("loading model cost ", time.time() - t1, "seconds.",file=sys.stderr)
|
||||
print("Trie has been built succesfully.", file=sys.stderr)
|
||||
|
||||
|
||||
def require_initialized(fn):
|
||||
global initialized,DICTIONARY
|
||||
global initialized,DICTIONARY
|
||||
|
||||
@wraps(fn)
|
||||
def wrapped(*args, **kwargs):
|
||||
if initialized:
|
||||
return fn(*args, **kwargs)
|
||||
else:
|
||||
initialize(DICTIONARY)
|
||||
return fn(*args, **kwargs)
|
||||
return wrapped
|
||||
@wraps(fn)
|
||||
def wrapped(*args, **kwargs):
|
||||
if initialized:
|
||||
return fn(*args, **kwargs)
|
||||
else:
|
||||
initialize(DICTIONARY)
|
||||
return fn(*args, **kwargs)
|
||||
return wrapped
|
||||
|
||||
def __cut_all(sentence):
|
||||
dag = get_DAG(sentence)
|
||||
old_j = -1
|
||||
for k,L in dag.items():
|
||||
if len(L)==1 and k>old_j:
|
||||
yield sentence[k:L[0]+1]
|
||||
old_j = L[0]
|
||||
else:
|
||||
for j in L:
|
||||
if j>k:
|
||||
yield sentence[k:j+1]
|
||||
old_j = j
|
||||
dag = get_DAG(sentence)
|
||||
old_j = -1
|
||||
for k,L in dag.items():
|
||||
if len(L)==1 and k>old_j:
|
||||
yield sentence[k:L[0]+1]
|
||||
old_j = L[0]
|
||||
else:
|
||||
for j in L:
|
||||
if j>k:
|
||||
yield sentence[k:j+1]
|
||||
old_j = j
|
||||
|
||||
|
||||
def calc(sentence,DAG,idx,route):
|
||||
N = len(sentence)
|
||||
route[N] = (0.0,'')
|
||||
for idx in range(N-1,-1,-1):
|
||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
||||
route[idx] = max(candidates)
|
||||
N = len(sentence)
|
||||
route[N] = (0.0,'')
|
||||
for idx in range(N-1,-1,-1):
|
||||
candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
|
||||
route[idx] = max(candidates)
|
||||
|
||||
@require_initialized
|
||||
def get_DAG(sentence):
|
||||
N = len(sentence)
|
||||
i,j=0,0
|
||||
p = trie
|
||||
DAG = {}
|
||||
while i<N:
|
||||
c = sentence[j]
|
||||
if c in p:
|
||||
p = p[c]
|
||||
if '' in p:
|
||||
if not i in DAG:
|
||||
DAG[i]=[]
|
||||
DAG[i].append(j)
|
||||
j+=1
|
||||
if j>=N:
|
||||
i+=1
|
||||
j=i
|
||||
p=trie
|
||||
else:
|
||||
p = trie
|
||||
i+=1
|
||||
j=i
|
||||
for i in range(len(sentence)):
|
||||
if not i in DAG:
|
||||
DAG[i] =[i]
|
||||
return DAG
|
||||
N = len(sentence)
|
||||
i,j=0,0
|
||||
p = trie
|
||||
DAG = {}
|
||||
while i<N:
|
||||
c = sentence[j]
|
||||
if c in p:
|
||||
p = p[c]
|
||||
if '' in p:
|
||||
if not i in DAG:
|
||||
DAG[i]=[]
|
||||
DAG[i].append(j)
|
||||
j+=1
|
||||
if j>=N:
|
||||
i+=1
|
||||
j=i
|
||||
p=trie
|
||||
else:
|
||||
p = trie
|
||||
i+=1
|
||||
j=i
|
||||
for i in range(len(sentence)):
|
||||
if not i in DAG:
|
||||
DAG[i] =[i]
|
||||
return DAG
|
||||
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = get_DAG(sentence)
|
||||
route ={}
|
||||
calc(sentence,DAG,0,route=route)
|
||||
x = 0
|
||||
buf =''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
buf=''
|
||||
else:
|
||||
if not (buf in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
buf=''
|
||||
yield l_word
|
||||
x =y
|
||||
DAG = get_DAG(sentence)
|
||||
route ={}
|
||||
calc(sentence,DAG,0,route=route)
|
||||
x = 0
|
||||
buf =''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
buf=''
|
||||
else:
|
||||
if not (buf in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
buf=''
|
||||
yield l_word
|
||||
x =y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
else:
|
||||
if not (buf in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield buf
|
||||
else:
|
||||
if not (buf in FREQ):
|
||||
regognized = finalseg.cut(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield elem
|
||||
|
||||
def cut(sentence,cut_all=False):
|
||||
if( type(sentence) is bytes):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
if( type(sentence) is bytes):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("[^a-zA-Z0-9+#\n]")
|
||||
|
||||
blocks = re_han.split(sentence)
|
||||
cut_block = __cut_DAG
|
||||
if cut_all:
|
||||
cut_block = __cut_all
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
#pprint.pprint(__cut_DAG(blk))
|
||||
for word in cut_block(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield x
|
||||
elif not cut_all:
|
||||
for xx in x:
|
||||
yield xx
|
||||
else:
|
||||
yield x
|
||||
blocks = re_han.split(sentence)
|
||||
cut_block = __cut_DAG
|
||||
if cut_all:
|
||||
cut_block = __cut_all
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
#pprint.pprint(__cut_DAG(blk))
|
||||
for word in cut_block(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield x
|
||||
elif not cut_all:
|
||||
for xx in x:
|
||||
yield xx
|
||||
else:
|
||||
yield x
|
||||
|
||||
def cut_for_search(sentence):
|
||||
words = cut(sentence)
|
||||
for w in words:
|
||||
if len(w)>2:
|
||||
for i in range(len(w)-1):
|
||||
gram2 = w[i:i+2]
|
||||
if gram2 in FREQ:
|
||||
yield gram2
|
||||
if len(w)>3:
|
||||
for i in range(len(w)-2):
|
||||
gram3 = w[i:i+3]
|
||||
if gram3 in FREQ:
|
||||
yield gram3
|
||||
yield w
|
||||
words = cut(sentence)
|
||||
for w in words:
|
||||
if len(w)>2:
|
||||
for i in range(len(w)-1):
|
||||
gram2 = w[i:i+2]
|
||||
if gram2 in FREQ:
|
||||
yield gram2
|
||||
if len(w)>3:
|
||||
for i in range(len(w)-2):
|
||||
gram3 = w[i:i+3]
|
||||
if gram3 in FREQ:
|
||||
yield gram3
|
||||
yield w
|
||||
|
||||
@require_initialized
|
||||
def load_userdict(f):
|
||||
global trie,total,FREQ
|
||||
if isinstance(f, (str, )):
|
||||
f = open(f, 'rb')
|
||||
content = f.read().decode('utf-8')
|
||||
line_no = 0
|
||||
for line in content.split("\n"):
|
||||
line_no+=1
|
||||
if line.rstrip()=='': continue
|
||||
tup =line.split(" ")
|
||||
word,freq = tup[0],tup[1]
|
||||
if line_no==1:
|
||||
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
||||
if len(tup)==3:
|
||||
user_word_tag_tab[word]=tup[2].strip()
|
||||
freq = float(freq)
|
||||
FREQ[word] = log(freq / total)
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
global trie,total,FREQ
|
||||
if isinstance(f, (str, )):
|
||||
f = open(f, 'rb')
|
||||
content = f.read().decode('utf-8')
|
||||
line_no = 0
|
||||
for line in content.split("\n"):
|
||||
line_no+=1
|
||||
if line.rstrip()=='': continue
|
||||
tup =line.split(" ")
|
||||
word,freq = tup[0],tup[1]
|
||||
if line_no==1:
|
||||
word = word.replace('\ufeff',"") #remove bom flag if it exists
|
||||
if len(tup)==3:
|
||||
user_word_tag_tab[word]=tup[2].strip()
|
||||
freq = float(freq)
|
||||
FREQ[word] = log(freq / total)
|
||||
p = trie
|
||||
for c in word:
|
||||
if not c in p:
|
||||
p[c] ={}
|
||||
p = p[c]
|
||||
p['']='' #ending flag
|
||||
|
||||
__ref_cut = cut
|
||||
__ref_cut_for_search = cut_for_search
|
||||
|
||||
def __lcut(sentence):
|
||||
return list(__ref_cut(sentence,False))
|
||||
return list(__ref_cut(sentence,False))
|
||||
def __lcut_all(sentence):
|
||||
return list(__ref_cut(sentence,True))
|
||||
return list(__ref_cut(sentence,True))
|
||||
def __lcut_for_search(sentence):
|
||||
return list(__ref_cut_for_search(sentence))
|
||||
return list(__ref_cut_for_search(sentence))
|
||||
|
||||
@require_initialized
|
||||
def enable_parallel(processnum):
|
||||
global pool,cut,cut_for_search
|
||||
if os.name=='nt':
|
||||
raise Exception("parallel mode only supports posix system")
|
||||
global pool,cut,cut_for_search
|
||||
if os.name=='nt':
|
||||
raise Exception("parallel mode only supports posix system")
|
||||
|
||||
from multiprocessing import Pool
|
||||
pool = Pool(processnum)
|
||||
from multiprocessing import Pool
|
||||
pool = Pool(processnum)
|
||||
|
||||
def pcut(sentence,cut_all=False):
|
||||
parts = re.compile(b'([\r\n]+)').split(sentence)
|
||||
if cut_all:
|
||||
result = pool.map(__lcut_all,parts)
|
||||
else:
|
||||
result = pool.map(__lcut,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
def pcut(sentence,cut_all=False):
|
||||
parts = re.compile(b'([\r\n]+)').split(sentence)
|
||||
if cut_all:
|
||||
result = pool.map(__lcut_all,parts)
|
||||
else:
|
||||
result = pool.map(__lcut,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
def pcut_for_search(sentence):
|
||||
parts = re.compile(b'([\r\n]+)').split(sentence)
|
||||
result = pool.map(__lcut_for_search,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
def pcut_for_search(sentence):
|
||||
parts = re.compile(b'([\r\n]+)').split(sentence)
|
||||
result = pool.map(__lcut_for_search,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
cut = pcut
|
||||
cut_for_search = pcut_for_search
|
||||
cut = pcut
|
||||
cut_for_search = pcut_for_search
|
||||
|
||||
def disable_parallel():
|
||||
global pool,cut,cut_for_search
|
||||
if 'pool' in globals():
|
||||
pool.close()
|
||||
pool = None
|
||||
cut = __ref_cut
|
||||
cut_for_search = __ref_cut_for_search
|
||||
global pool,cut,cut_for_search
|
||||
if 'pool' in globals():
|
||||
pool.close()
|
||||
pool = None
|
||||
cut = __ref_cut
|
||||
cut_for_search = __ref_cut_for_search
|
||||
|
||||
def set_dictionary(dictionary_path):
|
||||
global initialized, DICTIONARY
|
||||
with DICT_LOCK:
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("path does not exists:" + abs_path)
|
||||
DICTIONARY = abs_path
|
||||
initialized = False
|
||||
global initialized, DICTIONARY
|
||||
with DICT_LOCK:
|
||||
abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
|
||||
if not os.path.exists(abs_path):
|
||||
raise Exception("path does not exists:" + abs_path)
|
||||
DICTIONARY = abs_path
|
||||
initialized = False
|
||||
|
||||
def get_abs_path_dict():
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
abs_path = os.path.join(_curpath,DICTIONARY)
|
||||
return abs_path
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
abs_path = os.path.join(_curpath,DICTIONARY)
|
||||
return abs_path
|
||||
|
@ -8,8 +8,8 @@ content = open(f_name,'rb').read().decode('utf-8')
|
||||
idf_freq = {}
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
word,freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
word,freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
|
||||
median_idf = sorted(idf_freq.values())[int(len(idf_freq)/2)]
|
||||
stop_words= set([
|
||||
@ -17,18 +17,18 @@ stop_words= set([
|
||||
])
|
||||
|
||||
def extract_tags(sentence,topK=20):
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip())<2: continue
|
||||
if w.lower() in stop_words: continue
|
||||
freq[w]=freq.get(w,0.0)+1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.items()]
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip())<2: continue
|
||||
if w.lower() in stop_words: continue
|
||||
freq[w]=freq.get(w,0.0)+1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.items()]
|
||||
|
||||
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list,reverse=True)
|
||||
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list,reverse=True)
|
||||
|
||||
top_tuples= st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
top_tuples= st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
|
@ -8,66 +8,66 @@ from . import prob_emit
|
||||
MIN_FLOAT=-3.14e100
|
||||
|
||||
PrevStatus = {
|
||||
'B':('E','S'),
|
||||
'M':('M','B'),
|
||||
'S':('S','E'),
|
||||
'E':('B','M')
|
||||
'B':('E','S'),
|
||||
'M':('M','B'),
|
||||
'S':('S','E'),
|
||||
'E':('B','M')
|
||||
}
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
V = [{}] #tabular
|
||||
path = {}
|
||||
for y in states: #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
path[y] = [y]
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
newpath = {}
|
||||
for y in states:
|
||||
em_p = emit_p[y].get(obs[t],MIN_FLOAT)
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
|
||||
V[t][y] =prob
|
||||
newpath[y] = path[state] + [y]
|
||||
path = newpath
|
||||
V = [{}] #tabular
|
||||
path = {}
|
||||
for y in states: #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
path[y] = [y]
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
newpath = {}
|
||||
for y in states:
|
||||
em_p = emit_p[y].get(obs[t],MIN_FLOAT)
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
|
||||
V[t][y] =prob
|
||||
newpath[y] = path[state] + [y]
|
||||
path = newpath
|
||||
|
||||
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
|
||||
(prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
|
||||
|
||||
return (prob, path[state])
|
||||
return (prob, path[state])
|
||||
|
||||
|
||||
def __cut(sentence):
|
||||
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P)
|
||||
begin, next = 0,0
|
||||
#print pos_list, sentence
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i]
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield sentence[begin:i+1]
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield char
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield sentence[next:]
|
||||
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start.P, prob_trans.P, prob_emit.P)
|
||||
begin, next = 0,0
|
||||
#print pos_list, sentence
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i]
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield sentence[begin:i+1]
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield char
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield sentence[next:]
|
||||
|
||||
def cut(sentence):
|
||||
if not ( type(sentence) is str):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
if not ( type(sentence) is str):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
yield x
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
yield x
|
||||
|
@ -11,163 +11,163 @@ from . import char_state_tab
|
||||
default_encoding = sys.getfilesystemencoding()
|
||||
|
||||
def load_model(f_name):
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
prob_p_path = os.path.join(_curpath,f_name)
|
||||
if f_name.endswith(".py"):
|
||||
return eval(open(prob_p_path,"rb").read())
|
||||
else:
|
||||
result = {}
|
||||
for line in open(f_name,"rb"):
|
||||
line = line.strip()
|
||||
if line=="":continue
|
||||
line = line.decode("utf-8")
|
||||
word, _, tag = line.split(" ")
|
||||
result[word]=tag
|
||||
return result
|
||||
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
|
||||
prob_p_path = os.path.join(_curpath,f_name)
|
||||
if f_name.endswith(".py"):
|
||||
return eval(open(prob_p_path,"rb").read())
|
||||
else:
|
||||
result = {}
|
||||
for line in open(f_name,"rb"):
|
||||
line = line.strip()
|
||||
if line=="":continue
|
||||
line = line.decode("utf-8")
|
||||
word, _, tag = line.split(" ")
|
||||
result[word]=tag
|
||||
return result
|
||||
|
||||
word_tag_tab = load_model(jieba.get_abs_path_dict())
|
||||
|
||||
if jieba.user_word_tag_tab:
|
||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||
word_tag_tab.update(jieba.user_word_tag_tab)
|
||||
|
||||
class pair(object):
|
||||
def __init__(self,word,flag):
|
||||
self.word = word
|
||||
self.flag = flag
|
||||
def __init__(self,word,flag):
|
||||
self.word = word
|
||||
self.flag = flag
|
||||
|
||||
def __unicode__(self):
|
||||
return self.word+"/"+self.flag
|
||||
def __unicode__(self):
|
||||
return self.word+"/"+self.flag
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
def __str__(self):
|
||||
return self.__unicode__().encode(default_encoding)
|
||||
def __str__(self):
|
||||
return self.__unicode__().encode(default_encoding)
|
||||
|
||||
def encode(self,arg):
|
||||
return self.__unicode__().encode(arg)
|
||||
def encode(self,arg):
|
||||
return self.__unicode__().encode(arg)
|
||||
|
||||
def __cut(sentence):
|
||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P)
|
||||
begin, next = 0,0
|
||||
prob, pos_list = viterbi.viterbi(sentence,char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P)
|
||||
begin, next = 0,0
|
||||
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i][0]
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield pair(char,pos_list[i][1])
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield pair(sentence[next:], pos_list[next][1] )
|
||||
for i,char in enumerate(sentence):
|
||||
pos = pos_list[i][0]
|
||||
if pos=='B':
|
||||
begin = i
|
||||
elif pos=='E':
|
||||
yield pair(sentence[begin:i+1], pos_list[i][1])
|
||||
next = i+1
|
||||
elif pos=='S':
|
||||
yield pair(char,pos_list[i][1])
|
||||
next = i+1
|
||||
if next<len(sentence):
|
||||
yield pair(sentence[next:], pos_list[next][1] )
|
||||
|
||||
def __cut_detail(sentence):
|
||||
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5]+)"), re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
|
||||
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if re_num.match(x):
|
||||
yield pair(x,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(x,'eng')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if re_num.match(x):
|
||||
yield pair(x,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(x,'eng')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
|
||||
def __cut_DAG(sentence):
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
DAG = jieba.get_DAG(sentence)
|
||||
route ={}
|
||||
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
jieba.calc(sentence,DAG,0,route=route)
|
||||
|
||||
x = 0
|
||||
buf =''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
buf=''
|
||||
else:
|
||||
if not (buf in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
buf=''
|
||||
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
||||
x =y
|
||||
x = 0
|
||||
buf =''
|
||||
N = len(sentence)
|
||||
while x<N:
|
||||
y = route[x][1]+1
|
||||
l_word = sentence[x:y]
|
||||
if y-x==1:
|
||||
buf+= l_word
|
||||
else:
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
buf=''
|
||||
else:
|
||||
if not (buf in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
buf=''
|
||||
yield pair(l_word,word_tag_tab.get(l_word,'x'))
|
||||
x =y
|
||||
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
else:
|
||||
if not (buf in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
if len(buf)>0:
|
||||
if len(buf)==1:
|
||||
yield pair(buf,word_tag_tab.get(buf,'x'))
|
||||
else:
|
||||
if not (buf in jieba.FREQ):
|
||||
regognized = __cut_detail(buf)
|
||||
for t in regognized:
|
||||
yield t
|
||||
else:
|
||||
for elem in buf:
|
||||
yield pair(elem,word_tag_tab.get(elem,'x'))
|
||||
|
||||
def __cut_internal(sentence):
|
||||
if not ( type(sentence) is str):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
if not ( type(sentence) is str):
|
||||
try:
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||
re_han, re_skip = re.compile("([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile("(\s+)")
|
||||
re_eng,re_num = re.compile("[a-zA-Z0-9]+"), re.compile("[\.0-9]+")
|
||||
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut_DAG(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield pair(x,'x')
|
||||
else:
|
||||
for xx in x:
|
||||
if re_num.match(xx):
|
||||
yield pair(xx,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(xx,'eng')
|
||||
else:
|
||||
yield pair(xx,'x')
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
if re_han.match(blk):
|
||||
for word in __cut_DAG(blk):
|
||||
yield word
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if re_skip.match(x):
|
||||
yield pair(x,'x')
|
||||
else:
|
||||
for xx in x:
|
||||
if re_num.match(xx):
|
||||
yield pair(xx,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(xx,'eng')
|
||||
else:
|
||||
yield pair(xx,'x')
|
||||
|
||||
def __lcut_internal(sentence):
|
||||
return list(__cut_internal(sentence))
|
||||
return list(__cut_internal(sentence))
|
||||
|
||||
def cut(sentence):
|
||||
if (not hasattr(jieba,'pool')) or (jieba.pool==None):
|
||||
for w in __cut_internal(sentence):
|
||||
yield w
|
||||
else:
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
result = jieba.pool.map(__lcut_internal,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
if (not hasattr(jieba,'pool')) or (jieba.pool==None):
|
||||
for w in __cut_internal(sentence):
|
||||
yield w
|
||||
else:
|
||||
parts = re.compile('([\r\n]+)').split(sentence)
|
||||
result = jieba.pool.map(__lcut_internal,parts)
|
||||
for r in result:
|
||||
for w in r:
|
||||
yield w
|
||||
|
||||
|
@ -2,42 +2,42 @@ import operator
|
||||
MIN_FLOAT=-3.14e100
|
||||
|
||||
def get_top_states(t_state_v,K=4):
|
||||
items = t_state_v.items()
|
||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||
return [x[0] for x in topK]
|
||||
items = t_state_v.items()
|
||||
topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
|
||||
return [x[0] for x in topK]
|
||||
|
||||
def viterbi(obs, states, start_p, trans_p, emit_p):
|
||||
V = [{}] #tabular
|
||||
mem_path = [{}]
|
||||
all_states = trans_p.keys()
|
||||
for y in states.get(obs[0],all_states): #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
mem_path[0][y] = ''
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
mem_path.append({})
|
||||
prev_states = get_top_states(V[t-1])
|
||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||
V = [{}] #tabular
|
||||
mem_path = [{}]
|
||||
all_states = trans_p.keys()
|
||||
for y in states.get(obs[0],all_states): #init
|
||||
V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
|
||||
mem_path[0][y] = ''
|
||||
for t in range(1,len(obs)):
|
||||
V.append({})
|
||||
mem_path.append({})
|
||||
prev_states = get_top_states(V[t-1])
|
||||
prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
|
||||
|
||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||
obs_states = states.get(obs[t],all_states)
|
||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||
prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
|
||||
obs_states = states.get(obs[t],all_states)
|
||||
obs_states = set(obs_states) & set(prev_states_expect_next)
|
||||
|
||||
if len(obs_states)==0: obs_states = all_states
|
||||
for y in obs_states:
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
|
||||
V[t][y] =prob
|
||||
mem_path[t][y] = state
|
||||
if len(obs_states)==0: obs_states = all_states
|
||||
for y in obs_states:
|
||||
(prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + emit_p[y].get(obs[t],MIN_FLOAT) ,y0) for y0 in prev_states])
|
||||
V[t][y] =prob
|
||||
mem_path[t][y] = state
|
||||
|
||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
||||
#if len(last)==0:
|
||||
#print obs
|
||||
(prob, state) = max(last)
|
||||
last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
|
||||
#if len(last)==0:
|
||||
#print obs
|
||||
(prob, state) = max(last)
|
||||
|
||||
route = [None] * len(obs)
|
||||
i = len(obs)-1
|
||||
while i>=0:
|
||||
route[i] = state
|
||||
state = mem_path[i][state]
|
||||
i-=1
|
||||
return (prob, route)
|
||||
route = [None] * len(obs)
|
||||
i = len(obs)-1
|
||||
while i>=0:
|
||||
route[i] = state
|
||||
state = mem_path[i][state]
|
||||
i-=1
|
||||
return (prob, route)
|
@ -13,8 +13,8 @@ opt, args = parser.parse_args()
|
||||
|
||||
|
||||
if len(args) <1:
|
||||
print(USAGE)
|
||||
sys.exit(1)
|
||||
print(USAGE)
|
||||
sys.exit(1)
|
||||
|
||||
file_name = args[0]
|
||||
|
||||
|
@ -12,17 +12,17 @@ import os
|
||||
import random
|
||||
|
||||
if len(sys.argv)<2:
|
||||
print "usage: extract_topic.py directory [n_topic] [n_top_words]"
|
||||
sys.exit(0)
|
||||
print "usage: extract_topic.py directory [n_topic] [n_top_words]"
|
||||
sys.exit(0)
|
||||
|
||||
n_topic = 10
|
||||
n_top_words = 25
|
||||
|
||||
if len(sys.argv)>2:
|
||||
n_topic = int(sys.argv[2])
|
||||
n_topic = int(sys.argv[2])
|
||||
|
||||
if len(sys.argv)>3:
|
||||
n_top_words = int(sys.argv[3])
|
||||
n_top_words = int(sys.argv[3])
|
||||
|
||||
count_vect = CountVectorizer()
|
||||
docs = []
|
||||
@ -31,11 +31,11 @@ pattern = os.path.join(sys.argv[1],"*.txt")
|
||||
print "read "+pattern
|
||||
|
||||
for f_name in glob.glob(pattern):
|
||||
with open(f_name) as f:
|
||||
print "read file:", f_name
|
||||
for line in f: #one line as a document
|
||||
words = " ".join(jieba.cut(line))
|
||||
docs.append(words)
|
||||
with open(f_name) as f:
|
||||
print "read file:", f_name
|
||||
for line in f: #one line as a document
|
||||
words = " ".join(jieba.cut(line))
|
||||
docs.append(words)
|
||||
|
||||
random.shuffle(docs)
|
||||
|
||||
|
@ -15,14 +15,14 @@ import jieba
|
||||
default_encoding='utf-8'
|
||||
|
||||
if len(sys.argv)>1:
|
||||
default_encoding = sys.argv[1]
|
||||
default_encoding = sys.argv[1]
|
||||
|
||||
while True:
|
||||
line = sys.stdin.readline()
|
||||
if line=="":
|
||||
break
|
||||
line = line.strip()
|
||||
for word in jieba.cut(line):
|
||||
print(word.encode(default_encoding))
|
||||
line = sys.stdin.readline()
|
||||
if line=="":
|
||||
break
|
||||
line = line.strip()
|
||||
for word in jieba.cut(line):
|
||||
print(word.encode(default_encoding))
|
||||
|
||||
|
||||
|
@ -14,15 +14,15 @@ opt, args = parser.parse_args()
|
||||
|
||||
|
||||
if len(args) <1:
|
||||
print(USAGE)
|
||||
sys.exit(1)
|
||||
print(USAGE)
|
||||
sys.exit(1)
|
||||
|
||||
file_name = args[0]
|
||||
|
||||
if opt.topK==None:
|
||||
topK=10
|
||||
topK=10
|
||||
else:
|
||||
topK = int(opt.topK)
|
||||
topK = int(opt.topK)
|
||||
|
||||
|
||||
content = open(file_name,'rb').read()
|
||||
|
@ -5,92 +5,92 @@ import jieba
|
||||
jieba.enable_parallel(4)
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut(test_sent)
|
||||
print( "/ ".join(result) )
|
||||
result = jieba.cut(test_sent)
|
||||
print( "/ ".join(result) )
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -5,88 +5,88 @@ import jieba
|
||||
jieba.enable_parallel(4)
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut(test_sent,cut_all=True)
|
||||
print("/ ".join(result))
|
||||
result = jieba.cut(test_sent,cut_all=True)
|
||||
print("/ ".join(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
|
@ -5,88 +5,88 @@ import jieba
|
||||
jieba.enable_parallel(4)
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut_for_search(test_sent)
|
||||
print("/ ".join(result))
|
||||
result = jieba.cut_for_search(test_sent)
|
||||
print("/ ".join(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
|
@ -14,6 +14,6 @@ tm_cost = t2-t1
|
||||
|
||||
log_f = open("1.log","wb")
|
||||
for w in words:
|
||||
log_f.write(w.encode("utf-8"))
|
||||
log_f.write(w.encode("utf-8"))
|
||||
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||
|
||||
|
@ -6,94 +6,94 @@ jieba.enable_parallel(4)
|
||||
import jieba.posseg as pseg
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = pseg.cut(test_sent)
|
||||
for w in result:
|
||||
sys.stdout.write(w.word+ "/"+ w.flag + ", ")
|
||||
print("")
|
||||
result = pseg.cut(test_sent)
|
||||
for w in result:
|
||||
sys.stdout.write(w.word+ "/"+ w.flag + ", ")
|
||||
print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -16,7 +16,7 @@ tm_cost = t2-t1
|
||||
|
||||
log_f = open("1.log","wb")
|
||||
for w in words:
|
||||
print >> log_f, w.encode("utf-8"), "/" ,
|
||||
print >> log_f, w.encode("utf-8"), "/" ,
|
||||
|
||||
print 'speed' , len(content)/tm_cost, " bytes/second"
|
||||
|
||||
|
@ -4,8 +4,8 @@ sys.path.append("../")
|
||||
import jieba
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut(test_sent)
|
||||
print("/ ".join(result))
|
||||
result = jieba.cut(test_sent)
|
||||
print("/ ".join(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -4,24 +4,24 @@ sys.path.append("../")
|
||||
import jieba
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut(test_sent)
|
||||
print(" ".join(result) )
|
||||
result = jieba.cut(test_sent)
|
||||
print(" ".join(result) )
|
||||
|
||||
def testcase():
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
|
||||
if __name__ == "__main__":
|
||||
testcase()
|
||||
jieba.set_dictionary("foobar.txt")
|
||||
print("================================")
|
||||
testcase()
|
||||
testcase()
|
||||
jieba.set_dictionary("foobar.txt")
|
||||
print("================================")
|
||||
testcase()
|
||||
|
||||
|
@ -4,93 +4,93 @@ sys.path.append("../")
|
||||
import jieba
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut_for_search(test_sent)
|
||||
print("/ ".join(result))
|
||||
result = jieba.cut_for_search(test_sent)
|
||||
print("/ ".join(result))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -4,92 +4,92 @@ sys.path.append("../")
|
||||
import jieba
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = jieba.cut(test_sent,cut_all=True)
|
||||
print("/ ".join(result))
|
||||
result = jieba.cut(test_sent,cut_all=True)
|
||||
print("/ ".join(result))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -6,24 +6,24 @@ sys.path.append("../")
|
||||
import jieba
|
||||
|
||||
class Worker(threading.Thread):
|
||||
def run(self):
|
||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
||||
print("Full Mode:" + "/ ".join(seg_list)) #全模式
|
||||
def run(self):
|
||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
|
||||
print("Full Mode:" + "/ ".join(seg_list)) #全模式
|
||||
|
||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
||||
print("Default Mode:" + "/ ".join(seg_list)) #默认模式
|
||||
seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
|
||||
print("Default Mode:" + "/ ".join(seg_list)) #默认模式
|
||||
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||
print(", ".join(seg_list))
|
||||
seg_list = jieba.cut("他来到了网易杭研大厦")
|
||||
print(", ".join(seg_list))
|
||||
|
||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
||||
print(", ".join(seg_list))
|
||||
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
|
||||
print(", ".join(seg_list))
|
||||
workers = []
|
||||
for i in range(10):
|
||||
worker = Worker()
|
||||
workers.append(worker)
|
||||
worker.start()
|
||||
worker = Worker()
|
||||
workers.append(worker)
|
||||
worker.start()
|
||||
|
||||
for worker in workers:
|
||||
worker.join()
|
||||
worker.join()
|
||||
|
||||
|
176
test/test_pos.py
176
test/test_pos.py
@ -4,94 +4,94 @@ sys.path.append("../")
|
||||
import jieba.posseg as pseg
|
||||
|
||||
def cuttest(test_sent):
|
||||
result = pseg.cut(test_sent)
|
||||
for w in result:
|
||||
sys.stdout.write(w.word+ "/"+ w.flag + ", ")
|
||||
print("")
|
||||
result = pseg.cut(test_sent)
|
||||
for w in result:
|
||||
sys.stdout.write(w.word+ "/"+ w.flag + ", ")
|
||||
print("")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
|
||||
cuttest("我不喜欢日本和服。")
|
||||
cuttest("雷猴回归人间。")
|
||||
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
|
||||
cuttest("我需要廉租房")
|
||||
cuttest("永和服装饰品有限公司")
|
||||
cuttest("我爱北京天安门")
|
||||
cuttest("abc")
|
||||
cuttest("隐马尔可夫")
|
||||
cuttest("雷猴是个好网站")
|
||||
cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
|
||||
cuttest("草泥马和欺实马是今年的流行词汇")
|
||||
cuttest("伊藤洋华堂总府店")
|
||||
cuttest("中国科学院计算技术研究所")
|
||||
cuttest("罗密欧与朱丽叶")
|
||||
cuttest("我购买了道具和服装")
|
||||
cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍")
|
||||
cuttest("湖北省石首市")
|
||||
cuttest("湖北省十堰市")
|
||||
cuttest("总经理完成了这件事情")
|
||||
cuttest("电脑修好了")
|
||||
cuttest("做好了这件事情就一了百了了")
|
||||
cuttest("人们审美的观点是不同的")
|
||||
cuttest("我们买了一个美的空调")
|
||||
cuttest("线程初始化时我们要注意")
|
||||
cuttest("一个分子是由好多原子组织成的")
|
||||
cuttest("祝你马到功成")
|
||||
cuttest("他掉进了无底洞里")
|
||||
cuttest("中国的首都是北京")
|
||||
cuttest("孙君意")
|
||||
cuttest("外交部发言人马朝旭")
|
||||
cuttest("领导人会议和第四届东亚峰会")
|
||||
cuttest("在过去的这五年")
|
||||
cuttest("还需要很长的路要走")
|
||||
cuttest("60周年首都阅兵")
|
||||
cuttest("你好人们审美的观点是不同的")
|
||||
cuttest("买水果然后来世博园")
|
||||
cuttest("买水果然后去世博园")
|
||||
cuttest("但是后来我才知道你是对的")
|
||||
cuttest("存在即合理")
|
||||
cuttest("的的的的的在的的的的就以和和和")
|
||||
cuttest("I love你,不以为耻,反以为rong")
|
||||
cuttest("因")
|
||||
cuttest("")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("很好但主要是基于网页形式")
|
||||
cuttest("hello你好人们审美的观点是不同的")
|
||||
cuttest("为什么我不能拥有想要的生活")
|
||||
cuttest("后来我才")
|
||||
cuttest("此次来中国是为了")
|
||||
cuttest("使用了它就可以解决一些问题")
|
||||
cuttest(",使用了它就可以解决一些问题")
|
||||
cuttest("其实使用了它就可以解决一些问题")
|
||||
cuttest("好人使用了它就可以解决一些问题")
|
||||
cuttest("是因为和国家")
|
||||
cuttest("老年搜索还支持")
|
||||
cuttest("干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ")
|
||||
cuttest("大")
|
||||
cuttest("")
|
||||
cuttest("他说的确实在理")
|
||||
cuttest("长春市长春节讲话")
|
||||
cuttest("结婚的和尚未结婚的")
|
||||
cuttest("结合成分子时")
|
||||
cuttest("旅游和服务是最好的")
|
||||
cuttest("这件事情的确是我的错")
|
||||
cuttest("供大家参考指正")
|
||||
cuttest("哈尔滨政府公布塌桥原因")
|
||||
cuttest("我在机场入口处")
|
||||
cuttest("邢永臣摄影报道")
|
||||
cuttest("BP神经网络如何训练才能在分类时增加区分度?")
|
||||
cuttest("南京市长江大桥")
|
||||
cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究")
|
||||
cuttest('长春市长春药店')
|
||||
cuttest('邓颖超生前最喜欢的衣服')
|
||||
cuttest('胡锦涛是热爱世界和平的政治局常委')
|
||||
cuttest('程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪')
|
||||
cuttest('一次性交多少钱')
|
||||
cuttest('两块五一套,三块八一斤,四块七一本,五块六一条')
|
||||
cuttest('小和尚留了一个像大和尚一样的和尚头')
|
||||
cuttest('我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站')
|
||||
cuttest('张晓梅去人民医院做了个B超然后去买了件T恤')
|
||||
cuttest('AT&T是一件不错的公司,给你发offer了吗?')
|
||||
cuttest('C++和c#是什么关系?11+122=133,是吗?PI=3.14159')
|
||||
cuttest('你认识那个和主席握手的的哥吗?他开一辆黑色的士。')
|
||||
|
@ -15,7 +15,7 @@ tm_cost = t2-t1
|
||||
|
||||
log_f = open("1.log","wb")
|
||||
for w in words:
|
||||
log_f.write(bytes(w.word+"/"+w.flag+" ",'utf-8'))
|
||||
log_f.write(bytes(w.word+"/"+w.flag+" ",'utf-8'))
|
||||
|
||||
print('speed' , len(content)/tm_cost, " bytes/second")
|
||||
|
||||
|
@ -9,12 +9,12 @@ test_sent = "李小福是创新办主任也是云计算方面的专家;"
|
||||
test_sent += "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类型"
|
||||
words = jieba.cut(test_sent)
|
||||
for w in words:
|
||||
print(w)
|
||||
print(w)
|
||||
|
||||
result = pseg.cut(test_sent)
|
||||
|
||||
for w in result:
|
||||
print(w.word, "/", w.flag, ", ")
|
||||
print(w.word, "/", w.flag, ", ")
|
||||
|
||||
print("\n========")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user