mirror of
https://github.com/fxsjy/jieba.git
synced 2025-07-10 00:01:33 +08:00
punctuation; improve keywords extraction
This commit is contained in:
parent
7d227da5c4
commit
659326c4e1
@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)")
|
||||
if cut_all:
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
|
||||
blocks = re_han.split(sentence)
|
||||
@ -168,8 +168,11 @@ def cut(sentence,cut_all=False):
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if re_skip.match(x):
|
||||
yield x
|
||||
else:
|
||||
for xx in x:
|
||||
yield xx
|
||||
|
||||
def cut_for_search(sentence):
|
||||
words = cut(sentence)
|
||||
|
@ -10,21 +10,25 @@ lines = content.split('\n')
|
||||
for line in lines:
|
||||
word,freq = line.split(' ')
|
||||
idf_freq[word] = float(freq)
|
||||
max_idf = max(idf_freq.values())
|
||||
|
||||
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
|
||||
stop_words= set([
|
||||
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
|
||||
])
|
||||
|
||||
def extract_tags(sentence,topK=20):
|
||||
words = jieba.cut(sentence)
|
||||
freq = {}
|
||||
for w in words:
|
||||
if len(w.strip())<2: continue
|
||||
if w.lower() in stop_words: continue
|
||||
freq[w]=freq.get(w,0.0)+1.0
|
||||
total = sum(freq.values())
|
||||
freq = [(k,v/total) for k,v in freq.iteritems()]
|
||||
|
||||
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
|
||||
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
|
||||
st_list = sorted(tf_idf_list,reverse=True)
|
||||
|
||||
top_tuples= st_list[:topK]
|
||||
tags = [a[1] for a in top_tuples]
|
||||
return tags
|
||||
|
||||
|
@ -125,7 +125,7 @@ def cut(sentence):
|
||||
sentence = sentence.decode('utf-8')
|
||||
except:
|
||||
sentence = sentence.decode('gbk','ignore')
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
|
||||
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)")
|
||||
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
|
||||
blocks = re_han.split(sentence)
|
||||
for blk in blocks:
|
||||
@ -135,10 +135,13 @@ def cut(sentence):
|
||||
else:
|
||||
tmp = re_skip.split(blk)
|
||||
for x in tmp:
|
||||
if x!="":
|
||||
if re_num.match(x):
|
||||
yield pair(x,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(x,'eng')
|
||||
if re_skip.match(x):
|
||||
yield pair(x,'')
|
||||
else:
|
||||
yield pair(x,'x')
|
||||
for xx in x:
|
||||
if re_num.match(xx):
|
||||
yield pair(xx,'m')
|
||||
elif re_eng.match(x):
|
||||
yield pair(xx,'eng')
|
||||
else:
|
||||
yield pair(xx,'x')
|
||||
|
Loading…
x
Reference in New Issue
Block a user