punctuation; improve keywords extraction

This commit is contained in:
Sun Junyi 2013-04-06 14:02:11 +08:00
parent 7d227da5c4
commit 659326c4e1
3 changed files with 23 additions and 13 deletions

View File

@ -153,7 +153,7 @@ def cut(sentence,cut_all=False):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)")
if cut_all:
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]")
blocks = re_han.split(sentence)
@ -168,8 +168,11 @@ def cut(sentence,cut_all=False):
else:
tmp = re_skip.split(blk)
for x in tmp:
if x!="":
if re_skip.match(x):
yield x
else:
for xx in x:
yield xx
def cut_for_search(sentence):
words = cut(sentence)

View File

@ -10,21 +10,25 @@ lines = content.split('\n')
for line in lines:
word,freq = line.split(' ')
idf_freq[word] = float(freq)
max_idf = max(idf_freq.values())
median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
stop_words= set([
"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
])
def extract_tags(sentence,topK=20):
words = jieba.cut(sentence)
freq = {}
for w in words:
if len(w.strip())<2: continue
if w.lower() in stop_words: continue
freq[w]=freq.get(w,0.0)+1.0
total = sum(freq.values())
freq = [(k,v/total) for k,v in freq.iteritems()]
tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
st_list = sorted(tf_idf_list,reverse=True)
top_tuples= st_list[:topK]
tags = [a[1] for a in top_tuples]
return tags

View File

@ -125,7 +125,7 @@ def cut(sentence):
sentence = sentence.decode('utf-8')
except:
sentence = sentence.decode('gbk','ignore')
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]")
re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)")
re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
blocks = re_han.split(sentence)
for blk in blocks:
@ -135,10 +135,13 @@ def cut(sentence):
else:
tmp = re_skip.split(blk)
for x in tmp:
if x!="":
if re_num.match(x):
yield pair(x,'m')
elif re_eng.match(x):
yield pair(x,'eng')
else:
yield pair(x,'x')
if re_skip.match(x):
yield pair(x,'')
else:
for xx in x:
if re_num.match(xx):
yield pair(xx,'m')
elif re_eng.match(x):
yield pair(xx,'eng')
else:
yield pair(xx,'x')