diff --git a/jieba/__init__.py b/jieba/__init__.py index b935985..6bd45af 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -153,7 +153,7 @@ def cut(sentence,cut_all=False): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)") if cut_all: re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"[^a-zA-Z0-9+#\n]") blocks = re_han.split(sentence) @@ -168,8 +168,11 @@ def cut(sentence,cut_all=False): else: tmp = re_skip.split(blk) for x in tmp: - if x!="": + if re_skip.match(x): yield x + else: + for xx in x: + yield xx def cut_for_search(sentence): words = cut(sentence) diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 6182c0a..667251c 100644 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -10,21 +10,25 @@ lines = content.split('\n') for line in lines: word,freq = line.split(' ') idf_freq[word] = float(freq) -max_idf = max(idf_freq.values()) + +median_idf = sorted(idf_freq.values())[len(idf_freq)/2] +stop_words= set([ +"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" +]) def extract_tags(sentence,topK=20): words = jieba.cut(sentence) freq = {} for w in words: if len(w.strip())<2: continue + if w.lower() in stop_words: continue freq[w]=freq.get(w,0.0)+1.0 total = sum(freq.values()) freq = [(k,v/total) for k,v in freq.iteritems()] - tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq] + tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] st_list = sorted(tf_idf_list,reverse=True) top_tuples= st_list[:topK] tags = [a[1] for a in top_tuples] return tags - diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index ef247e1..affeac1 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -125,7 +125,7 @@ def cut(sentence): sentence = sentence.decode('utf-8') except: sentence = sentence.decode('gbk','ignore') - re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"[ ]") + re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&]+)"), re.compile(ur"(\s+)") re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") blocks = re_han.split(sentence) for blk in blocks: @@ -135,10 +135,13 @@ def cut(sentence): else: tmp = re_skip.split(blk) for x in tmp: - if x!="": - if re_num.match(x): - yield pair(x,'m') - elif re_eng.match(x): - yield pair(x,'eng') - else: - yield pair(x,'x') + if re_skip.match(x): + yield pair(x,'') + else: + for xx in x: + if re_num.match(xx): + yield pair(xx,'m') + elif re_eng.match(x): + yield pair(xx,'eng') + else: + yield pair(xx,'x')