From b658ee69cbc12423d0d3413b8bfe439b2d43ecd8 Mon Sep 17 00:00:00 2001 From: Fukuball Lin Date: Wed, 6 Aug 2014 03:35:16 +0800 Subject: [PATCH] =?UTF-8?q?=E8=AE=93=20jieba=20=E5=8F=AF=E4=BB=A5=E8=87=AA?= =?UTF-8?q?=E8=A1=8C=E5=A2=9E=E5=8A=A0=20stop=20words=20=E8=AA=9E=E6=96=99?= =?UTF-8?q?=E5=BA=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 增加範例 stop words 語料庫 2. 為了讓 jieba 可以切換 stop words 語料庫,新增 set_stop_words 方法,並改寫 extract_tags 3. test 增加 extract_tags_stop_words.py 測試範例 --- extra_dict/stop_words.txt | 51 +++++++++++++++++++++++++++++++++ jieba/analyse/__init__.py | 23 ++++++++++++--- test/extract_tags_stop_words.py | 33 +++++++++++++++++++++ 3 files changed, 103 insertions(+), 4 deletions(-) create mode 100644 extra_dict/stop_words.txt create mode 100644 test/extract_tags_stop_words.py diff --git a/extra_dict/stop_words.txt b/extra_dict/stop_words.txt new file mode 100644 index 0000000..1cf8259 --- /dev/null +++ b/extra_dict/stop_words.txt @@ -0,0 +1,51 @@ +the +of +is +and +to +in +that +we +for +an +are +by +be +as +on +with +can +if +from +which +you +it +this +then +at +have +all +not +one +has +or +that +的 +了 +和 +是 +就 +都 +而 +及 +與 +著 +或 +一個 +沒有 +我們 +你們 +妳們 +他們 +她們 +是否 \ No newline at end of file diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py index 9832e62..2c6a04b 100755 --- a/jieba/analyse/__init__.py +++ b/jieba/analyse/__init__.py @@ -8,7 +8,11 @@ except ImportError: _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) abs_path = os.path.join(_curpath, "idf.txt") + IDF_DICTIONARY = abs_path +STOP_WORDS = set([ + "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" +]) def set_idf_path(idf_path): global IDF_DICTIONARY @@ -28,17 +32,28 @@ def get_idf(abs_path): median_idf = sorted(idf_freq.values())[len(idf_freq)/2] return idf_freq, median_idf +def set_stop_words(stop_words_path): + global STOP_WORDS + abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path ) ) + if not os.path.exists(abs_path): + raise Exception("jieba: path does not exist:" + abs_path) + content = open(abs_path,'rb').read().decode('utf-8') + lines = content.split('\n') + for line in lines: + STOP_WORDS.add(line) + return + def extract_tags(sentence,topK=20): global IDF_DICTIONARY + global STOP_WORDS + idf_freq, median_idf = get_idf(IDF_DICTIONARY) - stop_words= set([ - "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" - ]) + words = jieba.cut(sentence) freq = {} for w in words: if len(w.strip())<2: continue - if w.lower() in stop_words: continue + if w.lower() in STOP_WORDS: continue freq[w]=freq.get(w,0.0)+1.0 total = sum(freq.values()) freq = [(k,v/total) for k,v in freq.iteritems()] diff --git a/test/extract_tags_stop_words.py b/test/extract_tags_stop_words.py new file mode 100644 index 0000000..b174a91 --- /dev/null +++ b/test/extract_tags_stop_words.py @@ -0,0 +1,33 @@ +import sys +sys.path.append('../') + +import jieba +import jieba.analyse +from optparse import OptionParser + +USAGE = "usage: python extract_tags_stop_words.py [file name] -k [top k]" + +parser = OptionParser(USAGE) +parser.add_option("-k", dest="topK") +opt, args = parser.parse_args() + + +if len(args) < 1: + print USAGE + sys.exit(1) + +file_name = args[0] + +if opt.topK is None: + topK = 10 +else: + topK = int(opt.topK) + +content = open(file_name, 'rb').read() + +jieba.analyse.set_stop_words("../extra_dict/stop_words.txt") +jieba.analyse.set_idf_path("../extra_dict/idf.txt.big"); + +tags = jieba.analyse.extract_tags(content, topK=topK) + +print ",".join(tags)