From b658ee69cbc12423d0d3413b8bfe439b2d43ecd8 Mon Sep 17 00:00:00 2001
From: Fukuball Lin <fukuball@gmail.com>
Date: Wed, 6 Aug 2014 03:35:16 +0800
Subject: [PATCH] =?UTF-8?q?=E8=AE=93=20jieba=20=E5=8F=AF=E4=BB=A5=E8=87=AA?=
 =?UTF-8?q?=E8=A1=8C=E5=A2=9E=E5=8A=A0=20stop=20words=20=E8=AA=9E=E6=96=99?=
 =?UTF-8?q?=E5=BA=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 增加範例 stop words 語料庫
2. 為了讓 jieba 可以切換 stop words 語料庫，新增 set_stop_words 方法，並改寫 extract_tags
3. test 增加 extract_tags_stop_words.py 測試範例
---
 extra_dict/stop_words.txt       | 51 +++++++++++++++++++++++++++++++++
 jieba/analyse/__init__.py       | 23 ++++++++++++---
 test/extract_tags_stop_words.py | 33 +++++++++++++++++++++
 3 files changed, 103 insertions(+), 4 deletions(-)
 create mode 100644 extra_dict/stop_words.txt
 create mode 100644 test/extract_tags_stop_words.py

diff --git a/extra_dict/stop_words.txt b/extra_dict/stop_words.txt
new file mode 100644
index 0000000..1cf8259
--- /dev/null
+++ b/extra_dict/stop_words.txt
@@ -0,0 +1,51 @@
+the
+of
+is
+and
+to
+in
+that
+we
+for
+an
+are
+by
+be
+as
+on
+with
+can
+if
+from
+which
+you
+it
+this
+then
+at
+have
+all
+not
+one
+has
+or
+that
+的
+了
+和
+是
+就
+都
+而
+及
+與
+著
+或
+一個
+沒有
+我們
+你們
+妳們
+他們
+她們
+是否
\ No newline at end of file
diff --git a/jieba/analyse/__init__.py b/jieba/analyse/__init__.py
index 9832e62..2c6a04b 100755
--- a/jieba/analyse/__init__.py
+++ b/jieba/analyse/__init__.py
@@ -8,7 +8,11 @@ except ImportError:
 
 _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 abs_path = os.path.join(_curpath, "idf.txt")
+
 IDF_DICTIONARY = abs_path
+STOP_WORDS = set([
+    "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
+])
 
 def set_idf_path(idf_path):
     global IDF_DICTIONARY
@@ -28,17 +32,28 @@ def get_idf(abs_path):
     median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
     return idf_freq, median_idf
 
+def set_stop_words(stop_words_path):
+    global STOP_WORDS
+    abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path )  )
+    if not os.path.exists(abs_path):
+        raise Exception("jieba: path does not exist:" + abs_path)
+    content = open(abs_path,'rb').read().decode('utf-8')
+    lines = content.split('\n')
+    for line in lines:
+        STOP_WORDS.add(line)
+    return
+
 def extract_tags(sentence,topK=20):
     global IDF_DICTIONARY
+    global STOP_WORDS
+
     idf_freq, median_idf = get_idf(IDF_DICTIONARY)
-    stop_words= set([
-        "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
-    ])
+
     words = jieba.cut(sentence)
     freq = {}
     for w in words:
         if len(w.strip())<2: continue
-        if w.lower() in stop_words: continue
+        if w.lower() in STOP_WORDS: continue
         freq[w]=freq.get(w,0.0)+1.0
     total = sum(freq.values())
     freq = [(k,v/total) for k,v in freq.iteritems()]
diff --git a/test/extract_tags_stop_words.py b/test/extract_tags_stop_words.py
new file mode 100644
index 0000000..b174a91
--- /dev/null
+++ b/test/extract_tags_stop_words.py
@@ -0,0 +1,33 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_stop_words.py [file name] -k [top k]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+content = open(file_name, 'rb').read()
+
+jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
+jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
+
+tags = jieba.analyse.extract_tags(content, topK=topK)
+
+print ",".join(tags)