讓 jieba 可以自行增加 stop words 語料庫

1. 增加範例 stop words 語料庫 2. 為了讓 jieba 可以切換 stop words 語料庫，新增 set_stop_words 方法，並改寫 extract_tags 3. test 增加 extract_tags_stop_words.py 測試範例
2025-07-10 00:01:33 +08:00 · 2014-08-06 03:35:16 +08:00 · 2014-08-06 03:35:16 +08:00 · b658ee69cb
commit b658ee69cb
parent 7198d562f1
3 changed files with 103 additions and 4 deletions
--- a/extra_dict/stop_words.txt
+++ b/extra_dict/stop_words.txt
@ -0,0 +1,51 @@
+the
+of
+is
+and
+to
+in
+that
+we
+for
+an
+are
+by
+be
+as
+on
+with
+can
+if
+from
+which
+you
+it
+this
+then
+at
+have
+all
+not
+one
+has
+or
+that
+的
+了
+和
+是
+就
+都
+而
+及
+與
+著
+或
+一個
+沒有
+我們
+你們
+妳們
+他們
+她們
+是否
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -8,7 +8,11 @@ except ImportError:

 _curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
 abs_path = os.path.join(_curpath, "idf.txt")
+
 IDF_DICTIONARY = abs_path
+STOP_WORDS = set([
+    "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
+])

 def set_idf_path(idf_path):
    global IDF_DICTIONARY
@ -28,17 +32,28 @@ def get_idf(abs_path):
    median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
    return idf_freq, median_idf

+def set_stop_words(stop_words_path):
+    global STOP_WORDS
+    abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path )  )
+    if not os.path.exists(abs_path):
+        raise Exception("jieba: path does not exist:" + abs_path)
+    content = open(abs_path,'rb').read().decode('utf-8')
+    lines = content.split('\n')
+    for line in lines:
+        STOP_WORDS.add(line)
+    return
+
 def extract_tags(sentence,topK=20):
    global IDF_DICTIONARY
+    global STOP_WORDS
+
    idf_freq, median_idf = get_idf(IDF_DICTIONARY)
-    stop_words= set([
-        "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
-    ])
+
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
        if len(w.strip())<2: continue
-        if w.lower() in stop_words: continue
+        if w.lower() in STOP_WORDS: continue
        freq[w]=freq.get(w,0.0)+1.0
    total = sum(freq.values())
    freq = [(k,v/total) for k,v in freq.iteritems()]
--- a/test/extract_tags_stop_words.py
+++ b/test/extract_tags_stop_words.py
@ -0,0 +1,33 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_stop_words.py [file name] -k [top k]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+content = open(file_name, 'rb').read()
+
+jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
+jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
+
+tags = jieba.analyse.extract_tags(content, topK=topK)
+
+print ",".join(tags)