Merge pull request #174 from fukuball/master

讓 jieba 可以切換 idf 語料庫及 stop words 語料庫
2025-07-10 00:01:33 +08:00 · 2014-08-06 10:36:10 +08:00 · 2014-08-06 10:36:10 +08:00 · 16d626d347
commit 16d626d347
parent 473ac1df75 b658ee69cb
6 changed files with 176407 additions and 24 deletions
--- a/extra_dict/idf.txt.big
+++ b/extra_dict/idf.txt.big
--- a/extra_dict/stop_words.txt
+++ b/extra_dict/stop_words.txt
@ -0,0 +1,51 @@
+the
+of
+is
+and
+to
+in
+that
+we
+for
+an
+are
+by
+be
+as
+on
+with
+can
+if
+from
+which
+you
+it
+this
+then
+at
+have
+all
+not
+one
+has
+or
+that
+的
+了
+和
+是
+就
+都
+而
+及
+與
+著
+或
+一個
+沒有
+我們
+你們
+妳們
+他們
+她們
+是否
--- a/jieba/init.py
+++ b/jieba/init.py
@ -39,7 +39,7 @@ def gen_trie(f_name):
    trie = {}
    ltotal = 0.0
    with open(f_name, 'rb') as f:
-        lineno = 0 
+        lineno = 0
        for line in f.read().rstrip().decode('utf-8').split('\n'):
            lineno += 1
            try:
@ -134,7 +134,7 @@ def __cut_all(sentence):
    for k,L in dag.iteritems():
        if len(L)==1 and k>old_j:
            yield sentence[k:L[0]+1]
-            old_j = L[0] 
+            old_j = L[0]
        else:
            for j in L:
                if j>k:
@ -195,7 +195,7 @@ def __cut_DAG_NO_HMM(sentence):
            if len(buf)>0:
                yield buf
                buf = u''
-            yield l_word        
+            yield l_word
            x =y
    if len(buf)>0:
        yield buf
@ -227,7 +227,7 @@ def __cut_DAG(sentence):
                        for elem in buf:
                            yield elem
                    buf=u''
-            yield l_word        
+            yield l_word
        x =y

    if len(buf)>0:
@ -243,8 +243,8 @@ def __cut_DAG(sentence):
                    yield elem

 def cut(sentence,cut_all=False,HMM=True):
-    '''The main function that segments an entire sentence that contains 
-    Chinese characters into seperated words. 
+    '''The main function that segments an entire sentence that contains
+    Chinese characters into seperated words.
    Parameter:
        - sentence: The String to be segmented
        - cut_all: Model. True means full pattern, false means accurate pattern.
@ -257,8 +257,8 @@ def cut(sentence,cut_all=False,HMM=True):
            sentence = sentence.decode('gbk','ignore')
    '''
        \u4E00-\u9FA5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
-        \r\n|\s : whitespace characters. Will not be Handled. 
-    ''' 
+        \r\n|\s : whitespace characters. Will not be Handled.
+    '''
    re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
    if cut_all:
        re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
@ -306,7 +306,7 @@ def load_userdict(f):
    ''' Load personalized dict to improve detect rate.
    Parameter:
        - f : A plain text file contains words and their ocurrences.
-    Structure of dict file: 
+    Structure of dict file:
    word1 freq1 word_type1
    word2 freq2 word_type2
    ...
@ -372,7 +372,7 @@ def enable_parallel(processnum=None):
    def pcut(sentence,cut_all=False,HMM=True):
        parts = re.compile('([\r\n]+)').split(sentence)
        if cut_all:
-            result = pool.map(__lcut_all,parts) 
+            result = pool.map(__lcut_all,parts)
        else:
            if HMM:
                result = pool.map(__lcut,parts)
@ -418,7 +418,7 @@ def tokenize(unicode_sentence,mode="default",HMM=True):
    #mode ("default" or "search")
    if not isinstance(unicode_sentence, unicode):
        raise Exception("jieba: the input parameter should  unicode.")
-    start = 0 
+    start = 0
    if mode=='default':
        for w in cut(unicode_sentence,HMM=HMM):
            width = len(w)
--- a/jieba/analyse/init.py
+++ b/jieba/analyse/init.py
@ -1,3 +1,4 @@
+#encoding=utf-8
 import jieba
 import os
 try:
@ -5,27 +6,54 @@ try:
 except ImportError:
    pass

-_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
-f_name = os.path.join(_curpath,"idf.txt")
-content = open(f_name,'rb').read().decode('utf-8')
+_curpath = os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )  )
+abs_path = os.path.join(_curpath, "idf.txt")

-idf_freq = {}
-lines = content.split('\n')
-for line in lines:
-    word,freq = line.split(' ')
-    idf_freq[word] = float(freq)
-
-median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
-stop_words= set([
-"the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
+IDF_DICTIONARY = abs_path
+STOP_WORDS = set([
+    "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
 ])

+def set_idf_path(idf_path):
+    global IDF_DICTIONARY
+    abs_path = os.path.normpath( os.path.join( os.getcwd(), idf_path )  )
+    if not os.path.exists(abs_path):
+        raise Exception("jieba: path does not exist:" + abs_path)
+    IDF_DICTIONARY = abs_path
+    return
+
+def get_idf(abs_path):
+    content = open(abs_path,'rb').read().decode('utf-8')
+    idf_freq = {}
+    lines = content.split('\n')
+    for line in lines:
+        word,freq = line.split(' ')
+        idf_freq[word] = float(freq)
+    median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
+    return idf_freq, median_idf
+
+def set_stop_words(stop_words_path):
+    global STOP_WORDS
+    abs_path = os.path.normpath( os.path.join( os.getcwd(), stop_words_path )  )
+    if not os.path.exists(abs_path):
+        raise Exception("jieba: path does not exist:" + abs_path)
+    content = open(abs_path,'rb').read().decode('utf-8')
+    lines = content.split('\n')
+    for line in lines:
+        STOP_WORDS.add(line)
+    return
+
 def extract_tags(sentence,topK=20):
+    global IDF_DICTIONARY
+    global STOP_WORDS
+
+    idf_freq, median_idf = get_idf(IDF_DICTIONARY)
+
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
        if len(w.strip())<2: continue
-        if w.lower() in stop_words: continue
+        if w.lower() in STOP_WORDS: continue
        freq[w]=freq.get(w,0.0)+1.0
    total = sum(freq.values())
    freq = [(k,v/total) for k,v in freq.iteritems()]
--- a/test/extract_tags_idfpath.py
+++ b/test/extract_tags_idfpath.py
@ -0,0 +1,32 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_idfpath.py [file name] -k [top k]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+content = open(file_name, 'rb').read()
+
+jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
+
+tags = jieba.analyse.extract_tags(content, topK=topK)
+
+print ",".join(tags)
--- a/test/extract_tags_stop_words.py
+++ b/test/extract_tags_stop_words.py
@ -0,0 +1,33 @@
+import sys
+sys.path.append('../')
+
+import jieba
+import jieba.analyse
+from optparse import OptionParser
+
+USAGE = "usage:    python extract_tags_stop_words.py [file name] -k [top k]"
+
+parser = OptionParser(USAGE)
+parser.add_option("-k", dest="topK")
+opt, args = parser.parse_args()
+
+
+if len(args) < 1:
+    print USAGE
+    sys.exit(1)
+
+file_name = args[0]
+
+if opt.topK is None:
+    topK = 10
+else:
+    topK = int(opt.topK)
+
+content = open(file_name, 'rb').read()
+
+jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
+jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
+
+tags = jieba.analyse.extract_tags(content, topK=topK)
+
+print ",".join(tags)