using idf.utf8 in keywordExtractor

2025-07-18 00:00:12 +08:00 · 2013-12-24 06:55:27 -08:00 · 2013-12-24 06:55:27 -08:00 · 62b83a36a0
commit 62b83a36a0
parent 9229fec6ca
3 changed files with 258843 additions and 45 deletions
--- a/dict/idf.utf8
+++ b/dict/idf.utf8
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -11,13 +11,12 @@ namespace CppJieba
    struct KeyWordInfo
    {
        string word;
        uint freq;
        double idf;
    };
    inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
    {
-        return os << keyword.word << "," << keyword.freq << "," << keyword.idf;
+        return os << keyword.word << "," << keyword.idf;
    }
    class KeywordExtractor
@ -25,9 +24,7 @@ namespace CppJieba
        private:
            MPSegment _segment;
        private:
-            unordered_map<string, const KeyWordInfo* > _wordIndex;
+            unordered_map<string, double> _idfMap;
            vector<KeyWordInfo> _wordinfos;
            size_t _totalFreq;
        protected:
            bool _isInited;
            bool _getInitFlag()const{return _isInited;};
@ -36,22 +33,19 @@ namespace CppJieba
            operator bool(){return _getInitFlag();};
        public:
            KeywordExtractor(){_setInitFlag(false);};
-            explicit KeywordExtractor(const string& dictPath){_setInitFlag(init(dictPath));};
+            explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
            ~KeywordExtractor(){};
        public:
-            bool init(const string& dictPath)
+            bool init(const string& dictPath, const string& idfPath)
            {
-                ifstream ifs(dictPath.c_str());
+                ifstream ifs(idfPath.c_str());
                if(!ifs)
                {
-                    LogError("open %s failed.", dictPath.c_str());
+                    LogError("open %s failed.", idfPath.c_str());
                    return false;
                }
                _totalFreq = 0;
                int tfreq;
                string line ;
                vector<string> buf;
                KeyWordInfo keywordInfo;
                for(uint lineno = 0; getline(ifs, line); lineno++)
                {
                    buf.clear();
@ -60,33 +54,12 @@ namespace CppJieba
                        LogError("line[%d] empty. skipped.", lineno);
                        continue;
                    }
-                    if(!split(line, buf, " ") || buf.size() != 3)
+                    if(!split(line, buf, " ") || buf.size() != 2)
                    {
                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
                        continue;
                    }
-                    keywordInfo.word = buf[0];
+                    _idfMap[buf[0]] = atof(buf[1].c_str());
                    tfreq= atoi(buf[1].c_str());
                    if(tfreq <= 0)
                    {
                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
                        continue;
                    }
                    keywordInfo.freq = tfreq;
                    _totalFreq += tfreq;
                    _wordinfos.push_back(keywordInfo);
                }
                // calculate idf & make index.
                for(uint i = 0; i < _wordinfos.size(); i++)
                {
                    if(_wordinfos[i].freq <= 0)
                    {
                        LogFatal("freq value is not positive.");
                        return false;
                    }
                    _wordinfos[i].idf = -log(_wordinfos[i].freq);
                    _wordIndex[_wordinfos[i].word] = &(_wordinfos[i]);
                }
                return _setInitFlag(_segment.init(dictPath));
            };
@ -110,10 +83,10 @@ namespace CppJieba
                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
                {
-                    unordered_map<string, const KeyWordInfo*>::const_iterator cit = _wordIndex.find(itr->first);
+                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
-                    if(cit != _wordIndex.end())
+                    if(cit != _idfMap.end())
                    {
-                        itr->second *= cit->second->idf;
+                        itr->second *= cit->second;
                        itr ++;
                    }
                    else
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -5,7 +5,7 @@ using namespace CppJieba;
 TEST(KeywordExtractorTest, Test1)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
    const char* str = "我来自北京邮电大学。。。  学号 123456";
    const char* res[] = {"北京邮电大学", "来自"};
    vector<string> words;
@ -16,9 +16,9 @@ TEST(KeywordExtractorTest, Test1)
 TEST(KeywordExtractorTest, Test2)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
    const char* str = "我来自北京邮电大学。。。  学号 123456";
-    const char* res[] = {"北京邮电大学", "来自", "学", "号", "我"};
+    const char* res[] = {"北京邮电大学", "来自"};
    vector<string> words;
    ASSERT_TRUE(extractor);
    ASSERT_TRUE(extractor.extract(str, words, 9));
@ -31,10 +31,9 @@ TEST(KeywordExtractorTest, Test3)
    ifstream ifs("../test/testdata/weicheng.utf8");
    ASSERT_TRUE(ifs);
    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
    vector<string> keywords;
-    string res;
+    const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
    extractor.extract(str, keywords, 5);
-    res << keywords;
+    ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
    ASSERT_EQ("[\"第三性\", \"多愁多病\", \"记挂着\", \"揭去\", \"贫血症\"]", res);
 }