Merge branch 'dev' of https://github.com/aszxqw/cppjieba into dev

2025-07-18 00:00:12 +08:00 · 2014-01-27 01:54:01 +08:00 · 2014-01-27 01:54:01 +08:00 · 8e2c726a8c
commit 8e2c726a8c
parent e23a3f555b 5f96dcf09a
11 changed files with 196 additions and 87 deletions
--- a/13
+++ b/13
@ -1,13 +0,0 @@
-           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
-                    Version 2, December 2004
-
- Copyright (C) 2013 Yanyi Wu <wuyanyi09@gmail.com>
-
- Everyone is permitted to copy and distribute verbatim or modified
- copies of this license document, and changing it is allowed as long
- as the name is changed.
-
-            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. You just DO WHAT THE FUCK YOU WANT TO.
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Yanyi Wu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -10,6 +10,11 @@

 ## 安装与使用

+### 依赖
+
+* g++ (version >= 4.6);
+* cmake (version >= 2.8);
+
 ### 下载和安装

 ```sh
--- a/dict/README.md
+++ b/dict/README.md
@ -0,0 +1,29 @@
+# CppJieba字典
+
+文件后缀名代表的是词典的编码方式。
+比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
+
+
+## 分词
+
+### jieba.dict.utf8/gbk
+
+作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
+
+### hmm_model.utf8/gbk
+
+作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
+
+__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
+
+
+## 关键词抽取
+
+## idf.utf8
+
+IDF(Inverse Document Frequency)
+在KeywordExtractor中，使用的是经典的TF-IDF算法，所以需要这么一个词典提供IDF信息。
+
+
+
+
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -1,43 +1,36 @@
 #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
 #define CPPJIEBA_KEYWORD_EXTRACTOR_H

-#include "MPSegment.hpp"
+#include "MixSegment.hpp"
 #include <cmath>
+#include <unordered_set>
 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))

 namespace CppJieba
 {
    using namespace Limonp;

-    //struct KeyWordInfo
-    //{
-    //    string word;
-    //    double tfidf;
-    //};
+    /*utf8*/
+    const char * BLACK_LIST[] = {"我们", "他们"};

-    //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
-    //{
-    //    return os << keyword.word << "," << keyword.idf;
-    //}
-
-    class KeywordExtractor
+    class KeywordExtractor: public InitOnOff
    {
        private:
-            MPSegment _segment;
+            MixSegment _segment;
        private:
            unordered_map<string, double> _idfMap;
-        protected:
-            bool _isInited;
-            bool _getInitFlag()const{return _isInited;};
-            bool _setInitFlag(bool flag){return _isInited = flag;};
-        public:
-            operator bool(){return _getInitFlag();};
+            double _idfAverage;
+
+            unordered_set<string> _blackSet;
        public:
            KeywordExtractor(){_setInitFlag(false);};
-            explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
+            explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
+            {
+                _setInitFlag(init(dictPath, hmmFilePath, idfPath));
+            };
            ~KeywordExtractor(){};
        public:
-            bool init(const string& dictPath, const string& idfPath)
+            bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
            {
                ifstream ifs(idfPath.c_str());
                if(!ifs)
@ -47,7 +40,10 @@ namespace CppJieba
                }
                string line ;
                vector<string> buf;
-                for(uint lineno = 0; getline(ifs, line); lineno++)
+                double idf = 0.0;
+                double idfSum = 0.0;
+                size_t lineno = 0;
+                for(;getline(ifs, line); lineno++)
                {
                    buf.clear();
                    if(line.empty())
@ -60,9 +56,22 @@ namespace CppJieba
                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
                        continue;
                    }
-                    _idfMap[buf[0]] = atof(buf[1].c_str());
-                }
-                return _setInitFlag(_segment.init(dictPath));
+                    idf = atof(buf[1].c_str());
+                    _idfMap[buf[0]] = idf;
+                    idfSum += idf;
+
+                } 
+
+                std::copy(
+                            BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), 
+                            std::inserter(_blackSet, _blackSet.begin()));
+                
+                assert(lineno);
+                _idfAverage = idfSum / lineno;
+
+                assert(_idfAverage > 0.0);
+                
+                return _setInitFlag(_segment.init(dictPath, hmmFilePath));
            };
        public:

@ -90,30 +99,58 @@ namespace CppJieba
                    return false;
                }

+                // filtering single word.
+                for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
+                {
+                    if(_isSingleWord(*iter))
+                    {
+                        iter = words.erase(iter);
+                    }
+                    else
+                    {
+                        iter++;
+                    }
+                }
+
                unordered_map<string, double> wordmap;
                for(uint i = 0; i < words.size(); i ++)
                {
                    wordmap[ words[i] ] += 1.0;
                }

-                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
+                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
                {
+                    if(_blackSet.end() != _blackSet.find(itr->first))
+                    {
+                        itr = wordmap.erase(itr);
+                        continue;
+                    }
+
                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
                    if(cit != _idfMap.end())
                    {
                        itr->second *= cit->second;
-                        itr ++;
                    }
                    else
                    {
-                        itr = wordmap.erase(itr);
+                        itr->second *= _idfAverage;
                    }
+                    itr ++;
                }

                keywords.resize(MIN(topN, wordmap.size()));
                partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
                return true;
            }
+        private:
+            bool _isSingleWord(const string& str) const
+            {
+                Unicode unicode;
+                TransCode::decode(str, unicode);
+                if(unicode.size() == 1)
+                  return true;
+                return false;
+            }

        private:
            static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
--- a/src/Limonp/CMakeLists.txt
+++ b/src/Limonp/CMakeLists.txt
@ -1 +1,3 @@
-INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp  std_outbound.hpp DESTINATION include/CppJieba/Limonp)
+INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp
+    str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp
+    std_outbound.hpp InitOnOff.hpp DESTINATION include/CppJieba/Limonp)
--- a/src/Limonp/InitOnOff.hpp
+++ b/src/Limonp/InitOnOff.hpp
@ -0,0 +1,21 @@
+#ifndef LIMONP_INITONOFF_H
+#define LIMONP_INITONOFF_H
+
+namespace Limonp
+{
+    class InitOnOff
+    {
+        public:
+            InitOnOff(){_setInitFlag(false);};
+            ~InitOnOff(){};
+        protected:
+            bool _isInited;
+            bool _getInitFlag()const{return _isInited;};
+            bool _setInitFlag(bool flag){return _isInited = flag;};
+        public:
+            operator bool(){return _getInitFlag();};
+
+    };
+}
+
+#endif
--- a/src/Limonp/str_functs.hpp
+++ b/src/Limonp/str_functs.hpp
@ -100,7 +100,7 @@ namespace Limonp



-    inline bool split(const string& src, vector<string>& res, const string& pattern)
+    inline bool split(const string& src, vector<string>& res, const string& pattern, size_t offset = 0, size_t len = string::npos)
    {
        if(src.empty())
        {
@ -110,20 +110,28 @@ namespace Limonp

        size_t start = 0;
        size_t end = 0;
-        while(start < src.size())
+        size_t cnt = 0;
+        while(start < src.size() && res.size() < len)
        {
            end = src.find_first_of(pattern, start);
            if(string::npos == end)
            {
-                res.push_back(src.substr(start));
+                if(cnt >= offset)
+                {
+                    res.push_back(src.substr(start));
+                }
                return true;
            }
-            res.push_back(src.substr(start, end - start));
-            if(end == src.size() - 1)
+            //if(end == src.size() - 1)
+            //{
+            //    res.push_back("");
+            //    return true;
+            //}
+            if(cnt >= offset)
            {
-                res.push_back("");
-                break;
+                res.push_back(src.substr(start, end - start));
            }
+            cnt ++;
            start = end + 1;
        }
        return true;
@ -158,12 +166,8 @@ namespace Limonp
        return ltrim(rtrim(s));
    }

-
-
-
    inline bool startsWith(const string& str, const string& prefix)
    {
-        //return str.substr(0, prefix.size()) == prefix;
        if(prefix.length() > str.length())
        {
            return false;
--- a/src/SegmentBase.hpp
+++ b/src/SegmentBase.hpp
@ -3,6 +3,7 @@

 #include "TransCode.hpp"
 #include "Limonp/logger.hpp"
+#include "Limonp/InitOnOff.hpp"
 #include "ISegment.hpp"
 #include <cassert>

@ -10,17 +11,11 @@
 namespace CppJieba
 {
    using namespace Limonp;
-    class SegmentBase: public ISegment
+    class SegmentBase: public ISegment, public InitOnOff
    {
        public:
-            SegmentBase(){_setInitFlag(false);};
+            SegmentBase(){};
            virtual ~SegmentBase(){};
-        protected:
-            bool _isInited;
-            bool _getInitFlag()const{return _isInited;};
-            bool _setInitFlag(bool flag){return _isInited = flag;};
-        public:
-            operator bool(){return _getInitFlag();};

        public:
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -3,25 +3,25 @@

 using namespace CppJieba;

+const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456";
+
 TEST(KeywordExtractorTest, Test1)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
-    const char* str = "我来自北京邮电大学。。。  学号 123456";
-    const char* res[] = {"北京邮电大学", "来自"};
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+    const char* res[] = {"学号", "北京邮电大学"};
    vector<string> words;
    ASSERT_TRUE(extractor);
-    ASSERT_TRUE(extractor.extract(str, words, 2));
+    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2));
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 }

 TEST(KeywordExtractorTest, Test2)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
-    const char* str = "我来自北京邮电大学。。。  学号 123456";
-    const char* res[] = {"北京邮电大学", "来自"};
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+    const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
    vector<string> words;
    ASSERT_TRUE(extractor);
-    ASSERT_TRUE(extractor.extract(str, words, 9));
+    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 }

@ -31,26 +31,35 @@ TEST(KeywordExtractorTest, Test3)
    ifstream ifs("../test/testdata/weicheng.utf8");
    ASSERT_TRUE(!!ifs);
    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
-    const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+    const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
+    const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
    vector<string> keywords;
+    string resStr;
+    vector<pair<string,double> >  keywords2;
    extractor.extract(str, keywords, 5);
+    extractor.extract(str, keywords2, 5);
    ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    resStr << keywords2;
+    ASSERT_EQ(res2, resStr);

 }

-TEST(KeywordExtractorTest, Test4)
-{
-    ifstream ifs("../test/testdata/weicheng.utf8");
-    ASSERT_TRUE(!!ifs);
-    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
-    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
-    vector<pair<string,double> >  keywords;
-    extractor.extract(str, keywords, 5);
-    //print(keywords);
-    string res;
-    res << keywords;
-    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
-
-}
+//TEST(KeywordExtractorTest, Test4)
+//{
+//    ifstream ifs("../test/testdata/weicheng.utf8");
+//    ASSERT_TRUE(!!ifs);
+//    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
+//    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
+//    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+//    vector<pair<string,double> >  keywords;
+//    extractor.extract(str, keywords, 5);
+//    //print(keywords);
+//    string res;
+//    res << keywords;
+//    print(keywords);
+//    print(__LINE__);
+//    exit(1);
+//    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
+//
+//}
--- a/test/unittest/TMd5.cpp
+++ b/test/unittest/TMd5.cpp
@ -19,7 +19,7 @@ TEST(Md5Test, Test1)
 {
    ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0]));
    string tmp;
-    for (int i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
+    for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
    {
        md5File(DICT_FILE[i], tmp);
        ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));