Merge branch 'dev' of https://github.com/aszxqw/cppjieba into dev

2025-07-18 00:00:12 +08:00 · 2014-01-27 01:54:01 +08:00 · 2014-01-27 01:54:01 +08:00 · 8e2c726a8c
commit 8e2c726a8c
parent e23a3f555b 5f96dcf09a
11 changed files with 196 additions and 87 deletions
--- a/13
+++ b/13
@ -1,13 +0,0 @@
           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
                    Version 2, December 2004
 Copyright (C) 2013 Yanyi Wu <wuyanyi09@gmail.com>
 Everyone is permitted to copy and distribute verbatim or modified
 copies of this license document, and changing it is allowed as long
 as the name is changed.
            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
  0. You just DO WHAT THE FUCK YOU WANT TO.
--- a/20
+++ b/20
@ -0,0 +1,20 @@
 The MIT License (MIT)
 Copyright (c) 2013 Yanyi Wu
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 the Software, and to permit persons to whom the Software is furnished to do so,
 subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -10,6 +10,11 @@
 ## 安装与使用
 ### 依赖
 * g++ (version >= 4.6);
 * cmake (version >= 2.8);
 ### 下载和安装
 ```sh
--- a/dict/README.md
+++ b/dict/README.md
@ -0,0 +1,29 @@
 # CppJieba字典
 文件后缀名代表的是词典的编码方式。
 比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
 ## 分词
 ### jieba.dict.utf8/gbk
 作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
 ### hmm_model.utf8/gbk
 作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
 __对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
 ## 关键词抽取
 ## idf.utf8
 IDF(Inverse Document Frequency)
 在KeywordExtractor中，使用的是经典的TF-IDF算法，所以需要这么一个词典提供IDF信息。
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -1,43 +1,36 @@
 #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
 #define CPPJIEBA_KEYWORD_EXTRACTOR_H
-#include "MPSegment.hpp"
+#include "MixSegment.hpp"
 #include <cmath>
 #include <unordered_set>
 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 namespace CppJieba
 {
    using namespace Limonp;
-    //struct KeyWordInfo
+    /*utf8*/
-    //{
+    const char * BLACK_LIST[] = {"我们", "他们"};
    //    string word;
    //    double tfidf;
    //};
-    //inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
+    class KeywordExtractor: public InitOnOff
    //{
    //    return os << keyword.word << "," << keyword.idf;
    //}
    class KeywordExtractor
    {
        private:
-            MPSegment _segment;
+            MixSegment _segment;
        private:
            unordered_map<string, double> _idfMap;
-        protected:
+            double _idfAverage;
-            bool _isInited;
+
-            bool _getInitFlag()const{return _isInited;};
+            unordered_set<string> _blackSet;
            bool _setInitFlag(bool flag){return _isInited = flag;};
        public:
            operator bool(){return _getInitFlag();};
        public:
            KeywordExtractor(){_setInitFlag(false);};
-            explicit KeywordExtractor(const string& dictPath, const string& idfPath){_setInitFlag(init(dictPath, idfPath));};
+            explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
            {
                _setInitFlag(init(dictPath, hmmFilePath, idfPath));
            };
            ~KeywordExtractor(){};
        public:
-            bool init(const string& dictPath, const string& idfPath)
+            bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
            {
                ifstream ifs(idfPath.c_str());
                if(!ifs)
@ -47,7 +40,10 @@ namespace CppJieba
                }
                string line ;
                vector<string> buf;
-                for(uint lineno = 0; getline(ifs, line); lineno++)
+                double idf = 0.0;
                double idfSum = 0.0;
                size_t lineno = 0;
                for(;getline(ifs, line); lineno++)
                {
                    buf.clear();
                    if(line.empty())
@ -60,9 +56,22 @@ namespace CppJieba
                        LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
                        continue;
                    }
-                    _idfMap[buf[0]] = atof(buf[1].c_str());
+                    idf = atof(buf[1].c_str());
-                }
+                    _idfMap[buf[0]] = idf;
-                return _setInitFlag(_segment.init(dictPath));
+                    idfSum += idf;
                } 
                std::copy(
                            BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), 
                            std::inserter(_blackSet, _blackSet.begin()));
                assert(lineno);
                _idfAverage = idfSum / lineno;
                assert(_idfAverage > 0.0);
                return _setInitFlag(_segment.init(dictPath, hmmFilePath));
            };
        public:
@ -90,30 +99,58 @@ namespace CppJieba
                    return false;
                }
                // filtering single word.
                for(vector<string>::iterator iter = words.begin(); iter != words.end(); )
                {
                    if(_isSingleWord(*iter))
                    {
                        iter = words.erase(iter);
                    }
                    else
                    {
                        iter++;
                    }
                }
                unordered_map<string, double> wordmap;
                for(uint i = 0; i < words.size(); i ++)
                {
                    wordmap[ words[i] ] += 1.0;
                }
-                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
+                for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
                {
                    if(_blackSet.end() != _blackSet.find(itr->first))
                    {
                        itr = wordmap.erase(itr);
                        continue;
                    }
                    unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
                    if(cit != _idfMap.end())
                    {
                        itr->second *= cit->second;
                        itr ++;
                    }
                    else
                    {
-                        itr = wordmap.erase(itr);
+                        itr->second *= _idfAverage;
                    }
                    itr ++;
                }
                keywords.resize(MIN(topN, wordmap.size()));
                partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
                return true;
            }
        private:
            bool _isSingleWord(const string& str) const
            {
                Unicode unicode;
                TransCode::decode(str, unicode);
                if(unicode.size() == 1)
                  return true;
                return false;
            }
        private:
            static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
--- a/src/Limonp/CMakeLists.txt
+++ b/src/Limonp/CMakeLists.txt
@ -1 +1,3 @@
-INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp  std_outbound.hpp DESTINATION include/CppJieba/Limonp)
+INSTALL(FILES ArgvContext.hpp io_functs.hpp macro_def.hpp MysqlClient.hpp
    str_functs.hpp cast_functs.hpp Config.hpp logger.hpp map_functs.hpp
    std_outbound.hpp InitOnOff.hpp DESTINATION include/CppJieba/Limonp)
--- a/src/Limonp/InitOnOff.hpp
+++ b/src/Limonp/InitOnOff.hpp
@ -0,0 +1,21 @@
 #ifndef LIMONP_INITONOFF_H
 #define LIMONP_INITONOFF_H
 namespace Limonp
 {
    class InitOnOff
    {
        public:
            InitOnOff(){_setInitFlag(false);};
            ~InitOnOff(){};
        protected:
            bool _isInited;
            bool _getInitFlag()const{return _isInited;};
            bool _setInitFlag(bool flag){return _isInited = flag;};
        public:
            operator bool(){return _getInitFlag();};
    };
 }
 #endif
--- a/src/Limonp/str_functs.hpp
+++ b/src/Limonp/str_functs.hpp
@ -100,7 +100,7 @@ namespace Limonp
-    inline bool split(const string& src, vector<string>& res, const string& pattern)
+    inline bool split(const string& src, vector<string>& res, const string& pattern, size_t offset = 0, size_t len = string::npos)
    {
        if(src.empty())
        {
@ -110,20 +110,28 @@ namespace Limonp
        size_t start = 0;
        size_t end = 0;
-        while(start < src.size())
+        size_t cnt = 0;
        while(start < src.size() && res.size() < len)
        {
            end = src.find_first_of(pattern, start);
            if(string::npos == end)
            {
-                res.push_back(src.substr(start));
+                if(cnt >= offset)
                {
                    res.push_back(src.substr(start));
                }
                return true;
            }
-            res.push_back(src.substr(start, end - start));
+            //if(end == src.size() - 1)
-            if(end == src.size() - 1)
+            //{
            //    res.push_back("");
            //    return true;
            //}
            if(cnt >= offset)
            {
-                res.push_back("");
+                res.push_back(src.substr(start, end - start));
                break;
            }
            cnt ++;
            start = end + 1;
        }
        return true;
@ -158,12 +166,8 @@ namespace Limonp
        return ltrim(rtrim(s));
    }
    inline bool startsWith(const string& str, const string& prefix)
    {
        //return str.substr(0, prefix.size()) == prefix;
        if(prefix.length() > str.length())
        {
            return false;
--- a/src/SegmentBase.hpp
+++ b/src/SegmentBase.hpp
@ -3,6 +3,7 @@
 #include "TransCode.hpp"
 #include "Limonp/logger.hpp"
 #include "Limonp/InitOnOff.hpp"
 #include "ISegment.hpp"
 #include <cassert>
@ -10,17 +11,11 @@
 namespace CppJieba
 {
    using namespace Limonp;
-    class SegmentBase: public ISegment
+    class SegmentBase: public ISegment, public InitOnOff
    {
        public:
-            SegmentBase(){_setInitFlag(false);};
+            SegmentBase(){};
            virtual ~SegmentBase(){};
        protected:
            bool _isInited;
            bool _getInitFlag()const{return _isInited;};
            bool _setInitFlag(bool flag){return _isInited = flag;};
        public:
            operator bool(){return _getInitFlag();};
        public:
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -3,25 +3,25 @@
 using namespace CppJieba;
 const char* KEYWORD_EXT_TEST_SENTENCE = "我来自北京邮电大学。 学号123456";
 TEST(KeywordExtractorTest, Test1)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* str = "我来自北京邮电大学。。。  学号 123456";
+    const char* res[] = {"学号", "北京邮电大学"};
    const char* res[] = {"北京邮电大学", "来自"};
    vector<string> words;
    ASSERT_TRUE(extractor);
-    ASSERT_TRUE(extractor.extract(str, words, 2));
+    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 2));
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 }
 TEST(KeywordExtractorTest, Test2)
 {
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* str = "我来自北京邮电大学。。。  学号 123456";
+    const char* res[] = {"学号", "北京邮电大学", "123456", "来自"};
    const char* res[] = {"北京邮电大学", "来自"};
    vector<string> words;
    ASSERT_TRUE(extractor);
-    ASSERT_TRUE(extractor.extract(str, words, 9));
+    ASSERT_TRUE(extractor.extract(KEYWORD_EXT_TEST_SENTENCE, words, 9));
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
 }
@ -31,26 +31,35 @@ TEST(KeywordExtractorTest, Test3)
    ifstream ifs("../test/testdata/weicheng.utf8");
    ASSERT_TRUE(!!ifs);
    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+    const char* res[] = {"柔嘉", "小姐", "孙小姐", "方鸿渐", "鸿渐"};
    const char* res2 = "[\"柔嘉:5611.34\", \"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"鸿渐:2552.93\"]";
    vector<string> keywords;
    string resStr;
    vector<pair<string,double> >  keywords2;
    extractor.extract(str, keywords, 5);
    extractor.extract(str, keywords2, 5);
    ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
    resStr << keywords2;
    ASSERT_EQ(res2, resStr);
 }
-TEST(KeywordExtractorTest, Test4)
+//TEST(KeywordExtractorTest, Test4)
-{
+//{
-    ifstream ifs("../test/testdata/weicheng.utf8");
+//    ifstream ifs("../test/testdata/weicheng.utf8");
-    ASSERT_TRUE(!!ifs);
+//    ASSERT_TRUE(!!ifs);
-    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
+//    string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
-    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
+//    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
-    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
+//    //const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
-    vector<pair<string,double> >  keywords;
+//    vector<pair<string,double> >  keywords;
-    extractor.extract(str, keywords, 5);
+//    extractor.extract(str, keywords, 5);
-    //print(keywords);
+//    //print(keywords);
-    string res;
+//    string res;
-    res << keywords;
+//    res << keywords;
-    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
+//    print(keywords);
-
+//    print(__LINE__);
-}
+//    exit(1);
 //    ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
 //
 //}
--- a/test/unittest/TMd5.cpp
+++ b/test/unittest/TMd5.cpp
@ -19,7 +19,7 @@ TEST(Md5Test, Test1)
 {
    ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0]));
    string tmp;
-    for (int i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
+    for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
    {
        md5File(DICT_FILE[i], tmp);
        ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));