KeywordExtractor 支持自定义词典（可选参数）。

2025-07-18 00:00:12 +08:00 · 2015-01-24 15:34:34 +08:00 · 2015-01-24 15:34:34 +08:00 · 4e72d4a06f
commit 4e72d4a06f
parent 269bc0fd0d
5 changed files with 57 additions and 14 deletions
--- a/ChangeLog.md
+++ b/ChangeLog.md
@ -1,5 +1,10 @@
 # CppJieba ChangeLog

+## v2.4.5 (untagged)
+
+1. 使得 QuerySegment 支持自定义词典（可选参数）。
+2. 使得 KeywordExtractor 支持自定义词典（可选参数）。
+
 ## v2.4.4

 1. 修改两条更细粒度的特殊过滤规则，将连续的数字（包括浮点数）和连续的字母单独切分出来（而不会混在一起）。
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@ -115,7 +115,7 @@ namespace CppJieba
            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
            {
                ifstream ifs(filePath.c_str());
-                assert(ifs);
+                assert(ifs.is_open());
                string line;
                DictUnit nodeInfo;
                vector<string> buf;
@ -143,7 +143,7 @@ namespace CppJieba
            void _loadDict(const string& filePath) 
            {
                ifstream ifs(filePath.c_str());
-                assert(ifs);
+                assert(ifs.is_open());
                string line;
                vector<string> buf;

--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -21,19 +21,18 @@ namespace CppJieba
            unordered_set<string> _stopWords;
        public:
            KeywordExtractor(){};
-            KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
+            KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
            {
-                LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
+                init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
            };
            ~KeywordExtractor(){};

        public:
-            bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
+            void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
            {
                _loadIdfDict(idfPath);
                _loadStopWordDict(stopWordPath);
-                LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
-                return true;
+                LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
            };
        public:

--- a/test/keyword_demo.cpp
+++ b/test/keyword_demo.cpp
@ -4,6 +4,7 @@ using namespace CppJieba;
 int main(int argc, char ** argv)
 {
    KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
+    //KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8");
    string s("我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。");
    vector<pair<string, double> > wordweights;
    vector<string> words;
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -8,6 +8,8 @@ using namespace CppJieba;
 TEST(KeywordExtractorTest, Test1)
 {
    KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
+
+    {
        string s("我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。");
        string res;
        vector<pair<string, double> > wordweights;
@ -15,4 +17,40 @@ TEST(KeywordExtractorTest, Test1)
        extractor.extract(s, wordweights, topN);
        res << wordweights;
        ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]");
+    }
+
+    {
+        string s("我毕业于蓝翔，日薪一部iPhone6");
+        string res;
+        vector<pair<string, double> > wordweights;
+        size_t topN = 5;
+        extractor.extract(s, wordweights, topN);
+        res << wordweights;
+        ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
+    }
+}
+
+TEST(KeywordExtractorTest, Test2)
+{
+    KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
+
+    {
+        string s("蓝翔优秀毕业生");
+        string res;
+        vector<pair<string, double> > wordweights;
+        size_t topN = 5;
+        extractor.extract(s, wordweights, topN);
+        res << wordweights;
+        ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]");
+    }
+
+    {
+        string s("我毕业于蓝翔，日薪一部iPhone6");
+        string res;
+        vector<pair<string, double> > wordweights;
+        size_t topN = 5;
+        extractor.extract(s, wordweights, topN);
+        res << wordweights;
+        ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone6:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]");
+    }
 }