diff --git a/ChangeLog.md b/ChangeLog.md index e768ab1..32bfa0a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,10 @@ # CppJieba ChangeLog +## v2.4.5 (untagged) + +1. 使得 QuerySegment 支持自定义词典(可选参数)。 +2. 使得 KeywordExtractor 支持自定义词典(可选参数)。 + ## v2.4.4 1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。 diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 10d86b5..e433bc0 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -115,7 +115,7 @@ namespace CppJieba void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) { ifstream ifs(filePath.c_str()); - assert(ifs); + assert(ifs.is_open()); string line; DictUnit nodeInfo; vector buf; @@ -143,7 +143,7 @@ namespace CppJieba void _loadDict(const string& filePath) { ifstream ifs(filePath.c_str()); - assert(ifs); + assert(ifs.is_open()); string line; vector buf; diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 36c7da0..e1c6bf4 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -21,19 +21,18 @@ namespace CppJieba unordered_set _stopWords; public: KeywordExtractor(){}; - KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) + KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { - LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath)); + init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); }; ~KeywordExtractor(){}; public: - bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) + void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { _loadIdfDict(idfPath); _loadStopWordDict(stopWordPath); - LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); - return true; + LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); }; public: diff --git a/test/keyword_demo.cpp b/test/keyword_demo.cpp index f1758f0..e48fc4a 100644 --- a/test/keyword_demo.cpp +++ b/test/keyword_demo.cpp @@ -4,6 +4,7 @@ using namespace CppJieba; int main(int argc, char ** argv) { KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); + //KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../dict/user.dict.utf8"); string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"); vector > wordweights; vector words; diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 5b62303..bb92958 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -8,11 +8,49 @@ using namespace CppJieba; TEST(KeywordExtractorTest, Test1) { KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); - string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"); - string res; - vector > wordweights; - size_t topN = 5; - extractor.extract(s, wordweights, topN); - res << wordweights; - ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); + + { + string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"); + string res; + vector > wordweights; + size_t topN = 5; + extractor.extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); + } + + { + string s("我毕业于蓝翔,日薪一部iPhone6"); + string res; + vector > wordweights; + size_t topN = 5; + extractor.extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]"); + } +} + +TEST(KeywordExtractorTest, Test2) +{ + KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8"); + + { + string s("蓝翔优秀毕业生"); + string res; + vector > wordweights; + size_t topN = 5; + extractor.extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]"); + } + + { + string s("我毕业于蓝翔,日薪一部iPhone6"); + string res; + vector > wordweights; + size_t topN = 5; + extractor.extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"日薪:11.7392\", \"iPhone6:11.7392\", \"一部:6.47592\", \"毕业:6.23165\"]"); + } }