From c19736995ce0ee6b0aa313150b7fdf39ce6a7dcf Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sat, 26 Mar 2016 22:12:40 +0800 Subject: [PATCH] Add KeywordExtractor::Word and add more overrided KeywordExtractor::Extract --- ChangeLog.md | 1 + include/cppjieba/KeywordExtractor.hpp | 68 ++++++++++++++++-------- test/demo.cpp | 2 +- test/unittest/keyword_extractor_test.cpp | 43 ++++++++++++--- 4 files changed, 82 insertions(+), 32 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 3a587f8..5e7b6dc 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,6 +4,7 @@ + Change Jieba::Locate to be static function. + Change the return value of KeywordExtractor::Extract from bool to void. ++ Add KeywordExtractor::Word and add more overrided KeywordExtractor::Extract ## v4.5.3 diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index 20e7b74..4af429f 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -11,6 +11,12 @@ using namespace limonp; /*utf8*/ class KeywordExtractor { public: + struct Word { + string word; + vector offsets; + double weight; + }; // struct Word + KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, @@ -39,42 +45,53 @@ class KeywordExtractor { } void Extract(const string& sentence, vector& keywords, size_t topN) const { - vector > topWords; + vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { - keywords.push_back(topWords[i].first); + keywords.push_back(topWords[i].word); } } void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } + } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { vector words; segment_.Cut(sentence, words); - map wordmap; - for (vector::iterator iter = words.begin(); iter != words.end(); iter++) { - if (IsSingleWord(*iter)) { + map wordmap; + size_t offset = 0; + for (size_t i = 0; i < words.size(); ++i) { + size_t t = offset; + offset += words[i].size(); + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } - wordmap[*iter] += 1.0; + wordmap[words[i]].offsets.push_back(t); + wordmap[words[i]].weight += 1.0; } - - for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { - if (stopWords_.end() != stopWords_.find(itr->first)) { - wordmap.erase(itr++); - continue; - } - - unordered_map::const_iterator cit = idfMap_.find(itr->first); - if (cit != idfMap_.end()) { - itr->second *= cit->second; - } else { - itr->second *= idfAverage_; - } - itr ++; + if (offset != sentence.size()) { + XLOG(ERROR) << "words illegal"; + return; } keywords.clear(); - std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); + keywords.reserve(wordmap.size()); + for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { + unordered_map::const_iterator cit = idfMap_.find(itr->first); + if (cit != idfMap_.end()) { + itr->second.weight *= cit->second; + } else { + itr->second.weight *= idfAverage_; + } + itr->second.word = itr->first; + keywords.push_back(itr->second); + } topN = min(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); keywords.resize(topN); @@ -127,8 +144,8 @@ class KeywordExtractor { return false; } - static bool Compare(const pair& lhs, const pair& rhs) { - return lhs.second > rhs.second; + static bool Compare(const Word& lhs, const Word& rhs) { + return lhs.weight > rhs.weight; } MixSegment segment_; @@ -137,6 +154,11 @@ class KeywordExtractor { unordered_set stopWords_; }; // class Jieba + +inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { + return os << word.word << '|' << word.offsets << '|' << word.weight; +} + } // namespace cppjieba #endif diff --git a/test/demo.cpp b/test/demo.cpp index 77194ba..930647a 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -62,7 +62,7 @@ int main(int argc, char** argv) { STOP_WORD_PATH); cout << "[demo] KEYWORD" << endl; const size_t topk = 5; - vector > keywordres; + vector keywordres; extractor.Extract(s, keywordres, topk); cout << s << endl; cout << keywordres << endl; diff --git a/test/unittest/keyword_extractor_test.cpp b/test/unittest/keyword_extractor_test.cpp index 1e7e60d..0062d9b 100644 --- a/test/unittest/keyword_extractor_test.cpp +++ b/test/unittest/keyword_extractor_test.cpp @@ -6,24 +6,51 @@ using namespace cppjieba; TEST(KeywordExtractorTest, Test1) { KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); + { + string s("你好世界世界而且而且"); + string res; + size_t topN = 5; + + { + vector words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界\", \"你好\"]"); + } + + { + vector > words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界:8.73506\", \"你好:7.95788\"]"); + } + + { + vector words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|8.73506\", \"你好|[\"0\"]|7.95788\"]"); + } + } + { string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"); string res; - vector > wordweights; + vector wordweights; size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"CEO:11.7392\", \"升职:10.8562\", \"加薪:10.6426\", \"手扶拖拉机:10.0089\", \"巅峰:9.49396\"]"); + ASSERT_EQ(res, "[\"CEO|[\"93\"]|11.7392\", \"\xE5\x8D\x87\xE8\x81\x8C|[\"72\"]|10.8562\", \"\xE5\x8A\xA0\xE8\x96\xAA|[\"78\"]|10.6426\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|10.0089\", \"\xE5\xB7\x85\xE5\xB3\xB0|[\"111\"]|9.49396\"]"); } { string s("一部iPhone6"); string res; - vector > wordweights; + vector wordweights; size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]"); + ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]"); } } @@ -33,20 +60,20 @@ TEST(KeywordExtractorTest, Test2) { { string s("蓝翔优秀毕业生"); string res; - vector > wordweights; + vector wordweights; size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"蓝翔:11.7392\", \"毕业生:8.13549\", \"优秀:6.78347\"]"); + ASSERT_EQ(res, "[\"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|11.7392\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|8.13549\", \"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|6.78347\"]"); } { string s("一部iPhone6"); string res; - vector > wordweights; + vector wordweights; size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]"); + ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|11.7392\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|6.47592\"]"); } }