From 6d105a864df9fc197c884e2a1aecc132cc36f425 Mon Sep 17 00:00:00 2001 From: Yanyi Wu Date: Tue, 3 May 2016 19:53:40 +0800 Subject: [PATCH 1/4] Update TextRankExtractor.hpp remove unused function which using c++11 keyword `auto` --- include/cppjieba/TextRankExtractor.hpp | 71 ++++++++++++-------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 82e56f3..948f3ad 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -1,4 +1,4 @@ -#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H +#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H #define CPPJIEBA_TEXTRANK_EXTRACTOR_H #include @@ -82,40 +82,40 @@ namespace cppjieba { } }; - public: - TextRankExtractor(const string& dictPath, - const string& hmmFilePath, - const string& stopWordPath, - const string& userDict = "") - : segment_(dictPath, hmmFilePath, userDict) { - LoadStopWordDict(stopWordPath); - } - TextRankExtractor(const DictTrie* dictTrie, - const HMMModel* model, - const string& stopWordPath) - : segment_(dictTrie, model) { - LoadStopWordDict(stopWordPath); + public: + TextRankExtractor(const string& dictPath, + const string& hmmFilePath, + const string& stopWordPath, + const string& userDict = "") + : segment_(dictPath, hmmFilePath, userDict) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) + : segment_(dictTrie, model) { + LoadStopWordDict(stopWordPath); } TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { LoadStopWordDict(stopWordPath); } ~TextRankExtractor() { } - - void Extract(const string& sentence, vector& keywords, size_t topN) const { - vector topWords; - Extract(sentence, topWords, topN); - for (size_t i = 0; i < topWords.size(); i++) { - keywords.push_back(topWords[i].word); - } - } - - void Extract(const string& sentence, vector >& keywords, size_t topN) const { - vector topWords; - Extract(sentence, topWords, topN); - for (size_t i = 0; i < topWords.size(); i++) { - keywords.push_back(pair(topWords[i].word, topWords[i].weight)); - } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); + } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } } void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { @@ -176,13 +176,6 @@ namespace cppjieba { return false; } - static void sortMapValue(WordMap &map,vector& result,size_t topN){ - for(auto i=map.begin();i!=map.end();i++){ - result.push_back(i->second); - } - partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); - } - static bool Compare(const Word &x,const Word &y){ return x.weight > y.weight; } @@ -190,9 +183,9 @@ namespace cppjieba { MixSegment segment_; unordered_set stopWords_; }; - - inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { - return os << word.word << '|' << word.offsets << '|' << word.weight; + + inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { + return os << word.word << '|' << word.offsets << '|' << word.weight; } } // namespace cppjieba From a1ea1d075778ec049d55e5f47eb749e6a4909ae9 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 3 May 2016 20:01:44 +0800 Subject: [PATCH 2/4] add textrank unittest into cmake --- test/unittest/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 2655215..ef19de4 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run pos_tagger_test.cpp jieba_test.cpp pre_filter_test.cpp + textrank_test.cpp ) TARGET_LINK_LIBRARIES(test.run gtest pthread) From 39316114c526ed55dcb889dc9ab3eb3c1600000b Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 3 May 2016 20:49:47 +0800 Subject: [PATCH 3/4] correct unittest case --- test/unittest/textrank_test.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp index 39b2163..c4ae193 100644 --- a/test/unittest/textrank_test.cpp +++ b/test/unittest/textrank_test.cpp @@ -42,7 +42,8 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); + ASSERT_EQ(res, "[\"专业|[\"36\"]|1\", \"CEO|[\"94\"]|0.94764\", \"当上|[\"87\"]|0.79271\", \"手扶拖拉机|[\"21\"]|0.789347\", \"走上|[\"100\"]|0.768261\"]"); + // ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); } { @@ -70,7 +71,8 @@ TEST(TextRankExtractorTest, Test2) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]"); + ASSERT_EQ(res, "[\"蓝翔|[\"0\"]|1\", \"毕业生|[\"12\"]|0.996685\", \"优秀|[\"6\"]|0.992994\"]"); + //ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]"); } { From f253db0133a8ba680acad0ad7e1f8e4f64e10059 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 3 May 2016 21:24:40 +0800 Subject: [PATCH 4/4] use map/set instead of unordered_map/unordered_set to make result stable --- include/cppjieba/TextRankExtractor.hpp | 11 +++++++---- test/unittest/textrank_test.cpp | 14 +++++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 948f3ad..a625695 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -12,15 +12,18 @@ namespace cppjieba { public: typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word private: - typedef std::unordered_map WordMap; + typedef std::map WordMap; class WordGraph{ private: typedef double Score; typedef string Node; - typedef std::unordered_set NodeSet; - typedef std::unordered_map Edges; - typedef std::unordered_map Graph; + typedef std::set NodeSet; + + typedef std::map Edges; + typedef std::map Graph; + //typedef std::unordered_map Edges; + //typedef std::unordered_map Graph; double d; Graph graph; diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp index c4ae193..70dbc52 100644 --- a/test/unittest/textrank_test.cpp +++ b/test/unittest/textrank_test.cpp @@ -24,14 +24,16 @@ TEST(TextRankExtractorTest, Test1) { vector > words; Extractor.Extract(s, words, topN); res << words; - ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]"); + //ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C:1\", \"\xE4\xBD\xA0\xE5\xA5\xBD:0.519787\"]"); } { vector words; Extractor.Extract(s, words, topN); res << words; - ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]"); + //ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C|[\"6\", \"12\"]|1\", \"\xE4\xBD\xA0\xE5\xA5\xBD|[\"0\"]|0.519787\"]"); } } @@ -42,7 +44,7 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"专业|[\"36\"]|1\", \"CEO|[\"94\"]|0.94764\", \"当上|[\"87\"]|0.79271\", \"手扶拖拉机|[\"21\"]|0.789347\", \"走上|[\"100\"]|0.768261\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.95375\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.801701\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.798968\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.775505\"]"); // ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); } @@ -53,7 +55,8 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]"); + //ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); } } @@ -82,6 +85,7 @@ TEST(TextRankExtractorTest, Test2) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + //ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]"); } }