From 669e971e3e4d6b60a36081f0610dfff5f6eca96b Mon Sep 17 00:00:00 2001 From: mayunyun <121999660@qq.com> Date: Mon, 25 Apr 2016 20:20:50 +0800 Subject: [PATCH 01/10] new file: include/cppjieba/TextRankExtractor.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add TextRank Keyword Extractor to JiebaCpp 新增TextRank关键词提取 --- include/cppjieba/TextRankExtractor.hpp | 168 +++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 include/cppjieba/TextRankExtractor.hpp diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp new file mode 100644 index 0000000..a97c537 --- /dev/null +++ b/include/cppjieba/TextRankExtractor.hpp @@ -0,0 +1,168 @@ +#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H +#define CPPJIEBA_TEXTRANK_EXTRACTOR_H + +#include +#include "Jieba.hpp" + +namespace cppjieba { + using namespace limonp; + using namespace std; + + class TextRankExtractor { + public: + typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word + private: + typedef std::unordered_map WordMap; + + class WordGraph{ + private: + typedef double Score; + typedef string Node; + typedef std::unordered_set NodeSet; + typedef std::unordered_map Edges; + typedef std::unordered_map Graph; + + double d; + Graph graph; + NodeSet nodeSet; + public: + WordGraph(): d(0.85) {}; + WordGraph(double in_d): d(in_d) {}; + + void addEdge(Node start,Node end,double weight){ + Edges temp; + Edges::iterator gotEdges; + nodeSet.insert(start); + nodeSet.insert(end); + graph[start][end]+=weight; + graph[end][start]+=weight; + } + + void rank(WordMap &ws,size_t rankTime=10){ + WordMap outSum; + Score wsdef, min_rank, max_rank; + + if( graph.size() == 0) + return; + + wsdef = 1.0 / graph.size(); + + for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ + // edges->first start节点;edge->first end节点;edge->second 权重 + ws[edges->first].word=edges->first; + ws[edges->first].weight=wsdef; + outSum[edges->first].weight=0; + for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ + outSum[edges->first].weight+=edge->second; + } + } + //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? + for( size_t i=0; ifirst end节点;edge->second 权重 + s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; + ws[*node].weight = (1 - d) + d * s; + } + } + + min_rank=max_rank=ws.begin()->second.weight; + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + if( i->second.weight < min_rank ){ + min_rank = i->second.weight; + } + if( i->second.weight > max_rank ){ + max_rank = i->second.weight; + } + } + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); + } + } + }; + + public: + TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { + LoadStopWordDict(stopWordPath); + } + ~TextRankExtractor() { + } + + void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { + vector words; + segment_.Cut(sentence, words); + + TextRankExtractor::WordGraph graph; + WordMap wordmap; + size_t offset = 0; + + for(size_t i=0; i < words.size(); i++){ + size_t t = offset; + offset += words[i].size(); + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + continue; + } + for(size_t j=i+1;jsecond); + } + + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + keywords.resize(topN); + } + private: + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + while (getline(ifs, line)) { + stopWords_.insert(line); + } + assert(stopWords_.size()); + } + + bool IsSingleWord(const string& str) const { + Unicode unicode; + TransCode::Decode(str, unicode); + if (unicode.size() == 1) + return true; + return false; + } + + static void sortMapValue(WordMap &map,vector& result,size_t topN){ + for(auto i=map.begin();i!=map.end();i++){ + result.push_back(i->second); + } + partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); + } + + static bool Compare(const Word &x,const Word &y){ + return x.weight > y.weight; + } + + MixSegment segment_; + unordered_set stopWords_; + }; +} // namespace cppjieba + +#endif + + From 1aa0a32d900133b732f70f46f23c3e3cc1dc22df Mon Sep 17 00:00:00 2001 From: mayunyun <121999660@qq.com> Date: Mon, 25 Apr 2016 20:28:47 +0800 Subject: [PATCH 02/10] code format check --- include/cppjieba/TextRankExtractor.hpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index a97c537..20d295b 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -12,15 +12,15 @@ namespace cppjieba { public: typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word private: - typedef std::unordered_map WordMap; + typedef std::unordered_map WordMap; class WordGraph{ private: - typedef double Score; - typedef string Node; - typedef std::unordered_set NodeSet; - typedef std::unordered_map Edges; - typedef std::unordered_map Graph; + typedef double Score; + typedef string Node; + typedef std::unordered_set NodeSet; + typedef std::unordered_map Edges; + typedef std::unordered_map Graph; double d; Graph graph; @@ -117,15 +117,15 @@ namespace cppjieba { } graph.rank(wordmap,rankTime); - - keywords.clear(); + + keywords.clear(); keywords.reserve(wordmap.size()); - for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { - keywords.push_back(itr->second); + for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { + keywords.push_back(itr->second); } - - topN = min(topN, keywords.size()); - partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); keywords.resize(topN); } private: From f2de41c15e96c4ec2c4b2a47913705d35c8b323e Mon Sep 17 00:00:00 2001 From: mayunyun <121999660@qq.com> Date: Tue, 3 May 2016 09:03:16 +0800 Subject: [PATCH 03/10] code layout change: tab -> space --- include/cppjieba/TextRankExtractor.hpp | 278 ++++++++++++------------- 1 file changed, 139 insertions(+), 139 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 20d295b..8e7d18a 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -5,162 +5,162 @@ #include "Jieba.hpp" namespace cppjieba { - using namespace limonp; - using namespace std; + using namespace limonp; + using namespace std; - class TextRankExtractor { - public: - typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word - private: - typedef std::unordered_map WordMap; - - class WordGraph{ - private: - typedef double Score; - typedef string Node; - typedef std::unordered_set NodeSet; - typedef std::unordered_map Edges; - typedef std::unordered_map Graph; + class TextRankExtractor { + public: + typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word + private: + typedef std::unordered_map WordMap; + + class WordGraph{ + private: + typedef double Score; + typedef string Node; + typedef std::unordered_set NodeSet; + typedef std::unordered_map Edges; + typedef std::unordered_map Graph; - double d; - Graph graph; - NodeSet nodeSet; - public: - WordGraph(): d(0.85) {}; - WordGraph(double in_d): d(in_d) {}; + double d; + Graph graph; + NodeSet nodeSet; + public: + WordGraph(): d(0.85) {}; + WordGraph(double in_d): d(in_d) {}; - void addEdge(Node start,Node end,double weight){ - Edges temp; - Edges::iterator gotEdges; - nodeSet.insert(start); - nodeSet.insert(end); - graph[start][end]+=weight; - graph[end][start]+=weight; - } + void addEdge(Node start,Node end,double weight){ + Edges temp; + Edges::iterator gotEdges; + nodeSet.insert(start); + nodeSet.insert(end); + graph[start][end]+=weight; + graph[end][start]+=weight; + } - void rank(WordMap &ws,size_t rankTime=10){ - WordMap outSum; - Score wsdef, min_rank, max_rank; + void rank(WordMap &ws,size_t rankTime=10){ + WordMap outSum; + Score wsdef, min_rank, max_rank; - if( graph.size() == 0) - return; + if( graph.size() == 0) + return; - wsdef = 1.0 / graph.size(); + wsdef = 1.0 / graph.size(); - for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ - // edges->first start节点;edge->first end节点;edge->second 权重 - ws[edges->first].word=edges->first; - ws[edges->first].weight=wsdef; - outSum[edges->first].weight=0; - for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ - outSum[edges->first].weight+=edge->second; - } - } - //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? - for( size_t i=0; ifirst end节点;edge->second 权重 - s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; - ws[*node].weight = (1 - d) + d * s; - } - } + for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ + // edges->first start节点;edge->first end节点;edge->second 权重 + ws[edges->first].word=edges->first; + ws[edges->first].weight=wsdef; + outSum[edges->first].weight=0; + for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ + outSum[edges->first].weight+=edge->second; + } + } + //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? + for( size_t i=0; ifirst end节点;edge->second 权重 + s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; + ws[*node].weight = (1 - d) + d * s; + } + } - min_rank=max_rank=ws.begin()->second.weight; - for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ - if( i->second.weight < min_rank ){ - min_rank = i->second.weight; - } - if( i->second.weight > max_rank ){ - max_rank = i->second.weight; - } - } - for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ - ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); - } - } - }; + min_rank=max_rank=ws.begin()->second.weight; + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + if( i->second.weight < min_rank ){ + min_rank = i->second.weight; + } + if( i->second.weight > max_rank ){ + max_rank = i->second.weight; + } + } + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); + } + } + }; - public: - TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { - LoadStopWordDict(stopWordPath); - } - ~TextRankExtractor() { - } + public: + TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { + LoadStopWordDict(stopWordPath); + } + ~TextRankExtractor() { + } - void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { - vector words; - segment_.Cut(sentence, words); + void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { + vector words; + segment_.Cut(sentence, words); - TextRankExtractor::WordGraph graph; - WordMap wordmap; - size_t offset = 0; + TextRankExtractor::WordGraph graph; + WordMap wordmap; + size_t offset = 0; - for(size_t i=0; i < words.size(); i++){ - size_t t = offset; - offset += words[i].size(); - if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { - continue; - } - for(size_t j=i+1;jsecond); - } - - topN = min(topN, keywords.size()); - partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); - keywords.resize(topN); - } - private: - void LoadStopWordDict(const string& filePath) { - ifstream ifs(filePath.c_str()); - XCHECK(ifs.is_open()) << "open " << filePath << " failed"; - string line ; - while (getline(ifs, line)) { - stopWords_.insert(line); - } - assert(stopWords_.size()); - } + graph.rank(wordmap,rankTime); + + keywords.clear(); + keywords.reserve(wordmap.size()); + for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { + keywords.push_back(itr->second); + } + + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + keywords.resize(topN); + } + private: + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + while (getline(ifs, line)) { + stopWords_.insert(line); + } + assert(stopWords_.size()); + } - bool IsSingleWord(const string& str) const { - Unicode unicode; - TransCode::Decode(str, unicode); - if (unicode.size() == 1) - return true; - return false; - } + bool IsSingleWord(const string& str) const { + Unicode unicode; + TransCode::Decode(str, unicode); + if (unicode.size() == 1) + return true; + return false; + } - static void sortMapValue(WordMap &map,vector& result,size_t topN){ - for(auto i=map.begin();i!=map.end();i++){ - result.push_back(i->second); - } - partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); - } + static void sortMapValue(WordMap &map,vector& result,size_t topN){ + for(auto i=map.begin();i!=map.end();i++){ + result.push_back(i->second); + } + partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); + } - static bool Compare(const Word &x,const Word &y){ - return x.weight > y.weight; - } + static bool Compare(const Word &x,const Word &y){ + return x.weight > y.weight; + } - MixSegment segment_; - unordered_set stopWords_; - }; + MixSegment segment_; + unordered_set stopWords_; + }; } // namespace cppjieba #endif From 0f66a923b368cbb4f81b49fb12e45f48aba7469c Mon Sep 17 00:00:00 2001 From: mayunyun <121999660@qq.com> Date: Tue, 3 May 2016 18:06:14 +0800 Subject: [PATCH 04/10] =?UTF-8?q?1.=E5=A2=9E=E5=8A=A0=E5=8D=95=E5=85=83?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=202.=E5=A2=9E=E5=8A=A0=E4=BA=86=E6=9E=84?= =?UTF-8?q?=E9=80=A0=E5=87=BD=E6=95=B0=E7=9A=84=E9=87=8D=E8=BD=BD=EF=BC=8C?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86=E6=8F=90=E5=8F=96=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E7=9A=84=E9=87=8D=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/cppjieba/TextRankExtractor.hpp | 35 ++++++++++- test/unittest/textrank_test.cpp | 85 ++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 test/unittest/textrank_test.cpp diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 8e7d18a..82e56f3 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -82,12 +82,41 @@ namespace cppjieba { } }; - public: + public: + TextRankExtractor(const string& dictPath, + const string& hmmFilePath, + const string& stopWordPath, + const string& userDict = "") + : segment_(dictPath, hmmFilePath, userDict) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) + : segment_(dictTrie, model) { + LoadStopWordDict(stopWordPath); + } TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { LoadStopWordDict(stopWordPath); } ~TextRankExtractor() { } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); + } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } + } void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { vector words; @@ -161,6 +190,10 @@ namespace cppjieba { MixSegment segment_; unordered_set stopWords_; }; + + inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { + return os << word.word << '|' << word.offsets << '|' << word.weight; + } } // namespace cppjieba #endif diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp new file mode 100644 index 0000000..39b2163 --- /dev/null +++ b/test/unittest/textrank_test.cpp @@ -0,0 +1,85 @@ +#include "cppjieba/TextRankExtractor.hpp" +#include "gtest/gtest.h" + +using namespace cppjieba; + +TEST(TextRankExtractorTest, Test1) { + TextRankExtractor Extractor( + "../test/testdata/extra_dict/jieba.dict.small.utf8", + "../dict/hmm_model.utf8", + "../dict/stop_words.utf8"); + { + string s("你好世界世界而且而且"); + string res; + size_t topN = 5; + + { + vector words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界\", \"你好\"]"); + } + + { + vector > words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]"); + } + + { + vector words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]"); + } + } + + { + string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); + } + + { + string s("一部iPhone6"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + } +} + +TEST(TextRankExtractorTest, Test2) { + TextRankExtractor Extractor( + "../test/testdata/extra_dict/jieba.dict.small.utf8", + "../dict/hmm_model.utf8", + "../dict/stop_words.utf8", + "../test/testdata/userdict.utf8"); + + { + string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]"); + } + + { + string s("一部iPhone6"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + } +} From 6d105a864df9fc197c884e2a1aecc132cc36f425 Mon Sep 17 00:00:00 2001 From: Yanyi Wu Date: Tue, 3 May 2016 19:53:40 +0800 Subject: [PATCH 05/10] Update TextRankExtractor.hpp remove unused function which using c++11 keyword `auto` --- include/cppjieba/TextRankExtractor.hpp | 71 ++++++++++++-------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 82e56f3..948f3ad 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -1,4 +1,4 @@ -#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H +#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H #define CPPJIEBA_TEXTRANK_EXTRACTOR_H #include @@ -82,40 +82,40 @@ namespace cppjieba { } }; - public: - TextRankExtractor(const string& dictPath, - const string& hmmFilePath, - const string& stopWordPath, - const string& userDict = "") - : segment_(dictPath, hmmFilePath, userDict) { - LoadStopWordDict(stopWordPath); - } - TextRankExtractor(const DictTrie* dictTrie, - const HMMModel* model, - const string& stopWordPath) - : segment_(dictTrie, model) { - LoadStopWordDict(stopWordPath); + public: + TextRankExtractor(const string& dictPath, + const string& hmmFilePath, + const string& stopWordPath, + const string& userDict = "") + : segment_(dictPath, hmmFilePath, userDict) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) + : segment_(dictTrie, model) { + LoadStopWordDict(stopWordPath); } TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { LoadStopWordDict(stopWordPath); } ~TextRankExtractor() { } - - void Extract(const string& sentence, vector& keywords, size_t topN) const { - vector topWords; - Extract(sentence, topWords, topN); - for (size_t i = 0; i < topWords.size(); i++) { - keywords.push_back(topWords[i].word); - } - } - - void Extract(const string& sentence, vector >& keywords, size_t topN) const { - vector topWords; - Extract(sentence, topWords, topN); - for (size_t i = 0; i < topWords.size(); i++) { - keywords.push_back(pair(topWords[i].word, topWords[i].weight)); - } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); + } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } } void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { @@ -176,13 +176,6 @@ namespace cppjieba { return false; } - static void sortMapValue(WordMap &map,vector& result,size_t topN){ - for(auto i=map.begin();i!=map.end();i++){ - result.push_back(i->second); - } - partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); - } - static bool Compare(const Word &x,const Word &y){ return x.weight > y.weight; } @@ -190,9 +183,9 @@ namespace cppjieba { MixSegment segment_; unordered_set stopWords_; }; - - inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { - return os << word.word << '|' << word.offsets << '|' << word.weight; + + inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { + return os << word.word << '|' << word.offsets << '|' << word.weight; } } // namespace cppjieba From a1ea1d075778ec049d55e5f47eb749e6a4909ae9 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 3 May 2016 20:01:44 +0800 Subject: [PATCH 06/10] add textrank unittest into cmake --- test/unittest/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 2655215..ef19de4 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run pos_tagger_test.cpp jieba_test.cpp pre_filter_test.cpp + textrank_test.cpp ) TARGET_LINK_LIBRARIES(test.run gtest pthread) From 39316114c526ed55dcb889dc9ab3eb3c1600000b Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 3 May 2016 20:49:47 +0800 Subject: [PATCH 07/10] correct unittest case --- test/unittest/textrank_test.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp index 39b2163..c4ae193 100644 --- a/test/unittest/textrank_test.cpp +++ b/test/unittest/textrank_test.cpp @@ -42,7 +42,8 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); + ASSERT_EQ(res, "[\"专业|[\"36\"]|1\", \"CEO|[\"94\"]|0.94764\", \"当上|[\"87\"]|0.79271\", \"手扶拖拉机|[\"21\"]|0.789347\", \"走上|[\"100\"]|0.768261\"]"); + // ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); } { @@ -70,7 +71,8 @@ TEST(TextRankExtractorTest, Test2) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]"); + ASSERT_EQ(res, "[\"蓝翔|[\"0\"]|1\", \"毕业生|[\"12\"]|0.996685\", \"优秀|[\"6\"]|0.992994\"]"); + //ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]"); } { From f253db0133a8ba680acad0ad7e1f8e4f64e10059 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Tue, 3 May 2016 21:24:40 +0800 Subject: [PATCH 08/10] use map/set instead of unordered_map/unordered_set to make result stable --- include/cppjieba/TextRankExtractor.hpp | 11 +++++++---- test/unittest/textrank_test.cpp | 14 +++++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 948f3ad..a625695 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -12,15 +12,18 @@ namespace cppjieba { public: typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word private: - typedef std::unordered_map WordMap; + typedef std::map WordMap; class WordGraph{ private: typedef double Score; typedef string Node; - typedef std::unordered_set NodeSet; - typedef std::unordered_map Edges; - typedef std::unordered_map Graph; + typedef std::set NodeSet; + + typedef std::map Edges; + typedef std::map Graph; + //typedef std::unordered_map Edges; + //typedef std::unordered_map Graph; double d; Graph graph; diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp index c4ae193..70dbc52 100644 --- a/test/unittest/textrank_test.cpp +++ b/test/unittest/textrank_test.cpp @@ -24,14 +24,16 @@ TEST(TextRankExtractorTest, Test1) { vector > words; Extractor.Extract(s, words, topN); res << words; - ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]"); + //ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C:1\", \"\xE4\xBD\xA0\xE5\xA5\xBD:0.519787\"]"); } { vector words; Extractor.Extract(s, words, topN); res << words; - ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]"); + //ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C|[\"6\", \"12\"]|1\", \"\xE4\xBD\xA0\xE5\xA5\xBD|[\"0\"]|0.519787\"]"); } } @@ -42,7 +44,7 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"专业|[\"36\"]|1\", \"CEO|[\"94\"]|0.94764\", \"当上|[\"87\"]|0.79271\", \"手扶拖拉机|[\"21\"]|0.789347\", \"走上|[\"100\"]|0.768261\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.95375\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.801701\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.798968\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.775505\"]"); // ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]"); } @@ -53,7 +55,8 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]"); + //ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); } } @@ -82,6 +85,7 @@ TEST(TextRankExtractorTest, Test2) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + //ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]"); + ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]"); } } From d5a52a8e7b69ff70a16fb60bcf23c44a0649dd06 Mon Sep 17 00:00:00 2001 From: mayunyun <121999660@qq.com> Date: Wed, 4 May 2016 17:52:30 +0800 Subject: [PATCH 09/10] 1. remove stopword from span windows 2. update unittest --- include/cppjieba/TextRankExtractor.hpp | 3 ++- test/unittest/textrank_test.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index a625695..34c6aae 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -135,8 +135,9 @@ namespace cppjieba { if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } - for(size_t j=i+1;j Date: Wed, 4 May 2016 19:33:05 +0800 Subject: [PATCH 10/10] update unittest to pass 'make test' --- test/unittest/textrank_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp index d62b2e5..ef7ac27 100644 --- a/test/unittest/textrank_test.cpp +++ b/test/unittest/textrank_test.cpp @@ -42,7 +42,8 @@ TEST(TextRankExtractorTest, Test1) { size_t topN = 5; Extractor.Extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]"); + ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]"); + //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]"); } {