diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp index 20d295b..8e7d18a 100644 --- a/include/cppjieba/TextRankExtractor.hpp +++ b/include/cppjieba/TextRankExtractor.hpp @@ -5,162 +5,162 @@ #include "Jieba.hpp" namespace cppjieba { - using namespace limonp; - using namespace std; + using namespace limonp; + using namespace std; - class TextRankExtractor { - public: - typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word - private: - typedef std::unordered_map WordMap; - - class WordGraph{ - private: - typedef double Score; - typedef string Node; - typedef std::unordered_set NodeSet; - typedef std::unordered_map Edges; - typedef std::unordered_map Graph; + class TextRankExtractor { + public: + typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word + private: + typedef std::unordered_map WordMap; + + class WordGraph{ + private: + typedef double Score; + typedef string Node; + typedef std::unordered_set NodeSet; + typedef std::unordered_map Edges; + typedef std::unordered_map Graph; - double d; - Graph graph; - NodeSet nodeSet; - public: - WordGraph(): d(0.85) {}; - WordGraph(double in_d): d(in_d) {}; + double d; + Graph graph; + NodeSet nodeSet; + public: + WordGraph(): d(0.85) {}; + WordGraph(double in_d): d(in_d) {}; - void addEdge(Node start,Node end,double weight){ - Edges temp; - Edges::iterator gotEdges; - nodeSet.insert(start); - nodeSet.insert(end); - graph[start][end]+=weight; - graph[end][start]+=weight; - } + void addEdge(Node start,Node end,double weight){ + Edges temp; + Edges::iterator gotEdges; + nodeSet.insert(start); + nodeSet.insert(end); + graph[start][end]+=weight; + graph[end][start]+=weight; + } - void rank(WordMap &ws,size_t rankTime=10){ - WordMap outSum; - Score wsdef, min_rank, max_rank; + void rank(WordMap &ws,size_t rankTime=10){ + WordMap outSum; + Score wsdef, min_rank, max_rank; - if( graph.size() == 0) - return; + if( graph.size() == 0) + return; - wsdef = 1.0 / graph.size(); + wsdef = 1.0 / graph.size(); - for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ - // edges->first start节点;edge->first end节点;edge->second 权重 - ws[edges->first].word=edges->first; - ws[edges->first].weight=wsdef; - outSum[edges->first].weight=0; - for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ - outSum[edges->first].weight+=edge->second; - } - } - //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? - for( size_t i=0; ifirst end节点;edge->second 权重 - s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; - ws[*node].weight = (1 - d) + d * s; - } - } + for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ + // edges->first start节点;edge->first end节点;edge->second 权重 + ws[edges->first].word=edges->first; + ws[edges->first].weight=wsdef; + outSum[edges->first].weight=0; + for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ + outSum[edges->first].weight+=edge->second; + } + } + //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? + for( size_t i=0; ifirst end节点;edge->second 权重 + s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; + ws[*node].weight = (1 - d) + d * s; + } + } - min_rank=max_rank=ws.begin()->second.weight; - for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ - if( i->second.weight < min_rank ){ - min_rank = i->second.weight; - } - if( i->second.weight > max_rank ){ - max_rank = i->second.weight; - } - } - for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ - ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); - } - } - }; + min_rank=max_rank=ws.begin()->second.weight; + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + if( i->second.weight < min_rank ){ + min_rank = i->second.weight; + } + if( i->second.weight > max_rank ){ + max_rank = i->second.weight; + } + } + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); + } + } + }; - public: - TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { - LoadStopWordDict(stopWordPath); - } - ~TextRankExtractor() { - } + public: + TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { + LoadStopWordDict(stopWordPath); + } + ~TextRankExtractor() { + } - void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { - vector words; - segment_.Cut(sentence, words); + void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { + vector words; + segment_.Cut(sentence, words); - TextRankExtractor::WordGraph graph; - WordMap wordmap; - size_t offset = 0; + TextRankExtractor::WordGraph graph; + WordMap wordmap; + size_t offset = 0; - for(size_t i=0; i < words.size(); i++){ - size_t t = offset; - offset += words[i].size(); - if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { - continue; - } - for(size_t j=i+1;jsecond); - } - - topN = min(topN, keywords.size()); - partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); - keywords.resize(topN); - } - private: - void LoadStopWordDict(const string& filePath) { - ifstream ifs(filePath.c_str()); - XCHECK(ifs.is_open()) << "open " << filePath << " failed"; - string line ; - while (getline(ifs, line)) { - stopWords_.insert(line); - } - assert(stopWords_.size()); - } + graph.rank(wordmap,rankTime); + + keywords.clear(); + keywords.reserve(wordmap.size()); + for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { + keywords.push_back(itr->second); + } + + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + keywords.resize(topN); + } + private: + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + while (getline(ifs, line)) { + stopWords_.insert(line); + } + assert(stopWords_.size()); + } - bool IsSingleWord(const string& str) const { - Unicode unicode; - TransCode::Decode(str, unicode); - if (unicode.size() == 1) - return true; - return false; - } + bool IsSingleWord(const string& str) const { + Unicode unicode; + TransCode::Decode(str, unicode); + if (unicode.size() == 1) + return true; + return false; + } - static void sortMapValue(WordMap &map,vector& result,size_t topN){ - for(auto i=map.begin();i!=map.end();i++){ - result.push_back(i->second); - } - partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); - } + static void sortMapValue(WordMap &map,vector& result,size_t topN){ + for(auto i=map.begin();i!=map.end();i++){ + result.push_back(i->second); + } + partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); + } - static bool Compare(const Word &x,const Word &y){ - return x.weight > y.weight; - } + static bool Compare(const Word &x,const Word &y){ + return x.weight > y.weight; + } - MixSegment segment_; - unordered_set stopWords_; - }; + MixSegment segment_; + unordered_set stopWords_; + }; } // namespace cppjieba #endif