diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index da67ea2..6d26223 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -145,7 +145,7 @@ class KeywordExtractor { double idfAverage_; unordered_set stopWords_; -}; // class Jieba +}; // class KeywordExtractor inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; diff --git a/include/cppjieba/TextRankExtractor.hpp b/include/cppjieba/TextRankExtractor.hpp new file mode 100644 index 0000000..292d0a8 --- /dev/null +++ b/include/cppjieba/TextRankExtractor.hpp @@ -0,0 +1,190 @@ +#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H +#define CPPJIEBA_TEXTRANK_EXTRACTOR_H + +#include +#include "Jieba.hpp" + +namespace cppjieba { + using namespace limonp; + using namespace std; + + class TextRankExtractor { + public: + typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word + private: + typedef std::map WordMap; + + class WordGraph{ + private: + typedef double Score; + typedef string Node; + typedef std::set NodeSet; + + typedef std::map Edges; + typedef std::map Graph; + //typedef std::unordered_map Edges; + //typedef std::unordered_map Graph; + + double d; + Graph graph; + NodeSet nodeSet; + public: + WordGraph(): d(0.85) {}; + WordGraph(double in_d): d(in_d) {}; + + void addEdge(Node start,Node end,double weight){ + Edges temp; + Edges::iterator gotEdges; + nodeSet.insert(start); + nodeSet.insert(end); + graph[start][end]+=weight; + graph[end][start]+=weight; + } + + void rank(WordMap &ws,size_t rankTime=10){ + WordMap outSum; + Score wsdef, min_rank, max_rank; + + if( graph.size() == 0) + return; + + wsdef = 1.0 / graph.size(); + + for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ + // edges->first start节点;edge->first end节点;edge->second 权重 + ws[edges->first].word=edges->first; + ws[edges->first].weight=wsdef; + outSum[edges->first].weight=0; + for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ + outSum[edges->first].weight+=edge->second; + } + } + //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? + for( size_t i=0; ifirst end节点;edge->second 权重 + s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; + ws[*node].weight = (1 - d) + d * s; + } + } + + min_rank=max_rank=ws.begin()->second.weight; + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + if( i->second.weight < min_rank ){ + min_rank = i->second.weight; + } + if( i->second.weight > max_rank ){ + max_rank = i->second.weight; + } + } + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); + } + } + }; + + public: + TextRankExtractor(const string& dictPath, + const string& hmmFilePath, + const string& stopWordPath, + const string& userDict = "") + : segment_(dictPath, hmmFilePath, userDict) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) + : segment_(dictTrie, model) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { + LoadStopWordDict(stopWordPath); + } + ~TextRankExtractor() { + } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); + } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } + } + + void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { + vector words; + segment_.Cut(sentence, words); + + TextRankExtractor::WordGraph graph; + WordMap wordmap; + size_t offset = 0; + + for(size_t i=0; i < words.size(); i++){ + size_t t = offset; + offset += words[i].size(); + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + continue; + } + for(size_t j=i+1,skip=0;jsecond); + } + + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + keywords.resize(topN); + } + private: + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + while (getline(ifs, line)) { + stopWords_.insert(line); + } + assert(stopWords_.size()); + } + + static bool Compare(const Word &x,const Word &y){ + return x.weight > y.weight; + } + + MixSegment segment_; + unordered_set stopWords_; + }; // class TextRankExtractor + + inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { + return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; + } +} // namespace cppjieba + +#endif + + diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 80c86af..802e80a 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -14,6 +14,7 @@ ADD_EXECUTABLE(test.run jieba_test.cpp pre_filter_test.cpp unicode_test.cpp + textrank_test.cpp ) if(MSVC) diff --git a/test/unittest/textrank_test.cpp b/test/unittest/textrank_test.cpp new file mode 100644 index 0000000..ef7ac27 --- /dev/null +++ b/test/unittest/textrank_test.cpp @@ -0,0 +1,86 @@ +#include "cppjieba/TextRankExtractor.hpp" +#include "gtest/gtest.h" + +using namespace cppjieba; + +TEST(TextRankExtractorTest, Test1) { + TextRankExtractor Extractor( + "../test/testdata/extra_dict/jieba.dict.small.utf8", + "../dict/hmm_model.utf8", + "../dict/stop_words.utf8"); + { + string s("你好世界世界而且而且"); + string res; + size_t topN = 5; + + { + vector words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[\"世界\", \"你好\"]"); + } + + { + vector > words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[世界:1, 你好:0.519787]"); + } + + { + vector words; + Extractor.Extract(s, words, topN); + res << words; + ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]"); + } + } + + { + string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]"); + //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]"); + } + + { + string s("一部iPhone6"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]"); + } +} + +TEST(TextRankExtractorTest, Test2) { + TextRankExtractor Extractor( + "../test/testdata/extra_dict/jieba.dict.small.utf8", + "../dict/hmm_model.utf8", + "../dict/stop_words.utf8", + "../test/testdata/userdict.utf8"); + + { + string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]"); + } + + { + string s("一部iPhone6"); + string res; + vector wordweights; + size_t topN = 5; + Extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]"); + } +}