mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Merge pull request #1 from yanyiwu/patch-1
Update TextRankExtractor.hpp: use yanyiwu's correction
This commit is contained in:
commit
04c176de08
@ -1,4 +1,4 @@
|
|||||||
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
||||||
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -12,15 +12,18 @@ namespace cppjieba {
|
|||||||
public:
|
public:
|
||||||
typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
|
typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
|
||||||
private:
|
private:
|
||||||
typedef std::unordered_map<string,Word> WordMap;
|
typedef std::map<string,Word> WordMap;
|
||||||
|
|
||||||
class WordGraph{
|
class WordGraph{
|
||||||
private:
|
private:
|
||||||
typedef double Score;
|
typedef double Score;
|
||||||
typedef string Node;
|
typedef string Node;
|
||||||
typedef std::unordered_set<Node> NodeSet;
|
typedef std::set<Node> NodeSet;
|
||||||
typedef std::unordered_map<Node,double> Edges;
|
|
||||||
typedef std::unordered_map<Node,Edges> Graph;
|
typedef std::map<Node,double> Edges;
|
||||||
|
typedef std::map<Node,Edges> Graph;
|
||||||
|
//typedef std::unordered_map<Node,double> Edges;
|
||||||
|
//typedef std::unordered_map<Node,Edges> Graph;
|
||||||
|
|
||||||
double d;
|
double d;
|
||||||
Graph graph;
|
Graph graph;
|
||||||
@ -82,40 +85,40 @@ namespace cppjieba {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TextRankExtractor(const string& dictPath,
|
TextRankExtractor(const string& dictPath,
|
||||||
const string& hmmFilePath,
|
const string& hmmFilePath,
|
||||||
const string& stopWordPath,
|
const string& stopWordPath,
|
||||||
const string& userDict = "")
|
const string& userDict = "")
|
||||||
: segment_(dictPath, hmmFilePath, userDict) {
|
: segment_(dictPath, hmmFilePath, userDict) {
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
}
|
}
|
||||||
TextRankExtractor(const DictTrie* dictTrie,
|
TextRankExtractor(const DictTrie* dictTrie,
|
||||||
const HMMModel* model,
|
const HMMModel* model,
|
||||||
const string& stopWordPath)
|
const string& stopWordPath)
|
||||||
: segment_(dictTrie, model) {
|
: segment_(dictTrie, model) {
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
}
|
}
|
||||||
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
}
|
}
|
||||||
~TextRankExtractor() {
|
~TextRankExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||||
vector<Word> topWords;
|
vector<Word> topWords;
|
||||||
Extract(sentence, topWords, topN);
|
Extract(sentence, topWords, topN);
|
||||||
for (size_t i = 0; i < topWords.size(); i++) {
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
keywords.push_back(topWords[i].word);
|
keywords.push_back(topWords[i].word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
vector<Word> topWords;
|
vector<Word> topWords;
|
||||||
Extract(sentence, topWords, topN);
|
Extract(sentence, topWords, topN);
|
||||||
for (size_t i = 0; i < topWords.size(); i++) {
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
||||||
@ -176,13 +179,6 @@ namespace cppjieba {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sortMapValue(WordMap &map,vector<Word>& result,size_t topN){
|
|
||||||
for(auto i=map.begin();i!=map.end();i++){
|
|
||||||
result.push_back(i->second);
|
|
||||||
}
|
|
||||||
partial_sort(result.begin(),result.begin()+topN,result.end(),Compare);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool Compare(const Word &x,const Word &y){
|
static bool Compare(const Word &x,const Word &y){
|
||||||
return x.weight > y.weight;
|
return x.weight > y.weight;
|
||||||
}
|
}
|
||||||
@ -190,9 +186,9 @@ namespace cppjieba {
|
|||||||
MixSegment segment_;
|
MixSegment segment_;
|
||||||
unordered_set<string> stopWords_;
|
unordered_set<string> stopWords_;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
||||||
return os << word.word << '|' << word.offsets << '|' << word.weight;
|
return os << word.word << '|' << word.offsets << '|' << word.weight;
|
||||||
}
|
}
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run
|
|||||||
pos_tagger_test.cpp
|
pos_tagger_test.cpp
|
||||||
jieba_test.cpp
|
jieba_test.cpp
|
||||||
pre_filter_test.cpp
|
pre_filter_test.cpp
|
||||||
|
textrank_test.cpp
|
||||||
)
|
)
|
||||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||||
|
|
||||||
|
@ -24,14 +24,16 @@ TEST(TextRankExtractorTest, Test1) {
|
|||||||
vector<pair<string, double> > words;
|
vector<pair<string, double> > words;
|
||||||
Extractor.Extract(s, words, topN);
|
Extractor.Extract(s, words, topN);
|
||||||
res << words;
|
res << words;
|
||||||
ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]");
|
//ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]");
|
||||||
|
ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C:1\", \"\xE4\xBD\xA0\xE5\xA5\xBD:0.519787\"]");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
vector<TextRankExtractor::Word> words;
|
vector<TextRankExtractor::Word> words;
|
||||||
Extractor.Extract(s, words, topN);
|
Extractor.Extract(s, words, topN);
|
||||||
res << words;
|
res << words;
|
||||||
ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]");
|
//ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]");
|
||||||
|
ASSERT_EQ(res, "[\"\xE4\xB8\x96\xE7\x95\x8C|[\"6\", \"12\"]|1\", \"\xE4\xBD\xA0\xE5\xA5\xBD|[\"0\"]|0.519787\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,7 +44,8 @@ TEST(TextRankExtractorTest, Test1) {
|
|||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]");
|
ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.95375\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.801701\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.798968\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.775505\"]");
|
||||||
|
// ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -52,7 +55,8 @@ TEST(TextRankExtractorTest, Test1) {
|
|||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]");
|
||||||
|
//ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,7 +74,8 @@ TEST(TextRankExtractorTest, Test2) {
|
|||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]");
|
ASSERT_EQ(res, "[\"蓝翔|[\"0\"]|1\", \"毕业生|[\"12\"]|0.996685\", \"优秀|[\"6\"]|0.992994\"]");
|
||||||
|
//ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -80,6 +85,7 @@ TEST(TextRankExtractorTest, Test2) {
|
|||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
Extractor.Extract(s, wordweights, topN);
|
Extractor.Extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
//ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
||||||
|
ASSERT_EQ(res, "[\"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|1\", \"iPhone6|[\"6\"]|0.996126\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user