mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
1.增加单元测试
2.增加了构造函数的重载,增加了提取函数的重载
This commit is contained in:
parent
f2de41c15e
commit
0f66a923b3
@ -82,12 +82,41 @@ namespace cppjieba {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
TextRankExtractor(const string& dictPath,
|
||||||
|
const string& hmmFilePath,
|
||||||
|
const string& stopWordPath,
|
||||||
|
const string& userDict = "")
|
||||||
|
: segment_(dictPath, hmmFilePath, userDict) {
|
||||||
|
LoadStopWordDict(stopWordPath);
|
||||||
|
}
|
||||||
|
TextRankExtractor(const DictTrie* dictTrie,
|
||||||
|
const HMMModel* model,
|
||||||
|
const string& stopWordPath)
|
||||||
|
: segment_(dictTrie, model) {
|
||||||
|
LoadStopWordDict(stopWordPath);
|
||||||
|
}
|
||||||
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
}
|
}
|
||||||
~TextRankExtractor() {
|
~TextRankExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
||||||
|
vector<Word> topWords;
|
||||||
|
Extract(sentence, topWords, topN);
|
||||||
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
|
keywords.push_back(topWords[i].word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
|
vector<Word> topWords;
|
||||||
|
Extract(sentence, topWords, topN);
|
||||||
|
for (size_t i = 0; i < topWords.size(); i++) {
|
||||||
|
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
@ -161,6 +190,10 @@ namespace cppjieba {
|
|||||||
MixSegment segment_;
|
MixSegment segment_;
|
||||||
unordered_set<string> stopWords_;
|
unordered_set<string> stopWords_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
||||||
|
return os << word.word << '|' << word.offsets << '|' << word.weight;
|
||||||
|
}
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
85
test/unittest/textrank_test.cpp
Normal file
85
test/unittest/textrank_test.cpp
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#include "cppjieba/TextRankExtractor.hpp"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
using namespace cppjieba;
|
||||||
|
|
||||||
|
TEST(TextRankExtractorTest, Test1) {
|
||||||
|
TextRankExtractor Extractor(
|
||||||
|
"../test/testdata/extra_dict/jieba.dict.small.utf8",
|
||||||
|
"../dict/hmm_model.utf8",
|
||||||
|
"../dict/stop_words.utf8");
|
||||||
|
{
|
||||||
|
string s("你好世界世界而且而且");
|
||||||
|
string res;
|
||||||
|
size_t topN = 5;
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<string> words;
|
||||||
|
Extractor.Extract(s, words, topN);
|
||||||
|
res << words;
|
||||||
|
ASSERT_EQ(res, "[\"世界\", \"你好\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<pair<string, double> > words;
|
||||||
|
Extractor.Extract(s, words, topN);
|
||||||
|
res << words;
|
||||||
|
ASSERT_EQ(res, "[\"世界:1\", \"你好:0.514286\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
vector<TextRankExtractor::Word> words;
|
||||||
|
Extractor.Extract(s, words, topN);
|
||||||
|
res << words;
|
||||||
|
ASSERT_EQ(res, "[\"世界|[\"6\", \"12\"]|1\", \"你好|[\"0\"]|0.514286\"]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0");
|
||||||
|
string res;
|
||||||
|
vector<TextRankExtractor::Word> wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
Extractor.Extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"\xE4\xB8\x93\xE4\xB8\x9A|[\"36\"]|1\", \"CEO|[\"94\"]|0.953149\", \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA|[\"21\"]|0.794203\", \"\xE5\xBD\x93\xE4\xB8\x8A|[\"87\"]|0.78716\", \"\xE8\xB5\xB0\xE4\xB8\x8A|[\"100\"]|0.767636\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("一部iPhone6");
|
||||||
|
string res;
|
||||||
|
vector<TextRankExtractor::Word> wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
Extractor.Extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(TextRankExtractorTest, Test2) {
|
||||||
|
TextRankExtractor Extractor(
|
||||||
|
"../test/testdata/extra_dict/jieba.dict.small.utf8",
|
||||||
|
"../dict/hmm_model.utf8",
|
||||||
|
"../dict/stop_words.utf8",
|
||||||
|
"../test/testdata/userdict.utf8");
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
|
||||||
|
string res;
|
||||||
|
vector<TextRankExtractor::Word> wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
Extractor.Extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"\xE4\xBC\x98\xE7\xA7\x80|[\"6\"]|1\", \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F|[\"12\"]|0.996685\", \"\xE8\x93\x9D\xE7\xBF\x94|[\"0\"]|0.992994\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("一部iPhone6");
|
||||||
|
string res;
|
||||||
|
vector<TextRankExtractor::Word> wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
Extractor.Extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[\"iPhone6|[\"6\"]|1\", \"\xE4\xB8\x80\xE9\x83\xA8|[\"0\"]|0.996126\"]");
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user