mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add another extract function in keywordextractor.hpp and ut ok
This commit is contained in:
parent
62b83a36a0
commit
229fcd715f
@ -8,16 +8,16 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
|
|
||||||
struct KeyWordInfo
|
//struct KeyWordInfo
|
||||||
{
|
//{
|
||||||
string word;
|
// string word;
|
||||||
double idf;
|
// double tfidf;
|
||||||
};
|
//};
|
||||||
|
|
||||||
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
||||||
{
|
//{
|
||||||
return os << keyword.word << "," << keyword.idf;
|
// return os << keyword.word << "," << keyword.idf;
|
||||||
}
|
//}
|
||||||
|
|
||||||
class KeywordExtractor
|
class KeywordExtractor
|
||||||
{
|
{
|
||||||
@ -64,10 +64,24 @@ namespace CppJieba
|
|||||||
return _setInitFlag(_segment.init(dictPath));
|
return _setInitFlag(_segment.init(dictPath));
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, uint topN) const
|
bool extract(const string& str, vector<string>& keywords, uint topN) const
|
||||||
{
|
{
|
||||||
assert(_getInitFlag());
|
assert(_getInitFlag());
|
||||||
|
vector<pair<string, double> > topWords;
|
||||||
|
if(!extract(str, topWords, topN))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for(uint i = 0; i < topWords.size(); i++)
|
||||||
|
{
|
||||||
|
keywords.push_back(topWords[i].first);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
|
||||||
|
{
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if(!_segment.cut(str, words))
|
if(!_segment.cut(str, words))
|
||||||
{
|
{
|
||||||
@ -95,16 +109,11 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<pair<string, double> > topWords(min(topN, wordmap.size()));
|
keywords.resize(min(topN, wordmap.size()));
|
||||||
partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp);
|
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
|
||||||
|
|
||||||
keywords.clear();
|
|
||||||
for(uint i = 0; i < topWords.size(); i++)
|
|
||||||
{
|
|
||||||
keywords.push_back(topWords[i].first);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
||||||
{
|
{
|
||||||
|
@ -32,8 +32,25 @@ TEST(KeywordExtractorTest, Test3)
|
|||||||
ASSERT_TRUE(ifs);
|
ASSERT_TRUE(ifs);
|
||||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||||
vector<string> keywords;
|
|
||||||
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||||
|
vector<string> keywords;
|
||||||
extractor.extract(str, keywords, 5);
|
extractor.extract(str, keywords, 5);
|
||||||
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(KeywordExtractorTest, Test4)
|
||||||
|
{
|
||||||
|
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||||
|
ASSERT_TRUE(ifs);
|
||||||
|
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||||
|
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||||
|
vector<pair<string,double> > keywords;
|
||||||
|
extractor.extract(str, keywords, 5);
|
||||||
|
//print(keywords);
|
||||||
|
string res;
|
||||||
|
res << keywords;
|
||||||
|
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user