mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add another extract function in keywordextractor.hpp and ut ok
This commit is contained in:
parent
62b83a36a0
commit
229fcd715f
@ -8,16 +8,16 @@ namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
struct KeyWordInfo
|
||||
{
|
||||
string word;
|
||||
double idf;
|
||||
};
|
||||
//struct KeyWordInfo
|
||||
//{
|
||||
// string word;
|
||||
// double tfidf;
|
||||
//};
|
||||
|
||||
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
||||
{
|
||||
return os << keyword.word << "," << keyword.idf;
|
||||
}
|
||||
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
||||
//{
|
||||
// return os << keyword.word << "," << keyword.idf;
|
||||
//}
|
||||
|
||||
class KeywordExtractor
|
||||
{
|
||||
@ -64,10 +64,24 @@ namespace CppJieba
|
||||
return _setInitFlag(_segment.init(dictPath));
|
||||
};
|
||||
public:
|
||||
|
||||
bool extract(const string& str, vector<string>& keywords, uint topN) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<pair<string, double> > topWords;
|
||||
if(!extract(str, topWords, topN))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
for(uint i = 0; i < topWords.size(); i++)
|
||||
{
|
||||
keywords.push_back(topWords[i].first);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
|
||||
{
|
||||
vector<string> words;
|
||||
if(!_segment.cut(str, words))
|
||||
{
|
||||
@ -95,16 +109,11 @@ namespace CppJieba
|
||||
}
|
||||
}
|
||||
|
||||
vector<pair<string, double> > topWords(min(topN, wordmap.size()));
|
||||
partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp);
|
||||
|
||||
keywords.clear();
|
||||
for(uint i = 0; i < topWords.size(); i++)
|
||||
{
|
||||
keywords.push_back(topWords[i].first);
|
||||
}
|
||||
keywords.resize(min(topN, wordmap.size()));
|
||||
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
|
||||
{
|
||||
|
@ -32,8 +32,25 @@ TEST(KeywordExtractorTest, Test3)
|
||||
ASSERT_TRUE(ifs);
|
||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
vector<string> keywords;
|
||||
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||
vector<string> keywords;
|
||||
extractor.extract(str, keywords, 5);
|
||||
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
|
||||
}
|
||||
|
||||
TEST(KeywordExtractorTest, Test4)
|
||||
{
|
||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||
ASSERT_TRUE(ifs);
|
||||
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
|
||||
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
|
||||
vector<pair<string,double> > keywords;
|
||||
extractor.extract(str, keywords, 5);
|
||||
//print(keywords);
|
||||
string res;
|
||||
res << keywords;
|
||||
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user