add another extract function in keywordextractor.hpp and ut ok

This commit is contained in:
wyy 2013-12-24 19:03:52 -08:00
parent 62b83a36a0
commit 229fcd715f
2 changed files with 44 additions and 18 deletions

View File

@ -8,16 +8,16 @@ namespace CppJieba
{
using namespace Limonp;
struct KeyWordInfo
{
string word;
double idf;
};
//struct KeyWordInfo
//{
// string word;
// double tfidf;
//};
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
{
return os << keyword.word << "," << keyword.idf;
}
//inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
//{
// return os << keyword.word << "," << keyword.idf;
//}
class KeywordExtractor
{
@ -64,10 +64,24 @@ namespace CppJieba
return _setInitFlag(_segment.init(dictPath));
};
public:
bool extract(const string& str, vector<string>& keywords, uint topN) const
{
assert(_getInitFlag());
vector<pair<string, double> > topWords;
if(!extract(str, topWords, topN))
{
return false;
}
for(uint i = 0; i < topWords.size(); i++)
{
keywords.push_back(topWords[i].first);
}
return true;
}
bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
{
vector<string> words;
if(!_segment.cut(str, words))
{
@ -95,16 +109,11 @@ namespace CppJieba
}
}
vector<pair<string, double> > topWords(min(topN, wordmap.size()));
partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp);
keywords.clear();
for(uint i = 0; i < topWords.size(); i++)
{
keywords.push_back(topWords[i].first);
}
keywords.resize(min(topN, wordmap.size()));
partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp);
return true;
}
private:
static bool _cmp(const pair<string, uint>& lhs, const pair<string, uint>& rhs)
{

View File

@ -32,8 +32,25 @@ TEST(KeywordExtractorTest, Test3)
ASSERT_TRUE(ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
vector<string> keywords;
const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
vector<string> keywords;
extractor.extract(str, keywords, 5);
ASSERT_EQ(keywords, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
TEST(KeywordExtractorTest, Test4)
{
ifstream ifs("../test/testdata/weicheng.utf8");
ASSERT_TRUE(ifs);
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/idf.utf8");
//const char* res[] = {"小姐", "孙小姐", "方鸿渐", "自己", "没有"};
vector<pair<string,double> > keywords;
extractor.extract(str, keywords, 5);
//print(keywords);
string res;
res << keywords;
ASSERT_EQ(res, "[\"小姐:4268.75\", \"孙小姐:3789.41\", \"方鸿渐:3030.35\", \"自己:2300.54\", \"没有:2104.27\"]");
}