mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
finished KeywordExtractor and its ut
This commit is contained in:
parent
0f7947d1e3
commit
0db2dfa6b8
@ -2,6 +2,7 @@
|
|||||||
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
||||||
|
|
||||||
#include "MPSegment.hpp"
|
#include "MPSegment.hpp"
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
@ -9,18 +10,24 @@ namespace CppJieba
|
|||||||
|
|
||||||
struct KeyWordInfo
|
struct KeyWordInfo
|
||||||
{
|
{
|
||||||
|
string word;
|
||||||
uint freq;
|
uint freq;
|
||||||
double weight;
|
double idf;
|
||||||
};
|
};
|
||||||
|
|
||||||
class KeywordExtractor//: public MPSegment
|
inline ostream& operator << (ostream& os, const KeyWordInfo & keyword)
|
||||||
|
{
|
||||||
|
return os << keyword.word << "," << keyword.freq << "," << keyword.idf;
|
||||||
|
}
|
||||||
|
|
||||||
|
class KeywordExtractor
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
MPSegment _segment;
|
MPSegment _segment;
|
||||||
private:
|
private:
|
||||||
unordered_map<string, uint> _wordIndex;
|
unordered_map<string, const KeyWordInfo* > _wordIndex;
|
||||||
vector<KeyWordInfo> _words;
|
vector<KeyWordInfo> _wordinfos;
|
||||||
|
size_t _totalFreq;
|
||||||
protected:
|
protected:
|
||||||
bool _isInited;
|
bool _isInited;
|
||||||
bool _getInitFlag()const{return _isInited;};
|
bool _getInitFlag()const{return _isInited;};
|
||||||
@ -40,16 +47,51 @@ namespace CppJieba
|
|||||||
LogError("open %s failed.", dictPath.c_str());
|
LogError("open %s failed.", dictPath.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
_totalFreq = 0;
|
||||||
|
int tfreq;
|
||||||
string line ;
|
string line ;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
|
KeyWordInfo keywordInfo;
|
||||||
for(uint lineno = 0; getline(ifs, line); lineno++)
|
for(uint lineno = 0; getline(ifs, line); lineno++)
|
||||||
{
|
{
|
||||||
buf.clear();
|
buf.clear();
|
||||||
|
if(line.empty())
|
||||||
|
{
|
||||||
|
LogError("line[%d] empty. skipped.", lineno);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(!split(line, buf, " ") || buf.size() != 3)
|
||||||
|
{
|
||||||
|
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
keywordInfo.word = buf[0];
|
||||||
|
tfreq= atoi(buf[1].c_str());
|
||||||
|
if(tfreq <= 0)
|
||||||
|
{
|
||||||
|
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
keywordInfo.freq = tfreq;
|
||||||
|
_totalFreq += tfreq;
|
||||||
|
_wordinfos.push_back(keywordInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate idf & make index.
|
||||||
|
for(uint i = 0; i < _wordinfos.size(); i++)
|
||||||
|
{
|
||||||
|
if(_wordinfos[i].freq <= 0)
|
||||||
|
{
|
||||||
|
LogFatal("freq value is not positive.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
_wordinfos[i].idf = -log(_wordinfos[i].freq);
|
||||||
|
_wordIndex[_wordinfos[i].word] = &(_wordinfos[i]);
|
||||||
}
|
}
|
||||||
return _setInitFlag(_segment.init(dictPath));
|
return _setInitFlag(_segment.init(dictPath));
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
bool extract(const string& str, vector<string>& keywords, uint topN)
|
bool extract(const string& str, vector<string>& keywords, uint topN) const
|
||||||
{
|
{
|
||||||
assert(_getInitFlag());
|
assert(_getInitFlag());
|
||||||
|
|
||||||
@ -60,14 +102,28 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_map<string, uint> wordcnt;
|
unordered_map<string, double> wordmap;
|
||||||
for(uint i = 0; i < words.size(); i ++)
|
for(uint i = 0; i < words.size(); i ++)
|
||||||
{
|
{
|
||||||
wordcnt[ words[i] ] ++;
|
wordmap[ words[i] ] += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<pair<string, uint> > topWords(topN);
|
for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();)
|
||||||
partial_sort_copy(wordcnt.begin(), wordcnt.end(), topWords.begin(), topWords.end(), _cmp);
|
{
|
||||||
|
unordered_map<string, const KeyWordInfo*>::const_iterator cit = _wordIndex.find(itr->first);
|
||||||
|
if(cit != _wordIndex.end())
|
||||||
|
{
|
||||||
|
itr->second *= cit->second->idf;
|
||||||
|
itr ++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
itr = wordmap.erase(itr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<pair<string, double> > topWords(min(topN, wordmap.size()));
|
||||||
|
partial_sort_copy(wordmap.begin(), wordmap.end(), topWords.begin(), topWords.end(), _cmp);
|
||||||
|
|
||||||
keywords.clear();
|
keywords.clear();
|
||||||
for(uint i = 0; i < topWords.size(); i++)
|
for(uint i = 0; i < topWords.size(); i++)
|
||||||
|
@ -3,6 +3,7 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test/lib)
|
|||||||
|
|
||||||
SET(GTEST_ROOT_DIR gtest-1.6.0)
|
SET(GTEST_ROOT_DIR gtest-1.6.0)
|
||||||
|
|
||||||
|
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
|
||||||
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
||||||
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
||||||
FILE(GLOB SRCFILES *.cpp)
|
FILE(GLOB SRCFILES *.cpp)
|
||||||
|
@ -7,13 +7,34 @@ TEST(KeywordExtractorTest, Test1)
|
|||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
|
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
|
||||||
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学","号", " 123456"};
|
const char* res[] = {"北京邮电大学", "来自"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(extractor);
|
ASSERT_TRUE(extractor);
|
||||||
ASSERT_TRUE(extractor.extract(str, words, 2));
|
ASSERT_TRUE(extractor.extract(str, words, 2));
|
||||||
//print(words);
|
|
||||||
//exit(0);
|
|
||||||
//print(words);
|
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(KeywordExtractorTest, Test2)
|
||||||
|
{
|
||||||
|
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
|
||||||
|
const char* str = "我来自北京邮电大学。。。 学号 123456";
|
||||||
|
const char* res[] = {"北京邮电大学", "来自", "学", "号", "我"};
|
||||||
|
vector<string> words;
|
||||||
|
ASSERT_TRUE(extractor);
|
||||||
|
ASSERT_TRUE(extractor.extract(str, words, 9));
|
||||||
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST(KeywordExtractorTest, Test3)
|
||||||
|
{
|
||||||
|
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||||
|
ASSERT_TRUE(ifs);
|
||||||
|
string str((istreambuf_iterator<char>(ifs)), (istreambuf_iterator<char>()));
|
||||||
|
KeywordExtractor extractor("../dicts/jieba.dict.utf8");
|
||||||
|
vector<string> keywords;
|
||||||
|
string res;
|
||||||
|
extractor.extract(str, keywords, 5);
|
||||||
|
res << keywords;
|
||||||
|
ASSERT_EQ("[\"第三性\", \"多愁多病\", \"记挂着\", \"揭去\", \"贫血症\"]", res);
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user