add stopword in KeywordExtractor

This commit is contained in:
wyy 2014-03-15 23:31:59 +08:00
parent 4a559e7858
commit 6de292a56d
3 changed files with 62 additions and 48 deletions

View File

@ -11,8 +11,6 @@ namespace CppJieba
using namespace Limonp; using namespace Limonp;
/*utf8*/ /*utf8*/
const char * const BLACK_LIST[] = {"我们", "他们"};
class KeywordExtractor: public InitOnOff class KeywordExtractor: public InitOnOff
{ {
private: private:
@ -21,56 +19,20 @@ namespace CppJieba
unordered_map<string, double> _idfMap; unordered_map<string, double> _idfMap;
double _idfAverage; double _idfAverage;
unordered_set<string> _blackSet; unordered_set<string> _stopWords;
public: public:
KeywordExtractor(){_setInitFlag(false);}; KeywordExtractor(){_setInitFlag(false);};
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath) explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
{ {
_setInitFlag(init(dictPath, hmmFilePath, idfPath)); _setInitFlag(init(dictPath, hmmFilePath, idfPath, stopWordPath));
}; };
~KeywordExtractor(){}; ~KeywordExtractor(){};
public: public:
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath) bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
{ {
ifstream ifs(idfPath.c_str()); _loadIdfDict(idfPath);
if(!ifs) _loadStopWordDict(stopWordPath);
{
LogError("open %s failed.", idfPath.c_str());
return false;
}
string line ;
vector<string> buf;
double idf = 0.0;
double idfSum = 0.0;
size_t lineno = 0;
for(;getline(ifs, line); lineno++)
{
buf.clear();
if(line.empty())
{
LogError("line[%d] empty. skipped.", lineno);
continue;
}
if(!split(line, buf, " ") || buf.size() != 2)
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf;
idfSum += idf;
}
std::copy(
BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]),
std::inserter(_blackSet, _blackSet.begin()));
assert(lineno);
_idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0);
return _setInitFlag(_segment.init(dictPath, hmmFilePath)); return _setInitFlag(_segment.init(dictPath, hmmFilePath));
}; };
public: public:
@ -120,7 +82,7 @@ namespace CppJieba
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
{ {
if(_blackSet.end() != _blackSet.find(itr->first)) if(_stopWords.end() != _stopWords.find(itr->first))
{ {
itr = wordmap.erase(itr); itr = wordmap.erase(itr);
continue; continue;
@ -145,6 +107,58 @@ namespace CppJieba
keywords.resize(topN); keywords.resize(topN);
return true; return true;
} }
private:
void _loadIdfDict(const string& idfPath)
{
ifstream ifs(idfPath.c_str());
if(!ifs)
{
LogError("open %s failed.", idfPath.c_str());
assert(false);
}
string line ;
vector<string> buf;
double idf = 0.0;
double idfSum = 0.0;
size_t lineno = 0;
for(;getline(ifs, line); lineno++)
{
buf.clear();
if(line.empty())
{
LogError("line[%d] empty. skipped.", lineno);
continue;
}
if(!split(line, buf, " ") || buf.size() != 2)
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue;
}
idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf;
idfSum += idf;
}
assert(lineno);
_idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0);
}
void _loadStopWordDict(const string& filePath)
{
ifstream ifs(filePath.c_str());
if(!ifs)
{
LogError("open %s failed.", filePath.c_str());
assert(false);
}
string line ;
while(getline(ifs, line))
{
_stopWords.insert(line);
}
assert(_stopWords.size());
}
private: private:
bool _isSingleWord(const string& str) const bool _isSingleWord(const string& str) const
{ {

View File

@ -3,7 +3,7 @@ using namespace CppJieba;
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。"); string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久我就会升职加薪当上总经理出任CEO迎娶白富美走上人生巅峰。");
vector<pair<string, double> > wordweights; vector<pair<string, double> > wordweights;
size_t topN = 5; size_t topN = 5;

View File

@ -7,7 +7,7 @@ using namespace CppJieba;
TEST(KeywordExtractorTest, Test1) TEST(KeywordExtractorTest, Test1)
{ {
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,迎娶白富美,走上人生巅峰。"); string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,迎娶白富美,走上人生巅峰。");
string res; string res;
vector<pair<string, double> > wordweights; vector<pair<string, double> > wordweights;