mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add stopword in KeywordExtractor
This commit is contained in:
parent
4a559e7858
commit
6de292a56d
@ -11,8 +11,6 @@ namespace CppJieba
|
|||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
|
|
||||||
/*utf8*/
|
/*utf8*/
|
||||||
const char * const BLACK_LIST[] = {"我们", "他们"};
|
|
||||||
|
|
||||||
class KeywordExtractor: public InitOnOff
|
class KeywordExtractor: public InitOnOff
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
@ -21,56 +19,20 @@ namespace CppJieba
|
|||||||
unordered_map<string, double> _idfMap;
|
unordered_map<string, double> _idfMap;
|
||||||
double _idfAverage;
|
double _idfAverage;
|
||||||
|
|
||||||
unordered_set<string> _blackSet;
|
unordered_set<string> _stopWords;
|
||||||
public:
|
public:
|
||||||
KeywordExtractor(){_setInitFlag(false);};
|
KeywordExtractor(){_setInitFlag(false);};
|
||||||
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
|
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||||
{
|
{
|
||||||
_setInitFlag(init(dictPath, hmmFilePath, idfPath));
|
_setInitFlag(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
||||||
};
|
};
|
||||||
~KeywordExtractor(){};
|
~KeywordExtractor(){};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
|
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||||
{
|
{
|
||||||
ifstream ifs(idfPath.c_str());
|
_loadIdfDict(idfPath);
|
||||||
if(!ifs)
|
_loadStopWordDict(stopWordPath);
|
||||||
{
|
|
||||||
LogError("open %s failed.", idfPath.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
string line ;
|
|
||||||
vector<string> buf;
|
|
||||||
double idf = 0.0;
|
|
||||||
double idfSum = 0.0;
|
|
||||||
size_t lineno = 0;
|
|
||||||
for(;getline(ifs, line); lineno++)
|
|
||||||
{
|
|
||||||
buf.clear();
|
|
||||||
if(line.empty())
|
|
||||||
{
|
|
||||||
LogError("line[%d] empty. skipped.", lineno);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if(!split(line, buf, " ") || buf.size() != 2)
|
|
||||||
{
|
|
||||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
idf = atof(buf[1].c_str());
|
|
||||||
_idfMap[buf[0]] = idf;
|
|
||||||
idfSum += idf;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::copy(
|
|
||||||
BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]),
|
|
||||||
std::inserter(_blackSet, _blackSet.begin()));
|
|
||||||
|
|
||||||
assert(lineno);
|
|
||||||
_idfAverage = idfSum / lineno;
|
|
||||||
|
|
||||||
assert(_idfAverage > 0.0);
|
|
||||||
|
|
||||||
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
@ -120,7 +82,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
||||||
{
|
{
|
||||||
if(_blackSet.end() != _blackSet.find(itr->first))
|
if(_stopWords.end() != _stopWords.find(itr->first))
|
||||||
{
|
{
|
||||||
itr = wordmap.erase(itr);
|
itr = wordmap.erase(itr);
|
||||||
continue;
|
continue;
|
||||||
@ -145,6 +107,58 @@ namespace CppJieba
|
|||||||
keywords.resize(topN);
|
keywords.resize(topN);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
void _loadIdfDict(const string& idfPath)
|
||||||
|
{
|
||||||
|
ifstream ifs(idfPath.c_str());
|
||||||
|
if(!ifs)
|
||||||
|
{
|
||||||
|
LogError("open %s failed.", idfPath.c_str());
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
string line ;
|
||||||
|
vector<string> buf;
|
||||||
|
double idf = 0.0;
|
||||||
|
double idfSum = 0.0;
|
||||||
|
size_t lineno = 0;
|
||||||
|
for(;getline(ifs, line); lineno++)
|
||||||
|
{
|
||||||
|
buf.clear();
|
||||||
|
if(line.empty())
|
||||||
|
{
|
||||||
|
LogError("line[%d] empty. skipped.", lineno);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(!split(line, buf, " ") || buf.size() != 2)
|
||||||
|
{
|
||||||
|
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
idf = atof(buf[1].c_str());
|
||||||
|
_idfMap[buf[0]] = idf;
|
||||||
|
idfSum += idf;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(lineno);
|
||||||
|
_idfAverage = idfSum / lineno;
|
||||||
|
assert(_idfAverage > 0.0);
|
||||||
|
}
|
||||||
|
void _loadStopWordDict(const string& filePath)
|
||||||
|
{
|
||||||
|
ifstream ifs(filePath.c_str());
|
||||||
|
if(!ifs)
|
||||||
|
{
|
||||||
|
LogError("open %s failed.", filePath.c_str());
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
string line ;
|
||||||
|
while(getline(ifs, line))
|
||||||
|
{
|
||||||
|
_stopWords.insert(line);
|
||||||
|
}
|
||||||
|
assert(_stopWords.size());
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
bool _isSingleWord(const string& str) const
|
bool _isSingleWord(const string& str) const
|
||||||
{
|
{
|
||||||
|
@ -3,7 +3,7 @@ using namespace CppJieba;
|
|||||||
|
|
||||||
int main(int argc, char ** argv)
|
int main(int argc, char ** argv)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||||
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
|
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
|
||||||
vector<pair<string, double> > wordweights;
|
vector<pair<string, double> > wordweights;
|
||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
|
@ -7,7 +7,7 @@ using namespace CppJieba;
|
|||||||
|
|
||||||
TEST(KeywordExtractorTest, Test1)
|
TEST(KeywordExtractorTest, Test1)
|
||||||
{
|
{
|
||||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,迎娶白富美,走上人生巅峰。");
|
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,迎娶白富美,走上人生巅峰。");
|
||||||
string res;
|
string res;
|
||||||
vector<pair<string, double> > wordweights;
|
vector<pair<string, double> > wordweights;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user