mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add stopword in KeywordExtractor
This commit is contained in:
parent
4a559e7858
commit
6de292a56d
@ -11,8 +11,6 @@ namespace CppJieba
|
||||
using namespace Limonp;
|
||||
|
||||
/*utf8*/
|
||||
const char * const BLACK_LIST[] = {"我们", "他们"};
|
||||
|
||||
class KeywordExtractor: public InitOnOff
|
||||
{
|
||||
private:
|
||||
@ -21,56 +19,20 @@ namespace CppJieba
|
||||
unordered_map<string, double> _idfMap;
|
||||
double _idfAverage;
|
||||
|
||||
unordered_set<string> _blackSet;
|
||||
unordered_set<string> _stopWords;
|
||||
public:
|
||||
KeywordExtractor(){_setInitFlag(false);};
|
||||
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath)
|
||||
explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||
{
|
||||
_setInitFlag(init(dictPath, hmmFilePath, idfPath));
|
||||
_setInitFlag(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
||||
};
|
||||
~KeywordExtractor(){};
|
||||
|
||||
public:
|
||||
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath)
|
||||
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||
{
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", idfPath.c_str());
|
||||
return false;
|
||||
}
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
double idf = 0.0;
|
||||
double idfSum = 0.0;
|
||||
size_t lineno = 0;
|
||||
for(;getline(ifs, line); lineno++)
|
||||
{
|
||||
buf.clear();
|
||||
if(line.empty())
|
||||
{
|
||||
LogError("line[%d] empty. skipped.", lineno);
|
||||
continue;
|
||||
}
|
||||
if(!split(line, buf, " ") || buf.size() != 2)
|
||||
{
|
||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||
continue;
|
||||
}
|
||||
idf = atof(buf[1].c_str());
|
||||
_idfMap[buf[0]] = idf;
|
||||
idfSum += idf;
|
||||
|
||||
}
|
||||
|
||||
std::copy(
|
||||
BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]),
|
||||
std::inserter(_blackSet, _blackSet.begin()));
|
||||
|
||||
assert(lineno);
|
||||
_idfAverage = idfSum / lineno;
|
||||
|
||||
assert(_idfAverage > 0.0);
|
||||
|
||||
_loadIdfDict(idfPath);
|
||||
_loadStopWordDict(stopWordPath);
|
||||
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||
};
|
||||
public:
|
||||
@ -120,7 +82,7 @@ namespace CppJieba
|
||||
|
||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
||||
{
|
||||
if(_blackSet.end() != _blackSet.find(itr->first))
|
||||
if(_stopWords.end() != _stopWords.find(itr->first))
|
||||
{
|
||||
itr = wordmap.erase(itr);
|
||||
continue;
|
||||
@ -145,6 +107,58 @@ namespace CppJieba
|
||||
keywords.resize(topN);
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
void _loadIdfDict(const string& idfPath)
|
||||
{
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", idfPath.c_str());
|
||||
assert(false);
|
||||
}
|
||||
string line ;
|
||||
vector<string> buf;
|
||||
double idf = 0.0;
|
||||
double idfSum = 0.0;
|
||||
size_t lineno = 0;
|
||||
for(;getline(ifs, line); lineno++)
|
||||
{
|
||||
buf.clear();
|
||||
if(line.empty())
|
||||
{
|
||||
LogError("line[%d] empty. skipped.", lineno);
|
||||
continue;
|
||||
}
|
||||
if(!split(line, buf, " ") || buf.size() != 2)
|
||||
{
|
||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||
continue;
|
||||
}
|
||||
idf = atof(buf[1].c_str());
|
||||
_idfMap[buf[0]] = idf;
|
||||
idfSum += idf;
|
||||
|
||||
}
|
||||
|
||||
assert(lineno);
|
||||
_idfAverage = idfSum / lineno;
|
||||
assert(_idfAverage > 0.0);
|
||||
}
|
||||
void _loadStopWordDict(const string& filePath)
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogError("open %s failed.", filePath.c_str());
|
||||
assert(false);
|
||||
}
|
||||
string line ;
|
||||
while(getline(ifs, line))
|
||||
{
|
||||
_stopWords.insert(line);
|
||||
}
|
||||
assert(_stopWords.size());
|
||||
}
|
||||
private:
|
||||
bool _isSingleWord(const string& str) const
|
||||
{
|
||||
|
@ -3,7 +3,7 @@ using namespace CppJieba;
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
|
||||
vector<pair<string, double> > wordweights;
|
||||
size_t topN = 5;
|
||||
|
@ -7,7 +7,7 @@ using namespace CppJieba;
|
||||
|
||||
TEST(KeywordExtractorTest, Test1)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8");
|
||||
KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,迎娶白富美,走上人生巅峰。");
|
||||
string res;
|
||||
vector<pair<string, double> > wordweights;
|
||||
|
Loading…
x
Reference in New Issue
Block a user