mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add _filterStopwords keywordext.cpp/h
This commit is contained in:
parent
1feeef81cb
commit
251b0b9334
@ -14,8 +14,37 @@ namespace CppJieba
|
||||
|
||||
bool KeyWordExt::init(const char * const filePath)
|
||||
{
|
||||
if(!checkFileExist(filePath))
|
||||
{
|
||||
LogError(string_format("cann't find fiel[%s].",filePath));
|
||||
return false;
|
||||
}
|
||||
return _segment.init(filePath);
|
||||
}
|
||||
|
||||
bool KeyWordExt::loadStopWords(const char * const filePath)
|
||||
{
|
||||
if(!_stopWords.empty())
|
||||
{
|
||||
LogError("_stopWords has been loaded before! ");
|
||||
return false;
|
||||
}
|
||||
if(!checkFileExist(filePath))
|
||||
{
|
||||
LogError(string_format("cann't find fiel[%s].",filePath));
|
||||
return false;
|
||||
}
|
||||
|
||||
ifstream ifile(filePath);
|
||||
string line;
|
||||
while(getline(ifile, line))
|
||||
{
|
||||
_stopWords.insert(line);
|
||||
}
|
||||
LogDebug(string_format("load stopwords[%d] finished.", _stopWords.size()));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::destroy()
|
||||
{
|
||||
@ -109,6 +138,14 @@ namespace CppJieba
|
||||
}
|
||||
LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str()));
|
||||
|
||||
retFlag = _filterStopWords(utf8Strs);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_filterStopWords failed.");
|
||||
return false;
|
||||
}
|
||||
LogDebug(string_format("_filterStopWords res:[%s]", joinStr(utf8Strs, ",").c_str()));
|
||||
|
||||
retFlag = _filterSubstr(utf8Strs);
|
||||
if(!retFlag)
|
||||
{
|
||||
@ -120,6 +157,26 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_filterStopWords(vector<string>& utf8Strs)
|
||||
{
|
||||
if(_stopWords.empty())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
for(VSI it = utf8Strs.begin(); it != utf8Strs.end();)
|
||||
{
|
||||
if(_stopWords.find(*it) != _stopWords.end())
|
||||
{
|
||||
it = utf8Strs.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
it ++;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_filterDuplicate(vector<string>& utf8Strs)
|
||||
{
|
||||
set<string> st;
|
||||
@ -206,6 +263,7 @@ int main()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
ext.loadStopWords("stopwords.tmp");
|
||||
//segment.init("dicts/jieba.dict.utf8");
|
||||
|
||||
vector<string> res;
|
||||
|
@ -5,14 +5,18 @@
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
class KeyWordExt
|
||||
{
|
||||
private:
|
||||
Segment _segment;
|
||||
set<string> _stopWords;
|
||||
public:
|
||||
KeyWordExt();
|
||||
~KeyWordExt();
|
||||
bool init(const char * const filePath);
|
||||
|
||||
bool loadStopWords(const char * const filePath);
|
||||
bool destroy();
|
||||
|
||||
public:
|
||||
@ -24,6 +28,7 @@ namespace CppJieba
|
||||
bool _filterDuplicate(vector<string>& utf8Strs);
|
||||
bool _filterSingleWord(vector<string>& utf8Strs);
|
||||
bool _filterSubstr(vector<string>& utf8Strs);
|
||||
bool _filterStopWords(vector<string>& utf8Strs);
|
||||
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user