mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add _filterStopwords keywordext.cpp/h
This commit is contained in:
parent
1feeef81cb
commit
251b0b9334
@ -14,9 +14,38 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool KeyWordExt::init(const char * const filePath)
|
bool KeyWordExt::init(const char * const filePath)
|
||||||
{
|
{
|
||||||
|
if(!checkFileExist(filePath))
|
||||||
|
{
|
||||||
|
LogError(string_format("cann't find fiel[%s].",filePath));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return _segment.init(filePath);
|
return _segment.init(filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool KeyWordExt::loadStopWords(const char * const filePath)
|
||||||
|
{
|
||||||
|
if(!_stopWords.empty())
|
||||||
|
{
|
||||||
|
LogError("_stopWords has been loaded before! ");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if(!checkFileExist(filePath))
|
||||||
|
{
|
||||||
|
LogError(string_format("cann't find fiel[%s].",filePath));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ifstream ifile(filePath);
|
||||||
|
string line;
|
||||||
|
while(getline(ifile, line))
|
||||||
|
{
|
||||||
|
_stopWords.insert(line);
|
||||||
|
}
|
||||||
|
LogDebug(string_format("load stopwords[%d] finished.", _stopWords.size()));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool KeyWordExt::destroy()
|
bool KeyWordExt::destroy()
|
||||||
{
|
{
|
||||||
_segment.destroy();
|
_segment.destroy();
|
||||||
@ -109,6 +138,14 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str()));
|
LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str()));
|
||||||
|
|
||||||
|
retFlag = _filterStopWords(utf8Strs);
|
||||||
|
if(!retFlag)
|
||||||
|
{
|
||||||
|
LogError("_filterStopWords failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
LogDebug(string_format("_filterStopWords res:[%s]", joinStr(utf8Strs, ",").c_str()));
|
||||||
|
|
||||||
retFlag = _filterSubstr(utf8Strs);
|
retFlag = _filterSubstr(utf8Strs);
|
||||||
if(!retFlag)
|
if(!retFlag)
|
||||||
{
|
{
|
||||||
@ -120,6 +157,26 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool KeyWordExt::_filterStopWords(vector<string>& utf8Strs)
|
||||||
|
{
|
||||||
|
if(_stopWords.empty())
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for(VSI it = utf8Strs.begin(); it != utf8Strs.end();)
|
||||||
|
{
|
||||||
|
if(_stopWords.find(*it) != _stopWords.end())
|
||||||
|
{
|
||||||
|
it = utf8Strs.erase(it);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
it ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool KeyWordExt::_filterDuplicate(vector<string>& utf8Strs)
|
bool KeyWordExt::_filterDuplicate(vector<string>& utf8Strs)
|
||||||
{
|
{
|
||||||
set<string> st;
|
set<string> st;
|
||||||
@ -206,6 +263,7 @@ int main()
|
|||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
ext.loadStopWords("stopwords.tmp");
|
||||||
//segment.init("dicts/jieba.dict.utf8");
|
//segment.init("dicts/jieba.dict.utf8");
|
||||||
|
|
||||||
vector<string> res;
|
vector<string> res;
|
||||||
|
@ -5,14 +5,18 @@
|
|||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
class KeyWordExt
|
class KeyWordExt
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
Segment _segment;
|
Segment _segment;
|
||||||
|
set<string> _stopWords;
|
||||||
public:
|
public:
|
||||||
KeyWordExt();
|
KeyWordExt();
|
||||||
~KeyWordExt();
|
~KeyWordExt();
|
||||||
bool init(const char * const filePath);
|
bool init(const char * const filePath);
|
||||||
|
|
||||||
|
bool loadStopWords(const char * const filePath);
|
||||||
bool destroy();
|
bool destroy();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -24,6 +28,7 @@ namespace CppJieba
|
|||||||
bool _filterDuplicate(vector<string>& utf8Strs);
|
bool _filterDuplicate(vector<string>& utf8Strs);
|
||||||
bool _filterSingleWord(vector<string>& utf8Strs);
|
bool _filterSingleWord(vector<string>& utf8Strs);
|
||||||
bool _filterSubstr(vector<string>& utf8Strs);
|
bool _filterSubstr(vector<string>& utf8Strs);
|
||||||
|
bool _filterStopWords(vector<string>& utf8Strs);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user