add _filterStopwords keywordext.cpp/h

This commit is contained in:
gwdwyy 2013-07-15 16:19:58 +08:00
parent 1feeef81cb
commit 251b0b9334
2 changed files with 63 additions and 0 deletions

View File

@ -14,9 +14,38 @@ namespace CppJieba
bool KeyWordExt::init(const char * const filePath) bool KeyWordExt::init(const char * const filePath)
{ {
if(!checkFileExist(filePath))
{
LogError(string_format("cann't find fiel[%s].",filePath));
return false;
}
return _segment.init(filePath); return _segment.init(filePath);
} }
bool KeyWordExt::loadStopWords(const char * const filePath)
{
if(!_stopWords.empty())
{
LogError("_stopWords has been loaded before! ");
return false;
}
if(!checkFileExist(filePath))
{
LogError(string_format("cann't find fiel[%s].",filePath));
return false;
}
ifstream ifile(filePath);
string line;
while(getline(ifile, line))
{
_stopWords.insert(line);
}
LogDebug(string_format("load stopwords[%d] finished.", _stopWords.size()));
return true;
}
bool KeyWordExt::destroy() bool KeyWordExt::destroy()
{ {
_segment.destroy(); _segment.destroy();
@ -109,6 +138,14 @@ namespace CppJieba
} }
LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str())); LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str()));
retFlag = _filterStopWords(utf8Strs);
if(!retFlag)
{
LogError("_filterStopWords failed.");
return false;
}
LogDebug(string_format("_filterStopWords res:[%s]", joinStr(utf8Strs, ",").c_str()));
retFlag = _filterSubstr(utf8Strs); retFlag = _filterSubstr(utf8Strs);
if(!retFlag) if(!retFlag)
{ {
@ -120,6 +157,26 @@ namespace CppJieba
return true; return true;
} }
bool KeyWordExt::_filterStopWords(vector<string>& utf8Strs)
{
if(_stopWords.empty())
{
return true;
}
for(VSI it = utf8Strs.begin(); it != utf8Strs.end();)
{
if(_stopWords.find(*it) != _stopWords.end())
{
it = utf8Strs.erase(it);
}
else
{
it ++;
}
}
return true;
}
bool KeyWordExt::_filterDuplicate(vector<string>& utf8Strs) bool KeyWordExt::_filterDuplicate(vector<string>& utf8Strs)
{ {
set<string> st; set<string> st;
@ -206,6 +263,7 @@ int main()
{ {
return 1; return 1;
} }
ext.loadStopWords("stopwords.tmp");
//segment.init("dicts/jieba.dict.utf8"); //segment.init("dicts/jieba.dict.utf8");
vector<string> res; vector<string> res;

View File

@ -5,14 +5,18 @@
namespace CppJieba namespace CppJieba
{ {
class KeyWordExt class KeyWordExt
{ {
private: private:
Segment _segment; Segment _segment;
set<string> _stopWords;
public: public:
KeyWordExt(); KeyWordExt();
~KeyWordExt(); ~KeyWordExt();
bool init(const char * const filePath); bool init(const char * const filePath);
bool loadStopWords(const char * const filePath);
bool destroy(); bool destroy();
public: public:
@ -24,6 +28,7 @@ namespace CppJieba
bool _filterDuplicate(vector<string>& utf8Strs); bool _filterDuplicate(vector<string>& utf8Strs);
bool _filterSingleWord(vector<string>& utf8Strs); bool _filterSingleWord(vector<string>& utf8Strs);
bool _filterSubstr(vector<string>& utf8Strs); bool _filterSubstr(vector<string>& utf8Strs);
bool _filterStopWords(vector<string>& utf8Strs);
}; };