From 251b0b93347b6aa67af982667e1caa65ff2ba91b Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Mon, 15 Jul 2013 16:19:58 +0800 Subject: [PATCH] add _filterStopwords keywordext.cpp/h --- src/KeyWordExt.cpp | 58 ++++++++++++++++++++++++++++++++++++++++++++++ src/KeyWordExt.h | 5 ++++ 2 files changed, 63 insertions(+) diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index 7eaaaf7..bd188a3 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -14,8 +14,37 @@ namespace CppJieba bool KeyWordExt::init(const char * const filePath) { + if(!checkFileExist(filePath)) + { + LogError(string_format("cann't find fiel[%s].",filePath)); + return false; + } return _segment.init(filePath); } + + bool KeyWordExt::loadStopWords(const char * const filePath) + { + if(!_stopWords.empty()) + { + LogError("_stopWords has been loaded before! "); + return false; + } + if(!checkFileExist(filePath)) + { + LogError(string_format("cann't find fiel[%s].",filePath)); + return false; + } + + ifstream ifile(filePath); + string line; + while(getline(ifile, line)) + { + _stopWords.insert(line); + } + LogDebug(string_format("load stopwords[%d] finished.", _stopWords.size())); + + return true; + } bool KeyWordExt::destroy() { @@ -109,6 +138,14 @@ namespace CppJieba } LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str())); + retFlag = _filterStopWords(utf8Strs); + if(!retFlag) + { + LogError("_filterStopWords failed."); + return false; + } + LogDebug(string_format("_filterStopWords res:[%s]", joinStr(utf8Strs, ",").c_str())); + retFlag = _filterSubstr(utf8Strs); if(!retFlag) { @@ -120,6 +157,26 @@ namespace CppJieba return true; } + bool KeyWordExt::_filterStopWords(vector& utf8Strs) + { + if(_stopWords.empty()) + { + return true; + } + for(VSI it = utf8Strs.begin(); it != utf8Strs.end();) + { + if(_stopWords.find(*it) != _stopWords.end()) + { + it = utf8Strs.erase(it); + } + else + { + it ++; + } + } + return true; + } + bool KeyWordExt::_filterDuplicate(vector& utf8Strs) { set st; @@ -206,6 +263,7 @@ int main() { return 1; } + ext.loadStopWords("stopwords.tmp"); //segment.init("dicts/jieba.dict.utf8"); vector res; diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index cb5fbf7..8bc5cce 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -5,14 +5,18 @@ namespace CppJieba { + class KeyWordExt { private: Segment _segment; + set _stopWords; public: KeyWordExt(); ~KeyWordExt(); bool init(const char * const filePath); + + bool loadStopWords(const char * const filePath); bool destroy(); public: @@ -24,6 +28,7 @@ namespace CppJieba bool _filterDuplicate(vector& utf8Strs); bool _filterSingleWord(vector& utf8Strs); bool _filterSubstr(vector& utf8Strs); + bool _filterStopWords(vector& utf8Strs); };