From 6de292a56d5084a8f174994e1b495630e17e021b Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 15 Mar 2014 23:31:59 +0800 Subject: [PATCH] add stopword in KeywordExtractor --- src/KeywordExtractor.hpp | 106 ++++++++++++++++------------ test/keyword_demo.cpp | 2 +- test/unittest/TKeywordExtractor.cpp | 2 +- 3 files changed, 62 insertions(+), 48 deletions(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 2e988be..a59fd6b 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -11,8 +11,6 @@ namespace CppJieba using namespace Limonp; /*utf8*/ - const char * const BLACK_LIST[] = {"我们", "他们"}; - class KeywordExtractor: public InitOnOff { private: @@ -21,56 +19,20 @@ namespace CppJieba unordered_map _idfMap; double _idfAverage; - unordered_set _blackSet; + unordered_set _stopWords; public: KeywordExtractor(){_setInitFlag(false);}; - explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath) + explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) { - _setInitFlag(init(dictPath, hmmFilePath, idfPath)); + _setInitFlag(init(dictPath, hmmFilePath, idfPath, stopWordPath)); }; ~KeywordExtractor(){}; + public: - bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath) + bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) { - ifstream ifs(idfPath.c_str()); - if(!ifs) - { - LogError("open %s failed.", idfPath.c_str()); - return false; - } - string line ; - vector buf; - double idf = 0.0; - double idfSum = 0.0; - size_t lineno = 0; - for(;getline(ifs, line); lineno++) - { - buf.clear(); - if(line.empty()) - { - LogError("line[%d] empty. skipped.", lineno); - continue; - } - if(!split(line, buf, " ") || buf.size() != 2) - { - LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); - continue; - } - idf = atof(buf[1].c_str()); - _idfMap[buf[0]] = idf; - idfSum += idf; - - } - - std::copy( - BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), - std::inserter(_blackSet, _blackSet.begin())); - - assert(lineno); - _idfAverage = idfSum / lineno; - - assert(_idfAverage > 0.0); - + _loadIdfDict(idfPath); + _loadStopWordDict(stopWordPath); return _setInitFlag(_segment.init(dictPath, hmmFilePath)); }; public: @@ -120,7 +82,7 @@ namespace CppJieba for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { - if(_blackSet.end() != _blackSet.find(itr->first)) + if(_stopWords.end() != _stopWords.find(itr->first)) { itr = wordmap.erase(itr); continue; @@ -145,6 +107,58 @@ namespace CppJieba keywords.resize(topN); return true; } + private: + void _loadIdfDict(const string& idfPath) + { + ifstream ifs(idfPath.c_str()); + if(!ifs) + { + LogError("open %s failed.", idfPath.c_str()); + assert(false); + } + string line ; + vector buf; + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for(;getline(ifs, line); lineno++) + { + buf.clear(); + if(line.empty()) + { + LogError("line[%d] empty. skipped.", lineno); + continue; + } + if(!split(line, buf, " ") || buf.size() != 2) + { + LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); + continue; + } + idf = atof(buf[1].c_str()); + _idfMap[buf[0]] = idf; + idfSum += idf; + + } + + assert(lineno); + _idfAverage = idfSum / lineno; + assert(_idfAverage > 0.0); + } + void _loadStopWordDict(const string& filePath) + { + ifstream ifs(filePath.c_str()); + if(!ifs) + { + LogError("open %s failed.", filePath.c_str()); + assert(false); + } + string line ; + while(getline(ifs, line)) + { + _stopWords.insert(line); + } + assert(_stopWords.size()); + } private: bool _isSingleWord(const string& str) const { diff --git a/test/keyword_demo.cpp b/test/keyword_demo.cpp index 9e0d9eb..7f38f20 100644 --- a/test/keyword_demo.cpp +++ b/test/keyword_demo.cpp @@ -3,7 +3,7 @@ using namespace CppJieba; int main(int argc, char ** argv) { - KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。"); vector > wordweights; size_t topN = 5; diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index c2d78f0..4363fb5 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -7,7 +7,7 @@ using namespace CppJieba; TEST(KeywordExtractorTest, Test1) { - KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8"); + KeywordExtractor extractor("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8"); string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,迎娶白富美,走上人生巅峰。"); string res; vector > wordweights;