#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H #define CPPJIEBA_KEYWORD_EXTRACTOR_H #include "MixSegment.hpp" #include #include #define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) namespace CppJieba { using namespace Limonp; /*utf8*/ const char * BLACK_LIST[] = {"我们", "他们"}; class KeywordExtractor: public InitOnOff { private: MixSegment _segment; private: unordered_map _idfMap; double _idfAverage; unordered_set _blackSet; public: KeywordExtractor(){_setInitFlag(false);}; explicit KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath) { _setInitFlag(init(dictPath, hmmFilePath, idfPath)); }; ~KeywordExtractor(){}; public: bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs) { LogError("open %s failed.", idfPath.c_str()); return false; } string line ; vector buf; double idf = 0.0; double idfSum = 0.0; size_t lineno = 0; for(;getline(ifs, line); lineno++) { buf.clear(); if(line.empty()) { LogError("line[%d] empty. skipped.", lineno); continue; } if(!split(line, buf, " ") || buf.size() != 2) { LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); continue; } idf = atof(buf[1].c_str()); _idfMap[buf[0]] = idf; idfSum += idf; } std::copy( BLACK_LIST, BLACK_LIST + sizeof(BLACK_LIST)/sizeof(BLACK_LIST[0]), std::inserter(_blackSet, _blackSet.begin())); assert(lineno); _idfAverage = idfSum / lineno; assert(_idfAverage > 0.0); return _setInitFlag(_segment.init(dictPath, hmmFilePath)); }; public: bool extract(const string& str, vector& keywords, uint topN) const { assert(_getInitFlag()); vector > topWords; if(!extract(str, topWords, topN)) { return false; } for(uint i = 0; i < topWords.size(); i++) { keywords.push_back(topWords[i].first); } return true; } bool extract(const string& str, vector >& keywords, uint topN) const { vector words; if(!_segment.cut(str, words)) { LogError("segment cut(%s) failed.", str.c_str()); return false; } // filtering single word. for(vector::iterator iter = words.begin(); iter != words.end(); ) { if(_isSingleWord(*iter)) { iter = words.erase(iter); } else { iter++; } } unordered_map wordmap; for(uint i = 0; i < words.size(); i ++) { wordmap[ words[i] ] += 1.0; } for(unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { if(_blackSet.end() != _blackSet.find(itr->first)) { itr = wordmap.erase(itr); continue; } unordered_map::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; } else { itr->second *= _idfAverage; } itr ++; } keywords.clear(); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); topN = MIN(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); return true; } private: bool _isSingleWord(const string& str) const { Unicode unicode; TransCode::decode(str, unicode); if(unicode.size() == 1) return true; return false; } private: static bool _cmp(const pair& lhs, const pair& rhs) { return lhs.second > rhs.second; } }; } #endif