#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H #define CPPJIEBA_KEYWORD_EXTRACTOR_H #include "MixSegment.hpp" #include #include namespace CppJieba { using namespace Limonp; /*utf8*/ class KeywordExtractor { public: KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") : segment_(dictPath, hmmFilePath, userDict) { loadIdfDict_(idfPath); loadStopWordDict_(stopWordPath); } KeywordExtractor(const DictTrie* dictTrie, const HMMModel* model, const string& idfPath, const string& stopWordPath) : segment_(dictTrie, model){ loadIdfDict_(idfPath); loadStopWordDict_(stopWordPath); } ~KeywordExtractor() { } bool extract(const string& str, vector& keywords, size_t topN) const { vector > topWords; if(!extract(str, topWords, topN)) { return false; } for(size_t i = 0; i < topWords.size(); i++) { keywords.push_back(topWords[i].first); } return true; } bool extract(const string& str, vector >& keywords, size_t topN) const { vector words; if(!segment_.cut(str, words)) { LogError("segment cut(%s) failed.", str.c_str()); return false; } map wordmap; for(vector::iterator iter = words.begin(); iter != words.end(); iter++) { if(isSingleWord_(*iter)) { continue; } wordmap[*iter] += 1.0; } for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { if(stopWords_.end() != stopWords_.find(itr->first)) { wordmap.erase(itr++); continue; } unordered_map::const_iterator cit = idfMap_.find(itr->first); if(cit != idfMap_.end()) { itr->second *= cit->second; } else { itr->second *= idfAverage_; } itr ++; } keywords.clear(); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); topN = min(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), cmp_); keywords.resize(topN); return true; } private: void loadIdfDict_(const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs.is_open()) { LogFatal("open %s failed.", idfPath.c_str()); } string line ; vector buf; double idf = 0.0; double idfSum = 0.0; size_t lineno = 0; for(; getline(ifs, line); lineno++) { buf.clear(); if(line.empty()) { LogError("line[%d] empty. skipped.", lineno); continue; } if(!split(line, buf, " ") || buf.size() != 2) { LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); continue; } idf = atof(buf[1].c_str()); idfMap_[buf[0]] = idf; idfSum += idf; } assert(lineno); idfAverage_ = idfSum / lineno; assert(idfAverage_ > 0.0); } void loadStopWordDict_(const string& filePath) { ifstream ifs(filePath.c_str()); if(!ifs.is_open()) { LogFatal("open %s failed.", filePath.c_str()); } string line ; while(getline(ifs, line)) { stopWords_.insert(line); } assert(stopWords_.size()); } bool isSingleWord_(const string& str) const { Unicode unicode; TransCode::decode(str, unicode); if(unicode.size() == 1) return true; return false; } static bool cmp_(const pair& lhs, const pair& rhs) { return lhs.second > rhs.second; } private: MixSegment segment_; unordered_map idfMap_; double idfAverage_; unordered_set stopWords_; }; } #endif