/************************************ * file enc : utf8 * author : wuyanyi09@gmail.com ************************************/ #ifndef CPPJIEBA_KEYWORDEXT_H #define CPPJIEBA_KEYWORDEXT_H #include "Segment.h" namespace CppJieba { struct WordInfo { string word; size_t wLen; double weight; double idf; WordInfo() { word = ""; wLen = 0; weight = 0.0; idf = 0.0; } string getInfoStr() const { return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); } }; inline string joinWordInfos(const vector& vec) { vector tmp; for(uint i = 0; i < vec.size(); i++) { tmp.push_back(vec[i].getInfoStr()); } return joinStr(tmp, ","); } class KeyWordExt { private: Segment _segment; vector _priorSubWords; set _stopWords; public: KeyWordExt(); ~KeyWordExt(); bool init(); bool loadSegDict(const string& filePath); //load stopwords bool loadStopWords(const string& filePath); //load prior words' prefix bool loadPriorSubWords(const string& filePath); bool dispose(); public: bool extract(const string& title, vector& keywords, uint topN); private: static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b); private: bool _extractTopN(const vector& words, vector& keywords, uint topN); private: //sort by word len - idf bool _sortWLIDF(vector& wordInfos); private: bool _filter(vector& strs); bool _filterDuplicate(vector& strs); bool _filterSingleWord(vector& strs); bool _filterSubstr(vector& strs); bool _filterStopWords(vector& strs); private: bool _prioritizeSubWords(vector& wordInfos); bool _isContainSubWords(const string& word); }; } #endif