diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp index c305c67..648c1e2 100644 --- a/demo/keywordext_demo.cpp +++ b/demo/keywordext_demo.cpp @@ -21,7 +21,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath) return ; } ifstream ifile(filePath); - vector res; + vector res; string line; while(getline(ifile, line)) { @@ -29,7 +29,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath) if(!line.empty()) { ext.extract(line, res, 20); - cout< words; - vector keywords; + vector res; string line; while(getline(ifile, line)) { if(!line.empty()) { seg.cutDAG(line, words); - ext.extract(words, keywords, 20); - cout<\n" + <<"options:\n" + <<"\t--dictpath\tIf is not specified, the default is "<& words, vector& keywords, uint topN) + bool KeyWordExt::_extTopN(vector& wordInfos, uint topN) { - keywords.clear(); - vector wordInfos; - for(uint i = 0; i < words.size(); i++) + int dis = wordInfos.size() - topN; + if(dis <= 0) { - KeyWordInfo wInfo; - wInfo.word = words[i]; - wordInfos.push_back(wInfo); + return true; } - _sortWLIDF(wordInfos); -#ifdef DEBUG - LogDebug(string_format("calc weight & sorted:%s",joinWordInfos(wordInfos).c_str())); -#endif - - _prioritizeSubWords(wordInfos); -#ifdef DEBUG - LogDebug(string_format("_prioritizeSubWords res:%s", joinWordInfos(wordInfos).c_str())); -#endif - //extract TopN - for(uint i = 0; i < topN && i < wordInfos.size(); i++) + if(uint(dis) <= topN) { - keywords.push_back(wordInfos[i].word); + for(int i = 0; i< dis; i++) + { + wordInfos.pop_back(); + } + } + else// in case that topN << size; + { + + vector tmp(wordInfos.begin(), wordInfos.begin() + topN); + wordInfos.swap(tmp); } return true; } - bool KeyWordExt::extract(const vector& _words, vector& keywords, uint topN) + bool KeyWordExt::extract(const vector& words, vector& keyWordInfos, uint topN) { - if(_words.empty()) + if(words.empty()) { return false; } - vector words(_words); - -#ifdef DEBUG +#ifdef DEBU LogDebug(string_format("words:[%s]", joinStr(words, ",").c_str())); #endif - bool retFlag = _filter(words); - if(!retFlag) + keyWordInfos.clear(); + for(uint i = 0; i < words.size(); i++) { - LogError("_filter failed."); - return false; + keyWordInfos.push_back(words[i]); } -#ifdef DEBUG - LogDebug(string_format("_filter res:[%s]", joinStr(words, ",").c_str())); -#endif - - retFlag = _extractTopN(words, keywords, topN); - if(!retFlag) - { - LogError("_extractTopN failed."); - return false; - } - //LogDebug("_extractTopN finished."); - -#ifdef DEBUG - LogDebug(string_format("ext res:[%s]", joinStr(keywords, ",").c_str())); -#endif - - return true; + return _extract(keyWordInfos, topN); } - bool KeyWordExt::extract(const string& title, vector& keywords, uint topN) + bool KeyWordExt::extract(const string& title, vector& keyWordInfos, uint topN) { if(title.empty()) { return false; } + + vector trieNodeInfos; + _segment.cutDAG(title, trieNodeInfos); -#ifdef DEBUG - LogDebug(string_format("title:[%s]",title.c_str())); -#endif - - bool retFlag; - vector words; - retFlag = _segment.cutDAG(title, words); - if(!retFlag) + keyWordInfos.clear(); + for(uint i = 0; i < trieNodeInfos.size(); i++) { - LogError(string_format("cutDAG(%s) failed.", title.c_str())); - return false; + keyWordInfos.push_back(trieNodeInfos[i]); } -#ifdef DEBUG - LogDebug(string_format("cutDAG result:[%s]", joinStr(words, ",").c_str())); -#endif + return _extract(keyWordInfos, topN); + } - retFlag = _filter(words); - if(!retFlag) + bool KeyWordExt::_extract(vector& keyWordInfos, uint topN) + { + if(!_filter(keyWordInfos)) { LogError("_filter failed."); return false; } -#ifdef DEBUG - LogDebug(string_format("_filter res:[%s]", joinStr(words, ",").c_str())); -#endif - - retFlag = _extractTopN(words, keywords, topN); - if(!retFlag) + if(!_sortWLIDF(keyWordInfos)) { - LogError("_extractTopN failed."); + LogError("_sortWLIDF failed."); + return false; + } + + if(!_extTopN(keyWordInfos, topN)) + { + LogError("_extTopN failed."); return false; } - //LogDebug("_extractTopN finished."); -#ifdef DEBUG - LogDebug(string_format("ext res:[%s]", joinStr(keywords, ",").c_str())); -#endif return true; } - bool KeyWordExt::_filter(vector& strs) + bool KeyWordExt::_filter(vector& wordInfos) { - bool retFlag; - retFlag = _filterDuplicate(strs); - if(!retFlag) + if(!_filterDuplicate(wordInfos)) { LogError("_filterDuplicate failed."); return false; } - //LogDebug(string_format("_filterDuplicate res:[%s]", joinStr(strs, ",").c_str())); - retFlag = _filterSingleWord(strs); - if(!retFlag) + if(!_filterSingleWord(wordInfos)) { LogError("_filterSingleWord failed."); return false; } - //LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(strs, ",").c_str())); - retFlag = _filterStopWords(strs); - if(!retFlag) + if(!_filterStopWords(wordInfos)) { LogError("_filterStopWords failed."); return false; } - //LogDebug(string_format("_filterStopWords res:[%s]", joinStr(strs, ",").c_str())); - retFlag = _filterSubstr(strs); - if(!retFlag) + if(!_filterSubstr(wordInfos)) { LogError("_filterSubstr failed."); return false; } - //LogDebug(string_format("_filterSubstr res:[%s]", joinStr(strs, ",").c_str())); return true; } - bool KeyWordExt::_filterStopWords(vector& strs) + bool KeyWordExt::_filterStopWords(vector& wordInfos) { if(_stopWords.empty()) { return true; } - for(VSI it = strs.begin(); it != strs.end();) + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end();) { - if(_stopWords.find(*it) != _stopWords.end()) + if(_stopWords.find(it->word) != _stopWords.end()) { - it = strs.erase(it); + it = wordInfos.erase(it); } else { @@ -280,33 +240,33 @@ namespace CppJieba } - bool KeyWordExt::_filterDuplicate(vector& strs) + bool KeyWordExt::_filterDuplicate(vector& wordInfos) { set st; - for(VSI it = strs.begin(); it != strs.end(); ) + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) { - if(st.find(*it) != st.end()) + if(st.find(it->word) != st.end()) { - it = strs.erase(it); + it = wordInfos.erase(it); } else { - st.insert(*it); + st.insert(it->word); it++; } } return true; } - bool KeyWordExt::_filterSingleWord(vector& strs) + bool KeyWordExt::_filterSingleWord(vector& wordInfos) { - for(vector::iterator it = strs.begin(); it != strs.end();) + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end();) { // filter single word - if(1 == TransCode::getWordLength(*it)) + if(1 == it->wLen) { - it = strs.erase(it); + it = wordInfos.erase(it); } else { @@ -316,27 +276,31 @@ namespace CppJieba return true; } - bool KeyWordExt::_filterSubstr(vector& strs) + bool KeyWordExt::_filterSubstr(vector& wordInfos) { - vector tmp = strs; + vector tmp ; + for(uint i = 0; i < wordInfos.size(); i++) + { + tmp.push_back(wordInfos[i].word); + } set subs; - for(VSI it = strs.begin(); it != strs.end(); it ++) + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++) { for(uint j = 0; j < tmp.size(); j++) { - if(*it != tmp[j] && string::npos != tmp[j].find(*it, 0)) + if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0)) { - subs.insert(*it); + subs.insert(it->word); } } } //erase subs from strs - for(VSI it = strs.begin(); it != strs.end(); ) + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) { - if(subs.end() != subs.find(*it)) + if(subs.end() != subs.find(it->word)) { - it = strs.erase(it); + it = wordInfos.erase(it); } else { diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index 3e8854f..eabf276 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -33,21 +33,22 @@ namespace CppJieba bool dispose(); public: - bool extract(const string& title, vector& keywords, uint topN); - bool extract(const vector& words, vector& keywords, uint topN); + bool extract(const string& title, vector& keyWordInfos, uint topN); + bool extract(const vector& words, vector& keyWordInfos, uint topN); private: static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); private: - bool _extractTopN(const vector& words, vector& keywords, uint topN); + bool _extract(vector& keyWordInfos, uint topN); + bool _extTopN(vector& wordInfos, uint topN); private: //sort by word len - idf bool _sortWLIDF(vector& wordInfos); private: - bool _filter(vector& strs); - bool _filterDuplicate(vector& strs); - bool _filterSingleWord(vector& strs); - bool _filterSubstr(vector& strs); - bool _filterStopWords(vector& strs); + bool _filter(vector& ); + bool _filterDuplicate(vector& ); + bool _filterSingleWord(vector& ); + bool _filterSubstr(vector& ); + bool _filterStopWords(vector& ); private: bool _prioritizeSubWords(vector& wordInfos); bool _isContainSubWords(const string& word); diff --git a/src/structs.h b/src/structs.h index 9eed706..983c75e 100644 --- a/src/structs.h +++ b/src/structs.h @@ -1,7 +1,9 @@ #ifndef CPPJIEBA_STRUCTS_H #define CPPJIEBA_STRUCTS_H +#include #include "globals.h" +#include "Trie.h" namespace CppJieba { @@ -19,6 +21,13 @@ namespace CppJieba freq = 0; logFreq = 0.0; } + TrieNodeInfo(const string& _word) + { + word = _word; + wLen = TransCode::getWordLength(_word); + freq = 0; + logFreq = -numeric_limits::max(); + } }; @@ -36,7 +45,6 @@ namespace CppJieba }; */ - struct KeyWordInfo: public TrieNodeInfo { @@ -47,10 +55,34 @@ namespace CppJieba idf = 0.0; weight = 0.0; } + KeyWordInfo(const string& _word):TrieNodeInfo(_word) + { + idf = 0.0; + weight = 0.0; + } + KeyWordInfo(const TrieNodeInfo& trieNodeInfo) + { + word = trieNodeInfo.word; + freq = trieNodeInfo.freq; + wLen = trieNodeInfo.wLen; + tag = trieNodeInfo.tag; + logFreq = trieNodeInfo.logFreq; + idf = 0.0; + weight = 0.0; + } string toString() const { return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); } + KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo) + { + word = trieNodeInfo.word; + freq = trieNodeInfo.freq; + wLen = trieNodeInfo.wLen; + tag = trieNodeInfo.tag; + logFreq = trieNodeInfo.logFreq; + return *this; + } }; inline string joinWordInfos(const vector& vec)