diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index df435bf..80bdb1e 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -69,62 +69,68 @@ namespace CppJieba return true; } - bool KeyWordExt::_pair_compare(const pair& a, const pair& b) + bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b) { - return a.second < b.second; + return a.weight < b.weight; } bool KeyWordExt::_extractTopN(const vector& words, vector& keywords, uint topN) { keywords.clear(); - vector > tmp; - + vector wordInfos; for(uint i = 0; i < words.size(); i++) { double w = _segment.getUtf8WordWeight(words[i]); - tmp.push_back(pair(words[i], w)); + WordInfo wInfo; + wInfo.word = words[i]; + wInfo.weight = w; + wInfo.idf = w; + wordInfos.push_back(wInfo); } - sort(tmp.begin(), tmp.end(), _pair_compare); + sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare); //logging result + /* vector logBuf;//for LogDebug for(uint i = 0; i < tmp.size(); i++) { logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second)); } LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str())); + */ + LogDebug(string_format("calc weight & sorted:%s",joinWordInfos(wordInfos).c_str())); //extract TopN - for(uint i = 0; i < topN && i < tmp.size(); i++) + for(uint i = 0; i < topN && i < wordInfos.size(); i++) { - keywords.push_back(tmp[i].first); + keywords.push_back(wordInfos[i].word); } return true; } bool KeyWordExt::extract(const string& utf8Str, vector& keywords, uint topN) { - LogInfo(utf8Str); + LogInfo(string_format("title:[%s]",utf8Str.c_str())); + bool retFlag; - vector tmp; - retFlag = _segment.cutDAG(utf8Str, tmp); + vector words; + retFlag = _segment.cutDAG(utf8Str, words); if(!retFlag) { LogError(string_format("cutDAG(%s) failed.", utf8Str.c_str())); return false; } - // like str.join([]) in python - LogDebug(string_format("cutDAG result:[%s]", joinStr(tmp, ",").c_str())); + LogDebug(string_format("cutDAG result:[%s]", joinStr(words, ",").c_str())); - retFlag = _filter(tmp); + retFlag = _filter(words); if(!retFlag) { LogError("_filter failed."); return false; } - LogDebug(string_format("_filter res:[%s]", joinStr(tmp, ",").c_str())); + LogDebug(string_format("_filter res:[%s]", joinStr(words, ",").c_str())); - retFlag = _extractTopN(tmp, keywords, topN); + retFlag = _extractTopN(words, keywords, topN); if(!retFlag) { LogError("_extractTopN failed."); diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index d6a5e4a..90658c7 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -6,6 +6,33 @@ namespace CppJieba { + struct WordInfo + { + string word; + double weight; + double idf; + WordInfo() + { + word = ""; + weight = 0.0; + idf = 0.0; + } + string getInfoStr() const + { + return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf); + } + }; + + inline string joinWordInfos(const vector& vec) + { + vector tmp; + for(uint i = 0; i < vec.size(); i++) + { + tmp.push_back(vec[i].getInfoStr()); + }; + return joinStr(tmp, ","); + } + class KeyWordExt { private: @@ -28,7 +55,8 @@ namespace CppJieba public: bool extract(const string& utf8Str, vector& keywords, uint topN); private: - static bool _pair_compare(const pair& a, const pair& b); + static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b); + private: bool _extractTopN(const vector& words, vector& keywords, uint topN); bool _filter(vector& utf8Strs); bool _filterDuplicate(vector& utf8Strs);