mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add struct WordInfo
This commit is contained in:
parent
5c14c3c07f
commit
081c9904c7
@ -69,62 +69,68 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::_pair_compare(const pair<string, double>& a, const pair<string, double>& b)
|
||||
bool KeyWordExt::_wordInfoCompare(const WordInfo& a, const WordInfo& b)
|
||||
{
|
||||
return a.second < b.second;
|
||||
return a.weight < b.weight;
|
||||
}
|
||||
bool KeyWordExt::_extractTopN(const vector<string>& words, vector<string>& keywords, uint topN)
|
||||
{
|
||||
keywords.clear();
|
||||
vector<pair<string, double> > tmp;
|
||||
|
||||
vector<WordInfo> wordInfos;
|
||||
for(uint i = 0; i < words.size(); i++)
|
||||
{
|
||||
double w = _segment.getUtf8WordWeight(words[i]);
|
||||
tmp.push_back(pair<string, double>(words[i], w));
|
||||
WordInfo wInfo;
|
||||
wInfo.word = words[i];
|
||||
wInfo.weight = w;
|
||||
wInfo.idf = w;
|
||||
wordInfos.push_back(wInfo);
|
||||
}
|
||||
|
||||
sort(tmp.begin(), tmp.end(), _pair_compare);
|
||||
sort(wordInfos.begin(), wordInfos.end(), _wordInfoCompare);
|
||||
|
||||
//logging result
|
||||
/*
|
||||
vector<string> logBuf;//for LogDebug
|
||||
for(uint i = 0; i < tmp.size(); i++)
|
||||
{
|
||||
logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second));
|
||||
}
|
||||
LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str()));
|
||||
*/
|
||||
LogDebug(string_format("calc weight & sorted:%s",joinWordInfos(wordInfos).c_str()));
|
||||
|
||||
//extract TopN
|
||||
for(uint i = 0; i < topN && i < tmp.size(); i++)
|
||||
for(uint i = 0; i < topN && i < wordInfos.size(); i++)
|
||||
{
|
||||
keywords.push_back(tmp[i].first);
|
||||
keywords.push_back(wordInfos[i].word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool KeyWordExt::extract(const string& utf8Str, vector<string>& keywords, uint topN)
|
||||
{
|
||||
LogInfo(utf8Str);
|
||||
LogInfo(string_format("title:[%s]",utf8Str.c_str()));
|
||||
|
||||
bool retFlag;
|
||||
vector<string> tmp;
|
||||
retFlag = _segment.cutDAG(utf8Str, tmp);
|
||||
vector<string> words;
|
||||
retFlag = _segment.cutDAG(utf8Str, words);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError(string_format("cutDAG(%s) failed.", utf8Str.c_str()));
|
||||
return false;
|
||||
}
|
||||
// like str.join([]) in python
|
||||
LogDebug(string_format("cutDAG result:[%s]", joinStr(tmp, ",").c_str()));
|
||||
LogDebug(string_format("cutDAG result:[%s]", joinStr(words, ",").c_str()));
|
||||
|
||||
retFlag = _filter(tmp);
|
||||
retFlag = _filter(words);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_filter failed.");
|
||||
return false;
|
||||
}
|
||||
LogDebug(string_format("_filter res:[%s]", joinStr(tmp, ",").c_str()));
|
||||
LogDebug(string_format("_filter res:[%s]", joinStr(words, ",").c_str()));
|
||||
|
||||
retFlag = _extractTopN(tmp, keywords, topN);
|
||||
retFlag = _extractTopN(words, keywords, topN);
|
||||
if(!retFlag)
|
||||
{
|
||||
LogError("_extractTopN failed.");
|
||||
|
@ -6,6 +6,33 @@
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
struct WordInfo
|
||||
{
|
||||
string word;
|
||||
double weight;
|
||||
double idf;
|
||||
WordInfo()
|
||||
{
|
||||
word = "";
|
||||
weight = 0.0;
|
||||
idf = 0.0;
|
||||
}
|
||||
string getInfoStr() const
|
||||
{
|
||||
return string_format("{word:%s, weight:%lf, idf:%lf}", word.c_str(), weight, idf);
|
||||
}
|
||||
};
|
||||
|
||||
inline string joinWordInfos(const vector<WordInfo>& vec)
|
||||
{
|
||||
vector<string> tmp;
|
||||
for(uint i = 0; i < vec.size(); i++)
|
||||
{
|
||||
tmp.push_back(vec[i].getInfoStr());
|
||||
};
|
||||
return joinStr(tmp, ",");
|
||||
}
|
||||
|
||||
class KeyWordExt
|
||||
{
|
||||
private:
|
||||
@ -28,7 +55,8 @@ namespace CppJieba
|
||||
public:
|
||||
bool extract(const string& utf8Str, vector<string>& keywords, uint topN);
|
||||
private:
|
||||
static bool _pair_compare(const pair<string, double>& a, const pair<string, double>& b);
|
||||
static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
|
||||
private:
|
||||
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
|
||||
bool _filter(vector<string>& utf8Strs);
|
||||
bool _filterDuplicate(vector<string>& utf8Strs);
|
||||
|
Loading…
x
Reference in New Issue
Block a user