mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
85 lines
1.8 KiB
C++
85 lines
1.8 KiB
C++
/************************************
|
|
* file enc : utf8
|
|
* author : wuyanyi09@gmail.com
|
|
************************************/
|
|
#ifndef CPPJIEBA_KEYWORDEXT_H
|
|
#define CPPJIEBA_KEYWORDEXT_H
|
|
|
|
#include "Segment.h"
|
|
|
|
namespace CppJieba
|
|
{
|
|
|
|
struct WordInfo
|
|
{
|
|
string word;
|
|
size_t wLen;
|
|
double weight;
|
|
double idf;
|
|
WordInfo()
|
|
{
|
|
word = "";
|
|
wLen = 0;
|
|
weight = 0.0;
|
|
idf = 0.0;
|
|
}
|
|
string getInfoStr() const
|
|
{
|
|
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
|
}
|
|
};
|
|
|
|
inline string joinWordInfos(const vector<WordInfo>& vec)
|
|
{
|
|
vector<string> tmp;
|
|
for(uint i = 0; i < vec.size(); i++)
|
|
{
|
|
tmp.push_back(vec[i].getInfoStr());
|
|
}
|
|
return joinStr(tmp, ",");
|
|
}
|
|
|
|
class KeyWordExt
|
|
{
|
|
private:
|
|
Segment _segment;
|
|
vector<string> _priorSubWords;
|
|
set<string> _stopWords;
|
|
public:
|
|
KeyWordExt();
|
|
~KeyWordExt();
|
|
bool init(const char * const filePath);
|
|
|
|
//load stopwords
|
|
bool loadStopWords(const char * const filePath);
|
|
|
|
//load prior words' prefix
|
|
bool loadPriorSubWords( const char * const filePath);
|
|
|
|
bool destroy();
|
|
|
|
public:
|
|
bool extract(const string& utf8Str, vector<string>& keywords, uint topN);
|
|
private:
|
|
static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b);
|
|
private:
|
|
bool _extractTopN(const vector<string>& words, vector<string>& keywords, uint topN);
|
|
private:
|
|
//sort by word len - idf
|
|
bool _sortWLIDF(vector<WordInfo>& wordInfos);
|
|
private:
|
|
bool _filter(vector<string>& utf8Strs);
|
|
bool _filterDuplicate(vector<string>& utf8Strs);
|
|
bool _filterSingleWord(vector<string>& utf8Strs);
|
|
bool _filterSubstr(vector<string>& utf8Strs);
|
|
bool _filterStopWords(vector<string>& utf8Strs);
|
|
private:
|
|
bool _prioritizeSubWords(vector<WordInfo>& wordInfos);
|
|
bool _isContainSubWords(const string& word);
|
|
|
|
};
|
|
|
|
}
|
|
|
|
#endif
|