mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
61 lines
1.5 KiB
C++
61 lines
1.5 KiB
C++
/************************************
|
|
* file enc : ASCII
|
|
* author : wuyanyi09@gmail.com
|
|
************************************/
|
|
#ifndef CPPJIEBA_KEYWORDEXT_H
|
|
#define CPPJIEBA_KEYWORDEXT_H
|
|
|
|
#include "Segment.h"
|
|
#include "structs.h"
|
|
|
|
namespace CppJieba
|
|
{
|
|
|
|
class KeyWordExt
|
|
{
|
|
private:
|
|
Segment _segment;
|
|
vector<string> _priorSubWords;
|
|
set<string> _stopWords;
|
|
public:
|
|
KeyWordExt();
|
|
~KeyWordExt();
|
|
bool init();
|
|
|
|
bool loadSegDict(const char * const filePath);
|
|
|
|
//load stopwords
|
|
bool loadStopWords(const char * const filePath);
|
|
|
|
//load prior words' prefix
|
|
bool loadPriorSubWords(const char * const filePath);
|
|
|
|
bool dispose();
|
|
|
|
public:
|
|
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
|
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
|
|
private:
|
|
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
|
|
private:
|
|
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
|
|
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
|
|
private:
|
|
//sort by word len - idf
|
|
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
|
|
private:
|
|
bool _filter(vector<KeyWordInfo>& );
|
|
bool _filterDuplicate(vector<KeyWordInfo>& );
|
|
bool _filterSingleWord(vector<KeyWordInfo>& );
|
|
bool _filterSubstr(vector<KeyWordInfo>& );
|
|
bool _filterStopWords(vector<KeyWordInfo>& );
|
|
private:
|
|
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
|
|
bool _isContainSubWords(const string& word);
|
|
|
|
};
|
|
|
|
}
|
|
|
|
#endif
|