cppjieba/src/KeyWordExt.h
2013-08-19 19:01:53 +08:00

61 lines
1.5 KiB
C++

/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_KEYWORDEXT_H
#define CPPJIEBA_KEYWORDEXT_H
#include "Segment.h"
#include "structs.h"
namespace CppJieba
{
class KeyWordExt
{
private:
Segment _segment;
vector<string> _priorSubWords;
set<string> _stopWords;
public:
KeyWordExt();
~KeyWordExt();
bool init();
bool loadSegDict(const char * const filePath);
//load stopwords
bool loadStopWords(const char * const filePath);
//load prior words' prefix
bool loadPriorSubWords(const char * const filePath);
bool dispose();
public:
bool extract(const string& title, vector<KeyWordInfo>& keyWordInfos, uint topN);
bool extract(const vector<string>& words, vector<KeyWordInfo>& keyWordInfos, uint topN);
private:
static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b);
private:
bool _extract(vector<KeyWordInfo>& keyWordInfos, uint topN);
bool _extTopN(vector<KeyWordInfo>& wordInfos, uint topN);
private:
//sort by word len - idf
bool _sortWLIDF(vector<KeyWordInfo>& wordInfos);
private:
bool _filter(vector<KeyWordInfo>& );
bool _filterDuplicate(vector<KeyWordInfo>& );
bool _filterSingleWord(vector<KeyWordInfo>& );
bool _filterSubstr(vector<KeyWordInfo>& );
bool _filterStopWords(vector<KeyWordInfo>& );
private:
bool _prioritizeSubWords(vector<KeyWordInfo>& wordInfos);
bool _isContainSubWords(const string& word);
};
}
#endif