mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
code style
This commit is contained in:
parent
bb32234654
commit
2b18a582fc
@ -26,60 +26,60 @@ class DictTrie {
|
||||
public:
|
||||
|
||||
DictTrie() {
|
||||
_trie = NULL;
|
||||
_minWeight = MAX_DOUBLE;
|
||||
trie_ = NULL;
|
||||
minWeight_ = MAX_DOUBLE;
|
||||
}
|
||||
DictTrie(const string& dictPath, const string& userDictPath = "") {
|
||||
new (this) DictTrie();
|
||||
init(dictPath, userDictPath);
|
||||
}
|
||||
~DictTrie() {
|
||||
if(_trie) {
|
||||
delete _trie;
|
||||
if(trie_) {
|
||||
delete trie_;
|
||||
}
|
||||
}
|
||||
|
||||
bool init(const string& dictPath, const string& userDictPath = "") {
|
||||
if(_trie != NULL) {
|
||||
if(trie_ != NULL) {
|
||||
LogFatal("trie already initted");
|
||||
}
|
||||
_loadDict(dictPath);
|
||||
_calculateWeight(_nodeInfos);
|
||||
_minWeight = _findMinWeight(_nodeInfos);
|
||||
loadDict_(dictPath);
|
||||
calculateWeight_(nodeInfos_);
|
||||
minWeight_ = findMinWeight_(nodeInfos_);
|
||||
|
||||
if(userDictPath.size()) {
|
||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||
double maxWeight = findMaxWeight_(nodeInfos_);
|
||||
loadUserDict_(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||
}
|
||||
_shrink(_nodeInfos);
|
||||
_trie = _createTrie(_nodeInfos);
|
||||
assert(_trie);
|
||||
shrink_(nodeInfos_);
|
||||
trie_ = createTrie_(nodeInfos_);
|
||||
assert(trie_);
|
||||
return true;
|
||||
}
|
||||
|
||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
return _trie->find(begin, end);
|
||||
return trie_->find(begin, end);
|
||||
}
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
||||
return _trie->find(begin, end, dag, offset);
|
||||
return trie_->find(begin, end, dag, offset);
|
||||
}
|
||||
void find(
|
||||
Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<SegmentChar>& res
|
||||
) const {
|
||||
_trie->find(begin, end, res);
|
||||
trie_->find(begin, end, res);
|
||||
}
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
||||
return isIn(_userDictSingleChineseWord, word);
|
||||
return isIn(userDictSingleChineseWord_, word);
|
||||
}
|
||||
double getMinWeight() const {
|
||||
return _minWeight;
|
||||
return minWeight_;
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
Trie * _createTrie(const vector<DictUnit>& dictUnits) {
|
||||
Trie * createTrie_(const vector<DictUnit>& dictUnits) {
|
||||
assert(dictUnits.size());
|
||||
vector<Unicode> words;
|
||||
vector<const DictUnit*> valuePointers;
|
||||
@ -91,7 +91,7 @@ class DictTrie {
|
||||
Trie * trie = new Trie(words, valuePointers);
|
||||
return trie;
|
||||
}
|
||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
|
||||
void loadUserDict_(const string& filePath, double defaultWeight, const string& defaultTag) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs.is_open()) {
|
||||
LogFatal("file %s open failed.", filePath.c_str());
|
||||
@ -111,15 +111,15 @@ class DictTrie {
|
||||
continue;
|
||||
}
|
||||
if(nodeInfo.word.size() == 1) {
|
||||
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
||||
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
|
||||
}
|
||||
nodeInfo.weight = defaultWeight;
|
||||
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
|
||||
_nodeInfos.push_back(nodeInfo);
|
||||
nodeInfos_.push_back(nodeInfo);
|
||||
}
|
||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||
}
|
||||
void _loadDict(const string& filePath) {
|
||||
void loadDict_(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs.is_open()) {
|
||||
LogFatal("file %s open failed.", filePath.c_str());
|
||||
@ -141,17 +141,17 @@ class DictTrie {
|
||||
nodeInfo.weight = atof(buf[1].c_str());
|
||||
nodeInfo.tag = buf[2];
|
||||
|
||||
_nodeInfos.push_back(nodeInfo);
|
||||
nodeInfos_.push_back(nodeInfo);
|
||||
}
|
||||
}
|
||||
double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
|
||||
double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
|
||||
double ret = MAX_DOUBLE;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||
ret = min(nodeInfos[i].weight, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
|
||||
double findMaxWeight_(const vector<DictUnit>& nodeInfos) const {
|
||||
double ret = MIN_DOUBLE;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||
ret = max(nodeInfos[i].weight, ret);
|
||||
@ -159,7 +159,7 @@ class DictTrie {
|
||||
return ret;
|
||||
}
|
||||
|
||||
void _calculateWeight(vector<DictUnit>& nodeInfos) const {
|
||||
void calculateWeight_(vector<DictUnit>& nodeInfos) const {
|
||||
double sum = 0.0;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||
sum += nodeInfos[i].weight;
|
||||
@ -172,16 +172,16 @@ class DictTrie {
|
||||
}
|
||||
}
|
||||
|
||||
void _shrink(vector<DictUnit>& units) const {
|
||||
void shrink_(vector<DictUnit>& units) const {
|
||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||
}
|
||||
|
||||
private:
|
||||
vector<DictUnit> _nodeInfos;
|
||||
Trie * _trie;
|
||||
vector<DictUnit> nodeInfos_;
|
||||
Trie * trie_;
|
||||
|
||||
double _minWeight;
|
||||
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
||||
double minWeight_;
|
||||
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -14,40 +14,40 @@ namespace CppJieba {
|
||||
class FullSegment: public SegmentBase {
|
||||
public:
|
||||
FullSegment() {
|
||||
_dictTrie = NULL;
|
||||
_isBorrowed = false;
|
||||
dictTrie_ = NULL;
|
||||
isBorrowed_ = false;
|
||||
}
|
||||
explicit FullSegment(const string& dictPath) {
|
||||
_dictTrie = NULL;
|
||||
dictTrie_ = NULL;
|
||||
init(dictPath);
|
||||
}
|
||||
explicit FullSegment(const DictTrie* dictTrie) {
|
||||
_dictTrie = NULL;
|
||||
dictTrie_ = NULL;
|
||||
init(dictTrie);
|
||||
}
|
||||
virtual ~FullSegment() {
|
||||
if(_dictTrie && ! _isBorrowed) {
|
||||
delete _dictTrie;
|
||||
if(dictTrie_ && ! isBorrowed_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
|
||||
};
|
||||
bool init(const string& dictPath) {
|
||||
assert(_dictTrie == NULL);
|
||||
_dictTrie = new DictTrie(dictPath);
|
||||
_isBorrowed = false;
|
||||
assert(dictTrie_ == NULL);
|
||||
dictTrie_ = new DictTrie(dictPath);
|
||||
isBorrowed_ = false;
|
||||
return true;
|
||||
}
|
||||
bool init(const DictTrie* dictTrie) {
|
||||
assert(_dictTrie == NULL);
|
||||
assert(dictTrie_ == NULL);
|
||||
assert(dictTrie);
|
||||
_dictTrie = dictTrie;
|
||||
_isBorrowed = true;
|
||||
dictTrie_ = dictTrie;
|
||||
isBorrowed_ = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
assert(_dictTrie);
|
||||
assert(dictTrie_);
|
||||
if (begin >= end) {
|
||||
LogError("begin >= end");
|
||||
return false;
|
||||
@ -66,7 +66,7 @@ class FullSegment: public SegmentBase {
|
||||
int wordLen = 0;
|
||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||
//find word start from uItr
|
||||
if (_dictTrie->find(uItr, end, tRes, 0)) {
|
||||
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
||||
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
{
|
||||
@ -93,7 +93,7 @@ class FullSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
||||
assert(_dictTrie);
|
||||
assert(dictTrie_);
|
||||
if (begin >= end) {
|
||||
LogError("begin >= end");
|
||||
return false;
|
||||
@ -117,8 +117,8 @@ class FullSegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
const DictTrie* _dictTrie;
|
||||
bool _isBorrowed;
|
||||
const DictTrie* dictTrie_;
|
||||
bool isBorrowed_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -31,17 +31,17 @@ class HMMSegment: public SegmentBase {
|
||||
virtual ~HMMSegment() {}
|
||||
public:
|
||||
bool init(const string& filePath) {
|
||||
memset(_startProb, 0, sizeof(_startProb));
|
||||
memset(_transProb, 0, sizeof(_transProb));
|
||||
_statMap[0] = 'B';
|
||||
_statMap[1] = 'E';
|
||||
_statMap[2] = 'M';
|
||||
_statMap[3] = 'S';
|
||||
_emitProbVec.push_back(&_emitProbB);
|
||||
_emitProbVec.push_back(&_emitProbE);
|
||||
_emitProbVec.push_back(&_emitProbM);
|
||||
_emitProbVec.push_back(&_emitProbS);
|
||||
LIMONP_CHECK(_loadModel(filePath.c_str()));
|
||||
memset(startProb_, 0, sizeof(startProb_));
|
||||
memset(transProb_, 0, sizeof(transProb_));
|
||||
statMap_[0] = 'B';
|
||||
statMap_[1] = 'E';
|
||||
statMap_[2] = 'M';
|
||||
statMap_[3] = 'S';
|
||||
emitProbVec_.push_back(&emitProbB_);
|
||||
emitProbVec_.push_back(&emitProbE_);
|
||||
emitProbVec_.push_back(&emitProbM_);
|
||||
emitProbVec_.push_back(&emitProbS_);
|
||||
LIMONP_CHECK(loadModel_(filePath.c_str()));
|
||||
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
||||
return true;
|
||||
}
|
||||
@ -53,16 +53,16 @@ class HMMSegment: public SegmentBase {
|
||||
Unicode::const_iterator right = begin;
|
||||
while(right != end) {
|
||||
if(*right < 0x80) {
|
||||
if(left != right && !_cut(left, right, res)) {
|
||||
if(left != right && !cut_(left, right, res)) {
|
||||
return false;
|
||||
}
|
||||
left = right;
|
||||
do {
|
||||
right = _sequentialLetterRule(left, end);
|
||||
right = sequentialLetterRule_(left, end);
|
||||
if(right != left) {
|
||||
break;
|
||||
}
|
||||
right = _numbersRule(left, end);
|
||||
right = numbersRule_(left, end);
|
||||
if(right != left) {
|
||||
break;
|
||||
}
|
||||
@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase {
|
||||
right++;
|
||||
}
|
||||
}
|
||||
if(left != right && !_cut(left, right, res)) {
|
||||
if(left != right && !cut_(left, right, res)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -100,7 +100,7 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Unicode::value_type x = *begin;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||
begin ++;
|
||||
@ -118,7 +118,7 @@ class HMMSegment: public SegmentBase {
|
||||
return begin;
|
||||
}
|
||||
//
|
||||
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
Unicode::value_type x = *begin;
|
||||
if('0' <= x && x <= '9') {
|
||||
begin ++;
|
||||
@ -135,10 +135,10 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
bool cut_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
vector<size_t> status;
|
||||
if(!_viterbi(begin, end, status)) {
|
||||
LogError("_viterbi failed.");
|
||||
if(!viterbi_(begin, end, status)) {
|
||||
LogError("viterbi_ failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -154,7 +154,7 @@ class HMMSegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
|
||||
bool viterbi_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
|
||||
if(begin == end) {
|
||||
return false;
|
||||
}
|
||||
@ -171,7 +171,7 @@ class HMMSegment: public SegmentBase {
|
||||
|
||||
//start
|
||||
for(size_t y = 0; y < Y; y++) {
|
||||
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
weight[0 + y * X] = startProb_[y] + getEmitProb_(emitProbVec_[y], *begin, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
|
||||
@ -183,10 +183,10 @@ class HMMSegment: public SegmentBase {
|
||||
now = x + y*X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = E; // warning
|
||||
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
emitProb = getEmitProb_(emitProbVec_[y], *(begin+x), MIN_DOUBLE);
|
||||
for(size_t preY = 0; preY < Y; preY++) {
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
||||
tmp = weight[old] + transProb_[preY][y] + emitProb;
|
||||
if(tmp > weight[now]) {
|
||||
weight[now] = tmp;
|
||||
path[now] = preY;
|
||||
@ -212,13 +212,13 @@ class HMMSegment: public SegmentBase {
|
||||
|
||||
return true;
|
||||
}
|
||||
bool _loadModel(const char* const filePath) {
|
||||
bool loadModel_(const char* const filePath) {
|
||||
ifstream ifile(filePath);
|
||||
string line;
|
||||
vector<string> tmp;
|
||||
vector<string> tmp2;
|
||||
//load _startProb
|
||||
if(!_getLine(ifile, line)) {
|
||||
//load startProb_
|
||||
if(!getLine_(ifile, line)) {
|
||||
return false;
|
||||
}
|
||||
split(line, tmp, " ");
|
||||
@ -227,12 +227,12 @@ class HMMSegment: public SegmentBase {
|
||||
return false;
|
||||
}
|
||||
for(size_t j = 0; j< tmp.size(); j++) {
|
||||
_startProb[j] = atof(tmp[j].c_str());
|
||||
startProb_[j] = atof(tmp[j].c_str());
|
||||
}
|
||||
|
||||
//load _transProb
|
||||
//load transProb_
|
||||
for(size_t i = 0; i < STATUS_SUM; i++) {
|
||||
if(!_getLine(ifile, line)) {
|
||||
if(!getLine_(ifile, line)) {
|
||||
return false;
|
||||
}
|
||||
split(line, tmp, " ");
|
||||
@ -241,33 +241,33 @@ class HMMSegment: public SegmentBase {
|
||||
return false;
|
||||
}
|
||||
for(size_t j =0; j < STATUS_SUM; j++) {
|
||||
_transProb[i][j] = atof(tmp[j].c_str());
|
||||
transProb_[i][j] = atof(tmp[j].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
//load _emitProbB
|
||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
|
||||
//load emitProbB_
|
||||
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbB_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//load _emitProbE
|
||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
|
||||
//load emitProbE_
|
||||
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbE_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//load _emitProbM
|
||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
|
||||
//load emitProbM_
|
||||
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbM_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//load _emitProbS
|
||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
|
||||
//load emitProbS_
|
||||
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbS_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
bool _getLine(ifstream& ifile, string& line) {
|
||||
bool getLine_(ifstream& ifile, string& line) {
|
||||
while(getline(ifile, line)) {
|
||||
trim(line);
|
||||
if(line.empty()) {
|
||||
@ -280,7 +280,7 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool _loadEmitProb(const string& line, EmitProbMap& mp) {
|
||||
bool loadEmitProb_(const string& line, EmitProbMap& mp) {
|
||||
if(line.empty()) {
|
||||
return false;
|
||||
}
|
||||
@ -290,7 +290,7 @@ class HMMSegment: public SegmentBase {
|
||||
for(size_t i = 0; i < tmp.size(); i++) {
|
||||
split(tmp[i], tmp2, ":");
|
||||
if(2 != tmp2.size()) {
|
||||
LogError("_emitProb illegal.");
|
||||
LogError("emitProb_ illegal.");
|
||||
return false;
|
||||
}
|
||||
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
||||
@ -301,7 +301,7 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
|
||||
double getEmitProb_(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
|
||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||
if(cit == ptMp->end()) {
|
||||
return defVal;
|
||||
@ -311,14 +311,14 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
private:
|
||||
char _statMap[STATUS_SUM];
|
||||
double _startProb[STATUS_SUM];
|
||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||
EmitProbMap _emitProbB;
|
||||
EmitProbMap _emitProbE;
|
||||
EmitProbMap _emitProbM;
|
||||
EmitProbMap _emitProbS;
|
||||
vector<EmitProbMap* > _emitProbVec;
|
||||
char statMap_[STATUS_SUM];
|
||||
double startProb_[STATUS_SUM];
|
||||
double transProb_[STATUS_SUM][STATUS_SUM];
|
||||
EmitProbMap emitProbB_;
|
||||
EmitProbMap emitProbE_;
|
||||
EmitProbMap emitProbM_;
|
||||
EmitProbMap emitProbS_;
|
||||
vector<EmitProbMap* > emitProbVec_;
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -18,9 +18,9 @@ class KeywordExtractor {
|
||||
~KeywordExtractor() {};
|
||||
|
||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
||||
_loadIdfDict(idfPath);
|
||||
_loadStopWordDict(stopWordPath);
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||
loadIdfDict_(idfPath);
|
||||
loadStopWordDict_(stopWordPath);
|
||||
LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDict));
|
||||
};
|
||||
|
||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
|
||||
@ -36,30 +36,30 @@ class KeywordExtractor {
|
||||
|
||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<string> words;
|
||||
if(!_segment.cut(str, words)) {
|
||||
if(!segment_.cut(str, words)) {
|
||||
LogError("segment cut(%s) failed.", str.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
map<string, double> wordmap;
|
||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||
if(_isSingleWord(*iter)) {
|
||||
if(isSingleWord_(*iter)) {
|
||||
continue;
|
||||
}
|
||||
wordmap[*iter] += 1.0;
|
||||
}
|
||||
|
||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
||||
if(_stopWords.end() != _stopWords.find(itr->first)) {
|
||||
if(stopWords_.end() != stopWords_.find(itr->first)) {
|
||||
wordmap.erase(itr++);
|
||||
continue;
|
||||
}
|
||||
|
||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
||||
if(cit != _idfMap.end()) {
|
||||
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||
if(cit != idfMap_.end()) {
|
||||
itr->second *= cit->second;
|
||||
} else {
|
||||
itr->second *= _idfAverage;
|
||||
itr->second *= idfAverage_;
|
||||
}
|
||||
itr ++;
|
||||
}
|
||||
@ -67,12 +67,12 @@ class KeywordExtractor {
|
||||
keywords.clear();
|
||||
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
||||
topN = min(topN, keywords.size());
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
|
||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), cmp_);
|
||||
keywords.resize(topN);
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
void _loadIdfDict(const string& idfPath) {
|
||||
void loadIdfDict_(const string& idfPath) {
|
||||
ifstream ifs(idfPath.c_str());
|
||||
if(!ifs.is_open()) {
|
||||
LogFatal("open %s failed.", idfPath.c_str());
|
||||
@ -93,28 +93,28 @@ class KeywordExtractor {
|
||||
continue;
|
||||
}
|
||||
idf = atof(buf[1].c_str());
|
||||
_idfMap[buf[0]] = idf;
|
||||
idfMap_[buf[0]] = idf;
|
||||
idfSum += idf;
|
||||
|
||||
}
|
||||
|
||||
assert(lineno);
|
||||
_idfAverage = idfSum / lineno;
|
||||
assert(_idfAverage > 0.0);
|
||||
idfAverage_ = idfSum / lineno;
|
||||
assert(idfAverage_ > 0.0);
|
||||
}
|
||||
void _loadStopWordDict(const string& filePath) {
|
||||
void loadStopWordDict_(const string& filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs.is_open()) {
|
||||
LogFatal("open %s failed.", filePath.c_str());
|
||||
}
|
||||
string line ;
|
||||
while(getline(ifs, line)) {
|
||||
_stopWords.insert(line);
|
||||
stopWords_.insert(line);
|
||||
}
|
||||
assert(_stopWords.size());
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
|
||||
bool _isSingleWord(const string& str) const {
|
||||
bool isSingleWord_(const string& str) const {
|
||||
Unicode unicode;
|
||||
TransCode::decode(str, unicode);
|
||||
if(unicode.size() == 1)
|
||||
@ -122,16 +122,16 @@ class KeywordExtractor {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
||||
static bool cmp_(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
||||
return lhs.second > rhs.second;
|
||||
}
|
||||
|
||||
private:
|
||||
MixSegment _segment;
|
||||
unordered_map<string, double> _idfMap;
|
||||
double _idfAverage;
|
||||
MixSegment segment_;
|
||||
unordered_map<string, double> idfMap_;
|
||||
double idfAverage_;
|
||||
|
||||
unordered_set<string> _stopWords;
|
||||
unordered_set<string> stopWords_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -21,12 +21,12 @@ class MPSegment: public SegmentBase {
|
||||
virtual ~MPSegment() {};
|
||||
|
||||
bool init(const string& dictPath, const string& userDictPath = "") {
|
||||
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
||||
LIMONP_CHECK(dictTrie_.init(dictPath, userDictPath));
|
||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||
return true;
|
||||
}
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
|
||||
return _dictTrie.isUserDictSingleChineseWord(value);
|
||||
return dictTrie_.isUserDictSingleChineseWord(value);
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
@ -57,20 +57,20 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
vector<SegmentChar> segmentChars;
|
||||
|
||||
_dictTrie.find(begin, end, segmentChars);
|
||||
dictTrie_.find(begin, end, segmentChars);
|
||||
|
||||
_calcDP(segmentChars);
|
||||
calcDP_(segmentChars);
|
||||
|
||||
_cut(segmentChars, res);
|
||||
cut_(segmentChars, res);
|
||||
|
||||
return true;
|
||||
}
|
||||
const DictTrie* getDictTrie() const {
|
||||
return &_dictTrie;
|
||||
return &dictTrie_;
|
||||
}
|
||||
|
||||
private:
|
||||
void _calcDP(vector<SegmentChar>& segmentChars) const {
|
||||
void calcDP_(vector<SegmentChar>& segmentChars) const {
|
||||
size_t nextPos;
|
||||
const DictUnit* p;
|
||||
double val;
|
||||
@ -90,7 +90,7 @@ class MPSegment: public SegmentBase {
|
||||
if(p) {
|
||||
val += p->weight;
|
||||
} else {
|
||||
val += _dictTrie.getMinWeight();
|
||||
val += dictTrie_.getMinWeight();
|
||||
}
|
||||
if(val > segmentChars[i].weight) {
|
||||
segmentChars[i].pInfo = p;
|
||||
@ -99,7 +99,7 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
|
||||
void cut_(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
|
||||
size_t i = 0;
|
||||
while(i < segmentChars.size()) {
|
||||
const DictUnit* p = segmentChars[i].pInfo;
|
||||
@ -114,7 +114,7 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
private:
|
||||
DictTrie _dictTrie;
|
||||
DictTrie dictTrie_;
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -17,8 +17,8 @@ class MixSegment: public SegmentBase {
|
||||
virtual ~MixSegment() {
|
||||
}
|
||||
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
|
||||
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
||||
LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
|
||||
LIMONP_CHECK(mpSeg_.init(mpSegDict, userDict));
|
||||
LIMONP_CHECK(hmmSeg_.init(hmmSegDict));
|
||||
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||
return true;
|
||||
}
|
||||
@ -26,7 +26,7 @@ class MixSegment: public SegmentBase {
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!_mpSeg.cut(begin, end, words)) {
|
||||
if(!mpSeg_.cut(begin, end, words)) {
|
||||
LogError("mpSeg cutDAG failed.");
|
||||
return false;
|
||||
}
|
||||
@ -37,21 +37,21 @@ class MixSegment: public SegmentBase {
|
||||
piece.reserve(end - begin);
|
||||
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
||||
//if mp get a word, it's ok, put it into result
|
||||
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) {
|
||||
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.isUserDictSingleChineseWord(words[i][0]))) {
|
||||
res.push_back(words[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if mp get a single one and it is not in userdict, collect it in sequence
|
||||
j = i;
|
||||
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) {
|
||||
while (j < words.size() && 1 == words[j].size() && !mpSeg_.isUserDictSingleChineseWord(words[j][0])) {
|
||||
piece.push_back(words[j][0]);
|
||||
j++;
|
||||
}
|
||||
|
||||
// cut the sequence with hmm
|
||||
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) {
|
||||
LogError("_hmmSeg cut failed.");
|
||||
if (!hmmSeg_.cut(piece.begin(), piece.end(), hmmRes)) {
|
||||
LogError("hmmSeg_ cut failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -92,11 +92,11 @@ class MixSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
const DictTrie* getDictTrie() const {
|
||||
return _mpSeg.getDictTrie();
|
||||
return mpSeg_.getDictTrie();
|
||||
}
|
||||
private:
|
||||
MPSegment _mpSeg;
|
||||
HMMSegment _hmmSeg;
|
||||
MPSegment mpSeg_;
|
||||
HMMSegment hmmSeg_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -30,16 +30,16 @@ class PosTagger {
|
||||
const string& hmmFilePath,
|
||||
const string& userDictPath = ""
|
||||
) {
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
|
||||
_dictTrie = _segment.getDictTrie();
|
||||
LIMONP_CHECK(_dictTrie);
|
||||
LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDictPath));
|
||||
dictTrie_ = segment_.getDictTrie();
|
||||
LIMONP_CHECK(dictTrie_);
|
||||
};
|
||||
|
||||
|
||||
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
vector<string> cutRes;
|
||||
if (!_segment.cut(src, cutRes)) {
|
||||
LogError("_mixSegment cut failed");
|
||||
if (!segment_.cut(src, cutRes)) {
|
||||
LogError("mixSegment_ cut failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -50,9 +50,9 @@ class PosTagger {
|
||||
LogError("decode failed.");
|
||||
return false;
|
||||
}
|
||||
tmp = _dictTrie->find(unico.begin(), unico.end());
|
||||
tmp = dictTrie_->find(unico.begin(), unico.end());
|
||||
if(tmp == NULL || tmp->tag.empty()) {
|
||||
res.push_back(make_pair(*itr, _specialRule(unico)));
|
||||
res.push_back(make_pair(*itr, specialRule_(unico)));
|
||||
} else {
|
||||
res.push_back(make_pair(*itr, tmp->tag));
|
||||
}
|
||||
@ -60,7 +60,7 @@ class PosTagger {
|
||||
return !res.empty();
|
||||
}
|
||||
private:
|
||||
const char* _specialRule(const Unicode& unicode) const {
|
||||
const char* specialRule_(const Unicode& unicode) const {
|
||||
size_t m = 0;
|
||||
size_t eng = 0;
|
||||
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||
@ -83,8 +83,8 @@ class PosTagger {
|
||||
return POS_ENG;
|
||||
}
|
||||
private:
|
||||
MixSegment _segment;
|
||||
const DictTrie * _dictTrie;
|
||||
MixSegment segment_;
|
||||
const DictTrie * dictTrie_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -22,10 +22,10 @@ class QuerySegment: public SegmentBase {
|
||||
};
|
||||
virtual ~QuerySegment() {};
|
||||
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
|
||||
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
|
||||
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
|
||||
LIMONP_CHECK(mixSeg_.init(dict, model, userDict));
|
||||
LIMONP_CHECK(fullSeg_.init(mixSeg_.getDictTrie()));
|
||||
assert(maxWordLen);
|
||||
_maxWordLen = maxWordLen;
|
||||
maxWordLen_ = maxWordLen;
|
||||
return true;
|
||||
}
|
||||
using SegmentBase::cut;
|
||||
@ -37,17 +37,17 @@ class QuerySegment: public SegmentBase {
|
||||
|
||||
//use mix cut first
|
||||
vector<Unicode> mixRes;
|
||||
if (!_mixSeg.cut(begin, end, mixRes)) {
|
||||
LogError("_mixSeg cut failed.");
|
||||
if (!mixSeg_.cut(begin, end, mixRes)) {
|
||||
LogError("mixSeg_ cut failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Unicode> fullRes;
|
||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||
|
||||
// if it's too long, cut with _fullSeg, put fullRes in res
|
||||
if (mixResItr->size() > _maxWordLen) {
|
||||
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
|
||||
// if it's too long, cut with fullSeg_, put fullRes in res
|
||||
if (mixResItr->size() > maxWordLen_) {
|
||||
if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
|
||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||
res.push_back(*fullResItr);
|
||||
}
|
||||
@ -88,9 +88,9 @@ class QuerySegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
MixSegment _mixSeg;
|
||||
FullSegment _fullSeg;
|
||||
size_t _maxWordLen;
|
||||
MixSegment mixSeg_;
|
||||
FullSegment fullSeg_;
|
||||
size_t maxWordLen_;
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
||||
class SegmentBase: public ISegment, public NonCopyable {
|
||||
public:
|
||||
SegmentBase() {
|
||||
_loadSpecialSymbols();
|
||||
loadSpecialSymbols_();
|
||||
};
|
||||
virtual ~SegmentBase() {};
|
||||
public:
|
||||
@ -39,7 +39,7 @@ class SegmentBase: public ISegment, public NonCopyable {
|
||||
Unicode::const_iterator right;
|
||||
|
||||
for(right = unicode.begin(); right != unicode.end(); right++) {
|
||||
if(isIn(_specialSymbols, *right)) {
|
||||
if(isIn(specialSymbols_, *right)) {
|
||||
if(left != right) {
|
||||
cut(left, right, res);
|
||||
}
|
||||
@ -55,15 +55,15 @@ class SegmentBase: public ISegment, public NonCopyable {
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
void _loadSpecialSymbols() {
|
||||
void loadSpecialSymbols_() {
|
||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||
for(size_t i = 0; i < size; i ++) {
|
||||
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
||||
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
|
||||
}
|
||||
assert(_specialSymbols.size());
|
||||
assert(specialSymbols_.size());
|
||||
}
|
||||
private:
|
||||
unordered_set<UnicodeValueType> _specialSymbols;
|
||||
unordered_set<UnicodeValueType> specialSymbols_;
|
||||
|
||||
};
|
||||
}
|
||||
|
50
src/Trie.hpp
50
src/Trie.hpp
@ -61,19 +61,19 @@ class TrieNode {
|
||||
class Trie {
|
||||
public:
|
||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
||||
_root = new TrieNode;
|
||||
_createTrie(keys, valuePointers);
|
||||
_build();// build automation
|
||||
root_ = new TrieNode;
|
||||
createTrie_(keys, valuePointers);
|
||||
build_();// build automation
|
||||
}
|
||||
~Trie() {
|
||||
if(_root) {
|
||||
_deleteNode(_root);
|
||||
if(root_) {
|
||||
deleteNode_(root_);
|
||||
}
|
||||
}
|
||||
public:
|
||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
const TrieNode* ptNode = _root;
|
||||
const TrieNode* ptNode = root_;
|
||||
for(Unicode::const_iterator it = begin; it != end; it++) {
|
||||
// build automation
|
||||
assert(ptNode);
|
||||
@ -91,7 +91,7 @@ class Trie {
|
||||
vector<struct SegmentChar>& res
|
||||
) const {
|
||||
res.resize(end - begin);
|
||||
const TrieNode * now = _root;
|
||||
const TrieNode * now = root_;
|
||||
const TrieNode* node;
|
||||
// compiler will complain warnings if only "i < end - begin" .
|
||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||
@ -102,7 +102,7 @@ class Trie {
|
||||
bool flag = false;
|
||||
|
||||
// rollback
|
||||
while( now != _root ) {
|
||||
while( now != root_ ) {
|
||||
node = now->findNext(ch);
|
||||
if (node != NULL) {
|
||||
flag = true;
|
||||
@ -116,11 +116,11 @@ class Trie {
|
||||
node = now->findNext(ch);
|
||||
}
|
||||
if(node == NULL) {
|
||||
now = _root;
|
||||
now = root_;
|
||||
} else {
|
||||
now = node;
|
||||
const TrieNode * temp = now;
|
||||
while(temp != _root) {
|
||||
while(temp != root_) {
|
||||
if (temp->ptValue) {
|
||||
size_t pos = i - temp->ptValue->word.size() + 1;
|
||||
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
|
||||
@ -139,7 +139,7 @@ class Trie {
|
||||
Unicode::const_iterator end,
|
||||
DagType & res,
|
||||
size_t offset = 0) const {
|
||||
const TrieNode * ptNode = _root;
|
||||
const TrieNode * ptNode = root_;
|
||||
TrieNode::NextMap::const_iterator citer;
|
||||
for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
|
||||
assert(ptNode);
|
||||
@ -158,13 +158,13 @@ class Trie {
|
||||
return !res.empty();
|
||||
}
|
||||
private:
|
||||
void _build() {
|
||||
void build_() {
|
||||
queue<TrieNode*> que;
|
||||
assert(_root->ptValue == NULL);
|
||||
assert(_root->next);
|
||||
_root->fail = NULL;
|
||||
for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
|
||||
iter->second->fail = _root;
|
||||
assert(root_->ptValue == NULL);
|
||||
assert(root_->next);
|
||||
root_->fail = NULL;
|
||||
for(TrieNode::NextMap::iterator iter = root_->next->begin(); iter != root_->next->end(); iter++) {
|
||||
iter->second->fail = root_;
|
||||
que.push(iter->second);
|
||||
}
|
||||
TrieNode* back = NULL;
|
||||
@ -185,24 +185,24 @@ class Trie {
|
||||
back = back->fail;
|
||||
}
|
||||
if(back == NULL) {
|
||||
iter->second->fail = _root;
|
||||
iter->second->fail = root_;
|
||||
}
|
||||
que.push(iter->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
||||
void createTrie_(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
||||
if(valuePointers.empty() || keys.empty()) {
|
||||
return;
|
||||
}
|
||||
assert(keys.size() == valuePointers.size());
|
||||
|
||||
for(size_t i = 0; i < keys.size(); i++) {
|
||||
_insertNode(keys[i], valuePointers[i]);
|
||||
insertNode_(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
void _insertNode(const Unicode& key, const DictUnit* ptValue) {
|
||||
TrieNode* ptNode = _root;
|
||||
void insertNode_(const Unicode& key, const DictUnit* ptValue) {
|
||||
TrieNode* ptNode = root_;
|
||||
|
||||
TrieNode::NextMap::const_iterator kmIter;
|
||||
|
||||
@ -224,21 +224,21 @@ class Trie {
|
||||
}
|
||||
ptNode->ptValue = ptValue;
|
||||
}
|
||||
void _deleteNode(TrieNode* node) {
|
||||
void deleteNode_(TrieNode* node) {
|
||||
if(!node) {
|
||||
return;
|
||||
}
|
||||
if(node->next) {
|
||||
TrieNode::NextMap::iterator it;
|
||||
for(it = node->next->begin(); it != node->next->end(); it++) {
|
||||
_deleteNode(it->second);
|
||||
deleteNode_(it->second);
|
||||
}
|
||||
delete node->next;
|
||||
}
|
||||
delete node;
|
||||
}
|
||||
private:
|
||||
TrieNode* _root;
|
||||
TrieNode* root_;
|
||||
};
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user