code style

This commit is contained in:
yanyiwu 2015-05-06 23:02:03 +08:00
parent bb32234654
commit 2b18a582fc
10 changed files with 195 additions and 195 deletions

View File

@ -26,60 +26,60 @@ class DictTrie {
public:
DictTrie() {
_trie = NULL;
_minWeight = MAX_DOUBLE;
trie_ = NULL;
minWeight_ = MAX_DOUBLE;
}
DictTrie(const string& dictPath, const string& userDictPath = "") {
new (this) DictTrie();
init(dictPath, userDictPath);
}
~DictTrie() {
if(_trie) {
delete _trie;
if(trie_) {
delete trie_;
}
}
bool init(const string& dictPath, const string& userDictPath = "") {
if(_trie != NULL) {
if(trie_ != NULL) {
LogFatal("trie already initted");
}
_loadDict(dictPath);
_calculateWeight(_nodeInfos);
_minWeight = _findMinWeight(_nodeInfos);
loadDict_(dictPath);
calculateWeight_(nodeInfos_);
minWeight_ = findMinWeight_(nodeInfos_);
if(userDictPath.size()) {
double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
double maxWeight = findMaxWeight_(nodeInfos_);
loadUserDict_(userDictPath, maxWeight, UNKNOWN_TAG);
}
_shrink(_nodeInfos);
_trie = _createTrie(_nodeInfos);
assert(_trie);
shrink_(nodeInfos_);
trie_ = createTrie_(nodeInfos_);
assert(trie_);
return true;
}
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
return _trie->find(begin, end);
return trie_->find(begin, end);
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
return _trie->find(begin, end, dag, offset);
return trie_->find(begin, end, dag, offset);
}
void find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<SegmentChar>& res
) const {
_trie->find(begin, end, res);
trie_->find(begin, end, res);
}
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
return isIn(_userDictSingleChineseWord, word);
return isIn(userDictSingleChineseWord_, word);
}
double getMinWeight() const {
return _minWeight;
return minWeight_;
};
private:
Trie * _createTrie(const vector<DictUnit>& dictUnits) {
Trie * createTrie_(const vector<DictUnit>& dictUnits) {
assert(dictUnits.size());
vector<Unicode> words;
vector<const DictUnit*> valuePointers;
@ -91,7 +91,7 @@ class DictTrie {
Trie * trie = new Trie(words, valuePointers);
return trie;
}
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
void loadUserDict_(const string& filePath, double defaultWeight, const string& defaultTag) {
ifstream ifs(filePath.c_str());
if(!ifs.is_open()) {
LogFatal("file %s open failed.", filePath.c_str());
@ -111,15 +111,15 @@ class DictTrie {
continue;
}
if(nodeInfo.word.size() == 1) {
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
}
nodeInfo.weight = defaultWeight;
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
_nodeInfos.push_back(nodeInfo);
nodeInfos_.push_back(nodeInfo);
}
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
}
void _loadDict(const string& filePath) {
void loadDict_(const string& filePath) {
ifstream ifs(filePath.c_str());
if(!ifs.is_open()) {
LogFatal("file %s open failed.", filePath.c_str());
@ -141,17 +141,17 @@ class DictTrie {
nodeInfo.weight = atof(buf[1].c_str());
nodeInfo.tag = buf[2];
_nodeInfos.push_back(nodeInfo);
nodeInfos_.push_back(nodeInfo);
}
}
double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
double ret = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) {
ret = min(nodeInfos[i].weight, ret);
}
return ret;
}
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
double findMaxWeight_(const vector<DictUnit>& nodeInfos) const {
double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) {
ret = max(nodeInfos[i].weight, ret);
@ -159,7 +159,7 @@ class DictTrie {
return ret;
}
void _calculateWeight(vector<DictUnit>& nodeInfos) const {
void calculateWeight_(vector<DictUnit>& nodeInfos) const {
double sum = 0.0;
for(size_t i = 0; i < nodeInfos.size(); i++) {
sum += nodeInfos[i].weight;
@ -172,16 +172,16 @@ class DictTrie {
}
}
void _shrink(vector<DictUnit>& units) const {
void shrink_(vector<DictUnit>& units) const {
vector<DictUnit>(units.begin(), units.end()).swap(units);
}
private:
vector<DictUnit> _nodeInfos;
Trie * _trie;
vector<DictUnit> nodeInfos_;
Trie * trie_;
double _minWeight;
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
double minWeight_;
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
};
}

View File

@ -14,40 +14,40 @@ namespace CppJieba {
class FullSegment: public SegmentBase {
public:
FullSegment() {
_dictTrie = NULL;
_isBorrowed = false;
dictTrie_ = NULL;
isBorrowed_ = false;
}
explicit FullSegment(const string& dictPath) {
_dictTrie = NULL;
dictTrie_ = NULL;
init(dictPath);
}
explicit FullSegment(const DictTrie* dictTrie) {
_dictTrie = NULL;
dictTrie_ = NULL;
init(dictTrie);
}
virtual ~FullSegment() {
if(_dictTrie && ! _isBorrowed) {
delete _dictTrie;
if(dictTrie_ && ! isBorrowed_) {
delete dictTrie_;
}
};
bool init(const string& dictPath) {
assert(_dictTrie == NULL);
_dictTrie = new DictTrie(dictPath);
_isBorrowed = false;
assert(dictTrie_ == NULL);
dictTrie_ = new DictTrie(dictPath);
isBorrowed_ = false;
return true;
}
bool init(const DictTrie* dictTrie) {
assert(_dictTrie == NULL);
assert(dictTrie_ == NULL);
assert(dictTrie);
_dictTrie = dictTrie;
_isBorrowed = true;
dictTrie_ = dictTrie;
isBorrowed_ = true;
return true;
}
using SegmentBase::cut;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
assert(_dictTrie);
assert(dictTrie_);
if (begin >= end) {
LogError("begin >= end");
return false;
@ -66,7 +66,7 @@ class FullSegment: public SegmentBase {
int wordLen = 0;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
//find word start from uItr
if (_dictTrie->find(uItr, end, tRes, 0)) {
if (dictTrie_->find(uItr, end, tRes, 0)) {
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{
@ -93,7 +93,7 @@ class FullSegment: public SegmentBase {
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
assert(_dictTrie);
assert(dictTrie_);
if (begin >= end) {
LogError("begin >= end");
return false;
@ -117,8 +117,8 @@ class FullSegment: public SegmentBase {
return true;
}
private:
const DictTrie* _dictTrie;
bool _isBorrowed;
const DictTrie* dictTrie_;
bool isBorrowed_;
};
}

View File

@ -31,17 +31,17 @@ class HMMSegment: public SegmentBase {
virtual ~HMMSegment() {}
public:
bool init(const string& filePath) {
memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B';
_statMap[1] = 'E';
_statMap[2] = 'M';
_statMap[3] = 'S';
_emitProbVec.push_back(&_emitProbB);
_emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbS);
LIMONP_CHECK(_loadModel(filePath.c_str()));
memset(startProb_, 0, sizeof(startProb_));
memset(transProb_, 0, sizeof(transProb_));
statMap_[0] = 'B';
statMap_[1] = 'E';
statMap_[2] = 'M';
statMap_[3] = 'S';
emitProbVec_.push_back(&emitProbB_);
emitProbVec_.push_back(&emitProbE_);
emitProbVec_.push_back(&emitProbM_);
emitProbVec_.push_back(&emitProbS_);
LIMONP_CHECK(loadModel_(filePath.c_str()));
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
return true;
}
@ -53,16 +53,16 @@ class HMMSegment: public SegmentBase {
Unicode::const_iterator right = begin;
while(right != end) {
if(*right < 0x80) {
if(left != right && !_cut(left, right, res)) {
if(left != right && !cut_(left, right, res)) {
return false;
}
left = right;
do {
right = _sequentialLetterRule(left, end);
right = sequentialLetterRule_(left, end);
if(right != left) {
break;
}
right = _numbersRule(left, end);
right = numbersRule_(left, end);
if(right != left) {
break;
}
@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase {
right++;
}
}
if(left != right && !_cut(left, right, res)) {
if(left != right && !cut_(left, right, res)) {
return false;
}
return true;
@ -100,7 +100,7 @@ class HMMSegment: public SegmentBase {
}
private:
// sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Unicode::value_type x = *begin;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
begin ++;
@ -118,7 +118,7 @@ class HMMSegment: public SegmentBase {
return begin;
}
//
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Unicode::value_type x = *begin;
if('0' <= x && x <= '9') {
begin ++;
@ -135,10 +135,10 @@ class HMMSegment: public SegmentBase {
}
return begin;
}
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
bool cut_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
vector<size_t> status;
if(!_viterbi(begin, end, status)) {
LogError("_viterbi failed.");
if(!viterbi_(begin, end, status)) {
LogError("viterbi_ failed.");
return false;
}
@ -154,7 +154,7 @@ class HMMSegment: public SegmentBase {
return true;
}
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
bool viterbi_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
if(begin == end) {
return false;
}
@ -171,7 +171,7 @@ class HMMSegment: public SegmentBase {
//start
for(size_t y = 0; y < Y; y++) {
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
weight[0 + y * X] = startProb_[y] + getEmitProb_(emitProbVec_[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1;
}
@ -183,10 +183,10 @@ class HMMSegment: public SegmentBase {
now = x + y*X;
weight[now] = MIN_DOUBLE;
path[now] = E; // warning
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
emitProb = getEmitProb_(emitProbVec_[y], *(begin+x), MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++) {
old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + emitProb;
tmp = weight[old] + transProb_[preY][y] + emitProb;
if(tmp > weight[now]) {
weight[now] = tmp;
path[now] = preY;
@ -212,13 +212,13 @@ class HMMSegment: public SegmentBase {
return true;
}
bool _loadModel(const char* const filePath) {
bool loadModel_(const char* const filePath) {
ifstream ifile(filePath);
string line;
vector<string> tmp;
vector<string> tmp2;
//load _startProb
if(!_getLine(ifile, line)) {
//load startProb_
if(!getLine_(ifile, line)) {
return false;
}
split(line, tmp, " ");
@ -227,12 +227,12 @@ class HMMSegment: public SegmentBase {
return false;
}
for(size_t j = 0; j< tmp.size(); j++) {
_startProb[j] = atof(tmp[j].c_str());
startProb_[j] = atof(tmp[j].c_str());
}
//load _transProb
//load transProb_
for(size_t i = 0; i < STATUS_SUM; i++) {
if(!_getLine(ifile, line)) {
if(!getLine_(ifile, line)) {
return false;
}
split(line, tmp, " ");
@ -241,33 +241,33 @@ class HMMSegment: public SegmentBase {
return false;
}
for(size_t j =0; j < STATUS_SUM; j++) {
_transProb[i][j] = atof(tmp[j].c_str());
transProb_[i][j] = atof(tmp[j].c_str());
}
}
//load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
//load emitProbB_
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbB_)) {
return false;
}
//load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
//load emitProbE_
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbE_)) {
return false;
}
//load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
//load emitProbM_
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbM_)) {
return false;
}
//load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
//load emitProbS_
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbS_)) {
return false;
}
return true;
}
bool _getLine(ifstream& ifile, string& line) {
bool getLine_(ifstream& ifile, string& line) {
while(getline(ifile, line)) {
trim(line);
if(line.empty()) {
@ -280,7 +280,7 @@ class HMMSegment: public SegmentBase {
}
return false;
}
bool _loadEmitProb(const string& line, EmitProbMap& mp) {
bool loadEmitProb_(const string& line, EmitProbMap& mp) {
if(line.empty()) {
return false;
}
@ -290,7 +290,7 @@ class HMMSegment: public SegmentBase {
for(size_t i = 0; i < tmp.size(); i++) {
split(tmp[i], tmp2, ":");
if(2 != tmp2.size()) {
LogError("_emitProb illegal.");
LogError("emitProb_ illegal.");
return false;
}
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
@ -301,7 +301,7 @@ class HMMSegment: public SegmentBase {
}
return true;
}
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
double getEmitProb_(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end()) {
return defVal;
@ -311,14 +311,14 @@ class HMMSegment: public SegmentBase {
}
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
char statMap_[STATUS_SUM];
double startProb_[STATUS_SUM];
double transProb_[STATUS_SUM][STATUS_SUM];
EmitProbMap emitProbB_;
EmitProbMap emitProbE_;
EmitProbMap emitProbM_;
EmitProbMap emitProbS_;
vector<EmitProbMap* > emitProbVec_;
};
}

View File

@ -18,9 +18,9 @@ class KeywordExtractor {
~KeywordExtractor() {};
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
_loadIdfDict(idfPath);
_loadStopWordDict(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
loadIdfDict_(idfPath);
loadStopWordDict_(stopWordPath);
LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDict));
};
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
@ -36,30 +36,30 @@ class KeywordExtractor {
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
vector<string> words;
if(!_segment.cut(str, words)) {
if(!segment_.cut(str, words)) {
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
if(_isSingleWord(*iter)) {
if(isSingleWord_(*iter)) {
continue;
}
wordmap[*iter] += 1.0;
}
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
if(_stopWords.end() != _stopWords.find(itr->first)) {
if(stopWords_.end() != stopWords_.find(itr->first)) {
wordmap.erase(itr++);
continue;
}
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
if(cit != _idfMap.end()) {
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
if(cit != idfMap_.end()) {
itr->second *= cit->second;
} else {
itr->second *= _idfAverage;
itr->second *= idfAverage_;
}
itr ++;
}
@ -67,12 +67,12 @@ class KeywordExtractor {
keywords.clear();
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), cmp_);
keywords.resize(topN);
return true;
}
private:
void _loadIdfDict(const string& idfPath) {
void loadIdfDict_(const string& idfPath) {
ifstream ifs(idfPath.c_str());
if(!ifs.is_open()) {
LogFatal("open %s failed.", idfPath.c_str());
@ -93,28 +93,28 @@ class KeywordExtractor {
continue;
}
idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf;
idfMap_[buf[0]] = idf;
idfSum += idf;
}
assert(lineno);
_idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0);
idfAverage_ = idfSum / lineno;
assert(idfAverage_ > 0.0);
}
void _loadStopWordDict(const string& filePath) {
void loadStopWordDict_(const string& filePath) {
ifstream ifs(filePath.c_str());
if(!ifs.is_open()) {
LogFatal("open %s failed.", filePath.c_str());
}
string line ;
while(getline(ifs, line)) {
_stopWords.insert(line);
stopWords_.insert(line);
}
assert(_stopWords.size());
assert(stopWords_.size());
}
bool _isSingleWord(const string& str) const {
bool isSingleWord_(const string& str) const {
Unicode unicode;
TransCode::decode(str, unicode);
if(unicode.size() == 1)
@ -122,16 +122,16 @@ class KeywordExtractor {
return false;
}
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
static bool cmp_(const pair<string, double>& lhs, const pair<string, double>& rhs) {
return lhs.second > rhs.second;
}
private:
MixSegment _segment;
unordered_map<string, double> _idfMap;
double _idfAverage;
MixSegment segment_;
unordered_map<string, double> idfMap_;
double idfAverage_;
unordered_set<string> _stopWords;
unordered_set<string> stopWords_;
};
}

View File

@ -21,12 +21,12 @@ class MPSegment: public SegmentBase {
virtual ~MPSegment() {};
bool init(const string& dictPath, const string& userDictPath = "") {
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
LIMONP_CHECK(dictTrie_.init(dictPath, userDictPath));
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
return true;
}
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
return _dictTrie.isUserDictSingleChineseWord(value);
return dictTrie_.isUserDictSingleChineseWord(value);
}
using SegmentBase::cut;
@ -57,20 +57,20 @@ class MPSegment: public SegmentBase {
}
vector<SegmentChar> segmentChars;
_dictTrie.find(begin, end, segmentChars);
dictTrie_.find(begin, end, segmentChars);
_calcDP(segmentChars);
calcDP_(segmentChars);
_cut(segmentChars, res);
cut_(segmentChars, res);
return true;
}
const DictTrie* getDictTrie() const {
return &_dictTrie;
return &dictTrie_;
}
private:
void _calcDP(vector<SegmentChar>& segmentChars) const {
void calcDP_(vector<SegmentChar>& segmentChars) const {
size_t nextPos;
const DictUnit* p;
double val;
@ -90,7 +90,7 @@ class MPSegment: public SegmentBase {
if(p) {
val += p->weight;
} else {
val += _dictTrie.getMinWeight();
val += dictTrie_.getMinWeight();
}
if(val > segmentChars[i].weight) {
segmentChars[i].pInfo = p;
@ -99,7 +99,7 @@ class MPSegment: public SegmentBase {
}
}
}
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
void cut_(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
size_t i = 0;
while(i < segmentChars.size()) {
const DictUnit* p = segmentChars[i].pInfo;
@ -114,7 +114,7 @@ class MPSegment: public SegmentBase {
}
private:
DictTrie _dictTrie;
DictTrie dictTrie_;
};
}

View File

@ -17,8 +17,8 @@ class MixSegment: public SegmentBase {
virtual ~MixSegment() {
}
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
LIMONP_CHECK(mpSeg_.init(mpSegDict, userDict));
LIMONP_CHECK(hmmSeg_.init(hmmSegDict));
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
return true;
}
@ -26,7 +26,7 @@ class MixSegment: public SegmentBase {
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Unicode> words;
words.reserve(end - begin);
if(!_mpSeg.cut(begin, end, words)) {
if(!mpSeg_.cut(begin, end, words)) {
LogError("mpSeg cutDAG failed.");
return false;
}
@ -37,21 +37,21 @@ class MixSegment: public SegmentBase {
piece.reserve(end - begin);
for (size_t i = 0, j = 0; i < words.size(); i++) {
//if mp get a word, it's ok, put it into result
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) {
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.isUserDictSingleChineseWord(words[i][0]))) {
res.push_back(words[i]);
continue;
}
// if mp get a single one and it is not in userdict, collect it in sequence
j = i;
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) {
while (j < words.size() && 1 == words[j].size() && !mpSeg_.isUserDictSingleChineseWord(words[j][0])) {
piece.push_back(words[j][0]);
j++;
}
// cut the sequence with hmm
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) {
LogError("_hmmSeg cut failed.");
if (!hmmSeg_.cut(piece.begin(), piece.end(), hmmRes)) {
LogError("hmmSeg_ cut failed.");
return false;
}
@ -92,11 +92,11 @@ class MixSegment: public SegmentBase {
}
const DictTrie* getDictTrie() const {
return _mpSeg.getDictTrie();
return mpSeg_.getDictTrie();
}
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
MPSegment mpSeg_;
HMMSegment hmmSeg_;
};
}

View File

@ -30,16 +30,16 @@ class PosTagger {
const string& hmmFilePath,
const string& userDictPath = ""
) {
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
_dictTrie = _segment.getDictTrie();
LIMONP_CHECK(_dictTrie);
LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDictPath));
dictTrie_ = segment_.getDictTrie();
LIMONP_CHECK(dictTrie_);
};
bool tag(const string& src, vector<pair<string, string> >& res) const {
vector<string> cutRes;
if (!_segment.cut(src, cutRes)) {
LogError("_mixSegment cut failed");
if (!segment_.cut(src, cutRes)) {
LogError("mixSegment_ cut failed");
return false;
}
@ -50,9 +50,9 @@ class PosTagger {
LogError("decode failed.");
return false;
}
tmp = _dictTrie->find(unico.begin(), unico.end());
tmp = dictTrie_->find(unico.begin(), unico.end());
if(tmp == NULL || tmp->tag.empty()) {
res.push_back(make_pair(*itr, _specialRule(unico)));
res.push_back(make_pair(*itr, specialRule_(unico)));
} else {
res.push_back(make_pair(*itr, tmp->tag));
}
@ -60,7 +60,7 @@ class PosTagger {
return !res.empty();
}
private:
const char* _specialRule(const Unicode& unicode) const {
const char* specialRule_(const Unicode& unicode) const {
size_t m = 0;
size_t eng = 0;
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
@ -83,8 +83,8 @@ class PosTagger {
return POS_ENG;
}
private:
MixSegment _segment;
const DictTrie * _dictTrie;
MixSegment segment_;
const DictTrie * dictTrie_;
};
}

View File

@ -22,10 +22,10 @@ class QuerySegment: public SegmentBase {
};
virtual ~QuerySegment() {};
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
LIMONP_CHECK(mixSeg_.init(dict, model, userDict));
LIMONP_CHECK(fullSeg_.init(mixSeg_.getDictTrie()));
assert(maxWordLen);
_maxWordLen = maxWordLen;
maxWordLen_ = maxWordLen;
return true;
}
using SegmentBase::cut;
@ -37,17 +37,17 @@ class QuerySegment: public SegmentBase {
//use mix cut first
vector<Unicode> mixRes;
if (!_mixSeg.cut(begin, end, mixRes)) {
LogError("_mixSeg cut failed.");
if (!mixSeg_.cut(begin, end, mixRes)) {
LogError("mixSeg_ cut failed.");
return false;
}
vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
// if it's too long, cut with _fullSeg, put fullRes in res
if (mixResItr->size() > _maxWordLen) {
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
// if it's too long, cut with fullSeg_, put fullRes in res
if (mixResItr->size() > maxWordLen_) {
if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
res.push_back(*fullResItr);
}
@ -88,9 +88,9 @@ class QuerySegment: public SegmentBase {
return true;
}
private:
MixSegment _mixSeg;
FullSegment _fullSeg;
size_t _maxWordLen;
MixSegment mixSeg_;
FullSegment fullSeg_;
size_t maxWordLen_;
};
}

View File

@ -22,7 +22,7 @@ const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
class SegmentBase: public ISegment, public NonCopyable {
public:
SegmentBase() {
_loadSpecialSymbols();
loadSpecialSymbols_();
};
virtual ~SegmentBase() {};
public:
@ -39,7 +39,7 @@ class SegmentBase: public ISegment, public NonCopyable {
Unicode::const_iterator right;
for(right = unicode.begin(); right != unicode.end(); right++) {
if(isIn(_specialSymbols, *right)) {
if(isIn(specialSymbols_, *right)) {
if(left != right) {
cut(left, right, res);
}
@ -55,15 +55,15 @@ class SegmentBase: public ISegment, public NonCopyable {
return true;
}
private:
void _loadSpecialSymbols() {
void loadSpecialSymbols_() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) {
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
assert(specialSymbols_.size());
}
private:
unordered_set<UnicodeValueType> _specialSymbols;
unordered_set<UnicodeValueType> specialSymbols_;
};
}

View File

@ -61,19 +61,19 @@ class TrieNode {
class Trie {
public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
_root = new TrieNode;
_createTrie(keys, valuePointers);
_build();// build automation
root_ = new TrieNode;
createTrie_(keys, valuePointers);
build_();// build automation
}
~Trie() {
if(_root) {
_deleteNode(_root);
if(root_) {
deleteNode_(root_);
}
}
public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
TrieNode::NextMap::const_iterator citer;
const TrieNode* ptNode = _root;
const TrieNode* ptNode = root_;
for(Unicode::const_iterator it = begin; it != end; it++) {
// build automation
assert(ptNode);
@ -91,7 +91,7 @@ class Trie {
vector<struct SegmentChar>& res
) const {
res.resize(end - begin);
const TrieNode * now = _root;
const TrieNode * now = root_;
const TrieNode* node;
// compiler will complain warnings if only "i < end - begin" .
for (size_t i = 0; i < size_t(end - begin); i++) {
@ -102,7 +102,7 @@ class Trie {
bool flag = false;
// rollback
while( now != _root ) {
while( now != root_ ) {
node = now->findNext(ch);
if (node != NULL) {
flag = true;
@ -116,11 +116,11 @@ class Trie {
node = now->findNext(ch);
}
if(node == NULL) {
now = _root;
now = root_;
} else {
now = node;
const TrieNode * temp = now;
while(temp != _root) {
while(temp != root_) {
if (temp->ptValue) {
size_t pos = i - temp->ptValue->word.size() + 1;
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
@ -139,7 +139,7 @@ class Trie {
Unicode::const_iterator end,
DagType & res,
size_t offset = 0) const {
const TrieNode * ptNode = _root;
const TrieNode * ptNode = root_;
TrieNode::NextMap::const_iterator citer;
for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
assert(ptNode);
@ -158,13 +158,13 @@ class Trie {
return !res.empty();
}
private:
void _build() {
void build_() {
queue<TrieNode*> que;
assert(_root->ptValue == NULL);
assert(_root->next);
_root->fail = NULL;
for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
iter->second->fail = _root;
assert(root_->ptValue == NULL);
assert(root_->next);
root_->fail = NULL;
for(TrieNode::NextMap::iterator iter = root_->next->begin(); iter != root_->next->end(); iter++) {
iter->second->fail = root_;
que.push(iter->second);
}
TrieNode* back = NULL;
@ -185,24 +185,24 @@ class Trie {
back = back->fail;
}
if(back == NULL) {
iter->second->fail = _root;
iter->second->fail = root_;
}
que.push(iter->second);
}
}
}
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
void createTrie_(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
if(valuePointers.empty() || keys.empty()) {
return;
}
assert(keys.size() == valuePointers.size());
for(size_t i = 0; i < keys.size(); i++) {
_insertNode(keys[i], valuePointers[i]);
insertNode_(keys[i], valuePointers[i]);
}
}
void _insertNode(const Unicode& key, const DictUnit* ptValue) {
TrieNode* ptNode = _root;
void insertNode_(const Unicode& key, const DictUnit* ptValue) {
TrieNode* ptNode = root_;
TrieNode::NextMap::const_iterator kmIter;
@ -224,21 +224,21 @@ class Trie {
}
ptNode->ptValue = ptValue;
}
void _deleteNode(TrieNode* node) {
void deleteNode_(TrieNode* node) {
if(!node) {
return;
}
if(node->next) {
TrieNode::NextMap::iterator it;
for(it = node->next->begin(); it != node->next->end(); it++) {
_deleteNode(it->second);
deleteNode_(it->second);
}
delete node->next;
}
delete node;
}
private:
TrieNode* _root;
TrieNode* root_;
};
}