code style

This commit is contained in:
yanyiwu 2015-05-06 23:02:03 +08:00
parent bb32234654
commit 2b18a582fc
10 changed files with 195 additions and 195 deletions

View File

@ -26,60 +26,60 @@ class DictTrie {
public: public:
DictTrie() { DictTrie() {
_trie = NULL; trie_ = NULL;
_minWeight = MAX_DOUBLE; minWeight_ = MAX_DOUBLE;
} }
DictTrie(const string& dictPath, const string& userDictPath = "") { DictTrie(const string& dictPath, const string& userDictPath = "") {
new (this) DictTrie(); new (this) DictTrie();
init(dictPath, userDictPath); init(dictPath, userDictPath);
} }
~DictTrie() { ~DictTrie() {
if(_trie) { if(trie_) {
delete _trie; delete trie_;
} }
} }
bool init(const string& dictPath, const string& userDictPath = "") { bool init(const string& dictPath, const string& userDictPath = "") {
if(_trie != NULL) { if(trie_ != NULL) {
LogFatal("trie already initted"); LogFatal("trie already initted");
} }
_loadDict(dictPath); loadDict_(dictPath);
_calculateWeight(_nodeInfos); calculateWeight_(nodeInfos_);
_minWeight = _findMinWeight(_nodeInfos); minWeight_ = findMinWeight_(nodeInfos_);
if(userDictPath.size()) { if(userDictPath.size()) {
double maxWeight = _findMaxWeight(_nodeInfos); double maxWeight = findMaxWeight_(nodeInfos_);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); loadUserDict_(userDictPath, maxWeight, UNKNOWN_TAG);
} }
_shrink(_nodeInfos); shrink_(nodeInfos_);
_trie = _createTrie(_nodeInfos); trie_ = createTrie_(nodeInfos_);
assert(_trie); assert(trie_);
return true; return true;
} }
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
return _trie->find(begin, end); return trie_->find(begin, end);
} }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const { bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
return _trie->find(begin, end, dag, offset); return trie_->find(begin, end, dag, offset);
} }
void find( void find(
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<SegmentChar>& res vector<SegmentChar>& res
) const { ) const {
_trie->find(begin, end, res); trie_->find(begin, end, res);
} }
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const { bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
return isIn(_userDictSingleChineseWord, word); return isIn(userDictSingleChineseWord_, word);
} }
double getMinWeight() const { double getMinWeight() const {
return _minWeight; return minWeight_;
}; };
private: private:
Trie * _createTrie(const vector<DictUnit>& dictUnits) { Trie * createTrie_(const vector<DictUnit>& dictUnits) {
assert(dictUnits.size()); assert(dictUnits.size());
vector<Unicode> words; vector<Unicode> words;
vector<const DictUnit*> valuePointers; vector<const DictUnit*> valuePointers;
@ -91,7 +91,7 @@ class DictTrie {
Trie * trie = new Trie(words, valuePointers); Trie * trie = new Trie(words, valuePointers);
return trie; return trie;
} }
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) { void loadUserDict_(const string& filePath, double defaultWeight, const string& defaultTag) {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) { if(!ifs.is_open()) {
LogFatal("file %s open failed.", filePath.c_str()); LogFatal("file %s open failed.", filePath.c_str());
@ -111,15 +111,15 @@ class DictTrie {
continue; continue;
} }
if(nodeInfo.word.size() == 1) { if(nodeInfo.word.size() == 1) {
_userDictSingleChineseWord.insert(nodeInfo.word[0]); userDictSingleChineseWord_.insert(nodeInfo.word[0]);
} }
nodeInfo.weight = defaultWeight; nodeInfo.weight = defaultWeight;
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
_nodeInfos.push_back(nodeInfo); nodeInfos_.push_back(nodeInfo);
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
void _loadDict(const string& filePath) { void loadDict_(const string& filePath) {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) { if(!ifs.is_open()) {
LogFatal("file %s open failed.", filePath.c_str()); LogFatal("file %s open failed.", filePath.c_str());
@ -141,17 +141,17 @@ class DictTrie {
nodeInfo.weight = atof(buf[1].c_str()); nodeInfo.weight = atof(buf[1].c_str());
nodeInfo.tag = buf[2]; nodeInfo.tag = buf[2];
_nodeInfos.push_back(nodeInfo); nodeInfos_.push_back(nodeInfo);
} }
} }
double _findMinWeight(const vector<DictUnit>& nodeInfos) const { double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
double ret = MAX_DOUBLE; double ret = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) { for(size_t i = 0; i < nodeInfos.size(); i++) {
ret = min(nodeInfos[i].weight, ret); ret = min(nodeInfos[i].weight, ret);
} }
return ret; return ret;
} }
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const { double findMaxWeight_(const vector<DictUnit>& nodeInfos) const {
double ret = MIN_DOUBLE; double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) { for(size_t i = 0; i < nodeInfos.size(); i++) {
ret = max(nodeInfos[i].weight, ret); ret = max(nodeInfos[i].weight, ret);
@ -159,7 +159,7 @@ class DictTrie {
return ret; return ret;
} }
void _calculateWeight(vector<DictUnit>& nodeInfos) const { void calculateWeight_(vector<DictUnit>& nodeInfos) const {
double sum = 0.0; double sum = 0.0;
for(size_t i = 0; i < nodeInfos.size(); i++) { for(size_t i = 0; i < nodeInfos.size(); i++) {
sum += nodeInfos[i].weight; sum += nodeInfos[i].weight;
@ -172,16 +172,16 @@ class DictTrie {
} }
} }
void _shrink(vector<DictUnit>& units) const { void shrink_(vector<DictUnit>& units) const {
vector<DictUnit>(units.begin(), units.end()).swap(units); vector<DictUnit>(units.begin(), units.end()).swap(units);
} }
private: private:
vector<DictUnit> _nodeInfos; vector<DictUnit> nodeInfos_;
Trie * _trie; Trie * trie_;
double _minWeight; double minWeight_;
unordered_set<Unicode::value_type> _userDictSingleChineseWord; unordered_set<Unicode::value_type> userDictSingleChineseWord_;
}; };
} }

View File

@ -14,40 +14,40 @@ namespace CppJieba {
class FullSegment: public SegmentBase { class FullSegment: public SegmentBase {
public: public:
FullSegment() { FullSegment() {
_dictTrie = NULL; dictTrie_ = NULL;
_isBorrowed = false; isBorrowed_ = false;
} }
explicit FullSegment(const string& dictPath) { explicit FullSegment(const string& dictPath) {
_dictTrie = NULL; dictTrie_ = NULL;
init(dictPath); init(dictPath);
} }
explicit FullSegment(const DictTrie* dictTrie) { explicit FullSegment(const DictTrie* dictTrie) {
_dictTrie = NULL; dictTrie_ = NULL;
init(dictTrie); init(dictTrie);
} }
virtual ~FullSegment() { virtual ~FullSegment() {
if(_dictTrie && ! _isBorrowed) { if(dictTrie_ && ! isBorrowed_) {
delete _dictTrie; delete dictTrie_;
} }
}; };
bool init(const string& dictPath) { bool init(const string& dictPath) {
assert(_dictTrie == NULL); assert(dictTrie_ == NULL);
_dictTrie = new DictTrie(dictPath); dictTrie_ = new DictTrie(dictPath);
_isBorrowed = false; isBorrowed_ = false;
return true; return true;
} }
bool init(const DictTrie* dictTrie) { bool init(const DictTrie* dictTrie) {
assert(_dictTrie == NULL); assert(dictTrie_ == NULL);
assert(dictTrie); assert(dictTrie);
_dictTrie = dictTrie; dictTrie_ = dictTrie;
_isBorrowed = true; isBorrowed_ = true;
return true; return true;
} }
using SegmentBase::cut; using SegmentBase::cut;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
assert(_dictTrie); assert(dictTrie_);
if (begin >= end) { if (begin >= end) {
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
@ -66,7 +66,7 @@ class FullSegment: public SegmentBase {
int wordLen = 0; int wordLen = 0;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
//find word start from uItr //find word start from uItr
if (_dictTrie->find(uItr, end, tRes, 0)) { if (dictTrie_->find(uItr, end, tRes, 0)) {
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {
@ -93,7 +93,7 @@ class FullSegment: public SegmentBase {
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const { bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
assert(_dictTrie); assert(dictTrie_);
if (begin >= end) { if (begin >= end) {
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
@ -117,8 +117,8 @@ class FullSegment: public SegmentBase {
return true; return true;
} }
private: private:
const DictTrie* _dictTrie; const DictTrie* dictTrie_;
bool _isBorrowed; bool isBorrowed_;
}; };
} }

View File

@ -31,17 +31,17 @@ class HMMSegment: public SegmentBase {
virtual ~HMMSegment() {} virtual ~HMMSegment() {}
public: public:
bool init(const string& filePath) { bool init(const string& filePath) {
memset(_startProb, 0, sizeof(_startProb)); memset(startProb_, 0, sizeof(startProb_));
memset(_transProb, 0, sizeof(_transProb)); memset(transProb_, 0, sizeof(transProb_));
_statMap[0] = 'B'; statMap_[0] = 'B';
_statMap[1] = 'E'; statMap_[1] = 'E';
_statMap[2] = 'M'; statMap_[2] = 'M';
_statMap[3] = 'S'; statMap_[3] = 'S';
_emitProbVec.push_back(&_emitProbB); emitProbVec_.push_back(&emitProbB_);
_emitProbVec.push_back(&_emitProbE); emitProbVec_.push_back(&emitProbE_);
_emitProbVec.push_back(&_emitProbM); emitProbVec_.push_back(&emitProbM_);
_emitProbVec.push_back(&_emitProbS); emitProbVec_.push_back(&emitProbS_);
LIMONP_CHECK(_loadModel(filePath.c_str())); LIMONP_CHECK(loadModel_(filePath.c_str()));
LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
return true; return true;
} }
@ -53,16 +53,16 @@ class HMMSegment: public SegmentBase {
Unicode::const_iterator right = begin; Unicode::const_iterator right = begin;
while(right != end) { while(right != end) {
if(*right < 0x80) { if(*right < 0x80) {
if(left != right && !_cut(left, right, res)) { if(left != right && !cut_(left, right, res)) {
return false; return false;
} }
left = right; left = right;
do { do {
right = _sequentialLetterRule(left, end); right = sequentialLetterRule_(left, end);
if(right != left) { if(right != left) {
break; break;
} }
right = _numbersRule(left, end); right = numbersRule_(left, end);
if(right != left) { if(right != left) {
break; break;
} }
@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase {
right++; right++;
} }
} }
if(left != right && !_cut(left, right, res)) { if(left != right && !cut_(left, right, res)) {
return false; return false;
} }
return true; return true;
@ -100,7 +100,7 @@ class HMMSegment: public SegmentBase {
} }
private: private:
// sequential letters rule // sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Unicode::value_type x = *begin; Unicode::value_type x = *begin;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
begin ++; begin ++;
@ -118,7 +118,7 @@ class HMMSegment: public SegmentBase {
return begin; return begin;
} }
// //
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Unicode::value_type x = *begin; Unicode::value_type x = *begin;
if('0' <= x && x <= '9') { if('0' <= x && x <= '9') {
begin ++; begin ++;
@ -135,10 +135,10 @@ class HMMSegment: public SegmentBase {
} }
return begin; return begin;
} }
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { bool cut_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
vector<size_t> status; vector<size_t> status;
if(!_viterbi(begin, end, status)) { if(!viterbi_(begin, end, status)) {
LogError("_viterbi failed."); LogError("viterbi_ failed.");
return false; return false;
} }
@ -154,7 +154,7 @@ class HMMSegment: public SegmentBase {
return true; return true;
} }
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const { bool viterbi_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
if(begin == end) { if(begin == end) {
return false; return false;
} }
@ -171,7 +171,7 @@ class HMMSegment: public SegmentBase {
//start //start
for(size_t y = 0; y < Y; y++) { for(size_t y = 0; y < Y; y++) {
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); weight[0 + y * X] = startProb_[y] + getEmitProb_(emitProbVec_[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1; path[0 + y * X] = -1;
} }
@ -183,10 +183,10 @@ class HMMSegment: public SegmentBase {
now = x + y*X; now = x + y*X;
weight[now] = MIN_DOUBLE; weight[now] = MIN_DOUBLE;
path[now] = E; // warning path[now] = E; // warning
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); emitProb = getEmitProb_(emitProbVec_[y], *(begin+x), MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++) { for(size_t preY = 0; preY < Y; preY++) {
old = x - 1 + preY * X; old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + emitProb; tmp = weight[old] + transProb_[preY][y] + emitProb;
if(tmp > weight[now]) { if(tmp > weight[now]) {
weight[now] = tmp; weight[now] = tmp;
path[now] = preY; path[now] = preY;
@ -212,13 +212,13 @@ class HMMSegment: public SegmentBase {
return true; return true;
} }
bool _loadModel(const char* const filePath) { bool loadModel_(const char* const filePath) {
ifstream ifile(filePath); ifstream ifile(filePath);
string line; string line;
vector<string> tmp; vector<string> tmp;
vector<string> tmp2; vector<string> tmp2;
//load _startProb //load startProb_
if(!_getLine(ifile, line)) { if(!getLine_(ifile, line)) {
return false; return false;
} }
split(line, tmp, " "); split(line, tmp, " ");
@ -227,12 +227,12 @@ class HMMSegment: public SegmentBase {
return false; return false;
} }
for(size_t j = 0; j< tmp.size(); j++) { for(size_t j = 0; j< tmp.size(); j++) {
_startProb[j] = atof(tmp[j].c_str()); startProb_[j] = atof(tmp[j].c_str());
} }
//load _transProb //load transProb_
for(size_t i = 0; i < STATUS_SUM; i++) { for(size_t i = 0; i < STATUS_SUM; i++) {
if(!_getLine(ifile, line)) { if(!getLine_(ifile, line)) {
return false; return false;
} }
split(line, tmp, " "); split(line, tmp, " ");
@ -241,33 +241,33 @@ class HMMSegment: public SegmentBase {
return false; return false;
} }
for(size_t j =0; j < STATUS_SUM; j++) { for(size_t j =0; j < STATUS_SUM; j++) {
_transProb[i][j] = atof(tmp[j].c_str()); transProb_[i][j] = atof(tmp[j].c_str());
} }
} }
//load _emitProbB //load emitProbB_
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) { if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbB_)) {
return false; return false;
} }
//load _emitProbE //load emitProbE_
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) { if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbE_)) {
return false; return false;
} }
//load _emitProbM //load emitProbM_
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) { if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbM_)) {
return false; return false;
} }
//load _emitProbS //load emitProbS_
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) { if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbS_)) {
return false; return false;
} }
return true; return true;
} }
bool _getLine(ifstream& ifile, string& line) { bool getLine_(ifstream& ifile, string& line) {
while(getline(ifile, line)) { while(getline(ifile, line)) {
trim(line); trim(line);
if(line.empty()) { if(line.empty()) {
@ -280,7 +280,7 @@ class HMMSegment: public SegmentBase {
} }
return false; return false;
} }
bool _loadEmitProb(const string& line, EmitProbMap& mp) { bool loadEmitProb_(const string& line, EmitProbMap& mp) {
if(line.empty()) { if(line.empty()) {
return false; return false;
} }
@ -290,7 +290,7 @@ class HMMSegment: public SegmentBase {
for(size_t i = 0; i < tmp.size(); i++) { for(size_t i = 0; i < tmp.size(); i++) {
split(tmp[i], tmp2, ":"); split(tmp[i], tmp2, ":");
if(2 != tmp2.size()) { if(2 != tmp2.size()) {
LogError("_emitProb illegal."); LogError("emitProb_ illegal.");
return false; return false;
} }
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) { if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
@ -301,7 +301,7 @@ class HMMSegment: public SegmentBase {
} }
return true; return true;
} }
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const { double getEmitProb_(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
EmitProbMap::const_iterator cit = ptMp->find(key); EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end()) { if(cit == ptMp->end()) {
return defVal; return defVal;
@ -311,14 +311,14 @@ class HMMSegment: public SegmentBase {
} }
private: private:
char _statMap[STATUS_SUM]; char statMap_[STATUS_SUM];
double _startProb[STATUS_SUM]; double startProb_[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM]; double transProb_[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB; EmitProbMap emitProbB_;
EmitProbMap _emitProbE; EmitProbMap emitProbE_;
EmitProbMap _emitProbM; EmitProbMap emitProbM_;
EmitProbMap _emitProbS; EmitProbMap emitProbS_;
vector<EmitProbMap* > _emitProbVec; vector<EmitProbMap* > emitProbVec_;
}; };
} }

View File

@ -18,9 +18,9 @@ class KeywordExtractor {
~KeywordExtractor() {}; ~KeywordExtractor() {};
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
_loadIdfDict(idfPath); loadIdfDict_(idfPath);
_loadStopWordDict(stopWordPath); loadStopWordDict_(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDict));
}; };
bool extract(const string& str, vector<string>& keywords, size_t topN) const { bool extract(const string& str, vector<string>& keywords, size_t topN) const {
@ -36,30 +36,30 @@ class KeywordExtractor {
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const { bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
vector<string> words; vector<string> words;
if(!_segment.cut(str, words)) { if(!segment_.cut(str, words)) {
LogError("segment cut(%s) failed.", str.c_str()); LogError("segment cut(%s) failed.", str.c_str());
return false; return false;
} }
map<string, double> wordmap; map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) { for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
if(_isSingleWord(*iter)) { if(isSingleWord_(*iter)) {
continue; continue;
} }
wordmap[*iter] += 1.0; wordmap[*iter] += 1.0;
} }
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
if(_stopWords.end() != _stopWords.find(itr->first)) { if(stopWords_.end() != stopWords_.find(itr->first)) {
wordmap.erase(itr++); wordmap.erase(itr++);
continue; continue;
} }
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
if(cit != _idfMap.end()) { if(cit != idfMap_.end()) {
itr->second *= cit->second; itr->second *= cit->second;
} else { } else {
itr->second *= _idfAverage; itr->second *= idfAverage_;
} }
itr ++; itr ++;
} }
@ -67,12 +67,12 @@ class KeywordExtractor {
keywords.clear(); keywords.clear();
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
topN = min(topN, keywords.size()); topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), cmp_);
keywords.resize(topN); keywords.resize(topN);
return true; return true;
} }
private: private:
void _loadIdfDict(const string& idfPath) { void loadIdfDict_(const string& idfPath) {
ifstream ifs(idfPath.c_str()); ifstream ifs(idfPath.c_str());
if(!ifs.is_open()) { if(!ifs.is_open()) {
LogFatal("open %s failed.", idfPath.c_str()); LogFatal("open %s failed.", idfPath.c_str());
@ -93,28 +93,28 @@ class KeywordExtractor {
continue; continue;
} }
idf = atof(buf[1].c_str()); idf = atof(buf[1].c_str());
_idfMap[buf[0]] = idf; idfMap_[buf[0]] = idf;
idfSum += idf; idfSum += idf;
} }
assert(lineno); assert(lineno);
_idfAverage = idfSum / lineno; idfAverage_ = idfSum / lineno;
assert(_idfAverage > 0.0); assert(idfAverage_ > 0.0);
} }
void _loadStopWordDict(const string& filePath) { void loadStopWordDict_(const string& filePath) {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs.is_open()) { if(!ifs.is_open()) {
LogFatal("open %s failed.", filePath.c_str()); LogFatal("open %s failed.", filePath.c_str());
} }
string line ; string line ;
while(getline(ifs, line)) { while(getline(ifs, line)) {
_stopWords.insert(line); stopWords_.insert(line);
} }
assert(_stopWords.size()); assert(stopWords_.size());
} }
bool _isSingleWord(const string& str) const { bool isSingleWord_(const string& str) const {
Unicode unicode; Unicode unicode;
TransCode::decode(str, unicode); TransCode::decode(str, unicode);
if(unicode.size() == 1) if(unicode.size() == 1)
@ -122,16 +122,16 @@ class KeywordExtractor {
return false; return false;
} }
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) { static bool cmp_(const pair<string, double>& lhs, const pair<string, double>& rhs) {
return lhs.second > rhs.second; return lhs.second > rhs.second;
} }
private: private:
MixSegment _segment; MixSegment segment_;
unordered_map<string, double> _idfMap; unordered_map<string, double> idfMap_;
double _idfAverage; double idfAverage_;
unordered_set<string> _stopWords; unordered_set<string> stopWords_;
}; };
} }

View File

@ -21,12 +21,12 @@ class MPSegment: public SegmentBase {
virtual ~MPSegment() {}; virtual ~MPSegment() {};
bool init(const string& dictPath, const string& userDictPath = "") { bool init(const string& dictPath, const string& userDictPath = "") {
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath)); LIMONP_CHECK(dictTrie_.init(dictPath, userDictPath));
LogInfo("MPSegment init(%s) ok", dictPath.c_str()); LogInfo("MPSegment init(%s) ok", dictPath.c_str());
return true; return true;
} }
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const { bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
return _dictTrie.isUserDictSingleChineseWord(value); return dictTrie_.isUserDictSingleChineseWord(value);
} }
using SegmentBase::cut; using SegmentBase::cut;
@ -57,20 +57,20 @@ class MPSegment: public SegmentBase {
} }
vector<SegmentChar> segmentChars; vector<SegmentChar> segmentChars;
_dictTrie.find(begin, end, segmentChars); dictTrie_.find(begin, end, segmentChars);
_calcDP(segmentChars); calcDP_(segmentChars);
_cut(segmentChars, res); cut_(segmentChars, res);
return true; return true;
} }
const DictTrie* getDictTrie() const { const DictTrie* getDictTrie() const {
return &_dictTrie; return &dictTrie_;
} }
private: private:
void _calcDP(vector<SegmentChar>& segmentChars) const { void calcDP_(vector<SegmentChar>& segmentChars) const {
size_t nextPos; size_t nextPos;
const DictUnit* p; const DictUnit* p;
double val; double val;
@ -90,7 +90,7 @@ class MPSegment: public SegmentBase {
if(p) { if(p) {
val += p->weight; val += p->weight;
} else { } else {
val += _dictTrie.getMinWeight(); val += dictTrie_.getMinWeight();
} }
if(val > segmentChars[i].weight) { if(val > segmentChars[i].weight) {
segmentChars[i].pInfo = p; segmentChars[i].pInfo = p;
@ -99,7 +99,7 @@ class MPSegment: public SegmentBase {
} }
} }
} }
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const { void cut_(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
size_t i = 0; size_t i = 0;
while(i < segmentChars.size()) { while(i < segmentChars.size()) {
const DictUnit* p = segmentChars[i].pInfo; const DictUnit* p = segmentChars[i].pInfo;
@ -114,7 +114,7 @@ class MPSegment: public SegmentBase {
} }
private: private:
DictTrie _dictTrie; DictTrie dictTrie_;
}; };
} }

View File

@ -17,8 +17,8 @@ class MixSegment: public SegmentBase {
virtual ~MixSegment() { virtual ~MixSegment() {
} }
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); LIMONP_CHECK(mpSeg_.init(mpSegDict, userDict));
LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); LIMONP_CHECK(hmmSeg_.init(hmmSegDict));
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
return true; return true;
} }
@ -26,7 +26,7 @@ class MixSegment: public SegmentBase {
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
if(!_mpSeg.cut(begin, end, words)) { if(!mpSeg_.cut(begin, end, words)) {
LogError("mpSeg cutDAG failed."); LogError("mpSeg cutDAG failed.");
return false; return false;
} }
@ -37,21 +37,21 @@ class MixSegment: public SegmentBase {
piece.reserve(end - begin); piece.reserve(end - begin);
for (size_t i = 0, j = 0; i < words.size(); i++) { for (size_t i = 0, j = 0; i < words.size(); i++) {
//if mp get a word, it's ok, put it into result //if mp get a word, it's ok, put it into result
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.isUserDictSingleChineseWord(words[i][0]))) {
res.push_back(words[i]); res.push_back(words[i]);
continue; continue;
} }
// if mp get a single one and it is not in userdict, collect it in sequence // if mp get a single one and it is not in userdict, collect it in sequence
j = i; j = i;
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { while (j < words.size() && 1 == words[j].size() && !mpSeg_.isUserDictSingleChineseWord(words[j][0])) {
piece.push_back(words[j][0]); piece.push_back(words[j][0]);
j++; j++;
} }
// cut the sequence with hmm // cut the sequence with hmm
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { if (!hmmSeg_.cut(piece.begin(), piece.end(), hmmRes)) {
LogError("_hmmSeg cut failed."); LogError("hmmSeg_ cut failed.");
return false; return false;
} }
@ -92,11 +92,11 @@ class MixSegment: public SegmentBase {
} }
const DictTrie* getDictTrie() const { const DictTrie* getDictTrie() const {
return _mpSeg.getDictTrie(); return mpSeg_.getDictTrie();
} }
private: private:
MPSegment _mpSeg; MPSegment mpSeg_;
HMMSegment _hmmSeg; HMMSegment hmmSeg_;
}; };
} }

View File

@ -30,16 +30,16 @@ class PosTagger {
const string& hmmFilePath, const string& hmmFilePath,
const string& userDictPath = "" const string& userDictPath = ""
) { ) {
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath)); LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDictPath));
_dictTrie = _segment.getDictTrie(); dictTrie_ = segment_.getDictTrie();
LIMONP_CHECK(_dictTrie); LIMONP_CHECK(dictTrie_);
}; };
bool tag(const string& src, vector<pair<string, string> >& res) const { bool tag(const string& src, vector<pair<string, string> >& res) const {
vector<string> cutRes; vector<string> cutRes;
if (!_segment.cut(src, cutRes)) { if (!segment_.cut(src, cutRes)) {
LogError("_mixSegment cut failed"); LogError("mixSegment_ cut failed");
return false; return false;
} }
@ -50,9 +50,9 @@ class PosTagger {
LogError("decode failed."); LogError("decode failed.");
return false; return false;
} }
tmp = _dictTrie->find(unico.begin(), unico.end()); tmp = dictTrie_->find(unico.begin(), unico.end());
if(tmp == NULL || tmp->tag.empty()) { if(tmp == NULL || tmp->tag.empty()) {
res.push_back(make_pair(*itr, _specialRule(unico))); res.push_back(make_pair(*itr, specialRule_(unico)));
} else { } else {
res.push_back(make_pair(*itr, tmp->tag)); res.push_back(make_pair(*itr, tmp->tag));
} }
@ -60,7 +60,7 @@ class PosTagger {
return !res.empty(); return !res.empty();
} }
private: private:
const char* _specialRule(const Unicode& unicode) const { const char* specialRule_(const Unicode& unicode) const {
size_t m = 0; size_t m = 0;
size_t eng = 0; size_t eng = 0;
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
@ -83,8 +83,8 @@ class PosTagger {
return POS_ENG; return POS_ENG;
} }
private: private:
MixSegment _segment; MixSegment segment_;
const DictTrie * _dictTrie; const DictTrie * dictTrie_;
}; };
} }

View File

@ -22,10 +22,10 @@ class QuerySegment: public SegmentBase {
}; };
virtual ~QuerySegment() {}; virtual ~QuerySegment() {};
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") { bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
LIMONP_CHECK(_mixSeg.init(dict, model, userDict)); LIMONP_CHECK(mixSeg_.init(dict, model, userDict));
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie())); LIMONP_CHECK(fullSeg_.init(mixSeg_.getDictTrie()));
assert(maxWordLen); assert(maxWordLen);
_maxWordLen = maxWordLen; maxWordLen_ = maxWordLen;
return true; return true;
} }
using SegmentBase::cut; using SegmentBase::cut;
@ -37,17 +37,17 @@ class QuerySegment: public SegmentBase {
//use mix cut first //use mix cut first
vector<Unicode> mixRes; vector<Unicode> mixRes;
if (!_mixSeg.cut(begin, end, mixRes)) { if (!mixSeg_.cut(begin, end, mixRes)) {
LogError("_mixSeg cut failed."); LogError("mixSeg_ cut failed.");
return false; return false;
} }
vector<Unicode> fullRes; vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
// if it's too long, cut with _fullSeg, put fullRes in res // if it's too long, cut with fullSeg_, put fullRes in res
if (mixResItr->size() > _maxWordLen) { if (mixResItr->size() > maxWordLen_) {
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) { if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
res.push_back(*fullResItr); res.push_back(*fullResItr);
} }
@ -88,9 +88,9 @@ class QuerySegment: public SegmentBase {
return true; return true;
} }
private: private:
MixSegment _mixSeg; MixSegment mixSeg_;
FullSegment _fullSeg; FullSegment fullSeg_;
size_t _maxWordLen; size_t maxWordLen_;
}; };
} }

View File

@ -22,7 +22,7 @@ const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
class SegmentBase: public ISegment, public NonCopyable { class SegmentBase: public ISegment, public NonCopyable {
public: public:
SegmentBase() { SegmentBase() {
_loadSpecialSymbols(); loadSpecialSymbols_();
}; };
virtual ~SegmentBase() {}; virtual ~SegmentBase() {};
public: public:
@ -39,7 +39,7 @@ class SegmentBase: public ISegment, public NonCopyable {
Unicode::const_iterator right; Unicode::const_iterator right;
for(right = unicode.begin(); right != unicode.end(); right++) { for(right = unicode.begin(); right != unicode.end(); right++) {
if(isIn(_specialSymbols, *right)) { if(isIn(specialSymbols_, *right)) {
if(left != right) { if(left != right) {
cut(left, right, res); cut(left, right, res);
} }
@ -55,15 +55,15 @@ class SegmentBase: public ISegment, public NonCopyable {
return true; return true;
} }
private: private:
void _loadSpecialSymbols() { void loadSpecialSymbols_() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) { for(size_t i = 0; i < size; i ++) {
_specialSymbols.insert(SPECIAL_SYMBOL[i]); specialSymbols_.insert(SPECIAL_SYMBOL[i]);
} }
assert(_specialSymbols.size()); assert(specialSymbols_.size());
} }
private: private:
unordered_set<UnicodeValueType> _specialSymbols; unordered_set<UnicodeValueType> specialSymbols_;
}; };
} }

View File

@ -61,19 +61,19 @@ class TrieNode {
class Trie { class Trie {
public: public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) { Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
_root = new TrieNode; root_ = new TrieNode;
_createTrie(keys, valuePointers); createTrie_(keys, valuePointers);
_build();// build automation build_();// build automation
} }
~Trie() { ~Trie() {
if(_root) { if(root_) {
_deleteNode(_root); deleteNode_(root_);
} }
} }
public: public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
const TrieNode* ptNode = _root; const TrieNode* ptNode = root_;
for(Unicode::const_iterator it = begin; it != end; it++) { for(Unicode::const_iterator it = begin; it != end; it++) {
// build automation // build automation
assert(ptNode); assert(ptNode);
@ -91,7 +91,7 @@ class Trie {
vector<struct SegmentChar>& res vector<struct SegmentChar>& res
) const { ) const {
res.resize(end - begin); res.resize(end - begin);
const TrieNode * now = _root; const TrieNode * now = root_;
const TrieNode* node; const TrieNode* node;
// compiler will complain warnings if only "i < end - begin" . // compiler will complain warnings if only "i < end - begin" .
for (size_t i = 0; i < size_t(end - begin); i++) { for (size_t i = 0; i < size_t(end - begin); i++) {
@ -102,7 +102,7 @@ class Trie {
bool flag = false; bool flag = false;
// rollback // rollback
while( now != _root ) { while( now != root_ ) {
node = now->findNext(ch); node = now->findNext(ch);
if (node != NULL) { if (node != NULL) {
flag = true; flag = true;
@ -116,11 +116,11 @@ class Trie {
node = now->findNext(ch); node = now->findNext(ch);
} }
if(node == NULL) { if(node == NULL) {
now = _root; now = root_;
} else { } else {
now = node; now = node;
const TrieNode * temp = now; const TrieNode * temp = now;
while(temp != _root) { while(temp != root_) {
if (temp->ptValue) { if (temp->ptValue) {
size_t pos = i - temp->ptValue->word.size() + 1; size_t pos = i - temp->ptValue->word.size() + 1;
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue)); res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
@ -139,7 +139,7 @@ class Trie {
Unicode::const_iterator end, Unicode::const_iterator end,
DagType & res, DagType & res,
size_t offset = 0) const { size_t offset = 0) const {
const TrieNode * ptNode = _root; const TrieNode * ptNode = root_;
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
for(Unicode::const_iterator itr = begin; itr != end ; itr++) { for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
assert(ptNode); assert(ptNode);
@ -158,13 +158,13 @@ class Trie {
return !res.empty(); return !res.empty();
} }
private: private:
void _build() { void build_() {
queue<TrieNode*> que; queue<TrieNode*> que;
assert(_root->ptValue == NULL); assert(root_->ptValue == NULL);
assert(_root->next); assert(root_->next);
_root->fail = NULL; root_->fail = NULL;
for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) { for(TrieNode::NextMap::iterator iter = root_->next->begin(); iter != root_->next->end(); iter++) {
iter->second->fail = _root; iter->second->fail = root_;
que.push(iter->second); que.push(iter->second);
} }
TrieNode* back = NULL; TrieNode* back = NULL;
@ -185,24 +185,24 @@ class Trie {
back = back->fail; back = back->fail;
} }
if(back == NULL) { if(back == NULL) {
iter->second->fail = _root; iter->second->fail = root_;
} }
que.push(iter->second); que.push(iter->second);
} }
} }
} }
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) { void createTrie_(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
if(valuePointers.empty() || keys.empty()) { if(valuePointers.empty() || keys.empty()) {
return; return;
} }
assert(keys.size() == valuePointers.size()); assert(keys.size() == valuePointers.size());
for(size_t i = 0; i < keys.size(); i++) { for(size_t i = 0; i < keys.size(); i++) {
_insertNode(keys[i], valuePointers[i]); insertNode_(keys[i], valuePointers[i]);
} }
} }
void _insertNode(const Unicode& key, const DictUnit* ptValue) { void insertNode_(const Unicode& key, const DictUnit* ptValue) {
TrieNode* ptNode = _root; TrieNode* ptNode = root_;
TrieNode::NextMap::const_iterator kmIter; TrieNode::NextMap::const_iterator kmIter;
@ -224,21 +224,21 @@ class Trie {
} }
ptNode->ptValue = ptValue; ptNode->ptValue = ptValue;
} }
void _deleteNode(TrieNode* node) { void deleteNode_(TrieNode* node) {
if(!node) { if(!node) {
return; return;
} }
if(node->next) { if(node->next) {
TrieNode::NextMap::iterator it; TrieNode::NextMap::iterator it;
for(it = node->next->begin(); it != node->next->end(); it++) { for(it = node->next->begin(); it != node->next->end(); it++) {
_deleteNode(it->second); deleteNode_(it->second);
} }
delete node->next; delete node->next;
} }
delete node; delete node;
} }
private: private:
TrieNode* _root; TrieNode* root_;
}; };
} }