From fae951a95d72c8dd14d1035ecbf8748b2a7c4121 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Fri, 28 Aug 2015 11:17:38 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BB=9F=E4=B8=80=E7=A7=81=E6=9C=89=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E7=9A=84=E5=91=BD=E5=90=8D=E9=A3=8E=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/DictTrie.hpp | 39 +++++++++++++++++++-------------------- src/HMMSegment.hpp | 19 +++++++++---------- src/KeywordExtractor.hpp | 21 ++++++++++----------- src/MPSegment.hpp | 13 ++++++------- src/PosTagger.hpp | 6 +++--- src/SegmentBase.hpp | 11 ++++++----- src/Trie.hpp | 10 +++++----- 7 files changed, 58 insertions(+), 61 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 35ce7c7..41cbe27 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -41,21 +41,21 @@ class DictTrie { if(trie_ != NULL) { LogFatal("trie already initted"); } - loadDict_(dictPath); - calculateWeight_(staticNodeInfos_); - minWeight_ = findMinWeight_(staticNodeInfos_); - maxWeight_ = findMaxWeight_(staticNodeInfos_); + LoadDict(dictPath); + CalculateWeight(staticNodeInfos_); + minWeight_ = FindMinWeight(staticNodeInfos_); + maxWeight_ = FindMaxWeight(staticNodeInfos_); if(userDictPath.size()) { - loadUserDict_(userDictPath); + LoadUserDict(userDictPath); } - shrink_(staticNodeInfos_); - createTrie_(staticNodeInfos_); + Shrink(staticNodeInfos_); + CreateTrie(staticNodeInfos_); } bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { DictUnit nodeInfo; - if(!makeUserNodeInfo_(nodeInfo, word, tag)) { + if(!MakeUserNodeInfo(nodeInfo, word, tag)) { return false; } activeNodeInfos_.push_back(nodeInfo); @@ -83,7 +83,7 @@ class DictTrie { } private: - void createTrie_(const vector& dictUnits) { + void CreateTrie(const vector& dictUnits) { assert(dictUnits.size()); vector words; vector valuePointers; @@ -94,7 +94,7 @@ class DictTrie { trie_ = new Trie(words, valuePointers); } - void loadUserDict_(const string& filePath) { + void LoadUserDict(const string& filePath) { ifstream ifs(filePath.c_str()); if(!ifs.is_open()) { LogFatal("file %s open failed.", filePath.c_str()); @@ -110,13 +110,13 @@ class DictTrie { LogFatal("split [%s] result illegal", line.c_str()); } DictUnit nodeInfo; - makeUserNodeInfo_(nodeInfo, buf[0], + MakeUserNodeInfo(nodeInfo, buf[0], (buf.size() == 2 ? buf[1] : UNKNOWN_TAG)); staticNodeInfos_.push_back(nodeInfo); } LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); } - bool makeNodeInfo(DictUnit& nodeInfo, + bool MakeNodeInfo(DictUnit& nodeInfo, const string& word, double weight, const string& tag) { @@ -128,7 +128,7 @@ class DictTrie { nodeInfo.tag = tag; return true; } - bool makeUserNodeInfo_(DictUnit& nodeInfo, + bool MakeUserNodeInfo(DictUnit& nodeInfo, const string& word, const string& tag = UNKNOWN_TAG) { if(!TransCode::decode(word, nodeInfo.word)) { @@ -142,7 +142,7 @@ class DictTrie { nodeInfo.tag = tag; return true; } - void loadDict_(const string& filePath) { + void LoadDict(const string& filePath) { ifstream ifs(filePath.c_str()); if(!ifs.is_open()) { LogFatal("file %s open failed.", filePath.c_str()); @@ -156,21 +156,21 @@ class DictTrie { if(buf.size() != DICT_COLUMN_NUM) { LogFatal("split result illegal, line: %s, result size: %u", line.c_str(), buf.size()); } - makeNodeInfo(nodeInfo, + MakeNodeInfo(nodeInfo, buf[0], atof(buf[1].c_str()), buf[2]); staticNodeInfos_.push_back(nodeInfo); } } - double findMinWeight_(const vector& nodeInfos) const { + double FindMinWeight(const vector& nodeInfos) const { double ret = MAX_DOUBLE; for(size_t i = 0; i < nodeInfos.size(); i++) { ret = min(nodeInfos[i].weight, ret); } return ret; } - double findMaxWeight_(const vector& nodeInfos) const { + double FindMaxWeight(const vector& nodeInfos) const { double ret = MIN_DOUBLE; for(size_t i = 0; i < nodeInfos.size(); i++) { ret = max(nodeInfos[i].weight, ret); @@ -178,7 +178,7 @@ class DictTrie { return ret; } - void calculateWeight_(vector& nodeInfos) const { + void CalculateWeight(vector& nodeInfos) const { double sum = 0.0; for(size_t i = 0; i < nodeInfos.size(); i++) { sum += nodeInfos[i].weight; @@ -191,11 +191,10 @@ class DictTrie { } } - void shrink_(vector& units) const { + void Shrink(vector& units) const { vector(units.begin(), units.end()).swap(units); } - private: vector staticNodeInfos_; deque activeNodeInfos_; // must not be vector Trie * trie_; diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 8adbafb..a354fae 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -30,15 +30,15 @@ class HMMSegment: public SegmentBase { while(right != end) { if(*right < 0x80) { if(left != right) { - cut_(left, right, res); + Cut(left, right, res); } left = right; do { - right = sequentialLetterRule_(left, end); + right = SequentialLetterRule(left, end); if(right != left) { break; } - right = numbersRule_(left, end); + right = NumbersRule(left, end); if(right != left) { break; } @@ -51,12 +51,12 @@ class HMMSegment: public SegmentBase { } } if(left != right) { - cut_(left, right, res); + Cut(left, right, res); } } private: // sequential letters rule - Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const { + Unicode::const_iterator SequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { Rune x = *begin; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { begin ++; @@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase { return begin; } // - Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const { + Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { Rune x = *begin; if('0' <= x && x <= '9') { begin ++; @@ -91,9 +91,9 @@ class HMMSegment: public SegmentBase { } return begin; } - void cut_(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { vector status; - viterbi_(begin, end, status); + Viterbi(begin, end, status); Unicode::const_iterator left = begin; Unicode::const_iterator right; @@ -106,7 +106,7 @@ class HMMSegment: public SegmentBase { } } - void viterbi_(Unicode::const_iterator begin, + void Viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status) const { size_t Y = HMMModel::STATUS_SUM; @@ -160,7 +160,6 @@ class HMMSegment: public SegmentBase { } } - private: const HMMModel* model_; bool isNeedDestroy_; }; // class HMMSegment diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 34c0096..7626313 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -17,16 +17,16 @@ class KeywordExtractor { const string& stopWordPath, const string& userDict = "") : segment_(dictPath, hmmFilePath, userDict) { - loadIdfDict_(idfPath); - loadStopWordDict_(stopWordPath); + LoadIdfDict(idfPath); + LoadStopWordDict(stopWordPath); } KeywordExtractor(const DictTrie* dictTrie, const HMMModel* model, const string& idfPath, const string& stopWordPath) : segment_(dictTrie, model){ - loadIdfDict_(idfPath); - loadStopWordDict_(stopWordPath); + LoadIdfDict(idfPath); + LoadStopWordDict(stopWordPath); } ~KeywordExtractor() { } @@ -51,7 +51,7 @@ class KeywordExtractor { map wordmap; for(vector::iterator iter = words.begin(); iter != words.end(); iter++) { - if(isSingleWord_(*iter)) { + if(IsSingleWord(*iter)) { continue; } wordmap[*iter] += 1.0; @@ -75,12 +75,12 @@ class KeywordExtractor { keywords.clear(); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); topN = min(topN, keywords.size()); - partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), cmp_); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); keywords.resize(topN); return true; } private: - void loadIdfDict_(const string& idfPath) { + void LoadIdfDict(const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs.is_open()) { LogFatal("open %s failed.", idfPath.c_str()); @@ -111,7 +111,7 @@ class KeywordExtractor { idfAverage_ = idfSum / lineno; assert(idfAverage_ > 0.0); } - void loadStopWordDict_(const string& filePath) { + void LoadStopWordDict(const string& filePath) { ifstream ifs(filePath.c_str()); if(!ifs.is_open()) { LogFatal("open %s failed.", filePath.c_str()); @@ -123,7 +123,7 @@ class KeywordExtractor { assert(stopWords_.size()); } - bool isSingleWord_(const string& str) const { + bool IsSingleWord(const string& str) const { Unicode unicode; TransCode::decode(str, unicode); if(unicode.size() == 1) @@ -131,11 +131,10 @@ class KeywordExtractor { return false; } - static bool cmp_(const pair& lhs, const pair& rhs) { + static bool Compare(const pair& lhs, const pair& rhs) { return lhs.second > rhs.second; } - private: MixSegment segment_; unordered_map idfMap_; double idfAverage_; diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 48a8ea0..919b393 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -38,9 +38,9 @@ class MPSegment: public SegmentBase { dictTrie_->find(begin, end, dags); - calcDP_(dags); + CalcDP(dags); - cut_(dags, res); + Cut(dags, res); } void cut(Unicode::const_iterator begin, Unicode::const_iterator end, @@ -51,15 +51,15 @@ class MPSegment: public SegmentBase { end, dags, max_word_len); - calcDP_(dags); - cut_(dags, res); + CalcDP(dags); + Cut(dags, res); } const DictTrie* getDictTrie() const { return dictTrie_; } private: - void calcDP_(vector& dags) const { + void CalcDP(vector& dags) const { size_t nextPos; const DictUnit* p; double val; @@ -88,7 +88,7 @@ class MPSegment: public SegmentBase { } } } - void cut_(const vector& dags, + void Cut(const vector& dags, vector& res) const { size_t i = 0; while(i < dags.size()) { @@ -103,7 +103,6 @@ class MPSegment: public SegmentBase { } } - private: const DictTrie* dictTrie_; bool isNeedDestroy_; }; // class MPSegment diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index 39b4709..94b89b9 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -43,7 +43,7 @@ class PosTagger { } tmp = dict->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { - res.push_back(make_pair(*itr, specialRule_(unico))); + res.push_back(make_pair(*itr, SpecialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } @@ -51,7 +51,7 @@ class PosTagger { return !res.empty(); } private: - const char* specialRule_(const Unicode& unicode) const { + const char* SpecialRule(const Unicode& unicode) const { size_t m = 0; size_t eng = 0; for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { @@ -73,7 +73,7 @@ class PosTagger { // the ascii chars contain english letter return POS_ENG; } - private: + MixSegment segment_; }; // class PosTagger diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 531e0dd..c28f0a3 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -22,7 +22,7 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u}; class SegmentBase: public ISegment, public NonCopyable { public: SegmentBase() { - loadSpecialSymbols_(); + LoadSpecialSymbols(); }; virtual ~SegmentBase() { }; @@ -69,17 +69,18 @@ class SegmentBase: public ISegment, public NonCopyable { } } private: - void loadSpecialSymbols_() { + void LoadSpecialSymbols() { size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); for(size_t i = 0; i < size; i ++) { specialSymbols_.insert(SPECIAL_SYMBOL[i]); } assert(specialSymbols_.size()); } - private: + unordered_set specialSymbols_; -}; -} +}; // class SegmentBase + +} // CppJieba #endif diff --git a/src/Trie.hpp b/src/Trie.hpp index 7311817..6052232 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -49,7 +49,7 @@ class Trie { public: static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); Trie(const vector& keys, const vector& valuePointers) { - _createTrie(keys, valuePointers); + CreateTrie(keys, valuePointers); } ~Trie() { for (size_t i = 0; i < BASE_SIZE; i++) { @@ -57,7 +57,7 @@ class Trie { continue; } for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) { - _deleteNode(it->second); + DeleteNode(it->second); it->second = NULL; } delete _base[i].next; @@ -143,7 +143,7 @@ class Trie { } private: - void _createTrie(const vector& keys, const vector& valuePointers) { + void CreateTrie(const vector& keys, const vector& valuePointers) { if (valuePointers.empty() || keys.empty()) { return; } @@ -154,14 +154,14 @@ class Trie { } } - void _deleteNode(TrieNode* node) { + void DeleteNode(TrieNode* node) { if (NULL == node) { return; } if (NULL != node->next) { TrieNode::NextMap::iterator it; for (it = node->next->begin(); it != node->next->end(); it++) { - _deleteNode(it->second); + DeleteNode(it->second); } delete node->next; node->next = NULL;