From e85a3ef8d35416fd38b273951e79cde510e5b9ac Mon Sep 17 00:00:00 2001 From: aholic Date: Sat, 25 Oct 2014 18:29:04 +0800 Subject: [PATCH 1/6] fix bug for map.erase --- src/KeywordExtractor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 36c7da0..c148145 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -74,7 +74,7 @@ namespace CppJieba { if(_stopWords.end() != _stopWords.find(itr->first)) { - wordmap.erase(itr++); + itr = wordmap.erase(itr); continue; } From 283c65db0a3b909b8792e71e948709e3c6cced3b Mon Sep 17 00:00:00 2001 From: aholic Date: Wed, 5 Nov 2014 11:13:00 +0800 Subject: [PATCH 2/6] fetch ahead --- src/KeywordExtractor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index c148145..e24fd40 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -74,7 +74,7 @@ namespace CppJieba { if(_stopWords.end() != _stopWords.find(itr->first)) { - itr = wordmap.erase(itr); + wordmap.erase(itr); continue; } From d1a112c0c4ef3404d09c32086046a7464fbd6e5d Mon Sep 17 00:00:00 2001 From: aholic Date: Sun, 19 Apr 2015 21:44:50 +0800 Subject: [PATCH 3/6] improve efficiency for trie tree in ugly way --- src/DictTrie.hpp | 9 +- src/UglyTrie.hpp | 253 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 src/UglyTrie.hpp diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 15b04fb..350fcd4 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -11,7 +11,8 @@ #include "Limonp/StringUtil.hpp" #include "Limonp/Logger.hpp" #include "TransCode.hpp" -#include "Trie.hpp" +//#include "Trie.hpp" +#include "UglyTrie.hpp" @@ -87,7 +88,7 @@ namespace CppJieba private: - Trie * _createTrie(const vector& dictUnits) + UglyTrie * _createTrie(const vector& dictUnits) { assert(dictUnits.size()); vector words; @@ -98,7 +99,7 @@ namespace CppJieba valuePointers.push_back(&dictUnits[i]); } - Trie * trie = new Trie(words, valuePointers); + UglyTrie * trie = new UglyTrie(words, valuePointers); return trie; } void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) @@ -195,7 +196,7 @@ namespace CppJieba private: vector _nodeInfos; - Trie * _trie; + UglyTrie * _trie; double _minWeight; unordered_set _userDictSingleChineseWord; diff --git a/src/UglyTrie.hpp b/src/UglyTrie.hpp new file mode 100644 index 0000000..9b6bab6 --- /dev/null +++ b/src/UglyTrie.hpp @@ -0,0 +1,253 @@ +#ifndef CPPJIEBA_UGLY_TRIE_HPP +#define CPPJIEBA_UGLY_TRIE_HPP + +#include "Limonp/StdExtension.hpp" +#include +#include + +namespace CppJieba +{ + using namespace std; + + struct DictUnit + { + Unicode word; + double weight; + string tag; + }; + + // for debugging + inline ostream & operator << (ostream& os, const DictUnit& unit) + { + string s; + s << unit.word; + return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); + } + + typedef LocalVector > DagType; + + struct SegmentChar + { + uint16_t uniCh; + DagType dag; + const DictUnit * pInfo; + double weight; + size_t nextPos; + SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0){} + ~SegmentChar() {} + }; + + typedef Unicode::value_type TrieKey; + + class TrieNode + { + public : + TrieNode(): next(NULL), ptValue(NULL) {} + public: + typedef unordered_map NextMap; + NextMap *next; + const DictUnit *ptValue; + }; + + class UglyTrie + { + public: + static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); + public: + UglyTrie(const vector& keys, const vector& valuePointers) + { + _createTrie(keys, valuePointers); + } + const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + if (begin == end) + { + return NULL; + } + + const TrieNode* ptNode = _base + (*(begin++)); + TrieNode::NextMap::const_iterator citer; + for (Unicode::const_iterator it = begin; it != end; it++) + { + if (NULL == ptNode->next) + { + return NULL; + } + citer = ptNode->next->find(*it); + if (ptNode->next->end() == citer) + { + return NULL; + } + ptNode = citer->second; + } + return ptNode->ptValue; + } + + void find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res + ) const + { + res.resize(end - begin); + + const TrieNode *ptNode = NULL; + TrieNode::NextMap::const_iterator citer; + for (size_t i = 0; i < size_t(end - begin); i++) + { + Unicode::value_type ch = *(begin + i); + ptNode = _base + ch; + res[i].uniCh = ch; + assert(res[i].dag.empty()); + + res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); + + for (size_t j = i + 1; j < size_t(end - begin); j++) + { + if (ptNode->next == NULL) + { + break; + } + citer = ptNode->next->find(*(begin + j)); + if (ptNode->next->end() == citer) + { + break; + } + ptNode = citer->second; + if (NULL != ptNode->ptValue) + { + res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); + } + } + } + } + bool find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + DagType & res, + size_t offset = 0) const + { + if (begin == end) + { + return !res.empty(); + } + + const TrieNode* ptNode = _base + (*(begin++)); + if (ptNode->ptValue != NULL && res.size() == 1) + { + res[0].second = ptNode->ptValue; + } + else if (ptNode->ptValue != NULL) + { + res.push_back(DagType::value_type(offset, ptNode->ptValue)); + } + + TrieNode::NextMap::const_iterator citer; + for (Unicode::const_iterator itr = begin; itr != end; itr++) + { + if (NULL == ptNode->next) + { + break; + } + citer = ptNode->next->find(*itr); + if (citer == ptNode->next->end()) + { + break; + } + ptNode = citer->second; + if (NULL != ptNode->ptValue) + { + res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); + } + } + return !res.empty(); + } + ~UglyTrie() + { + for (size_t i = 0; i < BASE_SIZE; i++) + { + if (_base[i].next == NULL) + { + continue; + } + for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) + { + _deleteNode(it->second); + it->second = NULL; + } + delete _base[i].next; + _base[i].next = NULL; + } + } + + private: + void _insertNode(const Unicode& key, const DictUnit* ptValue) + { + if (key.begin() == key.end()) + { + return; + } + + TrieNode::NextMap::const_iterator kmIter; + Unicode::const_iterator citer= key.begin(); + TrieNode *ptNode = _base + (*(citer++)); + for (; citer != key.end(); citer++) + { + if (NULL == ptNode->next) + { + ptNode->next = new TrieNode::NextMap; + } + kmIter = ptNode->next->find(*citer); + if (ptNode->next->end() == kmIter) + { + TrieNode *nextNode = new TrieNode; + + (*(ptNode->next))[*citer] = nextNode; + ptNode = nextNode; + } + else + { + ptNode = kmIter->second; + } + } + ptNode->ptValue = ptValue; + } + + void _createTrie(const vector& keys, const vector& valuePointers) + { + if (valuePointers.empty() || keys.empty()) + { + return; + } + assert(keys.size() == valuePointers.size()); + + for (size_t i = 0; i < keys.size(); i++) + { + _insertNode(keys[i], valuePointers[i]); + } + } + + void _deleteNode(TrieNode* node) + { + if (NULL == node) + { + return; + } + if (NULL != node->next) + { + TrieNode::NextMap::iterator it; + for (it = node->next->begin(); it != node->next->end(); it++) + { + _deleteNode(it->second); + } + delete node->next; + node->next = NULL; + } + delete node; + } + + TrieNode _base[BASE_SIZE]; + }; +} + +#endif From 931db7d1e5dbe6b613db7170e463ccd88b3a943a Mon Sep 17 00:00:00 2001 From: xuangong Date: Mon, 20 Jul 2015 23:54:20 +0800 Subject: [PATCH 4/6] astyle --- src/DictTrie.hpp | 321 +++++++++---------- src/FullSegment.hpp | 220 ++++++------- src/HMMSegment.hpp | 646 +++++++++++++++++---------------------- src/ISegment.hpp | 16 +- src/KeywordExtractor.hpp | 260 +++++++--------- 5 files changed, 656 insertions(+), 807 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 350fcd4..40602bf 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -16,191 +16,164 @@ -namespace CppJieba -{ - using namespace Limonp; - const double MIN_DOUBLE = -3.14e+100; - const double MAX_DOUBLE = 3.14e+100; - const size_t DICT_COLUMN_NUM = 3; - const char* const UNKNOWN_TAG = ""; +namespace CppJieba { +using namespace Limonp; +const double MIN_DOUBLE = -3.14e+100; +const double MAX_DOUBLE = 3.14e+100; +const size_t DICT_COLUMN_NUM = 3; +const char* const UNKNOWN_TAG = ""; - class DictTrie - { - public: +class DictTrie { + public: - DictTrie() - { - _trie = NULL; - _minWeight = MAX_DOUBLE; - } - DictTrie(const string& dictPath, const string& userDictPath = "") - { - new (this) DictTrie(); - init(dictPath, userDictPath); - } - ~DictTrie() - { - if(_trie) - { - delete _trie; - } - } - - bool init(const string& dictPath, const string& userDictPath = "") - { - assert(!_trie); - _loadDict(dictPath); - _calculateWeight(_nodeInfos); - _minWeight = _findMinWeight(_nodeInfos); - - if(userDictPath.size()) - { - double maxWeight = _findMaxWeight(_nodeInfos); - _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); - } - _shrink(_nodeInfos); - _trie = _createTrie(_nodeInfos); - assert(_trie); - return true; - } + DictTrie() { + _trie = NULL; + _minWeight = MAX_DOUBLE; + } + DictTrie(const string& dictPath, const string& userDictPath = "") { + new (this) DictTrie(); + init(dictPath, userDictPath); + } + ~DictTrie() { + if(_trie) { + delete _trie; + } + } - const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - return _trie->find(begin, end); - } - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const - { - return _trie->find(begin, end, dag, offset); - } - void find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res - ) const - { - _trie->find(begin, end, res); - } - bool isUserDictSingleChineseWord(const Unicode::value_type& word) const - { - return isIn(_userDictSingleChineseWord, word); - } - double getMinWeight() const {return _minWeight;}; + bool init(const string& dictPath, const string& userDictPath = "") { + assert(!_trie); + _loadDict(dictPath); + _calculateWeight(_nodeInfos); + _minWeight = _findMinWeight(_nodeInfos); + + if(userDictPath.size()) { + double maxWeight = _findMaxWeight(_nodeInfos); + _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); + } + _shrink(_nodeInfos); + _trie = _createTrie(_nodeInfos); + assert(_trie); + return true; + } + + const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + return _trie->find(begin, end); + } + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const { + return _trie->find(begin, end, dag, offset); + } + void find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res + ) const { + _trie->find(begin, end, res); + } + bool isUserDictSingleChineseWord(const Unicode::value_type& word) const { + return isIn(_userDictSingleChineseWord, word); + } + double getMinWeight() const { + return _minWeight; + }; - private: - UglyTrie * _createTrie(const vector& dictUnits) - { - assert(dictUnits.size()); - vector words; - vector valuePointers; - for(size_t i = 0 ; i < dictUnits.size(); i ++) - { - words.push_back(dictUnits[i].word); - valuePointers.push_back(&dictUnits[i]); - } + private: + UglyTrie * _createTrie(const vector& dictUnits) { + assert(dictUnits.size()); + vector words; + vector valuePointers; + for(size_t i = 0 ; i < dictUnits.size(); i ++) { + words.push_back(dictUnits[i].word); + valuePointers.push_back(&dictUnits[i]); + } - UglyTrie * trie = new UglyTrie(words, valuePointers); - return trie; - } - void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) - { - ifstream ifs(filePath.c_str()); - assert(ifs.is_open()); - string line; - DictUnit nodeInfo; - vector buf; - size_t lineno; - for(lineno = 0; getline(ifs, line); lineno++) - { - buf.clear(); - split(line, buf, " "); - assert(buf.size() >= 1); - if(!TransCode::decode(buf[0], nodeInfo.word)) - { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - continue; - } - if(nodeInfo.word.size() == 1) - { - _userDictSingleChineseWord.insert(nodeInfo.word[0]); - } - nodeInfo.weight = defaultWeight; - nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); - _nodeInfos.push_back(nodeInfo); - } - LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); - } - void _loadDict(const string& filePath) - { - ifstream ifs(filePath.c_str()); - assert(ifs.is_open()); - string line; - vector buf; + UglyTrie * trie = new UglyTrie(words, valuePointers); + return trie; + } + void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) { + ifstream ifs(filePath.c_str()); + assert(ifs.is_open()); + string line; + DictUnit nodeInfo; + vector buf; + size_t lineno; + for(lineno = 0; getline(ifs, line); lineno++) { + buf.clear(); + split(line, buf, " "); + assert(buf.size() >= 1); + if(!TransCode::decode(buf[0], nodeInfo.word)) { + LogError("line[%u:%s] illegal.", lineno, line.c_str()); + continue; + } + if(nodeInfo.word.size() == 1) { + _userDictSingleChineseWord.insert(nodeInfo.word[0]); + } + nodeInfo.weight = defaultWeight; + nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); + _nodeInfos.push_back(nodeInfo); + } + LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); + } + void _loadDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + assert(ifs.is_open()); + string line; + vector buf; - DictUnit nodeInfo; - for(size_t lineno = 0 ; getline(ifs, line); lineno++) - { - split(line, buf, " "); - assert(buf.size() == DICT_COLUMN_NUM); - - if(!TransCode::decode(buf[0], nodeInfo.word)) - { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - continue; - } - nodeInfo.weight = atof(buf[1].c_str()); - nodeInfo.tag = buf[2]; - - _nodeInfos.push_back(nodeInfo); - } - } - double _findMinWeight(const vector& nodeInfos) const - { - double ret = MAX_DOUBLE; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - ret = min(nodeInfos[i].weight, ret); - } - return ret; - } - double _findMaxWeight(const vector& nodeInfos) const - { - double ret = MIN_DOUBLE; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - ret = max(nodeInfos[i].weight, ret); - } - return ret; - } + DictUnit nodeInfo; + for(size_t lineno = 0 ; getline(ifs, line); lineno++) { + split(line, buf, " "); + assert(buf.size() == DICT_COLUMN_NUM); - void _calculateWeight(vector& nodeInfos) const - { - double sum = 0.0; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - sum += nodeInfos[i].weight; - } - assert(sum); - for(size_t i = 0; i < nodeInfos.size(); i++) - { - DictUnit& nodeInfo = nodeInfos[i]; - assert(nodeInfo.weight); - nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); - } - } + if(!TransCode::decode(buf[0], nodeInfo.word)) { + LogError("line[%u:%s] illegal.", lineno, line.c_str()); + continue; + } + nodeInfo.weight = atof(buf[1].c_str()); + nodeInfo.tag = buf[2]; - void _shrink(vector& units) const - { - vector(units.begin(), units.end()).swap(units); - } + _nodeInfos.push_back(nodeInfo); + } + } + double _findMinWeight(const vector& nodeInfos) const { + double ret = MAX_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) { + ret = min(nodeInfos[i].weight, ret); + } + return ret; + } + double _findMaxWeight(const vector& nodeInfos) const { + double ret = MIN_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) { + ret = max(nodeInfos[i].weight, ret); + } + return ret; + } - private: - vector _nodeInfos; - UglyTrie * _trie; + void _calculateWeight(vector& nodeInfos) const { + double sum = 0.0; + for(size_t i = 0; i < nodeInfos.size(); i++) { + sum += nodeInfos[i].weight; + } + assert(sum); + for(size_t i = 0; i < nodeInfos.size(); i++) { + DictUnit& nodeInfo = nodeInfos[i]; + assert(nodeInfo.weight); + nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); + } + } - double _minWeight; - unordered_set _userDictSingleChineseWord; - }; + void _shrink(vector& units) const { + vector(units.begin(), units.end()).swap(units); + } + + private: + vector _nodeInfos; + UglyTrie * _trie; + + double _minWeight; + unordered_set _userDictSingleChineseWord; +}; } #endif diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 0a3e747..a8b60a1 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -10,140 +10,116 @@ #include "SegmentBase.hpp" #include "TransCode.hpp" -namespace CppJieba -{ - class FullSegment: public SegmentBase - { - public: - FullSegment() - { - _dictTrie = NULL; - _isBorrowed = false; - } - explicit FullSegment(const string& dictPath) - { - _dictTrie = NULL; - init(dictPath); - } - explicit FullSegment(const DictTrie* dictTrie) - { - _dictTrie = NULL; - init(dictTrie); - } - virtual ~FullSegment() - { - if(_dictTrie && ! _isBorrowed) - { - delete _dictTrie; - } +namespace CppJieba { +class FullSegment: public SegmentBase { + public: + FullSegment() { + _dictTrie = NULL; + _isBorrowed = false; + } + explicit FullSegment(const string& dictPath) { + _dictTrie = NULL; + init(dictPath); + } + explicit FullSegment(const DictTrie* dictTrie) { + _dictTrie = NULL; + init(dictTrie); + } + virtual ~FullSegment() { + if(_dictTrie && ! _isBorrowed) { + delete _dictTrie; + } - }; - bool init(const string& dictPath) - { - assert(_dictTrie == NULL); - _dictTrie = new DictTrie(dictPath); - _isBorrowed = false; - return true; - } - bool init(const DictTrie* dictTrie) - { - assert(_dictTrie == NULL); - assert(dictTrie); - _dictTrie = dictTrie; - _isBorrowed = true; - return true; - } + }; + bool init(const string& dictPath) { + assert(_dictTrie == NULL); + _dictTrie = new DictTrie(dictPath); + _isBorrowed = false; + return true; + } + bool init(const DictTrie* dictTrie) { + assert(_dictTrie == NULL); + assert(dictTrie); + _dictTrie = dictTrie; + _isBorrowed = true; + return true; + } - using SegmentBase::cut; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - assert(_dictTrie); - if (begin >= end) - { - LogError("begin >= end"); - return false; - } + using SegmentBase::cut; + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_dictTrie); + if (begin >= end) { + LogError("begin >= end"); + return false; + } - //resut of searching in trie tree - DagType tRes; + //resut of searching in trie tree + DagType tRes; - //max index of res's words - int maxIdx = 0; + //max index of res's words + int maxIdx = 0; - // always equals to (uItr - begin) - int uIdx = 0; + // always equals to (uItr - begin) + int uIdx = 0; - //tmp variables - int wordLen = 0; - for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) - { - //find word start from uItr - if (_dictTrie->find(uItr, end, tRes, 0)) - { - for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - { - wordLen = itr->second->word.size(); - if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) - { - res.push_back(itr->second->word); - } - maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; - } - tRes.clear(); - } - else // not found word start from uItr - { - if (maxIdx <= uIdx) // never exist in prev results - { - //put itr itself in res - res.push_back(Unicode(1, *uItr)); + //tmp variables + int wordLen = 0; + for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { + //find word start from uItr + if (_dictTrie->find(uItr, end, tRes, 0)) { + for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + { + wordLen = itr->second->word.size(); + if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) { + res.push_back(itr->second->word); + } + maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; + } + tRes.clear(); + } else { // not found word start from uItr + if (maxIdx <= uIdx) { // never exist in prev results + //put itr itself in res + res.push_back(Unicode(1, *uItr)); - //mark it exits - ++maxIdx; - } - } - ++uIdx; - } + //mark it exits + ++maxIdx; + } + } + ++uIdx; + } - return true; - } + return true; + } - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - assert(_dictTrie); - if (begin >= end) - { - LogError("begin >= end"); - return false; - } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_dictTrie); + if (begin >= end) { + LogError("begin >= end"); + return false; + } - vector uRes; - if (!cut(begin, end, uRes)) - { - LogError("get unicode cut result error."); - return false; - } + vector uRes; + if (!cut(begin, end, uRes)) { + LogError("get unicode cut result error."); + return false; + } - string tmp; - for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) - { - if (TransCode::encode(*uItr, tmp)) - { - res.push_back(tmp); - } - else - { - LogError("encode failed."); - } - } + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { + if (TransCode::encode(*uItr, tmp)) { + res.push_back(tmp); + } else { + LogError("encode failed."); + } + } - return true; - } - private: - const DictTrie* _dictTrie; - bool _isBorrowed; - }; + return true; + } + private: + const DictTrie* _dictTrie; + bool _isBorrowed; +}; } #endif diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index d7c8c89..d000bce 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -12,387 +12,315 @@ #include "SegmentBase.hpp" #include "DictTrie.hpp" -namespace CppJieba -{ - using namespace Limonp; - typedef unordered_map EmitProbMap; - class HMMSegment: public SegmentBase - { - public: - /* - * STATUS: - * 0:B, 1:E, 2:M, 3:S - * */ - enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; +namespace CppJieba { +using namespace Limonp; +typedef unordered_map EmitProbMap; +class HMMSegment: public SegmentBase { + public: + /* + * STATUS: + * 0:B, 1:E, 2:M, 3:S + * */ + enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; - public: - HMMSegment(){} - explicit HMMSegment(const string& filePath) - { - LIMONP_CHECK(init(filePath)); - } - virtual ~HMMSegment(){} - public: - bool init(const string& filePath) - { - memset(_startProb, 0, sizeof(_startProb)); - memset(_transProb, 0, sizeof(_transProb)); - _statMap[0] = 'B'; - _statMap[1] = 'E'; - _statMap[2] = 'M'; - _statMap[3] = 'S'; - _emitProbVec.push_back(&_emitProbB); - _emitProbVec.push_back(&_emitProbE); - _emitProbVec.push_back(&_emitProbM); - _emitProbVec.push_back(&_emitProbS); - LIMONP_CHECK(_loadModel(filePath.c_str())); - LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); - return true; - } - public: - using SegmentBase::cut; - public: - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - Unicode::const_iterator left = begin; - Unicode::const_iterator right = begin; - while(right != end) - { - if(*right < 0x80) - { - if(left != right && !_cut(left, right, res)) - { - return false; - } - left = right; - do { - right = _sequentialLetterRule(left, end); - if(right != left) - { - break; - } - right = _numbersRule(left, end); - if(right != left) - { - break; - } - right ++; - } while(false); - res.push_back(Unicode(left, right)); - left = right; - } - else - { - right++; - } - } - if(left != right && !_cut(left, right, res)) - { - return false; - } - return true; - } - public: - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - if(begin == end) - { - return false; - } - vector words; - words.reserve(end - begin); - if(!cut(begin, end, words)) - { - return false; - } - size_t offset = res.size(); - res.resize(res.size() + words.size()); - for(size_t i = 0; i < words.size(); i++) - { - if(!TransCode::encode(words[i], res[offset + i])) - { - LogError("encode failed."); - } - } - return true; - } - private: - // sequential letters rule - Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - Unicode::value_type x = *begin; - if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) - { - begin ++; - } - else - { - return begin; - } - while(begin != end) - { - x = *begin; - if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) - { - begin ++; - } - else - { - break; - } - } - return begin; - } - // - Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - Unicode::value_type x = *begin; - if('0' <= x && x <= '9') - { - begin ++; - } - else - { - return begin; - } - while(begin != end) - { - x = *begin; - if( ('0' <= x && x <= '9') || x == '.') - { - begin++; - } - else - { - break; - } - } - return begin; - } - bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - vector status; - if(!_viterbi(begin, end, status)) - { - LogError("_viterbi failed."); - return false; - } + public: + HMMSegment() {} + explicit HMMSegment(const string& filePath) { + LIMONP_CHECK(init(filePath)); + } + virtual ~HMMSegment() {} + public: + bool init(const string& filePath) { + memset(_startProb, 0, sizeof(_startProb)); + memset(_transProb, 0, sizeof(_transProb)); + _statMap[0] = 'B'; + _statMap[1] = 'E'; + _statMap[2] = 'M'; + _statMap[3] = 'S'; + _emitProbVec.push_back(&_emitProbB); + _emitProbVec.push_back(&_emitProbE); + _emitProbVec.push_back(&_emitProbM); + _emitProbVec.push_back(&_emitProbS); + LIMONP_CHECK(_loadModel(filePath.c_str())); + LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); + return true; + } + public: + using SegmentBase::cut; + public: + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { + Unicode::const_iterator left = begin; + Unicode::const_iterator right = begin; + while(right != end) { + if(*right < 0x80) { + if(left != right && !_cut(left, right, res)) { + return false; + } + left = right; + do { + right = _sequentialLetterRule(left, end); + if(right != left) { + break; + } + right = _numbersRule(left, end); + if(right != left) { + break; + } + right ++; + } while(false); + res.push_back(Unicode(left, right)); + left = right; + } else { + right++; + } + } + if(left != right && !_cut(left, right, res)) { + return false; + } + return true; + } + public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { + if(begin == end) { + return false; + } + vector words; + words.reserve(end - begin); + if(!cut(begin, end, words)) { + return false; + } + size_t offset = res.size(); + res.resize(res.size() + words.size()); + for(size_t i = 0; i < words.size(); i++) { + if(!TransCode::encode(words[i], res[offset + i])) { + LogError("encode failed."); + } + } + return true; + } + private: + // sequential letters rule + Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { + Unicode::value_type x = *begin; + if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { + begin ++; + } else { + return begin; + } + while(begin != end) { + x = *begin; + if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { + begin ++; + } else { + break; + } + } + return begin; + } + // + Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { + Unicode::value_type x = *begin; + if('0' <= x && x <= '9') { + begin ++; + } else { + return begin; + } + while(begin != end) { + x = *begin; + if( ('0' <= x && x <= '9') || x == '.') { + begin++; + } else { + break; + } + } + return begin; + } + bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + vector status; + if(!_viterbi(begin, end, status)) { + LogError("_viterbi failed."); + return false; + } - Unicode::const_iterator left = begin; - Unicode::const_iterator right; - for(size_t i = 0; i < status.size(); i++) - { - if(status[i] % 2) //if(E == status[i] || S == status[i]) - { - right = begin + i + 1; - res.push_back(Unicode(left, right)); - left = right; - } - } - return true; - } + Unicode::const_iterator left = begin; + Unicode::const_iterator right; + for(size_t i = 0; i < status.size(); i++) { + if(status[i] % 2) { //if(E == status[i] || S == status[i]) + right = begin + i + 1; + res.push_back(Unicode(left, right)); + left = right; + } + } + return true; + } - bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const - { - if(begin == end) - { - return false; - } + bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const { + if(begin == end) { + return false; + } - size_t Y = STATUS_SUM; - size_t X = end - begin; + size_t Y = STATUS_SUM; + size_t X = end - begin; - size_t XYSize = X * Y; - size_t now, old, stat; - double tmp, endE, endS; + size_t XYSize = X * Y; + size_t now, old, stat; + double tmp, endE, endS; - vector path(XYSize); - vector weight(XYSize); + vector path(XYSize); + vector weight(XYSize); - //start - for(size_t y = 0; y < Y; y++) - { - weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); - path[0 + y * X] = -1; - } + //start + for(size_t y = 0; y < Y; y++) { + weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); + path[0 + y * X] = -1; + } - double emitProb; + double emitProb; - for(size_t x = 1; x < X; x++) - { - for(size_t y = 0; y < Y; y++) - { - now = x + y*X; - weight[now] = MIN_DOUBLE; - path[now] = E; // warning - emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); - for(size_t preY = 0; preY < Y; preY++) - { - old = x - 1 + preY * X; - tmp = weight[old] + _transProb[preY][y] + emitProb; - if(tmp > weight[now]) - { - weight[now] = tmp; - path[now] = preY; - } - } - } - } + for(size_t x = 1; x < X; x++) { + for(size_t y = 0; y < Y; y++) { + now = x + y*X; + weight[now] = MIN_DOUBLE; + path[now] = E; // warning + emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); + for(size_t preY = 0; preY < Y; preY++) { + old = x - 1 + preY * X; + tmp = weight[old] + _transProb[preY][y] + emitProb; + if(tmp > weight[now]) { + weight[now] = tmp; + path[now] = preY; + } + } + } + } - endE = weight[X-1+E*X]; - endS = weight[X-1+S*X]; - stat = 0; - if(endE >= endS) - { - stat = E; - } - else - { - stat = S; - } + endE = weight[X-1+E*X]; + endS = weight[X-1+S*X]; + stat = 0; + if(endE >= endS) { + stat = E; + } else { + stat = S; + } - status.resize(X); - for(int x = X -1 ; x >= 0; x--) - { - status[x] = stat; - stat = path[x + stat*X]; - } + status.resize(X); + for(int x = X -1 ; x >= 0; x--) { + status[x] = stat; + stat = path[x + stat*X]; + } - return true; - } - bool _loadModel(const char* const filePath) - { - ifstream ifile(filePath); - string line; - vector tmp; - vector tmp2; - //load _startProb - if(!_getLine(ifile, line)) - { - return false; - } - split(line, tmp, " "); - if(tmp.size() != STATUS_SUM) - { - LogError("start_p illegal"); - return false; - } - for(size_t j = 0; j< tmp.size(); j++) - { - _startProb[j] = atof(tmp[j].c_str()); - } + return true; + } + bool _loadModel(const char* const filePath) { + ifstream ifile(filePath); + string line; + vector tmp; + vector tmp2; + //load _startProb + if(!_getLine(ifile, line)) { + return false; + } + split(line, tmp, " "); + if(tmp.size() != STATUS_SUM) { + LogError("start_p illegal"); + return false; + } + for(size_t j = 0; j< tmp.size(); j++) { + _startProb[j] = atof(tmp[j].c_str()); + } - //load _transProb - for(size_t i = 0; i < STATUS_SUM; i++) - { - if(!_getLine(ifile, line)) - { - return false; - } - split(line, tmp, " "); - if(tmp.size() != STATUS_SUM) - { - LogError("trans_p illegal"); - return false; - } - for(size_t j =0; j < STATUS_SUM; j++) - { - _transProb[i][j] = atof(tmp[j].c_str()); - } - } + //load _transProb + for(size_t i = 0; i < STATUS_SUM; i++) { + if(!_getLine(ifile, line)) { + return false; + } + split(line, tmp, " "); + if(tmp.size() != STATUS_SUM) { + LogError("trans_p illegal"); + return false; + } + for(size_t j =0; j < STATUS_SUM; j++) { + _transProb[i][j] = atof(tmp[j].c_str()); + } + } - //load _emitProbB - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) - { - return false; - } + //load _emitProbB + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) { + return false; + } - //load _emitProbE - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) - { - return false; - } + //load _emitProbE + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) { + return false; + } - //load _emitProbM - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) - { - return false; - } + //load _emitProbM + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) { + return false; + } - //load _emitProbS - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) - { - return false; - } + //load _emitProbS + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) { + return false; + } - return true; - } - bool _getLine(ifstream& ifile, string& line) - { - while(getline(ifile, line)) - { - trim(line); - if(line.empty()) - { - continue; - } - if(startsWith(line, "#")) - { - continue; - } - return true; - } - return false; - } - bool _loadEmitProb(const string& line, EmitProbMap& mp) - { - if(line.empty()) - { - return false; - } - vector tmp, tmp2; - Unicode unicode; - split(line, tmp, ","); - for(size_t i = 0; i < tmp.size(); i++) - { - split(tmp[i], tmp2, ":"); - if(2 != tmp2.size()) - { - LogError("_emitProb illegal."); - return false; - } - if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) - { - LogError("TransCode failed."); - return false; - } - mp[unicode[0]] = atof(tmp2[1].c_str()); - } - return true; - } - double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const - { - EmitProbMap::const_iterator cit = ptMp->find(key); - if(cit == ptMp->end()) - { - return defVal; - } - return cit->second; + return true; + } + bool _getLine(ifstream& ifile, string& line) { + while(getline(ifile, line)) { + trim(line); + if(line.empty()) { + continue; + } + if(startsWith(line, "#")) { + continue; + } + return true; + } + return false; + } + bool _loadEmitProb(const string& line, EmitProbMap& mp) { + if(line.empty()) { + return false; + } + vector tmp, tmp2; + Unicode unicode; + split(line, tmp, ","); + for(size_t i = 0; i < tmp.size(); i++) { + split(tmp[i], tmp2, ":"); + if(2 != tmp2.size()) { + LogError("_emitProb illegal."); + return false; + } + if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) { + LogError("TransCode failed."); + return false; + } + mp[unicode[0]] = atof(tmp2[1].c_str()); + } + return true; + } + double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const { + EmitProbMap::const_iterator cit = ptMp->find(key); + if(cit == ptMp->end()) { + return defVal; + } + return cit->second; - } + } - private: - char _statMap[STATUS_SUM]; - double _startProb[STATUS_SUM]; - double _transProb[STATUS_SUM][STATUS_SUM]; - EmitProbMap _emitProbB; - EmitProbMap _emitProbE; - EmitProbMap _emitProbM; - EmitProbMap _emitProbS; - vector _emitProbVec; + private: + char _statMap[STATUS_SUM]; + double _startProb[STATUS_SUM]; + double _transProb[STATUS_SUM][STATUS_SUM]; + EmitProbMap _emitProbB; + EmitProbMap _emitProbE; + EmitProbMap _emitProbM; + EmitProbMap _emitProbS; + vector _emitProbVec; - }; +}; } #endif diff --git a/src/ISegment.hpp b/src/ISegment.hpp index 167e2f9..4faded5 100644 --- a/src/ISegment.hpp +++ b/src/ISegment.hpp @@ -2,15 +2,13 @@ #define CPPJIEBA_SEGMENTINTERFACE_H -namespace CppJieba -{ - class ISegment - { - public: - virtual ~ISegment(){}; - virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; - virtual bool cut(const string& str, vector& res) const = 0; - }; +namespace CppJieba { +class ISegment { + public: + virtual ~ISegment() {}; + virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; + virtual bool cut(const string& str, vector& res) const = 0; +}; } #endif diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index e3726d9..10f900e 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -5,162 +5,136 @@ #include #include -namespace CppJieba -{ - using namespace Limonp; +namespace CppJieba { +using namespace Limonp; - /*utf8*/ - class KeywordExtractor - { - public: - KeywordExtractor(){}; - KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") - { - init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); - }; - ~KeywordExtractor(){}; +/*utf8*/ +class KeywordExtractor { + public: + KeywordExtractor() {}; + KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { + init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); + }; + ~KeywordExtractor() {}; - void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") - { - _loadIdfDict(idfPath); - _loadStopWordDict(stopWordPath); - LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); - }; + void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { + _loadIdfDict(idfPath); + _loadStopWordDict(stopWordPath); + LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); + }; - bool extract(const string& str, vector& keywords, size_t topN) const - { - vector > topWords; - if(!extract(str, topWords, topN)) - { - return false; - } - for(size_t i = 0; i < topWords.size(); i++) - { - keywords.push_back(topWords[i].first); - } - return true; - } + bool extract(const string& str, vector& keywords, size_t topN) const { + vector > topWords; + if(!extract(str, topWords, topN)) { + return false; + } + for(size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].first); + } + return true; + } - bool extract(const string& str, vector >& keywords, size_t topN) const - { - vector words; - if(!_segment.cut(str, words)) - { - LogError("segment cut(%s) failed.", str.c_str()); - return false; - } + bool extract(const string& str, vector >& keywords, size_t topN) const { + vector words; + if(!_segment.cut(str, words)) { + LogError("segment cut(%s) failed.", str.c_str()); + return false; + } - map wordmap; - for(vector::iterator iter = words.begin(); iter != words.end(); iter++) - { - if(_isSingleWord(*iter)) - { - continue; - } - wordmap[*iter] += 1.0; - } + map wordmap; + for(vector::iterator iter = words.begin(); iter != words.end(); iter++) { + if(_isSingleWord(*iter)) { + continue; + } + wordmap[*iter] += 1.0; + } - for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) - { - if(_stopWords.end() != _stopWords.find(itr->first)) - { - wordmap.erase(itr); - continue; - } + for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { + if(_stopWords.end() != _stopWords.find(itr->first)) { + wordmap.erase(itr); + continue; + } - unordered_map::const_iterator cit = _idfMap.find(itr->first); - if(cit != _idfMap.end()) - { - itr->second *= cit->second; - } - else - { - itr->second *= _idfAverage; - } - itr ++; - } + unordered_map::const_iterator cit = _idfMap.find(itr->first); + if(cit != _idfMap.end()) { + itr->second *= cit->second; + } else { + itr->second *= _idfAverage; + } + itr ++; + } - keywords.clear(); - std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); - topN = min(topN, keywords.size()); - partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); - keywords.resize(topN); - return true; - } - private: - void _loadIdfDict(const string& idfPath) - { - ifstream ifs(idfPath.c_str()); - if(!ifs) - { - LogError("open %s failed.", idfPath.c_str()); - assert(false); - } - string line ; - vector buf; - double idf = 0.0; - double idfSum = 0.0; - size_t lineno = 0; - for(;getline(ifs, line); lineno++) - { - buf.clear(); - if(line.empty()) - { - LogError("line[%d] empty. skipped.", lineno); - continue; - } - if(!split(line, buf, " ") || buf.size() != 2) - { - LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); - continue; - } - idf = atof(buf[1].c_str()); - _idfMap[buf[0]] = idf; - idfSum += idf; + keywords.clear(); + std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); + keywords.resize(topN); + return true; + } + private: + void _loadIdfDict(const string& idfPath) { + ifstream ifs(idfPath.c_str()); + if(!ifs) { + LogError("open %s failed.", idfPath.c_str()); + assert(false); + } + string line ; + vector buf; + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for(; getline(ifs, line); lineno++) { + buf.clear(); + if(line.empty()) { + LogError("line[%d] empty. skipped.", lineno); + continue; + } + if(!split(line, buf, " ") || buf.size() != 2) { + LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); + continue; + } + idf = atof(buf[1].c_str()); + _idfMap[buf[0]] = idf; + idfSum += idf; - } + } - assert(lineno); - _idfAverage = idfSum / lineno; - assert(_idfAverage > 0.0); - } - void _loadStopWordDict(const string& filePath) - { - ifstream ifs(filePath.c_str()); - if(!ifs) - { - LogError("open %s failed.", filePath.c_str()); - assert(false); - } - string line ; - while(getline(ifs, line)) - { - _stopWords.insert(line); - } - assert(_stopWords.size()); - } + assert(lineno); + _idfAverage = idfSum / lineno; + assert(_idfAverage > 0.0); + } + void _loadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + if(!ifs) { + LogError("open %s failed.", filePath.c_str()); + assert(false); + } + string line ; + while(getline(ifs, line)) { + _stopWords.insert(line); + } + assert(_stopWords.size()); + } - bool _isSingleWord(const string& str) const - { - Unicode unicode; - TransCode::decode(str, unicode); - if(unicode.size() == 1) - return true; - return false; - } + bool _isSingleWord(const string& str) const { + Unicode unicode; + TransCode::decode(str, unicode); + if(unicode.size() == 1) + return true; + return false; + } - static bool _cmp(const pair& lhs, const pair& rhs) - { - return lhs.second > rhs.second; - } - - private: - MixSegment _segment; - unordered_map _idfMap; - double _idfAverage; + static bool _cmp(const pair& lhs, const pair& rhs) { + return lhs.second > rhs.second; + } - unordered_set _stopWords; - }; + private: + MixSegment _segment; + unordered_map _idfMap; + double _idfAverage; + + unordered_set _stopWords; +}; } #endif From cf9cc45c198c7e31c0e5b9dc350beba7bbbca957 Mon Sep 17 00:00:00 2001 From: xuangong Date: Tue, 21 Jul 2015 00:11:13 +0800 Subject: [PATCH 5/6] astyle --- src/UglyTrie.hpp | 407 +++++++++++++++++++++-------------------------- 1 file changed, 181 insertions(+), 226 deletions(-) diff --git a/src/UglyTrie.hpp b/src/UglyTrie.hpp index 9b6bab6..39a3e89 100644 --- a/src/UglyTrie.hpp +++ b/src/UglyTrie.hpp @@ -5,249 +5,204 @@ #include #include -namespace CppJieba -{ - using namespace std; +namespace CppJieba { +using namespace std; - struct DictUnit - { - Unicode word; - double weight; - string tag; - }; +struct DictUnit { + Unicode word; + double weight; + string tag; +}; - // for debugging - inline ostream & operator << (ostream& os, const DictUnit& unit) - { - string s; - s << unit.word; - return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); +// for debugging +inline ostream & operator << (ostream& os, const DictUnit& unit) { + string s; + s << unit.word; + return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); +} + +typedef LocalVector > DagType; + +struct SegmentChar { + uint16_t uniCh; + DagType dag; + const DictUnit * pInfo; + double weight; + size_t nextPos; + SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0) {} + ~SegmentChar() {} +}; + +typedef Unicode::value_type TrieKey; + +class TrieNode { + public : + TrieNode(): next(NULL), ptValue(NULL) {} + public: + typedef unordered_map NextMap; + NextMap *next; + const DictUnit *ptValue; +}; + +class UglyTrie { + public: + static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); + public: + UglyTrie(const vector& keys, const vector& valuePointers) { + _createTrie(keys, valuePointers); + } + const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + if (begin == end) { + return NULL; } - typedef LocalVector > DagType; + const TrieNode* ptNode = _base + (*(begin++)); + TrieNode::NextMap::const_iterator citer; + for (Unicode::const_iterator it = begin; it != end; it++) { + if (NULL == ptNode->next) { + return NULL; + } + citer = ptNode->next->find(*it); + if (ptNode->next->end() == citer) { + return NULL; + } + ptNode = citer->second; + } + return ptNode->ptValue; + } - struct SegmentChar - { - uint16_t uniCh; - DagType dag; - const DictUnit * pInfo; - double weight; - size_t nextPos; - SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0){} - ~SegmentChar() {} - }; + void find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res + ) const { + res.resize(end - begin); - typedef Unicode::value_type TrieKey; + const TrieNode *ptNode = NULL; + TrieNode::NextMap::const_iterator citer; + for (size_t i = 0; i < size_t(end - begin); i++) { + Unicode::value_type ch = *(begin + i); + ptNode = _base + ch; + res[i].uniCh = ch; + assert(res[i].dag.empty()); - class TrieNode - { - public : - TrieNode(): next(NULL), ptValue(NULL) {} - public: - typedef unordered_map NextMap; - NextMap *next; - const DictUnit *ptValue; - }; + res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); - class UglyTrie - { - public: - static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); - public: - UglyTrie(const vector& keys, const vector& valuePointers) - { - _createTrie(keys, valuePointers); - } - const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - if (begin == end) - { - return NULL; - } + for (size_t j = i + 1; j < size_t(end - begin); j++) { + if (ptNode->next == NULL) { + break; + } + citer = ptNode->next->find(*(begin + j)); + if (ptNode->next->end() == citer) { + break; + } + ptNode = citer->second; + if (NULL != ptNode->ptValue) { + res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); + } + } + } + } + bool find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + DagType & res, + size_t offset = 0) const { + if (begin == end) { + return !res.empty(); + } - const TrieNode* ptNode = _base + (*(begin++)); - TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator it = begin; it != end; it++) - { - if (NULL == ptNode->next) - { - return NULL; - } - citer = ptNode->next->find(*it); - if (ptNode->next->end() == citer) - { - return NULL; - } - ptNode = citer->second; - } - return ptNode->ptValue; - } + const TrieNode* ptNode = _base + (*(begin++)); + if (ptNode->ptValue != NULL && res.size() == 1) { + res[0].second = ptNode->ptValue; + } else if (ptNode->ptValue != NULL) { + res.push_back(DagType::value_type(offset, ptNode->ptValue)); + } - void find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res - ) const - { - res.resize(end - begin); + TrieNode::NextMap::const_iterator citer; + for (Unicode::const_iterator itr = begin; itr != end; itr++) { + if (NULL == ptNode->next) { + break; + } + citer = ptNode->next->find(*itr); + if (citer == ptNode->next->end()) { + break; + } + ptNode = citer->second; + if (NULL != ptNode->ptValue) { + res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); + } + } + return !res.empty(); + } + ~UglyTrie() { + for (size_t i = 0; i < BASE_SIZE; i++) { + if (_base[i].next == NULL) { + continue; + } + for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) { + _deleteNode(it->second); + it->second = NULL; + } + delete _base[i].next; + _base[i].next = NULL; + } + } - const TrieNode *ptNode = NULL; - TrieNode::NextMap::const_iterator citer; - for (size_t i = 0; i < size_t(end - begin); i++) - { - Unicode::value_type ch = *(begin + i); - ptNode = _base + ch; - res[i].uniCh = ch; - assert(res[i].dag.empty()); + private: + void _insertNode(const Unicode& key, const DictUnit* ptValue) { + if (key.begin() == key.end()) { + return; + } - res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); - - for (size_t j = i + 1; j < size_t(end - begin); j++) - { - if (ptNode->next == NULL) - { - break; - } - citer = ptNode->next->find(*(begin + j)); - if (ptNode->next->end() == citer) - { - break; - } - ptNode = citer->second; - if (NULL != ptNode->ptValue) - { - res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); - } - } - } - } - bool find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - DagType & res, - size_t offset = 0) const - { - if (begin == end) - { - return !res.empty(); - } + TrieNode::NextMap::const_iterator kmIter; + Unicode::const_iterator citer= key.begin(); + TrieNode *ptNode = _base + (*(citer++)); + for (; citer != key.end(); citer++) { + if (NULL == ptNode->next) { + ptNode->next = new TrieNode::NextMap; + } + kmIter = ptNode->next->find(*citer); + if (ptNode->next->end() == kmIter) { + TrieNode *nextNode = new TrieNode; - const TrieNode* ptNode = _base + (*(begin++)); - if (ptNode->ptValue != NULL && res.size() == 1) - { - res[0].second = ptNode->ptValue; - } - else if (ptNode->ptValue != NULL) - { - res.push_back(DagType::value_type(offset, ptNode->ptValue)); - } + (*(ptNode->next))[*citer] = nextNode; + ptNode = nextNode; + } else { + ptNode = kmIter->second; + } + } + ptNode->ptValue = ptValue; + } - TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator itr = begin; itr != end; itr++) - { - if (NULL == ptNode->next) - { - break; - } - citer = ptNode->next->find(*itr); - if (citer == ptNode->next->end()) - { - break; - } - ptNode = citer->second; - if (NULL != ptNode->ptValue) - { - res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); - } - } - return !res.empty(); - } - ~UglyTrie() - { - for (size_t i = 0; i < BASE_SIZE; i++) - { - if (_base[i].next == NULL) - { - continue; - } - for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) - { - _deleteNode(it->second); - it->second = NULL; - } - delete _base[i].next; - _base[i].next = NULL; - } - } + void _createTrie(const vector& keys, const vector& valuePointers) { + if (valuePointers.empty() || keys.empty()) { + return; + } + assert(keys.size() == valuePointers.size()); - private: - void _insertNode(const Unicode& key, const DictUnit* ptValue) - { - if (key.begin() == key.end()) - { - return; - } + for (size_t i = 0; i < keys.size(); i++) { + _insertNode(keys[i], valuePointers[i]); + } + } - TrieNode::NextMap::const_iterator kmIter; - Unicode::const_iterator citer= key.begin(); - TrieNode *ptNode = _base + (*(citer++)); - for (; citer != key.end(); citer++) - { - if (NULL == ptNode->next) - { - ptNode->next = new TrieNode::NextMap; - } - kmIter = ptNode->next->find(*citer); - if (ptNode->next->end() == kmIter) - { - TrieNode *nextNode = new TrieNode; + void _deleteNode(TrieNode* node) { + if (NULL == node) { + return; + } + if (NULL != node->next) { + TrieNode::NextMap::iterator it; + for (it = node->next->begin(); it != node->next->end(); it++) { + _deleteNode(it->second); + } + delete node->next; + node->next = NULL; + } + delete node; + } - (*(ptNode->next))[*citer] = nextNode; - ptNode = nextNode; - } - else - { - ptNode = kmIter->second; - } - } - ptNode->ptValue = ptValue; - } - - void _createTrie(const vector& keys, const vector& valuePointers) - { - if (valuePointers.empty() || keys.empty()) - { - return; - } - assert(keys.size() == valuePointers.size()); - - for (size_t i = 0; i < keys.size(); i++) - { - _insertNode(keys[i], valuePointers[i]); - } - } - - void _deleteNode(TrieNode* node) - { - if (NULL == node) - { - return; - } - if (NULL != node->next) - { - TrieNode::NextMap::iterator it; - for (it = node->next->begin(); it != node->next->end(); it++) - { - _deleteNode(it->second); - } - delete node->next; - node->next = NULL; - } - delete node; - } - - TrieNode _base[BASE_SIZE]; - }; + TrieNode _base[BASE_SIZE]; +}; } #endif From f5e74a3f4615885d355724adf44b53849e7ac1b2 Mon Sep 17 00:00:00 2001 From: aholic Date: Tue, 21 Jul 2015 00:29:49 +0800 Subject: [PATCH 6/6] replace old trie --- src/Trie.hpp | 253 +++++++++++++++++++---------------------------- src/UglyTrie.hpp | 208 -------------------------------------- 2 files changed, 102 insertions(+), 359 deletions(-) delete mode 100644 src/UglyTrie.hpp diff --git a/src/Trie.hpp b/src/Trie.hpp index eb20b36..3afda9b 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -29,228 +29,179 @@ struct SegmentChar { const DictUnit * pInfo; double weight; size_t nextPos; - SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) { - } - ~SegmentChar() { - } + SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0) {} + ~SegmentChar() {} }; typedef Unicode::value_type TrieKey; class TrieNode { + public : + TrieNode(): next(NULL), ptValue(NULL) {} public: - TrieNode(): fail(NULL), next(NULL), ptValue(NULL) { - } - const TrieNode * findNext(TrieKey key) const { - if(next == NULL) { - return NULL; - } - NextMap::const_iterator iter = next->find(key); - if(iter == next->end()) { - return NULL; - } - return iter->second; - } - public: - typedef unordered_map NextMap; - TrieNode * fail; - NextMap * next; - const DictUnit * ptValue; + typedef unordered_map NextMap; + NextMap *next; + const DictUnit *ptValue; }; class Trie { public: - Trie(const vector& keys, const vector & valuePointers) { - root_ = new TrieNode; - createTrie_(keys, valuePointers); - build_();// build automation - } - ~Trie() { - if(root_) { - deleteNode_(root_); - } - } + static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); public: + Trie(const vector& keys, const vector& valuePointers) { + _createTrie(keys, valuePointers); + } const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + if (begin == end) { + return NULL; + } + + const TrieNode* ptNode = _base + (*(begin++)); TrieNode::NextMap::const_iterator citer; - const TrieNode* ptNode = root_; - for(Unicode::const_iterator it = begin; it != end; it++) { - // build automation - assert(ptNode); - if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) { + for (Unicode::const_iterator it = begin; it != end; it++) { + if (NULL == ptNode->next) { + return NULL; + } + citer = ptNode->next->find(*it); + if (ptNode->next->end() == citer) { return NULL; } ptNode = citer->second; } return ptNode->ptValue; } - // aho-corasick-automation - void find(Unicode::const_iterator begin, + + void find( + Unicode::const_iterator begin, Unicode::const_iterator end, - vector& res) const { + vector& res + ) const { res.resize(end - begin); - const TrieNode* now = root_; - const TrieNode* node; - // compiler will complain warnings if only "i < end - begin" . + + const TrieNode *ptNode = NULL; + TrieNode::NextMap::const_iterator citer; for (size_t i = 0; i < size_t(end - begin); i++) { Unicode::value_type ch = *(begin + i); + ptNode = _base + ch; res[i].uniCh = ch; assert(res[i].dag.empty()); - res[i].dag.push_back(pair::size_type, const DictUnit* >(i, (const DictUnit*)NULL)); - bool flag = false; - // rollback - while( now != root_ ) { - node = now->findNext(ch); - if (node != NULL) { - flag = true; + res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); + + for (size_t j = i + 1; j < size_t(end - begin); j++) { + if (ptNode->next == NULL) { break; - } else { - now = now->fail; } - } - - if(!flag) { - node = now->findNext(ch); - } - if(node == NULL) { - now = root_; - } else { - now = node; - const TrieNode * temp = now; - while(temp != root_) { - if (temp->ptValue) { - size_t pos = i - temp->ptValue->word.size() + 1; - res[pos].dag.push_back(pair::size_type, const DictUnit* >(i, temp->ptValue)); - if(pos == i) { - res[pos].dag[0].second = temp->ptValue; - } - } - temp = temp->fail; - assert(temp); + citer = ptNode->next->find(*(begin + j)); + if (ptNode->next->end() == citer) { + break; + } + ptNode = citer->second; + if (NULL != ptNode->ptValue) { + res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); } } } } - bool find(Unicode::const_iterator begin, + bool find( + Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const { - const TrieNode * ptNode = root_; + if (begin == end) { + return !res.empty(); + } + + const TrieNode* ptNode = _base + (*(begin++)); + if (ptNode->ptValue != NULL && res.size() == 1) { + res[0].second = ptNode->ptValue; + } else if (ptNode->ptValue != NULL) { + res.push_back(DagType::value_type(offset, ptNode->ptValue)); + } + TrieNode::NextMap::const_iterator citer; - for(Unicode::const_iterator itr = begin; itr != end ; itr++) { - assert(ptNode); - if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) { + for (Unicode::const_iterator itr = begin; itr != end; itr++) { + if (NULL == ptNode->next) { + break; + } + citer = ptNode->next->find(*itr); + if (citer == ptNode->next->end()) { break; } ptNode = citer->second; - if(ptNode->ptValue) { - if(itr == begin && res.size() == 1) { // first singleword - res[0].second = ptNode->ptValue; - } else { - res.push_back(pair::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue)); - } + if (NULL != ptNode->ptValue) { + res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); } } return !res.empty(); } - void insertNode(const Unicode& key, const DictUnit* ptValue) { - TrieNode* newAddedNode = insertNode_(key, ptValue); - if (newAddedNode) { - build_(newAddedNode); - } - } - private: - void build_() { - assert(root_->ptValue == NULL); - assert(root_->next); - root_->fail = NULL; - for(TrieNode::NextMap::iterator iter = root_->next->begin(); iter != root_->next->end(); iter++) { - build_(iter->second); - } - } - void build_(TrieNode* node) { - node->fail = root_; - queue que; - que.push(node); - TrieNode* back = NULL; - TrieNode::NextMap::iterator backiter; - while(!que.empty()) { - TrieNode * now = que.front(); - que.pop(); - if(now->next == NULL) { + ~Trie() { + for (size_t i = 0; i < BASE_SIZE; i++) { + if (_base[i].next == NULL) { continue; } - for(TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { - back = now->fail; - while(back != NULL) { - if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) { - iter->second->fail = backiter->second; - break; - } - back = back->fail; - } - if(back == NULL) { - iter->second->fail = root_; - } - que.push(iter->second); + for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) { + _deleteNode(it->second); + it->second = NULL; } + delete _base[i].next; + _base[i].next = NULL; } } - void createTrie_(const vector& keys, - const vector & valuePointers) { - if(valuePointers.empty() || keys.empty()) { + + void insertNode(const Unicode& key, const DictUnit* ptValue) { + if (key.begin() == key.end()) { return; } - assert(keys.size() == valuePointers.size()); - - for(size_t i = 0; i < keys.size(); i++) { - insertNode_(keys[i], valuePointers[i]); - } - } - TrieNode* insertNode_(const Unicode& key, const DictUnit* ptValue) { - TrieNode* ptNode = root_; - TrieNode* newAddedNode = NULL; TrieNode::NextMap::const_iterator kmIter; - - for(Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) { - if(NULL == ptNode->next) { + Unicode::const_iterator citer= key.begin(); + TrieNode *ptNode = _base + (*(citer++)); + for (; citer != key.end(); citer++) { + if (NULL == ptNode->next) { ptNode->next = new TrieNode::NextMap; } kmIter = ptNode->next->find(*citer); - if(ptNode->next->end() == kmIter) { - TrieNode * nextNode = new TrieNode; - nextNode->next = NULL; - nextNode->ptValue = NULL; + if (ptNode->next->end() == kmIter) { + TrieNode *nextNode = new TrieNode; - if(newAddedNode == NULL) { - newAddedNode = nextNode; - } - (*ptNode->next)[*citer] = nextNode; + (*(ptNode->next))[*citer] = nextNode; ptNode = nextNode; } else { ptNode = kmIter->second; } } ptNode->ptValue = ptValue; - return newAddedNode; } - void deleteNode_(TrieNode* node) { - if(!node) { + + private: + void _createTrie(const vector& keys, const vector& valuePointers) { + if (valuePointers.empty() || keys.empty()) { return; } - if(node->next) { + assert(keys.size() == valuePointers.size()); + + for (size_t i = 0; i < keys.size(); i++) { + insertNode(keys[i], valuePointers[i]); + } + } + + void _deleteNode(TrieNode* node) { + if (NULL == node) { + return; + } + if (NULL != node->next) { TrieNode::NextMap::iterator it; - for(it = node->next->begin(); it != node->next->end(); it++) { - deleteNode_(it->second); + for (it = node->next->begin(); it != node->next->end(); it++) { + _deleteNode(it->second); } delete node->next; + node->next = NULL; } delete node; } - private: - TrieNode* root_; + + TrieNode _base[BASE_SIZE]; }; } diff --git a/src/UglyTrie.hpp b/src/UglyTrie.hpp deleted file mode 100644 index 39a3e89..0000000 --- a/src/UglyTrie.hpp +++ /dev/null @@ -1,208 +0,0 @@ -#ifndef CPPJIEBA_UGLY_TRIE_HPP -#define CPPJIEBA_UGLY_TRIE_HPP - -#include "Limonp/StdExtension.hpp" -#include -#include - -namespace CppJieba { -using namespace std; - -struct DictUnit { - Unicode word; - double weight; - string tag; -}; - -// for debugging -inline ostream & operator << (ostream& os, const DictUnit& unit) { - string s; - s << unit.word; - return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); -} - -typedef LocalVector > DagType; - -struct SegmentChar { - uint16_t uniCh; - DagType dag; - const DictUnit * pInfo; - double weight; - size_t nextPos; - SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0) {} - ~SegmentChar() {} -}; - -typedef Unicode::value_type TrieKey; - -class TrieNode { - public : - TrieNode(): next(NULL), ptValue(NULL) {} - public: - typedef unordered_map NextMap; - NextMap *next; - const DictUnit *ptValue; -}; - -class UglyTrie { - public: - static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); - public: - UglyTrie(const vector& keys, const vector& valuePointers) { - _createTrie(keys, valuePointers); - } - const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { - if (begin == end) { - return NULL; - } - - const TrieNode* ptNode = _base + (*(begin++)); - TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator it = begin; it != end; it++) { - if (NULL == ptNode->next) { - return NULL; - } - citer = ptNode->next->find(*it); - if (ptNode->next->end() == citer) { - return NULL; - } - ptNode = citer->second; - } - return ptNode->ptValue; - } - - void find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res - ) const { - res.resize(end - begin); - - const TrieNode *ptNode = NULL; - TrieNode::NextMap::const_iterator citer; - for (size_t i = 0; i < size_t(end - begin); i++) { - Unicode::value_type ch = *(begin + i); - ptNode = _base + ch; - res[i].uniCh = ch; - assert(res[i].dag.empty()); - - res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); - - for (size_t j = i + 1; j < size_t(end - begin); j++) { - if (ptNode->next == NULL) { - break; - } - citer = ptNode->next->find(*(begin + j)); - if (ptNode->next->end() == citer) { - break; - } - ptNode = citer->second; - if (NULL != ptNode->ptValue) { - res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); - } - } - } - } - bool find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - DagType & res, - size_t offset = 0) const { - if (begin == end) { - return !res.empty(); - } - - const TrieNode* ptNode = _base + (*(begin++)); - if (ptNode->ptValue != NULL && res.size() == 1) { - res[0].second = ptNode->ptValue; - } else if (ptNode->ptValue != NULL) { - res.push_back(DagType::value_type(offset, ptNode->ptValue)); - } - - TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator itr = begin; itr != end; itr++) { - if (NULL == ptNode->next) { - break; - } - citer = ptNode->next->find(*itr); - if (citer == ptNode->next->end()) { - break; - } - ptNode = citer->second; - if (NULL != ptNode->ptValue) { - res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); - } - } - return !res.empty(); - } - ~UglyTrie() { - for (size_t i = 0; i < BASE_SIZE; i++) { - if (_base[i].next == NULL) { - continue; - } - for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) { - _deleteNode(it->second); - it->second = NULL; - } - delete _base[i].next; - _base[i].next = NULL; - } - } - - private: - void _insertNode(const Unicode& key, const DictUnit* ptValue) { - if (key.begin() == key.end()) { - return; - } - - TrieNode::NextMap::const_iterator kmIter; - Unicode::const_iterator citer= key.begin(); - TrieNode *ptNode = _base + (*(citer++)); - for (; citer != key.end(); citer++) { - if (NULL == ptNode->next) { - ptNode->next = new TrieNode::NextMap; - } - kmIter = ptNode->next->find(*citer); - if (ptNode->next->end() == kmIter) { - TrieNode *nextNode = new TrieNode; - - (*(ptNode->next))[*citer] = nextNode; - ptNode = nextNode; - } else { - ptNode = kmIter->second; - } - } - ptNode->ptValue = ptValue; - } - - void _createTrie(const vector& keys, const vector& valuePointers) { - if (valuePointers.empty() || keys.empty()) { - return; - } - assert(keys.size() == valuePointers.size()); - - for (size_t i = 0; i < keys.size(); i++) { - _insertNode(keys[i], valuePointers[i]); - } - } - - void _deleteNode(TrieNode* node) { - if (NULL == node) { - return; - } - if (NULL != node->next) { - TrieNode::NextMap::iterator it; - for (it = node->next->begin(); it != node->next->end(); it++) { - _deleteNode(it->second); - } - delete node->next; - node->next = NULL; - } - delete node; - } - - TrieNode _base[BASE_SIZE]; -}; -} - -#endif