From f70b654b66bba031a1e7b87fe9c7fc5febe49738 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 10 Apr 2014 21:05:01 +0800 Subject: [PATCH 1/5] split Trie.hpp into (Trie.hpp & DictTrie.hpp) --- src/DictTrie.hpp | 145 ++++++++++++++++++++++ src/Trie.hpp | 316 +++++++++++++++-------------------------------- 2 files changed, 248 insertions(+), 213 deletions(-) create mode 100644 src/DictTrie.hpp diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp new file mode 100644 index 0000000..4f5700a --- /dev/null +++ b/src/DictTrie.hpp @@ -0,0 +1,145 @@ +#ifndef CPPJIEBA_DICT_TRIE_HPP +#define CPPJIEBA_DICT_TRIE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" +#include "Limonp/InitOnOff.hpp" +#include "TransCode.hpp" +#include "Trie.hpp" + + + +namespace CppJieba +{ + using namespace Limonp; + const double MIN_DOUBLE = -3.14e+100; + const double MAX_DOUBLE = 3.14e+100; + const size_t DICT_COLUMN_NUM = 3; + + + struct DictUnit + { + Unicode word; + size_t freq; + string tag; + double logFreq; //logFreq = log(freq/sum(freq)); + }; + + typedef map DagType; + + class DictTrie: InitOnOff + { + + private: + DictTrieNode* _root; + vector _nodeInfos; + + int64_t _freqSum; + double _minLogFreq; + + public: + DictTrie() + { + _root = new DictTrieNode; + _root.ptKeyMap = NULL; + _root.offset = 0; + _freqSum = 0; + _minLogFreq = MAX_DOUBLE; + _setInitFlag(false); + } + DictTrie(const string& filePath) + { + new (this) DictTrie(); + _setInitFlag(init(filePath)); + } + ~DictTrie() + { + _deleteNode(_root); + } + private: + + + public: + bool init(const string& filePath) + { + assert(!_getInitFlag()); + _loadDict(filePath, _nodeInfos); + _createDictTrie(_nodeInfos, _root); + _freqSum = _calculateFreqSum(_nodeInfos); + assert(_freqSum); + _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); + return _setInitFlag(true); + } + + + public: + double getMinLogFreq() const {return _minLogFreq;}; + + private: + void _loadDict(const string& filePath, vector& nodeInfos) const + { + ifstream ifs(filePath.c_str()); + if(!ifs) + { + LogFatal("open %s failed.", filePath.c_str()); + exit(1); + } + string line; + vector buf; + + nodeInfos.clear(); + DictUnit nodeInfo; + for(size_t lineno = 0 ; getline(ifs, line); lineno++) + { + split(line, buf, " "); + assert(buf.size() == DICT_COLUMN_NUM); + + if(!TransCode::decode(buf[0], nodeInfo.word)) + { + LogError("line[%u:%s] illegal.", lineno, line.c_str()); + continue; + } + nodeInfo.freq = atoi(buf[1].c_str()); + nodeInfo.tag = buf[2]; + + nodeInfos.push_back(nodeInfo); + } + } + size_t _calculateFreqSum(const vector& nodeInfos) const + { + size_t freqSum = 0; + for(size_t i = 0; i < nodeInfos.size(); i++) + { + freqSum += nodeInfos[i].freq; + } + return freqSum; + } + double _calculateLogFreqAndGetMinValue(vector& nodeInfos, size_t freqSum) const + { + assert(freqSum); + double minLogFreq = MAX_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) + { + DictUnit& nodeInfo = nodeInfos[i]; + assert(nodeInfo.freq); + nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum)); + if(minLogFreq > nodeInfo.logFreq) + { + minLogFreq = nodeInfo.logFreq; + } + } + return minLogFreq; + } + + + }; +} + +#endif diff --git a/src/Trie.hpp b/src/Trie.hpp index 714b2c6..8a9080d 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -1,241 +1,131 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com - ************************************/ -#ifndef CPPJIEBA_TRIE_H -#define CPPJIEBA_TRIE_H - -#include -#include -#include -#include -#include -#include -#include -#include "Limonp/str_functs.hpp" -#include "Limonp/logger.hpp" -#include "Limonp/InitOnOff.hpp" -#include "TransCode.hpp" - +#ifndef CPPJIEBA_TRIE_HPP +#define CPPJIEBA_TRIE_HPP +#include "Limonp/std_outbound.hpp" +#include namespace CppJieba { - using namespace Limonp; - const double MIN_DOUBLE = -3.14e+100; - const double MAX_DOUBLE = 3.14e+100; - const size_t DICT_COLUMN_NUM = 3; - typedef unordered_map TrieNodeMap; - struct TrieNodeInfo; - struct TrieNode - { - TrieNodeMap hmap; - const TrieNodeInfo * ptTrieNodeInfo; - TrieNode(): ptTrieNodeInfo(NULL) - {} - }; + template + class TrieNode + { + public: + typedef unordered_map TrieNodeMapType; + public: + TrieNodeMap * ptKeyMap; + const ValueType * ptValue; + }; - struct TrieNodeInfo - { - Unicode word; - size_t freq; - string tag; - double logFreq; //logFreq = log(freq/sum(freq)); - }; - - inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo) - { - return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ; - } - - typedef map DagType; - - class Trie: public InitOnOff - { - - private: - TrieNode* _root; - vector _nodeInfos; - - int64_t _freqSum; - double _minLogFreq; - - public: - Trie() - { - _root = new TrieNode; - _freqSum = 0; - _minLogFreq = MAX_DOUBLE; - _setInitFlag(false); - } - Trie(const string& filePath) - { - new (this) Trie(); - _setInitFlag(init(filePath)); - } - ~Trie() - { - _deleteNode(_root); - } - private: - - - public: - bool init(const string& filePath) - { - assert(!_getInitFlag()); - _loadDict(filePath, _nodeInfos); - _createTrie(_nodeInfos, _root); - _freqSum = _calculateFreqSum(_nodeInfos); - assert(_freqSum); - _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); - return _setInitFlag(true); - } - - public: - const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const - { - TrieNodeMap::const_iterator citer; - const TrieNode* p = _root; - for(Unicode::const_iterator it = begin; it != end; it++) + template + class Trie + { + private: + TrieNode* _root; + private: + public: + Trie(const vector& keys, const vector& valuePointers) { - citer = p->hmap.find(*it); - if(p->hmap.end() == citer) - { - return NULL; - } - p = citer->second; + _root = new TrieNode; + _root->ptKeyMap = NULL; + _root->ptValue = NULL; + + _createTrie(keys, valuePointers); } - return p->ptTrieNodeInfo; - } - - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const - { - const TrieNode* p = _root; - TrieNodeMap::const_iterator citer; - for (Unicode::const_iterator itr = begin; itr != end; itr++) + ~Trie() { - citer = p->hmap.find(*itr); - if(p->hmap.end() == citer) - { - break; - } - p = citer->second; - if(p->ptTrieNodeInfo) - { - res[itr - begin + offset] = p->ptTrieNodeInfo; - } } - return !res.empty(); - } - - public: - double getMinLogFreq() const {return _minLogFreq;}; - - private: - void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const - { - const Unicode& unico = nodeInfo.word; - TrieNodeMap::const_iterator citer; - for(size_t i = 0; i < unico.size(); i++) + public: + const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const { - uint16_t cu = unico[i]; - assert(ptNode); - citer = ptNode->hmap.find(cu); - if(ptNode->hmap.end() == citer) + TrieNodeMapType::const_iterator citer; + const TrieNode* ptNode = _root; + for(KeyType::const_iterator it = begin; it != end; it++) { - TrieNode * next = new TrieNode; - ptNode->hmap[cu] = next; - ptNode = next; + citer = ptNode->ptKeyMap->find(*it); + if(ptNode->ptKeyMap->end() == citer) + { + return NULL; + } + ptNode= citer->second; } - else + return ptNode->ptValue; + } + bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map& ordererMap) const + { + const TrieNode * ptNode = _root; + TrieNodeMapType::const_iterator citer; + for(KeyType::const_iterator itr = begin; itr != end ; itr++) { + citer = ptNode->ptKeyMap->find(*itr); + if(ptNode->ptKeyMap->end() == citer) + { + break; + } ptNode = citer->second; + if(ptNode->ptValue) + { + ordererMap[itr - begin] = ptNode->ptValue; + } } - } - - ptNode->ptTrieNodeInfo = &nodeInfo; - } - - private: - void _loadDict(const string& filePath, vector& nodeInfos) const - { - ifstream ifs(filePath.c_str()); - if(!ifs) + private: + void _createTrie(const vector& keys, const vector& valuePointers) { - LogFatal("open %s failed.", filePath.c_str()); - exit(1); - } - string line; - vector buf; - - nodeInfos.clear(); - TrieNodeInfo nodeInfo; - for(size_t lineno = 0 ; getline(ifs, line); lineno++) - { - split(line, buf, " "); - assert(buf.size() == DICT_COLUMN_NUM); - if(!TransCode::decode(buf[0], nodeInfo.word)) + if(values.empty() || keys.empty()) { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - continue; + return; } - nodeInfo.freq = atoi(buf[1].c_str()); - nodeInfo.tag = buf[2]; + assert(keys.size() == valuePointers.size()); - nodeInfos.push_back(nodeInfo); - } - } - bool _createTrie(const vector& nodeInfos, TrieNode * ptNode) - { - for(size_t i = 0; i < _nodeInfos.size(); i++) - { - _insertNode(_nodeInfos[i], ptNode); - } - return true; - } - size_t _calculateFreqSum(const vector& nodeInfos) const - { - size_t freqSum = 0; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - freqSum += nodeInfos[i].freq; - } - return freqSum; - } - double _calculateLogFreqAndGetMinValue(vector& nodeInfos, size_t freqSum) const - { - assert(freqSum); - double minLogFreq = MAX_DOUBLE; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - TrieNodeInfo& nodeInfo = nodeInfos[i]; - assert(nodeInfo.freq); - nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum)); - if(minLogFreq > nodeInfo.logFreq) + for(size_t i = 0; i < keys.size(); i++) { - minLogFreq = nodeInfo.logFreq; + _insertNode(keys[i], valuePointers[i]); } } - return minLogFreq; - } - - void _deleteNode(TrieNode* node) - { - if(!node) + private: + void _insertNode(const KeyType& key, const Value* ptValue) { - return; - } - for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) - { - TrieNode* next = it->second; - _deleteNode(next); - } - delete node; - } + TrieNode* ptNode = _root; - }; + TrieNode::KeyMapType::const_iterator kmIter; + + for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++) + { + if(NULL == ptNode->ptKeyMap) + { + ptNode->ptKeyMap = new TrieNode::KeyMapType; + } + kmIter = ptNode->ptKeyMap->find(*citer); + if(ptNode->ptKeyMap->end() == kmIter) + { + TrieNode * nextNode = new TrieNode; + nextNode->ptKeyMap = NULL; + nextNode->ptValue = NULL; + + ptNode->ptKeyMap[*citer] = nextNode; + ptNode = next; + } + else + { + ptNode = kmIter->second; + } + } + ptNode->ptValue = ptValue; + } + void _deleteNode(TrieNode* node) + { + if(!node) + { + return; + } + for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + { + _deleteNode(it->second); + } + delete node->ptKeyMap; + delete node; + } + } } #endif From abd23a4d79febe54c317ce58da657dc17cc33b0c Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 10 Apr 2014 21:07:11 +0800 Subject: [PATCH 2/5] rename Trie -> DictTrie --- src/DictTrie.hpp | 2 +- src/FullSegment.hpp | 10 +++++----- src/HMMSegment.hpp | 2 +- src/MPSegment.hpp | 14 +++++++------- src/PosTagger.hpp | 10 +++++----- src/QuerySegment.hpp | 4 ++-- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 4f5700a..59c2e69 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -12,7 +12,7 @@ #include "Limonp/logger.hpp" #include "Limonp/InitOnOff.hpp" #include "TransCode.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index ec97067..fa55cde 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -5,7 +5,7 @@ #include #include #include "Limonp/logger.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" #include "TransCode.hpp" @@ -15,7 +15,7 @@ namespace CppJieba class FullSegment: public SegmentBase { private: - Trie _trie; + DictTrie _dictTrie; public: FullSegment(){_setInitFlag(false);}; @@ -29,8 +29,8 @@ namespace CppJieba LogError("already inited before now."); return false; } - _trie.init(dictPath.c_str()); - assert(_trie); + _dictTrie.init(dictPath.c_str()); + assert(_dictTrie); return _setInitFlag(true); } @@ -61,7 +61,7 @@ namespace CppJieba for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { //find word start from uItr - if (_trie.find(uItr, end, tRes, 0)) + if (_dictTrie.find(uItr, end, tRes, 0)) { for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 41c5a77..6fe25c5 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -10,7 +10,7 @@ #include "TransCode.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" namespace CppJieba { diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 8a648a9..b22d21d 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -9,8 +9,8 @@ #include #include #include "Limonp/logger.hpp" -#include "Trie.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" +#include "DictTrie.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" @@ -32,7 +32,7 @@ namespace CppJieba class MPSegment: public SegmentBase { protected: - Trie _trie; + DictTrie _dictTrie; public: MPSegment(){_setInitFlag(false);}; @@ -49,8 +49,8 @@ namespace CppJieba LogError("already inited before now."); return false; } - _trie.init(dictPath); - assert(_trie); + _dictTrie.init(dictPath); + assert(_dictTrie); LogInfo("MPSegment init(%s) ok", dictPath.c_str()); return _setInitFlag(true); } @@ -124,7 +124,7 @@ namespace CppJieba schar.uniCh = *it; offset = it - begin; schar.dag.clear(); - _trie.find(it, end, schar.dag, offset); + _dictTrie.find(it, end, schar.dag, offset); if(!isIn(schar.dag, offset)) { schar.dag[offset] = NULL; @@ -165,7 +165,7 @@ namespace CppJieba } else { - val += _trie.getMinLogFreq(); + val += _dictTrie.getMinLogFreq(); } if(val > segContext[i].weight) { diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index 79bacd9..c760c0b 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -3,7 +3,7 @@ #include "MixSegment.hpp" #include "Limonp/str_functs.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" namespace CppJieba { @@ -13,7 +13,7 @@ namespace CppJieba { private: MixSegment _segment; - Trie _trie; + DictTrie _dictTrie; public: PosTagger(){_setInitFlag(false);}; @@ -27,8 +27,8 @@ namespace CppJieba { assert(!_getInitFlag()); - _trie.init(dictPath); - assert(_trie); + _dictTrie.init(dictPath); + assert(_dictTrie); return _setInitFlag(_segment.init(dictPath, hmmFilePath)); }; @@ -51,7 +51,7 @@ namespace CppJieba LogError("decode failed."); return false; } - tmp = _trie.find(unico.begin(), unico.end()); + tmp = _dictTrie.find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 93a207a..0e8fbac 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -5,13 +5,13 @@ #include #include #include "Limonp/logger.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" #include "FullSegment.hpp" #include "MixSegment.hpp" #include "TransCode.hpp" -#include "Trie.hpp" +#include "DictTrie.hpp" namespace CppJieba { From 776191b3755c552348e78a14fafa132210f0eb06 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 10 Apr 2014 22:32:39 +0800 Subject: [PATCH 3/5] ci --- src/DictTrie.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 59c2e69..1dae1e9 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -38,7 +38,6 @@ namespace CppJieba { private: - DictTrieNode* _root; vector _nodeInfos; int64_t _freqSum; @@ -47,9 +46,6 @@ namespace CppJieba public: DictTrie() { - _root = new DictTrieNode; - _root.ptKeyMap = NULL; - _root.offset = 0; _freqSum = 0; _minLogFreq = MAX_DOUBLE; _setInitFlag(false); @@ -61,7 +57,6 @@ namespace CppJieba } ~DictTrie() { - _deleteNode(_root); } private: @@ -71,7 +66,6 @@ namespace CppJieba { assert(!_getInitFlag()); _loadDict(filePath, _nodeInfos); - _createDictTrie(_nodeInfos, _root); _freqSum = _calculateFreqSum(_nodeInfos); assert(_freqSum); _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); From 24120c92b1f9e4e3eb22b6512d7202964c3f2c31 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 10 Apr 2014 09:16:35 -0700 Subject: [PATCH 4/5] compile ok --- src/DictTrie.hpp | 50 +++++++++++++++++++++++--- src/FullSegment.hpp | 2 +- src/MPSegment.hpp | 6 ++-- src/PosTagger.hpp | 2 +- src/Trie.hpp | 78 ++++++++++++++++++++++++----------------- test/unittest/TTrie.cpp | 26 +++++++------- 6 files changed, 108 insertions(+), 56 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 1dae1e9..0564d17 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -12,7 +12,7 @@ #include "Limonp/logger.hpp" #include "Limonp/InitOnOff.hpp" #include "TransCode.hpp" -#include "DictTrie.hpp" +#include "Trie.hpp" @@ -32,20 +32,30 @@ namespace CppJieba double logFreq; //logFreq = log(freq/sum(freq)); }; + inline ostream & operator << (ostream& os, const DictUnit& unit) + { + string s; + s << unit.word; + return os << string_format("%s %u %s %llf", s.c_str(), unit.freq, unit.tag.c_str(), unit.logFreq); + } + typedef map DagType; - class DictTrie: InitOnOff + class DictTrie: public InitOnOff { - + public: + typedef Trie TrieType; private: vector _nodeInfos; + TrieType * _trie; - int64_t _freqSum; + size_t _freqSum; double _minLogFreq; public: DictTrie() { + _trie = NULL; _freqSum = 0; _minLogFreq = MAX_DOUBLE; _setInitFlag(false); @@ -57,6 +67,10 @@ namespace CppJieba } ~DictTrie() { + if(_trie) + { + delete _trie; + } } private: @@ -69,14 +83,40 @@ namespace CppJieba _freqSum = _calculateFreqSum(_nodeInfos); assert(_freqSum); _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); - return _setInitFlag(true); + _trie = _creatTrie(_nodeInfos); + return _setInitFlag(_trie); } + public: + const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + return _trie->find(begin, end); + } + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const + { + return _trie->find(begin, end, dag, offset); + } public: double getMinLogFreq() const {return _minLogFreq;}; private: + TrieType * _creatTrie(const vector& dictUnits) + { + if(dictUnits.empty()) + { + return NULL; + } + vector words; + vector valuePointers; + for(size_t i = 0 ; i < dictUnits.size(); i ++) + { + words.push_back(dictUnits[i].word); + valuePointers.push_back(&dictUnits[i]); + } + TrieType * trie = new TrieType(words, valuePointers); + return trie; + } void _loadDict(const string& filePath, vector& nodeInfos) const { ifstream ifs(filePath.c_str()); diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index fa55cde..d0eedb1 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -64,7 +64,7 @@ namespace CppJieba if (_dictTrie.find(uItr, end, tRes, 0)) { for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) { wordLen = itr->second->word.size(); if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index b22d21d..7705227 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -21,7 +21,7 @@ namespace CppJieba { uint16_t uniCh; DagType dag; - const TrieNodeInfo * pInfo; + const DictUnit * pInfo; double weight; SegmentChar():uniCh(0), pInfo(NULL), weight(0.0) @@ -142,7 +142,7 @@ namespace CppJieba } size_t nextPos; - const TrieNodeInfo* p; + const DictUnit* p; double val; for(int i = segContext.size() - 1; i >= 0; i--) @@ -182,7 +182,7 @@ namespace CppJieba size_t i = 0; while(i < segContext.size()) { - const TrieNodeInfo* p = segContext[i].pInfo; + const DictUnit* p = segContext[i].pInfo; if(p) { res.push_back(p->word); diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index c760c0b..f6ee8fd 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -42,7 +42,7 @@ namespace CppJieba return false; } - const TrieNodeInfo *tmp = NULL; + const DictUnit *tmp = NULL; Unicode unico; for (vector::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { diff --git a/src/Trie.hpp b/src/Trie.hpp index 8a9080d..7d6feaf 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -6,26 +6,28 @@ namespace CppJieba { + using namespace std; template class TrieNode { public: - typedef unordered_map TrieNodeMapType; + typedef unordered_map* > KeyMapType; public: - TrieNodeMap * ptKeyMap; + KeyMapType * ptKeyMap; const ValueType * ptValue; }; template class Trie { - private: - TrieNode* _root; - private: public: - Trie(const vector& keys, const vector& valuePointers) + typedef TrieNode TrieNodeType; + private: + TrieNodeType* _root; + public: + Trie(const vector >& keys, const vector& valuePointers) { - _root = new TrieNode; + _root = new TrieNodeType; _root->ptKeyMap = NULL; _root->ptValue = NULL; @@ -33,28 +35,33 @@ namespace CppJieba } ~Trie() { + if(_root) + { + _deleteNode(_root); + } } public: - const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const + const ValueType* find(typename vector::const_iterator begin, typename vector::const_iterator end) const { - TrieNodeMapType::const_iterator citer; - const TrieNode* ptNode = _root; - for(KeyType::const_iterator it = begin; it != end; it++) + typename TrieNodeType::KeyMapType::const_iterator citer; + const TrieNodeType* ptNode = _root; + for(typename vector::const_iterator it = begin; it != end; it++) { citer = ptNode->ptKeyMap->find(*it); if(ptNode->ptKeyMap->end() == citer) { return NULL; } - ptNode= citer->second; + ptNode = citer->second; } return ptNode->ptValue; } - bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map& ordererMap) const + bool find(typename vector::const_iterator begin, typename vector ::const_iterator end, map::size_type, const ValueType* >& ordererMap, size_t offset = 0) const { - const TrieNode * ptNode = _root; - TrieNodeMapType::const_iterator citer; - for(KeyType::const_iterator itr = begin; itr != end ; itr++) + const TrieNodeType * ptNode = _root; + typename TrieNodeType::KeyMapType::const_iterator citer; + ordererMap.clear(); + for(typename vector::const_iterator itr = begin; itr != end ; itr++) { citer = ptNode->ptKeyMap->find(*itr); if(ptNode->ptKeyMap->end() == citer) @@ -64,46 +71,47 @@ namespace CppJieba ptNode = citer->second; if(ptNode->ptValue) { - ordererMap[itr - begin] = ptNode->ptValue; + ordererMap[itr - begin + offset] = ptNode->ptValue; } } + return ordererMap.size(); } private: - void _createTrie(const vector& keys, const vector& valuePointers) + void _createTrie(const vector >& keys, const vector& valuePointers) { - if(values.empty() || keys.empty()) + if(valuePointers.empty() || keys.empty()) { return; } assert(keys.size() == valuePointers.size()); - + for(size_t i = 0; i < keys.size(); i++) { _insertNode(keys[i], valuePointers[i]); } } private: - void _insertNode(const KeyType& key, const Value* ptValue) + void _insertNode(const vector& key, const ValueType* ptValue) { - TrieNode* ptNode = _root; + TrieNodeType* ptNode = _root; - TrieNode::KeyMapType::const_iterator kmIter; + typename TrieNodeType::KeyMapType::const_iterator kmIter; - for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++) + for(typename vector::const_iterator citer = key.begin(); citer != key.end(); citer++) { if(NULL == ptNode->ptKeyMap) { - ptNode->ptKeyMap = new TrieNode::KeyMapType; + ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType; } kmIter = ptNode->ptKeyMap->find(*citer); if(ptNode->ptKeyMap->end() == kmIter) { - TrieNode * nextNode = new TrieNode; + TrieNodeType * nextNode = new TrieNodeType; nextNode->ptKeyMap = NULL; nextNode->ptValue = NULL; - ptNode->ptKeyMap[*citer] = nextNode; - ptNode = next; + (*ptNode->ptKeyMap)[*citer] = nextNode; + ptNode = nextNode; } else { @@ -112,20 +120,24 @@ namespace CppJieba } ptNode->ptValue = ptValue; } - void _deleteNode(TrieNode* node) + void _deleteNode(TrieNodeType* node) { if(!node) { return; } - for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + if(node->ptKeyMap) { - _deleteNode(it->second); + typename TrieNodeType::KeyMapType::iterator it; + for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + { + _deleteNode(it->second); + } + delete node->ptKeyMap; } - delete node->ptKeyMap; delete node; } - } + }; } #endif diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 1ccbb98..5e56a44 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -1,30 +1,30 @@ -#include "src/Trie.hpp" +#include "src/DictTrie.hpp" #include "gtest/gtest.h" using namespace CppJieba; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; -TEST(TrieTest, NewAndDelete) +TEST(DictTrieTest, NewAndDelete) { - Trie * trie; - trie = new Trie(DICT_FILE); + DictTrie * trie; + trie = new DictTrie(DICT_FILE); delete trie; - trie = new Trie(); + trie = new DictTrie(); delete trie; } -TEST(TrieTest, Test1) +TEST(DictTrieTest, Test1) { string s1, s2; - Trie trie; + DictTrie trie; ASSERT_TRUE(trie.init(DICT_FILE)); ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001); string word("来到"); Unicode uni; ASSERT_TRUE(TransCode::decode(word, uni)); - TrieNodeInfo nodeInfo; + DictUnit nodeInfo; nodeInfo.word = uni; nodeInfo.freq = 8779; nodeInfo.tag = "v"; @@ -34,9 +34,9 @@ TEST(TrieTest, Test1) EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2); word = "清华大学"; - vector > res; - map resMap; - map mp; + vector > res; + map resMap; + map mp; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { @@ -44,10 +44,10 @@ TEST(TrieTest, Test1) res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); } - //TrieNodeInfo + //DictUnit //res.push_back(make_pair(0, )) - vector > vec; + vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); //print(uni); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); From cae1503725ab702fefa3329f8e5ba421152a63e8 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 11 Apr 2014 12:08:46 +0800 Subject: [PATCH 5/5] split Trie.hpp into (Trie.hpp & DictTrie.hpp) test ok --- src/DictTrie.hpp | 2 +- src/Trie.hpp | 8 ++++---- test/unittest/TTrie.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 0564d17..729364c 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -36,7 +36,7 @@ namespace CppJieba { string s; s << unit.word; - return os << string_format("%s %u %s %llf", s.c_str(), unit.freq, unit.tag.c_str(), unit.logFreq); + return os << string_format("%s %u %s %.3lf", s.c_str(), unit.freq, unit.tag.c_str(), unit.logFreq); } typedef map DagType; diff --git a/src/Trie.hpp b/src/Trie.hpp index 7d6feaf..0f8282a 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -47,8 +47,8 @@ namespace CppJieba const TrieNodeType* ptNode = _root; for(typename vector::const_iterator it = begin; it != end; it++) { - citer = ptNode->ptKeyMap->find(*it); - if(ptNode->ptKeyMap->end() == citer) + assert(ptNode); + if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it))) { return NULL; } @@ -63,8 +63,8 @@ namespace CppJieba ordererMap.clear(); for(typename vector::const_iterator itr = begin; itr != end ; itr++) { - citer = ptNode->ptKeyMap->find(*itr); - if(ptNode->ptKeyMap->end() == citer) + assert(ptNode); + if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr))) { break; } diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 5e56a44..5ae91b2 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -32,7 +32,7 @@ TEST(DictTrieTest, Test1) s1 << nodeInfo; s2 << (*trie.find(uni.begin(), uni.end())); - EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2); + EXPECT_EQ("[\"26469\", \"21040\"] 8779 v -8.870", s2); word = "清华大学"; vector > res; map resMap;