From abfc3b4b6cee4a16bc150733445f2518637f6de6 Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 30 Nov 2013 12:17:36 +0800 Subject: [PATCH] merge trie.h/cpp into trie.hpp --- src/CMakeLists.txt | 2 +- src/MPSegment.h | 2 +- src/Trie.cpp | 390 ------------------------------------------ src/Trie.h | 85 --------- src/Trie.hpp | 416 +++++++++++++++++++++++++++++++++++++++++++++ src/structs.h | 2 +- 6 files changed, 419 insertions(+), 478 deletions(-) delete mode 100644 src/Trie.cpp delete mode 100644 src/Trie.h create mode 100644 src/Trie.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1196cef..c32ad6f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) -SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp) +SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp ) ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC}) ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjserver server.cpp) diff --git a/src/MPSegment.h b/src/MPSegment.h index a3eaae3..9dd0a7c 100644 --- a/src/MPSegment.h +++ b/src/MPSegment.h @@ -8,7 +8,7 @@ #include #include #include "Limonp/logger.hpp" -#include "Trie.h" +#include "Trie.hpp" #include "globals.h" #include "ISegment.hpp" #include "SegmentBase.hpp" diff --git a/src/Trie.cpp b/src/Trie.cpp deleted file mode 100644 index b72ff3b..0000000 --- a/src/Trie.cpp +++ /dev/null @@ -1,390 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#include "Trie.h" - -namespace CppJieba -{ - - Trie::Trie() - { - - _root = NULL; - _freqSum = 0; - _minLogFreq = MAX_DOUBLE; - _initFlag = false; - } - - Trie::~Trie() - { - dispose(); - } - - bool Trie::init() - { - if(_getInitFlag()) - { - LogError("already initted!"); - return false; - } - - try - { - _root = new TrieNode; - } - catch(const bad_alloc& e) - { - return false; - } - if(NULL == _root) - { - return false; - } - _setInitFlag(true); - return true; - } - - bool Trie::loadDict(const char * const filePath) - { - if(!_getInitFlag()) - { - LogError("not initted."); - return false; - } - - if(!checkFileExist(filePath)) - { - LogError("cann't find fiel[%s].",filePath); - return false; - } - bool res = false; - res = _trieInsert(filePath); - if(!res) - { - LogError("_trieInsert failed."); - return false; - } - res = _countWeight(); - if(!res) - { - LogError("_countWeight failed."); - return false; - } - return true; - } - - bool Trie::_trieInsert(const char * const filePath) - { - - ifstream ifile(filePath); - string line; - vector vecBuf; - - TrieNodeInfo nodeInfo; - while(getline(ifile, line)) - { - vecBuf.clear(); - splitStr(line, vecBuf, " "); - if(3 < vecBuf.size()) - { - LogError("line[%s] illegal.", line.c_str()); - return false; - } - if(!TransCode::decode(vecBuf[0], nodeInfo.word)) - { - return false; - } - nodeInfo.freq = atoi(vecBuf[1].c_str()); - if(3 == vecBuf.size()) - { - nodeInfo.tag = vecBuf[2]; - } - - //insert node - if(!insert(nodeInfo)) - { - LogError("insert node failed!"); - } - } - return true; - } - - bool Trie::dispose() - { - if(!_getInitFlag()) - { - return false; - } - bool ret = _deleteNode(_root); - if(!ret) - { - LogFatal("_deleteNode failed!"); - return false; - } - _root = NULL; - _nodeInfoVec.clear(); - - _setInitFlag(false); - return ret; - } - - const TrieNodeInfo* Trie::findPrefix(const string& str)const - { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return NULL; - } - Unicode uintVec; - - if(!TransCode::decode(str, uintVec)) - { - LogError("TransCode::decode failed."); - return NULL; - } - - //find - TrieNode* p = _root; - uint pos = 0; - uint16_t chUni = 0; - const TrieNodeInfo * res = NULL; - for(uint i = 0; i < uintVec.size(); i++) - { - chUni = uintVec[i]; - if(p->isLeaf) - { - pos = p->nodeInfoVecPos; - if(pos >= _nodeInfoVec.size()) - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return NULL; - } - res = &(_nodeInfoVec[pos]); - - } - if(p->hmap.find(chUni) == p->hmap.end()) - { - break; - } - else - { - p = p->hmap[chUni]; - } - } - return res; - } - - const TrieNodeInfo* Trie::find(const string& str)const - { - Unicode uintVec; - if(!TransCode::decode(str, uintVec)) - { - return NULL; - } - return find(uintVec); - } - - const TrieNodeInfo* Trie::find(const Unicode& uintVec)const - { - if(uintVec.empty()) - { - return NULL; - } - return find(uintVec.begin(), uintVec.end()); - } - - const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)const - { - - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return NULL; - } - if(begin >= end) - { - return NULL; - } - TrieNode* p = _root; - for(Unicode::const_iterator it = begin; it != end; it++) - { - uint16_t chUni = *it; - if(p->hmap.find(chUni) == p-> hmap.end()) - { - return NULL; - } - else - { - p = p->hmap[chUni]; - } - } - if(p->isLeaf) - { - uint pos = p->nodeInfoVecPos; - if(pos < _nodeInfoVec.size()) - { - return &(_nodeInfoVec[pos]); - } - else - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return NULL; - } - } - return NULL; - } - - bool Trie::find(const Unicode& unico, vector >& res)const - { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return false; - } - TrieNode* p = _root; - //for(Unicode::const_iterator it = begin; it != end; it++) - for(uint i = 0; i < unico.size(); i++) - { - if(p->hmap.find(unico[i]) == p-> hmap.end()) - { - break; - } - p = p->hmap[unico[i]]; - if(p->isLeaf) - { - uint pos = p->nodeInfoVecPos; - if(pos < _nodeInfoVec.size()) - { - res.push_back(make_pair(i, &_nodeInfoVec[pos])); - } - else - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return false; - } - } - } - return !res.empty(); - } - - bool Trie::_deleteNode(TrieNode* node) - { - for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) - { - TrieNode* next = it->second; - _deleteNode(next); - } - - delete node; - return true; - } - - - bool Trie::insert(const TrieNodeInfo& nodeInfo) - { - if(!_getInitFlag()) - { - LogFatal("not initted!"); - return false; - } - - - const Unicode& uintVec = nodeInfo.word; - TrieNode* p = _root; - for(uint i = 0; i < uintVec.size(); i++) - { - uint16_t cu = uintVec[i]; - if(NULL == p) - { - return false; - } - if(p->hmap.end() == p->hmap.find(cu)) - { - TrieNode * next = NULL; - try - { - next = new TrieNode; - } - catch(const bad_alloc& e) - { - return false; - } - p->hmap[cu] = next; - p = next; - } - else - { - p = p->hmap[cu]; - } - } - if(NULL == p) - { - return false; - } - if(p->isLeaf) - { - LogError("this node already inserted"); - return false; - } - - p->isLeaf = true; - _nodeInfoVec.push_back(nodeInfo); - p->nodeInfoVecPos = _nodeInfoVec.size() - 1; - - return true; - } - - bool Trie::_countWeight() - { - if(_nodeInfoVec.empty() || 0 != _freqSum) - { - LogError("_nodeInfoVec is empty or _freqSum has been counted already."); - return false; - } - - //freq total freq - for(size_t i = 0; i < _nodeInfoVec.size(); i++) - { - _freqSum += _nodeInfoVec[i].freq; - } - - if(0 == _freqSum) - { - LogError("_freqSum == 0 ."); - return false; - } - - //normalize - for(uint i = 0; i < _nodeInfoVec.size(); i++) - { - TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; - if(0 == nodeInfo.freq) - { - LogFatal("nodeInfo.freq == 0!"); - return false; - } - nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); - if(_minLogFreq > nodeInfo.logFreq) - { - _minLogFreq = nodeInfo.logFreq; - } - } - - return true; - } -} - -#ifdef TRIE_UT -using namespace CppJieba; -int main() -{ - Trie trie; - trie.init(); - trie.loadDict("../dicts/segdict.gbk.v2.1"); - //trie.loadDict("tmp"); - cout< -#include -#include -#include -#include -#include -#include -#include "Limonp/str_functs.hpp" -#include "Limonp/logger.hpp" -#include "TransCode.hpp" -#include "globals.h" -#include "structs.h" - - -namespace CppJieba -{ - using namespace Limonp; - struct TrieNode - { - TrieNodeMap hmap; - bool isLeaf; - uint nodeInfoVecPos; - TrieNode() - { - isLeaf = false; - nodeInfoVecPos = 0; - } - }; - - class Trie - { - - private: - TrieNode* _root; - vector _nodeInfoVec; - - bool _initFlag; - int64_t _freqSum; - double _minLogFreq; - - public: - Trie(); - ~Trie(); - bool init(); - bool loadDict(const char * const filePath); - bool dispose(); - - private: - void _setInitFlag(bool on){_initFlag = on;}; - bool _getInitFlag()const{return _initFlag;}; - - public: - const TrieNodeInfo* find(const string& str)const; - const TrieNodeInfo* find(const Unicode& uintVec)const; - const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const; - bool find(const Unicode& unico, vector >& res)const; - - const TrieNodeInfo* findPrefix(const string& str)const; - - public: - //double getWeight(const string& str); - //double getWeight(const Unicode& uintVec); - //double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end); - double getMinLogFreq()const{return _minLogFreq;}; - - //int64_t getTotalCount(){return _freqSum;}; - - bool insert(const TrieNodeInfo& nodeInfo); - - private: - bool _trieInsert(const char * const filePath); - bool _countWeight(); - bool _deleteNode(TrieNode* node); - - }; -} - -#endif diff --git a/src/Trie.hpp b/src/Trie.hpp new file mode 100644 index 0000000..7d91112 --- /dev/null +++ b/src/Trie.hpp @@ -0,0 +1,416 @@ +/************************************ + * file enc : ASCII + * author : wuyanyi09@gmail.com + ************************************/ +#ifndef CPPJIEBA_TRIE_H +#define CPPJIEBA_TRIE_H + +#include +#include +#include +#include +#include +#include +#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" +#include "TransCode.hpp" +#include "globals.h" +#include "structs.h" + + +namespace CppJieba +{ + using namespace Limonp; + struct TrieNode + { + TrieNodeMap hmap; + bool isLeaf; + uint nodeInfoVecPos; + TrieNode() + { + isLeaf = false; + nodeInfoVecPos = 0; + } + }; + + class Trie + { + + private: + TrieNode* _root; + vector _nodeInfoVec; + + bool _initFlag; + int64_t _freqSum; + double _minLogFreq; + + public: + Trie() + { + _root = NULL; + _freqSum = 0; + _minLogFreq = MAX_DOUBLE; + _initFlag = false; + } + ~Trie() + { + dispose(); + } + bool init() + { + if(_getInitFlag()) + { + LogError("already initted!"); + return false; + } + + try + { + _root = new TrieNode; + } + catch(const bad_alloc& e) + { + return false; + } + if(NULL == _root) + { + return false; + } + _setInitFlag(true); + return true; + } + bool dispose() + { + if(!_getInitFlag()) + { + return false; + } + bool ret = _deleteNode(_root); + if(!ret) + { + LogFatal("_deleteNode failed!"); + return false; + } + _root = NULL; + _nodeInfoVec.clear(); + + _setInitFlag(false); + return ret; + } + bool loadDict(const char * const filePath) + { + if(!_getInitFlag()) + { + LogError("not initted."); + return false; + } + + if(!checkFileExist(filePath)) + { + LogError("cann't find fiel[%s].",filePath); + return false; + } + bool res = false; + res = _trieInsert(filePath); + if(!res) + { + LogError("_trieInsert failed."); + return false; + } + res = _countWeight(); + if(!res) + { + LogError("_countWeight failed."); + return false; + } + return true; + } + + private: + void _setInitFlag(bool on){_initFlag = on;}; + bool _getInitFlag()const{return _initFlag;}; + + public: + const TrieNodeInfo* find(const string& str)const + { + Unicode uintVec; + if(!TransCode::decode(str, uintVec)) + { + return NULL; + } + return find(uintVec); + } + const TrieNodeInfo* find(const Unicode& uintVec)const + { + if(uintVec.empty()) + { + return NULL; + } + return find(uintVec.begin(), uintVec.end()); + } + const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const + { + + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return NULL; + } + if(begin >= end) + { + return NULL; + } + TrieNode* p = _root; + for(Unicode::const_iterator it = begin; it != end; it++) + { + uint16_t chUni = *it; + if(p->hmap.find(chUni) == p-> hmap.end()) + { + return NULL; + } + else + { + p = p->hmap[chUni]; + } + } + if(p->isLeaf) + { + uint pos = p->nodeInfoVecPos; + if(pos < _nodeInfoVec.size()) + { + return &(_nodeInfoVec[pos]); + } + else + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return NULL; + } + } + return NULL; + } + bool find(const Unicode& unico, vector >& res)const + { + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return false; + } + TrieNode* p = _root; + for(uint i = 0; i < unico.size(); i++) + { + if(p->hmap.find(unico[i]) == p-> hmap.end()) + { + break; + } + p = p->hmap[unico[i]]; + if(p->isLeaf) + { + uint pos = p->nodeInfoVecPos; + if(pos < _nodeInfoVec.size()) + { + res.push_back(make_pair(i, &_nodeInfoVec[pos])); + } + else + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return false; + } + } + } + return !res.empty(); + } + + const TrieNodeInfo* findPrefix(const string& str)const + { + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return NULL; + } + Unicode uintVec; + + if(!TransCode::decode(str, uintVec)) + { + LogError("TransCode::decode failed."); + return NULL; + } + + //find + TrieNode* p = _root; + uint pos = 0; + uint16_t chUni = 0; + const TrieNodeInfo * res = NULL; + for(uint i = 0; i < uintVec.size(); i++) + { + chUni = uintVec[i]; + if(p->isLeaf) + { + pos = p->nodeInfoVecPos; + if(pos >= _nodeInfoVec.size()) + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return NULL; + } + res = &(_nodeInfoVec[pos]); + + } + if(p->hmap.find(chUni) == p->hmap.end()) + { + break; + } + else + { + p = p->hmap[chUni]; + } + } + return res; + } + + public: + double getMinLogFreq()const{return _minLogFreq;}; + + bool insert(const TrieNodeInfo& nodeInfo) + { + if(!_getInitFlag()) + { + LogFatal("not initted!"); + return false; + } + + + const Unicode& uintVec = nodeInfo.word; + TrieNode* p = _root; + for(uint i = 0; i < uintVec.size(); i++) + { + uint16_t cu = uintVec[i]; + if(NULL == p) + { + return false; + } + if(p->hmap.end() == p->hmap.find(cu)) + { + TrieNode * next = NULL; + try + { + next = new TrieNode; + } + catch(const bad_alloc& e) + { + return false; + } + p->hmap[cu] = next; + p = next; + } + else + { + p = p->hmap[cu]; + } + } + if(NULL == p) + { + return false; + } + if(p->isLeaf) + { + LogError("this node already inserted"); + return false; + } + + p->isLeaf = true; + _nodeInfoVec.push_back(nodeInfo); + p->nodeInfoVecPos = _nodeInfoVec.size() - 1; + + return true; + } + + private: + bool _trieInsert(const char * const filePath) + { + ifstream ifile(filePath); + string line; + vector vecBuf; + + TrieNodeInfo nodeInfo; + while(getline(ifile, line)) + { + vecBuf.clear(); + splitStr(line, vecBuf, " "); + if(3 < vecBuf.size()) + { + LogError("line[%s] illegal.", line.c_str()); + return false; + } + if(!TransCode::decode(vecBuf[0], nodeInfo.word)) + { + return false; + } + nodeInfo.freq = atoi(vecBuf[1].c_str()); + if(3 == vecBuf.size()) + { + nodeInfo.tag = vecBuf[2]; + } + + //insert node + if(!insert(nodeInfo)) + { + LogError("insert node failed!"); + } + } + return true; + } + bool _countWeight() + { + if(_nodeInfoVec.empty() || 0 != _freqSum) + { + LogError("_nodeInfoVec is empty or _freqSum has been counted already."); + return false; + } + + //freq total freq + for(size_t i = 0; i < _nodeInfoVec.size(); i++) + { + _freqSum += _nodeInfoVec[i].freq; + } + + if(0 == _freqSum) + { + LogError("_freqSum == 0 ."); + return false; + } + + //normalize + for(uint i = 0; i < _nodeInfoVec.size(); i++) + { + TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; + if(0 == nodeInfo.freq) + { + LogFatal("nodeInfo.freq == 0!"); + return false; + } + nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); + if(_minLogFreq > nodeInfo.logFreq) + { + _minLogFreq = nodeInfo.logFreq; + } + } + + return true; + } + + bool _deleteNode(TrieNode* node) + { + for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) + { + TrieNode* next = it->second; + _deleteNode(next); + } + + delete node; + return true; + } + + }; +} + +#endif diff --git a/src/structs.h b/src/structs.h index 88c5894..e02b294 100644 --- a/src/structs.h +++ b/src/structs.h @@ -3,7 +3,7 @@ #include #include "globals.h" -#include "Trie.h" +#include "Trie.hpp" #include "TransCode.hpp" namespace CppJieba