From f70b654b66bba031a1e7b87fe9c7fc5febe49738 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 10 Apr 2014 21:05:01 +0800 Subject: [PATCH] split Trie.hpp into (Trie.hpp & DictTrie.hpp) --- src/DictTrie.hpp | 145 ++++++++++++++++++++++ src/Trie.hpp | 316 +++++++++++++++-------------------------------- 2 files changed, 248 insertions(+), 213 deletions(-) create mode 100644 src/DictTrie.hpp diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp new file mode 100644 index 0000000..4f5700a --- /dev/null +++ b/src/DictTrie.hpp @@ -0,0 +1,145 @@ +#ifndef CPPJIEBA_DICT_TRIE_HPP +#define CPPJIEBA_DICT_TRIE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" +#include "Limonp/InitOnOff.hpp" +#include "TransCode.hpp" +#include "Trie.hpp" + + + +namespace CppJieba +{ + using namespace Limonp; + const double MIN_DOUBLE = -3.14e+100; + const double MAX_DOUBLE = 3.14e+100; + const size_t DICT_COLUMN_NUM = 3; + + + struct DictUnit + { + Unicode word; + size_t freq; + string tag; + double logFreq; //logFreq = log(freq/sum(freq)); + }; + + typedef map DagType; + + class DictTrie: InitOnOff + { + + private: + DictTrieNode* _root; + vector _nodeInfos; + + int64_t _freqSum; + double _minLogFreq; + + public: + DictTrie() + { + _root = new DictTrieNode; + _root.ptKeyMap = NULL; + _root.offset = 0; + _freqSum = 0; + _minLogFreq = MAX_DOUBLE; + _setInitFlag(false); + } + DictTrie(const string& filePath) + { + new (this) DictTrie(); + _setInitFlag(init(filePath)); + } + ~DictTrie() + { + _deleteNode(_root); + } + private: + + + public: + bool init(const string& filePath) + { + assert(!_getInitFlag()); + _loadDict(filePath, _nodeInfos); + _createDictTrie(_nodeInfos, _root); + _freqSum = _calculateFreqSum(_nodeInfos); + assert(_freqSum); + _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); + return _setInitFlag(true); + } + + + public: + double getMinLogFreq() const {return _minLogFreq;}; + + private: + void _loadDict(const string& filePath, vector& nodeInfos) const + { + ifstream ifs(filePath.c_str()); + if(!ifs) + { + LogFatal("open %s failed.", filePath.c_str()); + exit(1); + } + string line; + vector buf; + + nodeInfos.clear(); + DictUnit nodeInfo; + for(size_t lineno = 0 ; getline(ifs, line); lineno++) + { + split(line, buf, " "); + assert(buf.size() == DICT_COLUMN_NUM); + + if(!TransCode::decode(buf[0], nodeInfo.word)) + { + LogError("line[%u:%s] illegal.", lineno, line.c_str()); + continue; + } + nodeInfo.freq = atoi(buf[1].c_str()); + nodeInfo.tag = buf[2]; + + nodeInfos.push_back(nodeInfo); + } + } + size_t _calculateFreqSum(const vector& nodeInfos) const + { + size_t freqSum = 0; + for(size_t i = 0; i < nodeInfos.size(); i++) + { + freqSum += nodeInfos[i].freq; + } + return freqSum; + } + double _calculateLogFreqAndGetMinValue(vector& nodeInfos, size_t freqSum) const + { + assert(freqSum); + double minLogFreq = MAX_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) + { + DictUnit& nodeInfo = nodeInfos[i]; + assert(nodeInfo.freq); + nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum)); + if(minLogFreq > nodeInfo.logFreq) + { + minLogFreq = nodeInfo.logFreq; + } + } + return minLogFreq; + } + + + }; +} + +#endif diff --git a/src/Trie.hpp b/src/Trie.hpp index 714b2c6..8a9080d 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -1,241 +1,131 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com - ************************************/ -#ifndef CPPJIEBA_TRIE_H -#define CPPJIEBA_TRIE_H - -#include -#include -#include -#include -#include -#include -#include -#include "Limonp/str_functs.hpp" -#include "Limonp/logger.hpp" -#include "Limonp/InitOnOff.hpp" -#include "TransCode.hpp" - +#ifndef CPPJIEBA_TRIE_HPP +#define CPPJIEBA_TRIE_HPP +#include "Limonp/std_outbound.hpp" +#include namespace CppJieba { - using namespace Limonp; - const double MIN_DOUBLE = -3.14e+100; - const double MAX_DOUBLE = 3.14e+100; - const size_t DICT_COLUMN_NUM = 3; - typedef unordered_map TrieNodeMap; - struct TrieNodeInfo; - struct TrieNode - { - TrieNodeMap hmap; - const TrieNodeInfo * ptTrieNodeInfo; - TrieNode(): ptTrieNodeInfo(NULL) - {} - }; + template + class TrieNode + { + public: + typedef unordered_map TrieNodeMapType; + public: + TrieNodeMap * ptKeyMap; + const ValueType * ptValue; + }; - struct TrieNodeInfo - { - Unicode word; - size_t freq; - string tag; - double logFreq; //logFreq = log(freq/sum(freq)); - }; - - inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo) - { - return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ; - } - - typedef map DagType; - - class Trie: public InitOnOff - { - - private: - TrieNode* _root; - vector _nodeInfos; - - int64_t _freqSum; - double _minLogFreq; - - public: - Trie() - { - _root = new TrieNode; - _freqSum = 0; - _minLogFreq = MAX_DOUBLE; - _setInitFlag(false); - } - Trie(const string& filePath) - { - new (this) Trie(); - _setInitFlag(init(filePath)); - } - ~Trie() - { - _deleteNode(_root); - } - private: - - - public: - bool init(const string& filePath) - { - assert(!_getInitFlag()); - _loadDict(filePath, _nodeInfos); - _createTrie(_nodeInfos, _root); - _freqSum = _calculateFreqSum(_nodeInfos); - assert(_freqSum); - _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); - return _setInitFlag(true); - } - - public: - const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const - { - TrieNodeMap::const_iterator citer; - const TrieNode* p = _root; - for(Unicode::const_iterator it = begin; it != end; it++) + template + class Trie + { + private: + TrieNode* _root; + private: + public: + Trie(const vector& keys, const vector& valuePointers) { - citer = p->hmap.find(*it); - if(p->hmap.end() == citer) - { - return NULL; - } - p = citer->second; + _root = new TrieNode; + _root->ptKeyMap = NULL; + _root->ptValue = NULL; + + _createTrie(keys, valuePointers); } - return p->ptTrieNodeInfo; - } - - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const - { - const TrieNode* p = _root; - TrieNodeMap::const_iterator citer; - for (Unicode::const_iterator itr = begin; itr != end; itr++) + ~Trie() { - citer = p->hmap.find(*itr); - if(p->hmap.end() == citer) - { - break; - } - p = citer->second; - if(p->ptTrieNodeInfo) - { - res[itr - begin + offset] = p->ptTrieNodeInfo; - } } - return !res.empty(); - } - - public: - double getMinLogFreq() const {return _minLogFreq;}; - - private: - void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const - { - const Unicode& unico = nodeInfo.word; - TrieNodeMap::const_iterator citer; - for(size_t i = 0; i < unico.size(); i++) + public: + const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const { - uint16_t cu = unico[i]; - assert(ptNode); - citer = ptNode->hmap.find(cu); - if(ptNode->hmap.end() == citer) + TrieNodeMapType::const_iterator citer; + const TrieNode* ptNode = _root; + for(KeyType::const_iterator it = begin; it != end; it++) { - TrieNode * next = new TrieNode; - ptNode->hmap[cu] = next; - ptNode = next; + citer = ptNode->ptKeyMap->find(*it); + if(ptNode->ptKeyMap->end() == citer) + { + return NULL; + } + ptNode= citer->second; } - else + return ptNode->ptValue; + } + bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map& ordererMap) const + { + const TrieNode * ptNode = _root; + TrieNodeMapType::const_iterator citer; + for(KeyType::const_iterator itr = begin; itr != end ; itr++) { + citer = ptNode->ptKeyMap->find(*itr); + if(ptNode->ptKeyMap->end() == citer) + { + break; + } ptNode = citer->second; + if(ptNode->ptValue) + { + ordererMap[itr - begin] = ptNode->ptValue; + } } - } - - ptNode->ptTrieNodeInfo = &nodeInfo; - } - - private: - void _loadDict(const string& filePath, vector& nodeInfos) const - { - ifstream ifs(filePath.c_str()); - if(!ifs) + private: + void _createTrie(const vector& keys, const vector& valuePointers) { - LogFatal("open %s failed.", filePath.c_str()); - exit(1); - } - string line; - vector buf; - - nodeInfos.clear(); - TrieNodeInfo nodeInfo; - for(size_t lineno = 0 ; getline(ifs, line); lineno++) - { - split(line, buf, " "); - assert(buf.size() == DICT_COLUMN_NUM); - if(!TransCode::decode(buf[0], nodeInfo.word)) + if(values.empty() || keys.empty()) { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - continue; + return; } - nodeInfo.freq = atoi(buf[1].c_str()); - nodeInfo.tag = buf[2]; + assert(keys.size() == valuePointers.size()); - nodeInfos.push_back(nodeInfo); - } - } - bool _createTrie(const vector& nodeInfos, TrieNode * ptNode) - { - for(size_t i = 0; i < _nodeInfos.size(); i++) - { - _insertNode(_nodeInfos[i], ptNode); - } - return true; - } - size_t _calculateFreqSum(const vector& nodeInfos) const - { - size_t freqSum = 0; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - freqSum += nodeInfos[i].freq; - } - return freqSum; - } - double _calculateLogFreqAndGetMinValue(vector& nodeInfos, size_t freqSum) const - { - assert(freqSum); - double minLogFreq = MAX_DOUBLE; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - TrieNodeInfo& nodeInfo = nodeInfos[i]; - assert(nodeInfo.freq); - nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum)); - if(minLogFreq > nodeInfo.logFreq) + for(size_t i = 0; i < keys.size(); i++) { - minLogFreq = nodeInfo.logFreq; + _insertNode(keys[i], valuePointers[i]); } } - return minLogFreq; - } - - void _deleteNode(TrieNode* node) - { - if(!node) + private: + void _insertNode(const KeyType& key, const Value* ptValue) { - return; - } - for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) - { - TrieNode* next = it->second; - _deleteNode(next); - } - delete node; - } + TrieNode* ptNode = _root; - }; + TrieNode::KeyMapType::const_iterator kmIter; + + for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++) + { + if(NULL == ptNode->ptKeyMap) + { + ptNode->ptKeyMap = new TrieNode::KeyMapType; + } + kmIter = ptNode->ptKeyMap->find(*citer); + if(ptNode->ptKeyMap->end() == kmIter) + { + TrieNode * nextNode = new TrieNode; + nextNode->ptKeyMap = NULL; + nextNode->ptValue = NULL; + + ptNode->ptKeyMap[*citer] = nextNode; + ptNode = next; + } + else + { + ptNode = kmIter->second; + } + } + ptNode->ptValue = ptValue; + } + void _deleteNode(TrieNode* node) + { + if(!node) + { + return; + } + for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + { + _deleteNode(it->second); + } + delete node->ptKeyMap; + delete node; + } + } } #endif