From 24120c92b1f9e4e3eb22b6512d7202964c3f2c31 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 10 Apr 2014 09:16:35 -0700 Subject: [PATCH] compile ok --- src/DictTrie.hpp | 50 +++++++++++++++++++++++--- src/FullSegment.hpp | 2 +- src/MPSegment.hpp | 6 ++-- src/PosTagger.hpp | 2 +- src/Trie.hpp | 78 ++++++++++++++++++++++++----------------- test/unittest/TTrie.cpp | 26 +++++++------- 6 files changed, 108 insertions(+), 56 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 1dae1e9..0564d17 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -12,7 +12,7 @@ #include "Limonp/logger.hpp" #include "Limonp/InitOnOff.hpp" #include "TransCode.hpp" -#include "DictTrie.hpp" +#include "Trie.hpp" @@ -32,20 +32,30 @@ namespace CppJieba double logFreq; //logFreq = log(freq/sum(freq)); }; + inline ostream & operator << (ostream& os, const DictUnit& unit) + { + string s; + s << unit.word; + return os << string_format("%s %u %s %llf", s.c_str(), unit.freq, unit.tag.c_str(), unit.logFreq); + } + typedef map DagType; - class DictTrie: InitOnOff + class DictTrie: public InitOnOff { - + public: + typedef Trie TrieType; private: vector _nodeInfos; + TrieType * _trie; - int64_t _freqSum; + size_t _freqSum; double _minLogFreq; public: DictTrie() { + _trie = NULL; _freqSum = 0; _minLogFreq = MAX_DOUBLE; _setInitFlag(false); @@ -57,6 +67,10 @@ namespace CppJieba } ~DictTrie() { + if(_trie) + { + delete _trie; + } } private: @@ -69,14 +83,40 @@ namespace CppJieba _freqSum = _calculateFreqSum(_nodeInfos); assert(_freqSum); _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); - return _setInitFlag(true); + _trie = _creatTrie(_nodeInfos); + return _setInitFlag(_trie); } + public: + const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + return _trie->find(begin, end); + } + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const + { + return _trie->find(begin, end, dag, offset); + } public: double getMinLogFreq() const {return _minLogFreq;}; private: + TrieType * _creatTrie(const vector& dictUnits) + { + if(dictUnits.empty()) + { + return NULL; + } + vector words; + vector valuePointers; + for(size_t i = 0 ; i < dictUnits.size(); i ++) + { + words.push_back(dictUnits[i].word); + valuePointers.push_back(&dictUnits[i]); + } + TrieType * trie = new TrieType(words, valuePointers); + return trie; + } void _loadDict(const string& filePath, vector& nodeInfos) const { ifstream ifs(filePath.c_str()); diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index fa55cde..d0eedb1 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -64,7 +64,7 @@ namespace CppJieba if (_dictTrie.find(uItr, end, tRes, 0)) { for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) { wordLen = itr->second->word.size(); if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index b22d21d..7705227 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -21,7 +21,7 @@ namespace CppJieba { uint16_t uniCh; DagType dag; - const TrieNodeInfo * pInfo; + const DictUnit * pInfo; double weight; SegmentChar():uniCh(0), pInfo(NULL), weight(0.0) @@ -142,7 +142,7 @@ namespace CppJieba } size_t nextPos; - const TrieNodeInfo* p; + const DictUnit* p; double val; for(int i = segContext.size() - 1; i >= 0; i--) @@ -182,7 +182,7 @@ namespace CppJieba size_t i = 0; while(i < segContext.size()) { - const TrieNodeInfo* p = segContext[i].pInfo; + const DictUnit* p = segContext[i].pInfo; if(p) { res.push_back(p->word); diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index c760c0b..f6ee8fd 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -42,7 +42,7 @@ namespace CppJieba return false; } - const TrieNodeInfo *tmp = NULL; + const DictUnit *tmp = NULL; Unicode unico; for (vector::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { diff --git a/src/Trie.hpp b/src/Trie.hpp index 8a9080d..7d6feaf 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -6,26 +6,28 @@ namespace CppJieba { + using namespace std; template class TrieNode { public: - typedef unordered_map TrieNodeMapType; + typedef unordered_map* > KeyMapType; public: - TrieNodeMap * ptKeyMap; + KeyMapType * ptKeyMap; const ValueType * ptValue; }; template class Trie { - private: - TrieNode* _root; - private: public: - Trie(const vector& keys, const vector& valuePointers) + typedef TrieNode TrieNodeType; + private: + TrieNodeType* _root; + public: + Trie(const vector >& keys, const vector& valuePointers) { - _root = new TrieNode; + _root = new TrieNodeType; _root->ptKeyMap = NULL; _root->ptValue = NULL; @@ -33,28 +35,33 @@ namespace CppJieba } ~Trie() { + if(_root) + { + _deleteNode(_root); + } } public: - const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const + const ValueType* find(typename vector::const_iterator begin, typename vector::const_iterator end) const { - TrieNodeMapType::const_iterator citer; - const TrieNode* ptNode = _root; - for(KeyType::const_iterator it = begin; it != end; it++) + typename TrieNodeType::KeyMapType::const_iterator citer; + const TrieNodeType* ptNode = _root; + for(typename vector::const_iterator it = begin; it != end; it++) { citer = ptNode->ptKeyMap->find(*it); if(ptNode->ptKeyMap->end() == citer) { return NULL; } - ptNode= citer->second; + ptNode = citer->second; } return ptNode->ptValue; } - bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map& ordererMap) const + bool find(typename vector::const_iterator begin, typename vector ::const_iterator end, map::size_type, const ValueType* >& ordererMap, size_t offset = 0) const { - const TrieNode * ptNode = _root; - TrieNodeMapType::const_iterator citer; - for(KeyType::const_iterator itr = begin; itr != end ; itr++) + const TrieNodeType * ptNode = _root; + typename TrieNodeType::KeyMapType::const_iterator citer; + ordererMap.clear(); + for(typename vector::const_iterator itr = begin; itr != end ; itr++) { citer = ptNode->ptKeyMap->find(*itr); if(ptNode->ptKeyMap->end() == citer) @@ -64,46 +71,47 @@ namespace CppJieba ptNode = citer->second; if(ptNode->ptValue) { - ordererMap[itr - begin] = ptNode->ptValue; + ordererMap[itr - begin + offset] = ptNode->ptValue; } } + return ordererMap.size(); } private: - void _createTrie(const vector& keys, const vector& valuePointers) + void _createTrie(const vector >& keys, const vector& valuePointers) { - if(values.empty() || keys.empty()) + if(valuePointers.empty() || keys.empty()) { return; } assert(keys.size() == valuePointers.size()); - + for(size_t i = 0; i < keys.size(); i++) { _insertNode(keys[i], valuePointers[i]); } } private: - void _insertNode(const KeyType& key, const Value* ptValue) + void _insertNode(const vector& key, const ValueType* ptValue) { - TrieNode* ptNode = _root; + TrieNodeType* ptNode = _root; - TrieNode::KeyMapType::const_iterator kmIter; + typename TrieNodeType::KeyMapType::const_iterator kmIter; - for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++) + for(typename vector::const_iterator citer = key.begin(); citer != key.end(); citer++) { if(NULL == ptNode->ptKeyMap) { - ptNode->ptKeyMap = new TrieNode::KeyMapType; + ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType; } kmIter = ptNode->ptKeyMap->find(*citer); if(ptNode->ptKeyMap->end() == kmIter) { - TrieNode * nextNode = new TrieNode; + TrieNodeType * nextNode = new TrieNodeType; nextNode->ptKeyMap = NULL; nextNode->ptValue = NULL; - ptNode->ptKeyMap[*citer] = nextNode; - ptNode = next; + (*ptNode->ptKeyMap)[*citer] = nextNode; + ptNode = nextNode; } else { @@ -112,20 +120,24 @@ namespace CppJieba } ptNode->ptValue = ptValue; } - void _deleteNode(TrieNode* node) + void _deleteNode(TrieNodeType* node) { if(!node) { return; } - for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + if(node->ptKeyMap) { - _deleteNode(it->second); + typename TrieNodeType::KeyMapType::iterator it; + for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + { + _deleteNode(it->second); + } + delete node->ptKeyMap; } - delete node->ptKeyMap; delete node; } - } + }; } #endif diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 1ccbb98..5e56a44 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -1,30 +1,30 @@ -#include "src/Trie.hpp" +#include "src/DictTrie.hpp" #include "gtest/gtest.h" using namespace CppJieba; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; -TEST(TrieTest, NewAndDelete) +TEST(DictTrieTest, NewAndDelete) { - Trie * trie; - trie = new Trie(DICT_FILE); + DictTrie * trie; + trie = new DictTrie(DICT_FILE); delete trie; - trie = new Trie(); + trie = new DictTrie(); delete trie; } -TEST(TrieTest, Test1) +TEST(DictTrieTest, Test1) { string s1, s2; - Trie trie; + DictTrie trie; ASSERT_TRUE(trie.init(DICT_FILE)); ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001); string word("来到"); Unicode uni; ASSERT_TRUE(TransCode::decode(word, uni)); - TrieNodeInfo nodeInfo; + DictUnit nodeInfo; nodeInfo.word = uni; nodeInfo.freq = 8779; nodeInfo.tag = "v"; @@ -34,9 +34,9 @@ TEST(TrieTest, Test1) EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2); word = "清华大学"; - vector > res; - map resMap; - map mp; + vector > res; + map resMap; + map mp; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { @@ -44,10 +44,10 @@ TEST(TrieTest, Test1) res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); } - //TrieNodeInfo + //DictUnit //res.push_back(make_pair(0, )) - vector > vec; + vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); //print(uni); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));