From 3ced4512124e4ccea5a236ceeda5394d668471ff Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 12 Nov 2014 18:55:17 +0800 Subject: [PATCH] use automation --- src/DictTrie.hpp | 26 ++----- src/MPSegment.hpp | 47 ++++------- src/Trie.hpp | 150 +++++++++++++++++++++++++++++++----- test/unittest/TSegments.cpp | 1 - test/unittest/TTrie.cpp | 12 +++ 5 files changed, 162 insertions(+), 74 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 8c2d44f..01ff370 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -23,24 +23,6 @@ namespace CppJieba const size_t DICT_COLUMN_NUM = 3; const char* const UNKNOWN_TAG = "x"; - - - struct DictUnit - { - Unicode word; - double weight; - string tag; - }; - - inline ostream & operator << (ostream& os, const DictUnit& unit) - { - string s; - s << unit.word; - return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); - } - - typedef std::vector > DagType; - class DictTrie { public: @@ -107,6 +89,14 @@ namespace CppJieba { return _trie->find(begin, end, dag, offset); } + void find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res + ) const + { + _trie->find(begin, end, res); + } private: diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index f397928..60c76bd 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -1,7 +1,3 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com - ************************************/ #ifndef CPPJIEBA_MPSEGMENT_H #define CPPJIEBA_MPSEGMENT_H @@ -17,17 +13,6 @@ namespace CppJieba { - struct SegmentChar - { - uint16_t uniCh; - DagType dag; - const DictUnit * pInfo; - double weight; - size_t nextPos; - SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) - {} - }; - class MPSegment: public SegmentBase { private: @@ -85,16 +70,9 @@ namespace CppJieba { return false; } - vector segmentChars(end - begin); + vector segmentChars; - //calc DAG - for(size_t i = 0; i < segmentChars.size(); i ++) - { - segmentChars[i].uniCh = *(begin + i); - segmentChars[i].dag.clear(); - segmentChars[i].dag.push_back(std::pair(i, NULL)); - _dictTrie.find(begin + i, end, segmentChars[i].dag, i); - } + _dictTrie.find(begin, end, segmentChars); _calcDP(segmentChars); @@ -112,24 +90,25 @@ namespace CppJieba } private: - void _calcDP(vector& SegmentChars) const + void _calcDP(vector& segmentChars) const { size_t nextPos; const DictUnit* p; double val; - for(int i = SegmentChars.size() - 1; i >= 0; i--) + for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) { - SegmentChars[i].pInfo = NULL; - SegmentChars[i].weight = MIN_DOUBLE; - for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++) + segmentChars[i].pInfo = NULL; + segmentChars[i].weight = MIN_DOUBLE; + assert(!segmentChars[i].dag.empty()); + for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) { nextPos = it->first; p = it->second; val = 0.0; - if(nextPos + 1 < SegmentChars.size()) + if(nextPos + 1 < segmentChars.size()) { - val += SegmentChars[nextPos + 1].weight; + val += segmentChars[nextPos + 1].weight; } if(p) @@ -140,10 +119,10 @@ namespace CppJieba { val += _dictTrie.getMinWeight(); } - if(val > SegmentChars[i].weight) + if(val > segmentChars[i].weight) { - SegmentChars[i].pInfo = p; - SegmentChars[i].weight = val; + segmentChars[i].pInfo = p; + segmentChars[i].weight = val; } } } diff --git a/src/Trie.hpp b/src/Trie.hpp index d89b38b..9d10267 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -3,18 +3,51 @@ #include "Limonp/StdExtension.hpp" #include +#include namespace CppJieba { using namespace std; + + struct DictUnit + { + Unicode word; + double weight; + string tag; + }; + + inline ostream & operator << (ostream& os, const DictUnit& unit) + { + string s; + s << unit.word; + return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); + } + + typedef std::vector > DagType; + + struct SegmentChar + { + uint16_t uniCh; + DagType dag; + const DictUnit * pInfo; + double weight; + size_t nextPos; + SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) + {} + }; + template class TrieNode { public: - typedef unordered_map* > KeyMapType; + typedef unordered_map* > NextMap; public: - KeyMapType * ptKeyMap; + TrieNode * fail; + NextMap * next; const ValueType * ptValue; + public: + TrieNode(): fail(NULL), next(NULL), ptValue(NULL) { + } }; template , class KeysContainerType = vector, class ValueContainerType = vector > @@ -28,10 +61,8 @@ namespace CppJieba Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers) { _root = new TrieNodeType; - _root->ptKeyMap = NULL; - _root->ptValue = NULL; - _createTrie(keys, valuePointers); + _build();// build automation } ~Trie() { @@ -43,12 +74,12 @@ namespace CppJieba public: const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const { - typename TrieNodeType::KeyMapType::const_iterator citer; + typename TrieNodeType::NextMap::const_iterator citer; const TrieNodeType* ptNode = _root; for(typename KeyContainerType::const_iterator it = begin; it != end; it++) - { + {// build automation assert(ptNode); - if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it))) + if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) { return NULL; } @@ -56,6 +87,47 @@ namespace CppJieba } return ptNode->ptValue; } + void find( + typename KeyContainerType::const_iterator begin, + typename KeyContainerType::const_iterator end, + vector& res + ) const + { + res.resize(end - begin); + const TrieNodeType * now = _root; + typename TrieNodeType::NextMap::const_iterator iter; + for (size_t i = 0; i < end - begin; i++) { + bool flag = false; + res[i].uniCh = *(begin + i); + assert(res[i].dag.empty()); + res[i].dag.reserve(4);//TODO + while( now != _root && (now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end())) { + now = now->fail; + } + if(now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end()) { + now = _root; + } else { + now = iter->second; + const TrieNodeType * temp = now; + while(temp != _root) { + if (temp->ptValue) { + string str; + TransCode::encode(temp->ptValue->word, str); + size_t pos = i - temp->ptValue->word.size() + 1; + res[pos].dag.push_back(pair(i, temp->ptValue)); + if(temp->ptValue->word.size() == 1) { + flag = true; + } + } + temp = temp->fail; + assert(temp); + } + } + if(!flag) { + res[i].dag.push_back(pair(i, NULL)); + } + } + } bool find( typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, @@ -63,11 +135,11 @@ namespace CppJieba size_t offset = 0) const { const TrieNodeType * ptNode = _root; - typename TrieNodeType::KeyMapType::const_iterator citer; + typename TrieNodeType::NextMap::const_iterator citer; for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++) { assert(ptNode); - if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr))) + if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) { break; } @@ -86,6 +158,42 @@ namespace CppJieba } return !res.empty(); } + private: + void _build() + { + queue que; + assert(_root->ptValue == NULL); + assert(_root->next); + _root->fail = NULL; + for(typename TrieNodeType::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) { + iter->second->fail = _root; + que.push(iter->second); + } + TrieNodeType* back = NULL; + typename TrieNodeType::NextMap::iterator backiter; + while(!que.empty()) { + TrieNodeType * now = que.front(); + que.pop(); + if(now->next == NULL) { + continue; + } + for(typename TrieNodeType::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { + back = now->fail; + while(back != NULL) { + if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) + { + iter->second->fail = backiter->second; + break; + } + back = back->fail; + } + if(back == NULL) { + iter->second->fail = _root; + } + que.push(iter->second); + } + } + } private: void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers) { @@ -105,22 +213,22 @@ namespace CppJieba { TrieNodeType* ptNode = _root; - typename TrieNodeType::KeyMapType::const_iterator kmIter; + typename TrieNodeType::NextMap::const_iterator kmIter; for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++) { - if(NULL == ptNode->ptKeyMap) + if(NULL == ptNode->next) { - ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType; + ptNode->next = new typename TrieNodeType::NextMap; } - kmIter = ptNode->ptKeyMap->find(*citer); - if(ptNode->ptKeyMap->end() == kmIter) + kmIter = ptNode->next->find(*citer); + if(ptNode->next->end() == kmIter) { TrieNodeType * nextNode = new TrieNodeType; - nextNode->ptKeyMap = NULL; + nextNode->next = NULL; nextNode->ptValue = NULL; - (*ptNode->ptKeyMap)[*citer] = nextNode; + (*ptNode->next)[*citer] = nextNode; ptNode = nextNode; } else @@ -136,14 +244,14 @@ namespace CppJieba { return; } - if(node->ptKeyMap) + if(node->next) { - typename TrieNodeType::KeyMapType::iterator it; - for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + typename TrieNodeType::NextMap::iterator it; + for(it = node->next->begin(); it != node->next->end(); it++) { _deleteNode(it->second); } - delete node->ptKeyMap; + delete node->next; } delete node; } diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 36eb36f..1955267 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -21,7 +21,6 @@ TEST(MixSegmentTest, Test1) ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_TRUE(segment.cut(str2, words)); ASSERT_EQ(words, vector(res2, res2 + sizeof(res2)/sizeof(res2[0]))); - //exit(0); } TEST(MixSegmentTest, NoUserDict) diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 18ff9a9..5ca12fa 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -1,4 +1,5 @@ #include "src/DictTrie.hpp" +#include "src/MPSegment.hpp" #include "gtest/gtest.h" using namespace CppJieba; @@ -64,3 +65,14 @@ TEST(DictTrieTest, UserDict) res << *unit; ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res); } + +TEST(DictTrieTest, automation) +{ + DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); + //string word = "yasherhs"; + string word = "abcderf"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.find(unicode.begin(), unicode.end(), res); +}