From 11b041ed523b50d82b653783a29f458b7f52573e Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 5 Nov 2014 14:57:34 +0800 Subject: [PATCH 1/8] make load_test test time longer --- test/load_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/load_test.cpp b/test/load_test.cpp index 2d9543d..5abe2e7 100644 --- a/test/load_test.cpp +++ b/test/load_test.cpp @@ -8,7 +8,7 @@ using namespace CppJieba; -void cut(size_t times = 20) +void cut(size_t times = 50) { MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); vector res; From b9736ee132a34af3b944ded5f02f65b60afe2109 Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 5 Nov 2014 15:31:09 +0800 Subject: [PATCH 2/8] update trie and dag , make cut faster . see details in changelog.md --- ChangeLog.md | 5 +++++ src/DictTrie.hpp | 2 +- src/MPSegment.hpp | 2 +- src/Trie.hpp | 18 ++++++++++++++---- test/unittest/TTrie.cpp | 10 +++++----- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index c968114..5e5f78b 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,10 @@ # CppJieba ChangeLog +## v2.4.4 (is coming) + +1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。 +2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构(同时也修改 Trie 的 DAG 查询函数),提高分词速度 8% 。 + ## v2.4.3 1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。 diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 16a46e5..8c2d44f 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -39,7 +39,7 @@ namespace CppJieba return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); } - typedef map DagType; + typedef std::vector > DagType; class DictTrie { diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 3b62f4a..f397928 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -92,8 +92,8 @@ namespace CppJieba { segmentChars[i].uniCh = *(begin + i); segmentChars[i].dag.clear(); + segmentChars[i].dag.push_back(std::pair(i, NULL)); _dictTrie.find(begin + i, end, segmentChars[i].dag, i); - segmentChars[i].dag.insert(pair(i, NULL)); } _calcDP(segmentChars); diff --git a/src/Trie.hpp b/src/Trie.hpp index dbaf989..d89b38b 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -56,11 +56,14 @@ namespace CppJieba } return ptNode->ptValue; } - bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map& ordererMap, size_t offset = 0) const + bool find( + typename KeyContainerType::const_iterator begin, + typename KeyContainerType::const_iterator end, + std::vector >& res, + size_t offset = 0) const { const TrieNodeType * ptNode = _root; typename TrieNodeType::KeyMapType::const_iterator citer; - ordererMap.clear(); for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++) { assert(ptNode); @@ -71,10 +74,17 @@ namespace CppJieba ptNode = citer->second; if(ptNode->ptValue) { - ordererMap[itr - begin + offset] = ptNode->ptValue; + if(itr == begin && res.size() == 1) // first singleword + { + res[0].second = ptNode->ptValue; + } + else + { + res.push_back(pair(itr - begin + offset, ptNode->ptValue)); + } } } - return ordererMap.size(); + return !res.empty(); } private: void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers) diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 2791884..18ff9a9 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -34,22 +34,22 @@ TEST(DictTrieTest, Test1) EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; vector > res; - map resMap; - map mp; + //vector resMap; + vector > res2; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); - resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); + //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); } //DictUnit //res.push_back(make_pair(0, )) vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); - ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); - ASSERT_EQ(mp, resMap); + ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); + ASSERT_EQ(res, res2); } TEST(DictTrieTest, UserDict) From 3ced4512124e4ccea5a236ceeda5394d668471ff Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 12 Nov 2014 18:55:17 +0800 Subject: [PATCH 3/8] use automation --- src/DictTrie.hpp | 26 ++----- src/MPSegment.hpp | 47 ++++------- src/Trie.hpp | 150 +++++++++++++++++++++++++++++++----- test/unittest/TSegments.cpp | 1 - test/unittest/TTrie.cpp | 12 +++ 5 files changed, 162 insertions(+), 74 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 8c2d44f..01ff370 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -23,24 +23,6 @@ namespace CppJieba const size_t DICT_COLUMN_NUM = 3; const char* const UNKNOWN_TAG = "x"; - - - struct DictUnit - { - Unicode word; - double weight; - string tag; - }; - - inline ostream & operator << (ostream& os, const DictUnit& unit) - { - string s; - s << unit.word; - return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); - } - - typedef std::vector > DagType; - class DictTrie { public: @@ -107,6 +89,14 @@ namespace CppJieba { return _trie->find(begin, end, dag, offset); } + void find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res + ) const + { + _trie->find(begin, end, res); + } private: diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index f397928..60c76bd 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -1,7 +1,3 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com - ************************************/ #ifndef CPPJIEBA_MPSEGMENT_H #define CPPJIEBA_MPSEGMENT_H @@ -17,17 +13,6 @@ namespace CppJieba { - struct SegmentChar - { - uint16_t uniCh; - DagType dag; - const DictUnit * pInfo; - double weight; - size_t nextPos; - SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) - {} - }; - class MPSegment: public SegmentBase { private: @@ -85,16 +70,9 @@ namespace CppJieba { return false; } - vector segmentChars(end - begin); + vector segmentChars; - //calc DAG - for(size_t i = 0; i < segmentChars.size(); i ++) - { - segmentChars[i].uniCh = *(begin + i); - segmentChars[i].dag.clear(); - segmentChars[i].dag.push_back(std::pair(i, NULL)); - _dictTrie.find(begin + i, end, segmentChars[i].dag, i); - } + _dictTrie.find(begin, end, segmentChars); _calcDP(segmentChars); @@ -112,24 +90,25 @@ namespace CppJieba } private: - void _calcDP(vector& SegmentChars) const + void _calcDP(vector& segmentChars) const { size_t nextPos; const DictUnit* p; double val; - for(int i = SegmentChars.size() - 1; i >= 0; i--) + for(ssize_t i = segmentChars.size() - 1; i >= 0; i--) { - SegmentChars[i].pInfo = NULL; - SegmentChars[i].weight = MIN_DOUBLE; - for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++) + segmentChars[i].pInfo = NULL; + segmentChars[i].weight = MIN_DOUBLE; + assert(!segmentChars[i].dag.empty()); + for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++) { nextPos = it->first; p = it->second; val = 0.0; - if(nextPos + 1 < SegmentChars.size()) + if(nextPos + 1 < segmentChars.size()) { - val += SegmentChars[nextPos + 1].weight; + val += segmentChars[nextPos + 1].weight; } if(p) @@ -140,10 +119,10 @@ namespace CppJieba { val += _dictTrie.getMinWeight(); } - if(val > SegmentChars[i].weight) + if(val > segmentChars[i].weight) { - SegmentChars[i].pInfo = p; - SegmentChars[i].weight = val; + segmentChars[i].pInfo = p; + segmentChars[i].weight = val; } } } diff --git a/src/Trie.hpp b/src/Trie.hpp index d89b38b..9d10267 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -3,18 +3,51 @@ #include "Limonp/StdExtension.hpp" #include +#include namespace CppJieba { using namespace std; + + struct DictUnit + { + Unicode word; + double weight; + string tag; + }; + + inline ostream & operator << (ostream& os, const DictUnit& unit) + { + string s; + s << unit.word; + return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); + } + + typedef std::vector > DagType; + + struct SegmentChar + { + uint16_t uniCh; + DagType dag; + const DictUnit * pInfo; + double weight; + size_t nextPos; + SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) + {} + }; + template class TrieNode { public: - typedef unordered_map* > KeyMapType; + typedef unordered_map* > NextMap; public: - KeyMapType * ptKeyMap; + TrieNode * fail; + NextMap * next; const ValueType * ptValue; + public: + TrieNode(): fail(NULL), next(NULL), ptValue(NULL) { + } }; template , class KeysContainerType = vector, class ValueContainerType = vector > @@ -28,10 +61,8 @@ namespace CppJieba Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers) { _root = new TrieNodeType; - _root->ptKeyMap = NULL; - _root->ptValue = NULL; - _createTrie(keys, valuePointers); + _build();// build automation } ~Trie() { @@ -43,12 +74,12 @@ namespace CppJieba public: const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const { - typename TrieNodeType::KeyMapType::const_iterator citer; + typename TrieNodeType::NextMap::const_iterator citer; const TrieNodeType* ptNode = _root; for(typename KeyContainerType::const_iterator it = begin; it != end; it++) - { + {// build automation assert(ptNode); - if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it))) + if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) { return NULL; } @@ -56,6 +87,47 @@ namespace CppJieba } return ptNode->ptValue; } + void find( + typename KeyContainerType::const_iterator begin, + typename KeyContainerType::const_iterator end, + vector& res + ) const + { + res.resize(end - begin); + const TrieNodeType * now = _root; + typename TrieNodeType::NextMap::const_iterator iter; + for (size_t i = 0; i < end - begin; i++) { + bool flag = false; + res[i].uniCh = *(begin + i); + assert(res[i].dag.empty()); + res[i].dag.reserve(4);//TODO + while( now != _root && (now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end())) { + now = now->fail; + } + if(now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end()) { + now = _root; + } else { + now = iter->second; + const TrieNodeType * temp = now; + while(temp != _root) { + if (temp->ptValue) { + string str; + TransCode::encode(temp->ptValue->word, str); + size_t pos = i - temp->ptValue->word.size() + 1; + res[pos].dag.push_back(pair(i, temp->ptValue)); + if(temp->ptValue->word.size() == 1) { + flag = true; + } + } + temp = temp->fail; + assert(temp); + } + } + if(!flag) { + res[i].dag.push_back(pair(i, NULL)); + } + } + } bool find( typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, @@ -63,11 +135,11 @@ namespace CppJieba size_t offset = 0) const { const TrieNodeType * ptNode = _root; - typename TrieNodeType::KeyMapType::const_iterator citer; + typename TrieNodeType::NextMap::const_iterator citer; for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++) { assert(ptNode); - if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr))) + if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) { break; } @@ -86,6 +158,42 @@ namespace CppJieba } return !res.empty(); } + private: + void _build() + { + queue que; + assert(_root->ptValue == NULL); + assert(_root->next); + _root->fail = NULL; + for(typename TrieNodeType::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) { + iter->second->fail = _root; + que.push(iter->second); + } + TrieNodeType* back = NULL; + typename TrieNodeType::NextMap::iterator backiter; + while(!que.empty()) { + TrieNodeType * now = que.front(); + que.pop(); + if(now->next == NULL) { + continue; + } + for(typename TrieNodeType::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { + back = now->fail; + while(back != NULL) { + if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) + { + iter->second->fail = backiter->second; + break; + } + back = back->fail; + } + if(back == NULL) { + iter->second->fail = _root; + } + que.push(iter->second); + } + } + } private: void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers) { @@ -105,22 +213,22 @@ namespace CppJieba { TrieNodeType* ptNode = _root; - typename TrieNodeType::KeyMapType::const_iterator kmIter; + typename TrieNodeType::NextMap::const_iterator kmIter; for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++) { - if(NULL == ptNode->ptKeyMap) + if(NULL == ptNode->next) { - ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType; + ptNode->next = new typename TrieNodeType::NextMap; } - kmIter = ptNode->ptKeyMap->find(*citer); - if(ptNode->ptKeyMap->end() == kmIter) + kmIter = ptNode->next->find(*citer); + if(ptNode->next->end() == kmIter) { TrieNodeType * nextNode = new TrieNodeType; - nextNode->ptKeyMap = NULL; + nextNode->next = NULL; nextNode->ptValue = NULL; - (*ptNode->ptKeyMap)[*citer] = nextNode; + (*ptNode->next)[*citer] = nextNode; ptNode = nextNode; } else @@ -136,14 +244,14 @@ namespace CppJieba { return; } - if(node->ptKeyMap) + if(node->next) { - typename TrieNodeType::KeyMapType::iterator it; - for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) + typename TrieNodeType::NextMap::iterator it; + for(it = node->next->begin(); it != node->next->end(); it++) { _deleteNode(it->second); } - delete node->ptKeyMap; + delete node->next; } delete node; } diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 36eb36f..1955267 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -21,7 +21,6 @@ TEST(MixSegmentTest, Test1) ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_TRUE(segment.cut(str2, words)); ASSERT_EQ(words, vector(res2, res2 + sizeof(res2)/sizeof(res2[0]))); - //exit(0); } TEST(MixSegmentTest, NoUserDict) diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 18ff9a9..5ca12fa 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -1,4 +1,5 @@ #include "src/DictTrie.hpp" +#include "src/MPSegment.hpp" #include "gtest/gtest.h" using namespace CppJieba; @@ -64,3 +65,14 @@ TEST(DictTrieTest, UserDict) res << *unit; ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res); } + +TEST(DictTrieTest, automation) +{ + DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); + //string word = "yasherhs"; + string word = "abcderf"; + Unicode unicode; + ASSERT_TRUE(TransCode::decode(word, unicode)); + vector res; + trie.find(unicode.begin(), unicode.end(), res); +} From 75367a20c91ad4cb0f852fc059d8893e834a5537 Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 12 Nov 2014 19:45:20 +0800 Subject: [PATCH 4/8] little modification --- src/MPSegment.hpp | 9 ++------- src/Trie.hpp | 30 +++++++++++++++++++----------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 60c76bd..333fe65 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -76,11 +76,7 @@ namespace CppJieba _calcDP(segmentChars); - if(!_cut(segmentChars, res)) - { - LogError("_cut failed."); - return false; - } + _cut(segmentChars, res); return true; } @@ -127,7 +123,7 @@ namespace CppJieba } } } - bool _cut(const vector& segmentChars, vector& res)const + void _cut(const vector& segmentChars, vector& res) const { size_t i = 0; while(i < segmentChars.size()) @@ -144,7 +140,6 @@ namespace CppJieba i++; } } - return true; } diff --git a/src/Trie.hpp b/src/Trie.hpp index 9d10267..39651c2 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -87,6 +87,7 @@ namespace CppJieba } return ptNode->ptValue; } + // aho-corasick-automation void find( typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, @@ -96,26 +97,32 @@ namespace CppJieba res.resize(end - begin); const TrieNodeType * now = _root; typename TrieNodeType::NextMap::const_iterator iter; - for (size_t i = 0; i < end - begin; i++) { + for (size_t i = 0; i < end - begin; i++) + { bool flag = false; res[i].uniCh = *(begin + i); assert(res[i].dag.empty()); - res[i].dag.reserve(4);//TODO - while( now != _root && (now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end())) { + res[i].dag.reserve(2); + while( now != _root && (now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end())) + { now = now->fail; } - if(now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end()) { + if(now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end()) + { now = _root; - } else { + } + else + { now = iter->second; const TrieNodeType * temp = now; - while(temp != _root) { - if (temp->ptValue) { - string str; - TransCode::encode(temp->ptValue->word, str); + while(temp != _root) + { + if (temp->ptValue) + { size_t pos = i - temp->ptValue->word.size() + 1; res[pos].dag.push_back(pair(i, temp->ptValue)); - if(temp->ptValue->word.size() == 1) { + if(pos == i) + { flag = true; } } @@ -123,7 +130,8 @@ namespace CppJieba assert(temp); } } - if(!flag) { + if(!flag) + { res[i].dag.push_back(pair(i, NULL)); } } From 99c3405e13ef4363948f689743347e9b900b37ee Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 12 Nov 2014 20:03:32 +0800 Subject: [PATCH 5/8] move flag --- src/Trie.hpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/Trie.hpp b/src/Trie.hpp index 39651c2..c97d96d 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -99,15 +99,15 @@ namespace CppJieba typename TrieNodeType::NextMap::const_iterator iter; for (size_t i = 0; i < end - begin; i++) { - bool flag = false; res[i].uniCh = *(begin + i); assert(res[i].dag.empty()); res[i].dag.reserve(2); - while( now != _root && (now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end())) + res[i].dag.push_back(pair(i, NULL)); + while( now != _root && (now->next == NULL || (iter = now->next->find(res[i].uniCh)) == now->next->end())) { now = now->fail; } - if(now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end()) + if(now->next == NULL || (iter = now->next->find(res[i].uniCh)) == now->next->end()) { now = _root; } @@ -123,17 +123,13 @@ namespace CppJieba res[pos].dag.push_back(pair(i, temp->ptValue)); if(pos == i) { - flag = true; + res[pos].dag[0].second = temp->ptValue; } } temp = temp->fail; assert(temp); } } - if(!flag) - { - res[i].dag.push_back(pair(i, NULL)); - } } } bool find( From c119dc0a932da33f804df8b425aff3024049b65f Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 12 Nov 2014 21:18:30 +0800 Subject: [PATCH 6/8] use localvector in dag --- src/Trie.hpp | 52 ++++++++++++++++++++++++++++++++++------- test/unittest/TTrie.cpp | 8 ++++--- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/Trie.hpp b/src/Trie.hpp index c97d96d..0e71973 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -23,7 +23,7 @@ namespace CppJieba return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); } - typedef std::vector > DagType; + typedef LocalVector > DagType; struct SegmentChar { @@ -34,6 +34,8 @@ namespace CppJieba size_t nextPos; SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) {} + ~SegmentChar() + {} }; template @@ -45,6 +47,20 @@ namespace CppJieba TrieNode * fail; NextMap * next; const ValueType * ptValue; + public: + const TrieNode * findNext(KeyType key) const + { + if(next == NULL) + { + return NULL; + } + typename NextMap::const_iterator iter = next->find(key); + if(iter == next->end()) + { + return NULL; + } + return iter->second; + } public: TrieNode(): fail(NULL), next(NULL), ptValue(NULL) { } @@ -96,24 +112,42 @@ namespace CppJieba { res.resize(end - begin); const TrieNodeType * now = _root; - typename TrieNodeType::NextMap::const_iterator iter; + //typename TrieNodeType::NextMap::const_iterator iter; + const TrieNodeType* node; for (size_t i = 0; i < end - begin; i++) { - res[i].uniCh = *(begin + i); + Unicode::value_type ch = *(begin + i); + res[i].uniCh = ch; assert(res[i].dag.empty()); - res[i].dag.reserve(2); res[i].dag.push_back(pair(i, NULL)); - while( now != _root && (now->next == NULL || (iter = now->next->find(res[i].uniCh)) == now->next->end())) + bool flag = false; + + // rollback + while( now != _root ) { - now = now->fail; + node = now->findNext(ch); + if (node != NULL) + { + flag = true; + break; + } + else + { + now = now->fail; + } } - if(now->next == NULL || (iter = now->next->find(res[i].uniCh)) == now->next->end()) + + if(!flag) + { + node = now->findNext(ch); + } + if(node == NULL) { now = _root; } else { - now = iter->second; + now = node; const TrieNodeType * temp = now; while(temp != _root) { @@ -135,7 +169,7 @@ namespace CppJieba bool find( typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, - std::vector >& res, + DagType & res, size_t offset = 0) const { const TrieNodeType * ptNode = _root; diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 5ca12fa..5baea4a 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -34,9 +34,9 @@ TEST(DictTrieTest, Test1) EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; - vector > res; + LocalVector > res; //vector resMap; - vector > res2; + LocalVector > res2; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { @@ -50,7 +50,9 @@ TEST(DictTrieTest, Test1) vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); - ASSERT_EQ(res, res2); + s1 << res; + s2 << res; + ASSERT_EQ(s1, s2); } TEST(DictTrieTest, UserDict) From 7868f7cdff4983141ae3002dabb863c60c0ce488 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 13 Nov 2014 01:16:38 +0800 Subject: [PATCH 7/8] =?UTF-8?q?=E5=8E=BB=E9=99=A4=E4=B8=80=E4=BA=9B=20temp?= =?UTF-8?q?late=20=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/DictTrie.hpp | 8 +- src/Trie.hpp | 444 +++++++++++++++++++++++------------------------ 2 files changed, 224 insertions(+), 228 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 01ff370..9d78dd6 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -25,11 +25,9 @@ namespace CppJieba class DictTrie { - public: - typedef Trie, vector > TrieType; private: vector _nodeInfos; - TrieType * _trie; + Trie * _trie; double _minWeight; private: @@ -100,7 +98,7 @@ namespace CppJieba private: - TrieType * _createTrie(const vector& dictUnits) + Trie * _createTrie(const vector& dictUnits) { assert(dictUnits.size()); vector words; @@ -111,7 +109,7 @@ namespace CppJieba valuePointers.push_back(&dictUnits[i]); } - TrieType * trie = new TrieType(words, valuePointers); + Trie * trie = new Trie(words, valuePointers); return trie; } void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) diff --git a/src/Trie.hpp b/src/Trie.hpp index 0e71973..80a1f15 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -16,6 +16,7 @@ namespace CppJieba string tag; }; + // for debugging inline ostream & operator << (ostream& os, const DictUnit& unit) { string s; @@ -38,262 +39,259 @@ namespace CppJieba {} }; - template - class TrieNode - { - public: - typedef unordered_map* > NextMap; - public: - TrieNode * fail; - NextMap * next; - const ValueType * ptValue; - public: - const TrieNode * findNext(KeyType key) const + typedef Unicode::value_type TrieKey; + + class TrieNode + { + public: + typedef unordered_map NextMap; + public: + TrieNode * fail; + NextMap * next; + const DictUnit * ptValue; + public: + TrieNode(): fail(NULL), next(NULL), ptValue(NULL) + {} + const TrieNode * findNext(TrieKey key) const + { + if(next == NULL) { - if(next == NULL) + return NULL; + } + typename NextMap::const_iterator iter = next->find(key); + if(iter == next->end()) + { + return NULL; + } + return iter->second; + } + }; + + class Trie + { + private: + TrieNode* _root; + public: + Trie(const vector& keys, const vector & valuePointers) + { + _root = new TrieNode; + _createTrie(keys, valuePointers); + _build();// build automation + } + ~Trie() + { + if(_root) + { + _deleteNode(_root); + } + } + public: + const DictUnit* find(typename Unicode::const_iterator begin, typename Unicode::const_iterator end) const + { + typename TrieNode::NextMap::const_iterator citer; + const TrieNode* ptNode = _root; + for(typename Unicode::const_iterator it = begin; it != end; it++) + {// build automation + assert(ptNode); + if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) { return NULL; } - typename NextMap::const_iterator iter = next->find(key); - if(iter == next->end()) - { - return NULL; - } - return iter->second; + ptNode = citer->second; } - public: - TrieNode(): fail(NULL), next(NULL), ptValue(NULL) { - } - }; + return ptNode->ptValue; + } + // aho-corasick-automation + void find( + typename Unicode::const_iterator begin, + typename Unicode::const_iterator end, + vector& res + ) const + { + res.resize(end - begin); + const TrieNode * now = _root; + //typename TrieNode::NextMap::const_iterator iter; + const TrieNode* node; + for (size_t i = 0; i < end - begin; i++) + { + Unicode::value_type ch = *(begin + i); + res[i].uniCh = ch; + assert(res[i].dag.empty()); + res[i].dag.push_back(pair::size_type, const DictUnit* >(i, NULL)); + bool flag = false; - template , class KeysContainerType = vector, class ValueContainerType = vector > - class Trie - { - public: - typedef TrieNode TrieNodeType; - private: - TrieNodeType* _root; - public: - Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers) - { - _root = new TrieNodeType; - _createTrie(keys, valuePointers); - _build();// build automation - } - ~Trie() - { - if(_root) + // rollback + while( now != _root ) { - _deleteNode(_root); - } - } - public: - const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const - { - typename TrieNodeType::NextMap::const_iterator citer; - const TrieNodeType* ptNode = _root; - for(typename KeyContainerType::const_iterator it = begin; it != end; it++) - {// build automation - assert(ptNode); - if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it))) - { - return NULL; - } - ptNode = citer->second; - } - return ptNode->ptValue; - } - // aho-corasick-automation - void find( - typename KeyContainerType::const_iterator begin, - typename KeyContainerType::const_iterator end, - vector& res - ) const - { - res.resize(end - begin); - const TrieNodeType * now = _root; - //typename TrieNodeType::NextMap::const_iterator iter; - const TrieNodeType* node; - for (size_t i = 0; i < end - begin; i++) - { - Unicode::value_type ch = *(begin + i); - res[i].uniCh = ch; - assert(res[i].dag.empty()); - res[i].dag.push_back(pair(i, NULL)); - bool flag = false; - - // rollback - while( now != _root ) - { - node = now->findNext(ch); - if (node != NULL) - { - flag = true; - break; - } - else - { - now = now->fail; - } - } - - if(!flag) - { - node = now->findNext(ch); - } - if(node == NULL) - { - now = _root; - } - else - { - now = node; - const TrieNodeType * temp = now; - while(temp != _root) - { - if (temp->ptValue) - { - size_t pos = i - temp->ptValue->word.size() + 1; - res[pos].dag.push_back(pair(i, temp->ptValue)); - if(pos == i) - { - res[pos].dag[0].second = temp->ptValue; - } - } - temp = temp->fail; - assert(temp); - } - } - } - } - bool find( - typename KeyContainerType::const_iterator begin, - typename KeyContainerType::const_iterator end, - DagType & res, - size_t offset = 0) const - { - const TrieNodeType * ptNode = _root; - typename TrieNodeType::NextMap::const_iterator citer; - for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++) - { - assert(ptNode); - if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) + node = now->findNext(ch); + if (node != NULL) { + flag = true; break; } - ptNode = citer->second; - if(ptNode->ptValue) + else { - if(itr == begin && res.size() == 1) // first singleword - { - res[0].second = ptNode->ptValue; - } - else - { - res.push_back(pair(itr - begin + offset, ptNode->ptValue)); - } + now = now->fail; } } - return !res.empty(); - } - private: - void _build() - { - queue que; - assert(_root->ptValue == NULL); - assert(_root->next); - _root->fail = NULL; - for(typename TrieNodeType::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) { - iter->second->fail = _root; - que.push(iter->second); + + if(!flag) + { + node = now->findNext(ch); } - TrieNodeType* back = NULL; - typename TrieNodeType::NextMap::iterator backiter; - while(!que.empty()) { - TrieNodeType * now = que.front(); - que.pop(); - if(now->next == NULL) { - continue; - } - for(typename TrieNodeType::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { - back = now->fail; - while(back != NULL) { - if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) + if(node == NULL) + { + now = _root; + } + else + { + now = node; + const TrieNode * temp = now; + while(temp != _root) + { + if (temp->ptValue) + { + size_t pos = i - temp->ptValue->word.size() + 1; + res[pos].dag.push_back(pair::size_type, const DictUnit* >(i, temp->ptValue)); + if(pos == i) { - iter->second->fail = backiter->second; - break; + res[pos].dag[0].second = temp->ptValue; } - back = back->fail; } - if(back == NULL) { - iter->second->fail = _root; - } - que.push(iter->second); + temp = temp->fail; + assert(temp); } } } - private: - void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers) + } + bool find( + typename Unicode::const_iterator begin, + typename Unicode::const_iterator end, + DagType & res, + size_t offset = 0) const + { + const TrieNode * ptNode = _root; + typename TrieNode::NextMap::const_iterator citer; + for(typename Unicode::const_iterator itr = begin; itr != end ; itr++) { - if(valuePointers.empty() || keys.empty()) + assert(ptNode); + if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr))) { - return; + break; } - assert(keys.size() == valuePointers.size()); - - for(size_t i = 0; i < keys.size(); i++) + ptNode = citer->second; + if(ptNode->ptValue) { - _insertNode(keys[i], valuePointers[i]); - } - } - private: - void _insertNode(const KeyContainerType& key, const ValueType* ptValue) - { - TrieNodeType* ptNode = _root; - - typename TrieNodeType::NextMap::const_iterator kmIter; - - for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++) - { - if(NULL == ptNode->next) + if(itr == begin && res.size() == 1) // first singleword { - ptNode->next = new typename TrieNodeType::NextMap; - } - kmIter = ptNode->next->find(*citer); - if(ptNode->next->end() == kmIter) - { - TrieNodeType * nextNode = new TrieNodeType; - nextNode->next = NULL; - nextNode->ptValue = NULL; - - (*ptNode->next)[*citer] = nextNode; - ptNode = nextNode; + res[0].second = ptNode->ptValue; } else { - ptNode = kmIter->second; + res.push_back(pair::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue)); } } - ptNode->ptValue = ptValue; } - void _deleteNode(TrieNodeType* node) + return !res.empty(); + } + private: + void _build() + { + queue que; + assert(_root->ptValue == NULL); + assert(_root->next); + _root->fail = NULL; + for(typename TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) { + iter->second->fail = _root; + que.push(iter->second); + } + TrieNode* back = NULL; + typename TrieNode::NextMap::iterator backiter; + while(!que.empty()) { + TrieNode * now = que.front(); + que.pop(); + if(now->next == NULL) { + continue; + } + for(typename TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) { + back = now->fail; + while(back != NULL) { + if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) + { + iter->second->fail = backiter->second; + break; + } + back = back->fail; + } + if(back == NULL) { + iter->second->fail = _root; + } + que.push(iter->second); + } + } + } + private: + void _createTrie(const vector& keys, const vector & valuePointers) + { + if(valuePointers.empty() || keys.empty()) { - if(!node) - { - return; - } - if(node->next) - { - typename TrieNodeType::NextMap::iterator it; - for(it = node->next->begin(); it != node->next->end(); it++) - { - _deleteNode(it->second); - } - delete node->next; - } - delete node; + return; } - }; + assert(keys.size() == valuePointers.size()); + + for(size_t i = 0; i < keys.size(); i++) + { + _insertNode(keys[i], valuePointers[i]); + } + } + private: + void _insertNode(const Unicode& key, const DictUnit* ptValue) + { + TrieNode* ptNode = _root; + + typename TrieNode::NextMap::const_iterator kmIter; + + for(typename Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++) + { + if(NULL == ptNode->next) + { + ptNode->next = new typename TrieNode::NextMap; + } + kmIter = ptNode->next->find(*citer); + if(ptNode->next->end() == kmIter) + { + TrieNode * nextNode = new TrieNode; + nextNode->next = NULL; + nextNode->ptValue = NULL; + + (*ptNode->next)[*citer] = nextNode; + ptNode = nextNode; + } + else + { + ptNode = kmIter->second; + } + } + ptNode->ptValue = ptValue; + } + void _deleteNode(TrieNode* node) + { + if(!node) + { + return; + } + if(node->next) + { + typename TrieNode::NextMap::iterator it; + for(it = node->next->begin(); it != node->next->end(); it++) + { + _deleteNode(it->second); + } + delete node->next; + } + delete node; + } + }; } #endif From 9d5359fc34c2284e0b51e8ddc2fc7176dd173b24 Mon Sep 17 00:00:00 2001 From: wyy Date: Thu, 13 Nov 2014 01:32:38 +0800 Subject: [PATCH 8/8] update changelog.md --- ChangeLog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.md b/ChangeLog.md index 5e5f78b..dcbd172 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,6 +4,7 @@ 1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。 2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构(同时也修改 Trie 的 DAG 查询函数),提高分词速度 8% 。 +3. 使用了 `Aho-Corasick-Automation` 算法提速 Trie 查找的过程等优化,提升性能。 ## v2.4.3