diff --git a/ChangeLog.md b/ChangeLog.md index b7e16b7..bdd44d3 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,7 +2,8 @@ ## next version -+ 升级 [limonp] 并使用定制化的 limonp::HashMap 。 ++ 开始在 Trie 中使用定制化的 cppjieba::HashMap ,并去除之前糟糕的针对 uint16 优化的用数组代替 map 的设计, +该设计的主要问题是前提 unicode 每个字符必须是 uint16 ,则无法更全面得支持 unicode 多国字符。 ## v4.4.1 diff --git a/deps/limonp/HashMap.hpp b/include/cppjieba/HashMap.hpp similarity index 99% rename from deps/limonp/HashMap.hpp rename to include/cppjieba/HashMap.hpp index 187903d..9aaced8 100644 --- a/deps/limonp/HashMap.hpp +++ b/include/cppjieba/HashMap.hpp @@ -8,7 +8,7 @@ #include #include -namespace limonp { +namespace cppjieba { static size_t PRIME_NUMBERS[] = {3, 7, 17, 37, 79, 163, 331, 673, 1361, 2729, 471, 10949, @@ -224,6 +224,6 @@ class HashMap { size_t size_; }; // class HashMap -} // namespace limonp +} // namespace cppjieba #endif // LIMONP_HASH_MAP_HPP diff --git a/include/cppjieba/Trie.hpp b/include/cppjieba/Trie.hpp index c2c0fea..7e1c49c 100644 --- a/include/cppjieba/Trie.hpp +++ b/include/cppjieba/Trie.hpp @@ -3,8 +3,7 @@ #include #include -#include "limonp/StdExtension.hpp" -#include "limonp/HashMap.hpp" +#include "HashMap.hpp" namespace cppjieba { using namespace std; @@ -41,28 +40,19 @@ class TrieNode { TrieNode(): next(NULL), ptValue(NULL) { } public: - typedef limonp::HashMap NextMap; + typedef HashMap NextMap; NextMap *next; const DictUnit *ptValue; }; class Trie { public: - static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); - Trie(const vector& keys, const vector& valuePointers) { + Trie(const vector& keys, const vector& valuePointers) + : root_(new TrieNode) { CreateTrie(keys, valuePointers); } ~Trie() { - for (size_t i = 0; i < BASE_SIZE; i++) { - if (_base[i].next == NULL) { - continue; - } - for (TrieNode::NextMap::const_iterator it = _base[i].next->Begin(); it != _base[i].next->End(); ++it) { - DeleteNode(it->second); - } - delete _base[i].next; - _base[i].next = NULL; - } + DeleteNode(root_); } const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const { @@ -70,7 +60,7 @@ class Trie { return NULL; } - const TrieNode* ptNode = _base + (*(begin++)); + const TrieNode* ptNode = root_; TrieNode::NextMap::const_iterator citer; for (Unicode::const_iterator it = begin; it != end; it++) { if (NULL == ptNode->next) { @@ -89,20 +79,28 @@ class Trie { Unicode::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { + assert(root_ != NULL); res.resize(end - begin); const TrieNode *ptNode = NULL; TrieNode::NextMap::const_iterator citer; for (size_t i = 0; i < size_t(end - begin); i++) { Rune rune = *(begin + i); - ptNode = _base + rune; res[i].rune = rune; - assert(res[i].nexts.empty()); - res[i].nexts.push_back(pair(i, ptNode->ptValue)); + if (root_->next != NULL && root_->next->End() != (citer = root_->next->Find(rune))) { + ptNode = citer->second; + } else { + ptNode = NULL; + } + if (ptNode != NULL) { + res[i].nexts.push_back(pair(i, ptNode->ptValue)); + } else { + res[i].nexts.push_back(pair(i, static_cast(NULL))); + } - for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) { - if (ptNode->next == NULL) { + for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) { + if (ptNode == NULL || ptNode->next == NULL) { break; } citer = ptNode->next->Find(*(begin + j)); @@ -123,9 +121,8 @@ class Trie { } TrieNode::NextMap::const_iterator kmIter; - Unicode::const_iterator citer= key.begin(); - TrieNode *ptNode = _base + (*(citer++)); - for (; citer != key.end(); citer++) { + TrieNode *ptNode = root_; + for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { if (NULL == ptNode->next) { ptNode->next = new TrieNode::NextMap; } @@ -139,6 +136,7 @@ class Trie { ptNode = kmIter->second; } } + assert(ptNode != NULL); ptNode->ptValue = ptValue; } @@ -167,8 +165,8 @@ class Trie { delete node; } - TrieNode _base[BASE_SIZE]; -}; -} + TrieNode* root_; +}; // class Trie +} // namespace cppjieba -#endif +#endif // CPPJIEBA_TRIE_HPP diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index b81f53a..2655215 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -3,7 +3,7 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include) -ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN) +ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARNING) ADD_EXECUTABLE(test.run gtest_main.cpp