mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
use HashMap in Trie, and remove the base array of trie root node, see details in Changelog
This commit is contained in:
parent
2d3c51dba7
commit
e6454fef77
@ -2,7 +2,8 @@
|
|||||||
|
|
||||||
## next version
|
## next version
|
||||||
|
|
||||||
+ 升级 [limonp] 并使用定制化的 limonp::HashMap 。
|
+ 开始在 Trie 中使用定制化的 cppjieba::HashMap ,并去除之前糟糕的针对 uint16 优化的用数组代替 map 的设计,
|
||||||
|
该设计的主要问题是前提 unicode 每个字符必须是 uint16 ,则无法更全面得支持 unicode 多国字符。
|
||||||
|
|
||||||
## v4.4.1
|
## v4.4.1
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace limonp {
|
namespace cppjieba {
|
||||||
|
|
||||||
static size_t PRIME_NUMBERS[] = {3, 7, 17, 37, 79, 163, 331,
|
static size_t PRIME_NUMBERS[] = {3, 7, 17, 37, 79, 163, 331,
|
||||||
673, 1361, 2729, 471, 10949,
|
673, 1361, 2729, 471, 10949,
|
||||||
@ -224,6 +224,6 @@ class HashMap {
|
|||||||
size_t size_;
|
size_t size_;
|
||||||
}; // class HashMap
|
}; // class HashMap
|
||||||
|
|
||||||
} // namespace limonp
|
} // namespace cppjieba
|
||||||
|
|
||||||
#endif // LIMONP_HASH_MAP_HPP
|
#endif // LIMONP_HASH_MAP_HPP
|
@ -3,8 +3,7 @@
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <queue>
|
#include <queue>
|
||||||
#include "limonp/StdExtension.hpp"
|
#include "HashMap.hpp"
|
||||||
#include "limonp/HashMap.hpp"
|
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -41,28 +40,19 @@ class TrieNode {
|
|||||||
TrieNode(): next(NULL), ptValue(NULL) {
|
TrieNode(): next(NULL), ptValue(NULL) {
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
typedef limonp::HashMap<TrieKey, TrieNode*> NextMap;
|
typedef HashMap<TrieKey, TrieNode*> NextMap;
|
||||||
NextMap *next;
|
NextMap *next;
|
||||||
const DictUnit *ptValue;
|
const DictUnit *ptValue;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Trie {
|
class Trie {
|
||||||
public:
|
public:
|
||||||
static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey))));
|
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
|
||||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
|
: root_(new TrieNode) {
|
||||||
CreateTrie(keys, valuePointers);
|
CreateTrie(keys, valuePointers);
|
||||||
}
|
}
|
||||||
~Trie() {
|
~Trie() {
|
||||||
for (size_t i = 0; i < BASE_SIZE; i++) {
|
DeleteNode(root_);
|
||||||
if (_base[i].next == NULL) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (TrieNode::NextMap::const_iterator it = _base[i].next->Begin(); it != _base[i].next->End(); ++it) {
|
|
||||||
DeleteNode(it->second);
|
|
||||||
}
|
|
||||||
delete _base[i].next;
|
|
||||||
_base[i].next = NULL;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
@ -70,7 +60,7 @@ class Trie {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
const TrieNode* ptNode = _base + (*(begin++));
|
const TrieNode* ptNode = root_;
|
||||||
TrieNode::NextMap::const_iterator citer;
|
TrieNode::NextMap::const_iterator citer;
|
||||||
for (Unicode::const_iterator it = begin; it != end; it++) {
|
for (Unicode::const_iterator it = begin; it != end; it++) {
|
||||||
if (NULL == ptNode->next) {
|
if (NULL == ptNode->next) {
|
||||||
@ -89,20 +79,28 @@ class Trie {
|
|||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<struct Dag>&res,
|
vector<struct Dag>&res,
|
||||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||||
|
assert(root_ != NULL);
|
||||||
res.resize(end - begin);
|
res.resize(end - begin);
|
||||||
|
|
||||||
const TrieNode *ptNode = NULL;
|
const TrieNode *ptNode = NULL;
|
||||||
TrieNode::NextMap::const_iterator citer;
|
TrieNode::NextMap::const_iterator citer;
|
||||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||||
Rune rune = *(begin + i);
|
Rune rune = *(begin + i);
|
||||||
ptNode = _base + rune;
|
|
||||||
res[i].rune = rune;
|
res[i].rune = rune;
|
||||||
assert(res[i].nexts.empty());
|
|
||||||
|
|
||||||
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
if (root_->next != NULL && root_->next->End() != (citer = root_->next->Find(rune))) {
|
||||||
|
ptNode = citer->second;
|
||||||
|
} else {
|
||||||
|
ptNode = NULL;
|
||||||
|
}
|
||||||
|
if (ptNode != NULL) {
|
||||||
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
||||||
|
} else {
|
||||||
|
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len ; j++) {
|
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
|
||||||
if (ptNode->next == NULL) {
|
if (ptNode == NULL || ptNode->next == NULL) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
citer = ptNode->next->Find(*(begin + j));
|
citer = ptNode->next->Find(*(begin + j));
|
||||||
@ -123,9 +121,8 @@ class Trie {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TrieNode::NextMap::const_iterator kmIter;
|
TrieNode::NextMap::const_iterator kmIter;
|
||||||
Unicode::const_iterator citer= key.begin();
|
TrieNode *ptNode = root_;
|
||||||
TrieNode *ptNode = _base + (*(citer++));
|
for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
||||||
for (; citer != key.end(); citer++) {
|
|
||||||
if (NULL == ptNode->next) {
|
if (NULL == ptNode->next) {
|
||||||
ptNode->next = new TrieNode::NextMap;
|
ptNode->next = new TrieNode::NextMap;
|
||||||
}
|
}
|
||||||
@ -139,6 +136,7 @@ class Trie {
|
|||||||
ptNode = kmIter->second;
|
ptNode = kmIter->second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
assert(ptNode != NULL);
|
||||||
ptNode->ptValue = ptValue;
|
ptNode->ptValue = ptValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,8 +165,8 @@ class Trie {
|
|||||||
delete node;
|
delete node;
|
||||||
}
|
}
|
||||||
|
|
||||||
TrieNode _base[BASE_SIZE];
|
TrieNode* root_;
|
||||||
};
|
}; // class Trie
|
||||||
}
|
} // namespace cppjieba
|
||||||
|
|
||||||
#endif
|
#endif // CPPJIEBA_TRIE_HPP
|
||||||
|
@ -3,7 +3,7 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
|||||||
|
|
||||||
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
|
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
|
||||||
|
|
||||||
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
|
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARNING)
|
||||||
|
|
||||||
ADD_EXECUTABLE(test.run
|
ADD_EXECUTABLE(test.run
|
||||||
gtest_main.cpp
|
gtest_main.cpp
|
||||||
|
Loading…
x
Reference in New Issue
Block a user