This commit is contained in:
xuangong 2015-07-21 00:11:13 +08:00
parent 931db7d1e5
commit cf9cc45c19

View File

@ -5,249 +5,204 @@
#include <vector> #include <vector>
#include <queue> #include <queue>
namespace CppJieba namespace CppJieba {
{ using namespace std;
using namespace std;
struct DictUnit struct DictUnit {
{ Unicode word;
Unicode word; double weight;
double weight; string tag;
string tag; };
};
// for debugging // for debugging
inline ostream & operator << (ostream& os, const DictUnit& unit) inline ostream & operator << (ostream& os, const DictUnit& unit) {
{ string s;
string s; s << unit.word;
s << unit.word; return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); }
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
struct SegmentChar {
uint16_t uniCh;
DagType dag;
const DictUnit * pInfo;
double weight;
size_t nextPos;
SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0) {}
~SegmentChar() {}
};
typedef Unicode::value_type TrieKey;
class TrieNode {
public :
TrieNode(): next(NULL), ptValue(NULL) {}
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
NextMap *next;
const DictUnit *ptValue;
};
class UglyTrie {
public:
static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey))));
public:
UglyTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
_createTrie(keys, valuePointers);
}
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
if (begin == end) {
return NULL;
} }
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType; const TrieNode* ptNode = _base + (*(begin++));
TrieNode::NextMap::const_iterator citer;
for (Unicode::const_iterator it = begin; it != end; it++) {
if (NULL == ptNode->next) {
return NULL;
}
citer = ptNode->next->find(*it);
if (ptNode->next->end() == citer) {
return NULL;
}
ptNode = citer->second;
}
return ptNode->ptValue;
}
struct SegmentChar void find(
{ Unicode::const_iterator begin,
uint16_t uniCh; Unicode::const_iterator end,
DagType dag; vector<struct SegmentChar>& res
const DictUnit * pInfo; ) const {
double weight; res.resize(end - begin);
size_t nextPos;
SegmentChar() : uniCh(), pInfo(NULL), weight(0.0), nextPos(0){}
~SegmentChar() {}
};
typedef Unicode::value_type TrieKey; const TrieNode *ptNode = NULL;
TrieNode::NextMap::const_iterator citer;
for (size_t i = 0; i < size_t(end - begin); i++) {
Unicode::value_type ch = *(begin + i);
ptNode = _base + ch;
res[i].uniCh = ch;
assert(res[i].dag.empty());
class TrieNode res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue));
{
public :
TrieNode(): next(NULL), ptValue(NULL) {}
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
NextMap *next;
const DictUnit *ptValue;
};
class UglyTrie for (size_t j = i + 1; j < size_t(end - begin); j++) {
{ if (ptNode->next == NULL) {
public: break;
static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); }
public: citer = ptNode->next->find(*(begin + j));
UglyTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) if (ptNode->next->end() == citer) {
{ break;
_createTrie(keys, valuePointers); }
} ptNode = citer->second;
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const if (NULL != ptNode->ptValue) {
{ res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue));
if (begin == end) }
{ }
return NULL; }
} }
bool find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
DagType & res,
size_t offset = 0) const {
if (begin == end) {
return !res.empty();
}
const TrieNode* ptNode = _base + (*(begin++)); const TrieNode* ptNode = _base + (*(begin++));
TrieNode::NextMap::const_iterator citer; if (ptNode->ptValue != NULL && res.size() == 1) {
for (Unicode::const_iterator it = begin; it != end; it++) res[0].second = ptNode->ptValue;
{ } else if (ptNode->ptValue != NULL) {
if (NULL == ptNode->next) res.push_back(DagType::value_type(offset, ptNode->ptValue));
{ }
return NULL;
}
citer = ptNode->next->find(*it);
if (ptNode->next->end() == citer)
{
return NULL;
}
ptNode = citer->second;
}
return ptNode->ptValue;
}
void find( TrieNode::NextMap::const_iterator citer;
Unicode::const_iterator begin, for (Unicode::const_iterator itr = begin; itr != end; itr++) {
Unicode::const_iterator end, if (NULL == ptNode->next) {
vector<struct SegmentChar>& res break;
) const }
{ citer = ptNode->next->find(*itr);
res.resize(end - begin); if (citer == ptNode->next->end()) {
break;
}
ptNode = citer->second;
if (NULL != ptNode->ptValue) {
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
}
}
return !res.empty();
}
~UglyTrie() {
for (size_t i = 0; i < BASE_SIZE; i++) {
if (_base[i].next == NULL) {
continue;
}
for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) {
_deleteNode(it->second);
it->second = NULL;
}
delete _base[i].next;
_base[i].next = NULL;
}
}
const TrieNode *ptNode = NULL; private:
TrieNode::NextMap::const_iterator citer; void _insertNode(const Unicode& key, const DictUnit* ptValue) {
for (size_t i = 0; i < size_t(end - begin); i++) if (key.begin() == key.end()) {
{ return;
Unicode::value_type ch = *(begin + i); }
ptNode = _base + ch;
res[i].uniCh = ch;
assert(res[i].dag.empty());
res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); TrieNode::NextMap::const_iterator kmIter;
Unicode::const_iterator citer= key.begin();
for (size_t j = i + 1; j < size_t(end - begin); j++) TrieNode *ptNode = _base + (*(citer++));
{ for (; citer != key.end(); citer++) {
if (ptNode->next == NULL) if (NULL == ptNode->next) {
{ ptNode->next = new TrieNode::NextMap;
break; }
} kmIter = ptNode->next->find(*citer);
citer = ptNode->next->find(*(begin + j)); if (ptNode->next->end() == kmIter) {
if (ptNode->next->end() == citer) TrieNode *nextNode = new TrieNode;
{
break;
}
ptNode = citer->second;
if (NULL != ptNode->ptValue)
{
res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue));
}
}
}
}
bool find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
DagType & res,
size_t offset = 0) const
{
if (begin == end)
{
return !res.empty();
}
const TrieNode* ptNode = _base + (*(begin++)); (*(ptNode->next))[*citer] = nextNode;
if (ptNode->ptValue != NULL && res.size() == 1) ptNode = nextNode;
{ } else {
res[0].second = ptNode->ptValue; ptNode = kmIter->second;
} }
else if (ptNode->ptValue != NULL) }
{ ptNode->ptValue = ptValue;
res.push_back(DagType::value_type(offset, ptNode->ptValue)); }
}
TrieNode::NextMap::const_iterator citer; void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
for (Unicode::const_iterator itr = begin; itr != end; itr++) if (valuePointers.empty() || keys.empty()) {
{ return;
if (NULL == ptNode->next) }
{ assert(keys.size() == valuePointers.size());
break;
}
citer = ptNode->next->find(*itr);
if (citer == ptNode->next->end())
{
break;
}
ptNode = citer->second;
if (NULL != ptNode->ptValue)
{
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
}
}
return !res.empty();
}
~UglyTrie()
{
for (size_t i = 0; i < BASE_SIZE; i++)
{
if (_base[i].next == NULL)
{
continue;
}
for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++)
{
_deleteNode(it->second);
it->second = NULL;
}
delete _base[i].next;
_base[i].next = NULL;
}
}
private: for (size_t i = 0; i < keys.size(); i++) {
void _insertNode(const Unicode& key, const DictUnit* ptValue) _insertNode(keys[i], valuePointers[i]);
{ }
if (key.begin() == key.end()) }
{
return;
}
TrieNode::NextMap::const_iterator kmIter; void _deleteNode(TrieNode* node) {
Unicode::const_iterator citer= key.begin(); if (NULL == node) {
TrieNode *ptNode = _base + (*(citer++)); return;
for (; citer != key.end(); citer++) }
{ if (NULL != node->next) {
if (NULL == ptNode->next) TrieNode::NextMap::iterator it;
{ for (it = node->next->begin(); it != node->next->end(); it++) {
ptNode->next = new TrieNode::NextMap; _deleteNode(it->second);
} }
kmIter = ptNode->next->find(*citer); delete node->next;
if (ptNode->next->end() == kmIter) node->next = NULL;
{ }
TrieNode *nextNode = new TrieNode; delete node;
}
(*(ptNode->next))[*citer] = nextNode; TrieNode _base[BASE_SIZE];
ptNode = nextNode; };
}
else
{
ptNode = kmIter->second;
}
}
ptNode->ptValue = ptValue;
}
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
{
if (valuePointers.empty() || keys.empty())
{
return;
}
assert(keys.size() == valuePointers.size());
for (size_t i = 0; i < keys.size(); i++)
{
_insertNode(keys[i], valuePointers[i]);
}
}
void _deleteNode(TrieNode* node)
{
if (NULL == node)
{
return;
}
if (NULL != node->next)
{
TrieNode::NextMap::iterator it;
for (it = node->next->begin(); it != node->next->end(); it++)
{
_deleteNode(it->second);
}
delete node->next;
node->next = NULL;
}
delete node;
}
TrieNode _base[BASE_SIZE];
};
} }
#endif #endif