This commit is contained in:
xuangong 2015-07-21 00:11:13 +08:00
parent 931db7d1e5
commit cf9cc45c19

View File

@ -5,20 +5,17 @@
#include <vector> #include <vector>
#include <queue> #include <queue>
namespace CppJieba namespace CppJieba {
{
using namespace std; using namespace std;
struct DictUnit struct DictUnit {
{
Unicode word; Unicode word;
double weight; double weight;
string tag; string tag;
}; };
// for debugging // for debugging
inline ostream & operator << (ostream& os, const DictUnit& unit) inline ostream & operator << (ostream& os, const DictUnit& unit) {
{
string s; string s;
s << unit.word; s << unit.word;
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
@ -26,8 +23,7 @@ namespace CppJieba
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType; typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
struct SegmentChar struct SegmentChar {
{
uint16_t uniCh; uint16_t uniCh;
DagType dag; DagType dag;
const DictUnit * pInfo; const DictUnit * pInfo;
@ -39,8 +35,7 @@ namespace CppJieba
typedef Unicode::value_type TrieKey; typedef Unicode::value_type TrieKey;
class TrieNode class TrieNode {
{
public : public :
TrieNode(): next(NULL), ptValue(NULL) {} TrieNode(): next(NULL), ptValue(NULL) {}
public: public:
@ -49,33 +44,26 @@ namespace CppJieba
const DictUnit *ptValue; const DictUnit *ptValue;
}; };
class UglyTrie class UglyTrie {
{
public: public:
static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey)))); static const size_t BASE_SIZE = (1 << (8 * (sizeof(TrieKey))));
public: public:
UglyTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) UglyTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
{
_createTrie(keys, valuePointers); _createTrie(keys, valuePointers);
} }
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{ if (begin == end) {
if (begin == end)
{
return NULL; return NULL;
} }
const TrieNode* ptNode = _base + (*(begin++)); const TrieNode* ptNode = _base + (*(begin++));
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
for (Unicode::const_iterator it = begin; it != end; it++) for (Unicode::const_iterator it = begin; it != end; it++) {
{ if (NULL == ptNode->next) {
if (NULL == ptNode->next)
{
return NULL; return NULL;
} }
citer = ptNode->next->find(*it); citer = ptNode->next->find(*it);
if (ptNode->next->end() == citer) if (ptNode->next->end() == citer) {
{
return NULL; return NULL;
} }
ptNode = citer->second; ptNode = citer->second;
@ -87,14 +75,12 @@ namespace CppJieba
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<struct SegmentChar>& res vector<struct SegmentChar>& res
) const ) const {
{
res.resize(end - begin); res.resize(end - begin);
const TrieNode *ptNode = NULL; const TrieNode *ptNode = NULL;
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
for (size_t i = 0; i < size_t(end - begin); i++) for (size_t i = 0; i < size_t(end - begin); i++) {
{
Unicode::value_type ch = *(begin + i); Unicode::value_type ch = *(begin + i);
ptNode = _base + ch; ptNode = _base + ch;
res[i].uniCh = ch; res[i].uniCh = ch;
@ -102,20 +88,16 @@ namespace CppJieba
res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue)); res[i].dag.push_back(DagType::value_type(i, ptNode->ptValue));
for (size_t j = i + 1; j < size_t(end - begin); j++) for (size_t j = i + 1; j < size_t(end - begin); j++) {
{ if (ptNode->next == NULL) {
if (ptNode->next == NULL)
{
break; break;
} }
citer = ptNode->next->find(*(begin + j)); citer = ptNode->next->find(*(begin + j));
if (ptNode->next->end() == citer) if (ptNode->next->end() == citer) {
{
break; break;
} }
ptNode = citer->second; ptNode = citer->second;
if (NULL != ptNode->ptValue) if (NULL != ptNode->ptValue) {
{
res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue)); res[i].dag.push_back(DagType::value_type(j, ptNode->ptValue));
} }
} }
@ -125,53 +107,40 @@ namespace CppJieba
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
DagType & res, DagType & res,
size_t offset = 0) const size_t offset = 0) const {
{ if (begin == end) {
if (begin == end)
{
return !res.empty(); return !res.empty();
} }
const TrieNode* ptNode = _base + (*(begin++)); const TrieNode* ptNode = _base + (*(begin++));
if (ptNode->ptValue != NULL && res.size() == 1) if (ptNode->ptValue != NULL && res.size() == 1) {
{
res[0].second = ptNode->ptValue; res[0].second = ptNode->ptValue;
} } else if (ptNode->ptValue != NULL) {
else if (ptNode->ptValue != NULL)
{
res.push_back(DagType::value_type(offset, ptNode->ptValue)); res.push_back(DagType::value_type(offset, ptNode->ptValue));
} }
TrieNode::NextMap::const_iterator citer; TrieNode::NextMap::const_iterator citer;
for (Unicode::const_iterator itr = begin; itr != end; itr++) for (Unicode::const_iterator itr = begin; itr != end; itr++) {
{ if (NULL == ptNode->next) {
if (NULL == ptNode->next)
{
break; break;
} }
citer = ptNode->next->find(*itr); citer = ptNode->next->find(*itr);
if (citer == ptNode->next->end()) if (citer == ptNode->next->end()) {
{
break; break;
} }
ptNode = citer->second; ptNode = citer->second;
if (NULL != ptNode->ptValue) if (NULL != ptNode->ptValue) {
{
res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue)); res.push_back(DagType::value_type(itr - begin + offset, ptNode->ptValue));
} }
} }
return !res.empty(); return !res.empty();
} }
~UglyTrie() ~UglyTrie() {
{ for (size_t i = 0; i < BASE_SIZE; i++) {
for (size_t i = 0; i < BASE_SIZE; i++) if (_base[i].next == NULL) {
{
if (_base[i].next == NULL)
{
continue; continue;
} }
for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) for (TrieNode::NextMap::iterator it = _base[i].next->begin(); it != _base[i].next->end(); it++) {
{
_deleteNode(it->second); _deleteNode(it->second);
it->second = NULL; it->second = NULL;
} }
@ -181,63 +150,49 @@ namespace CppJieba
} }
private: private:
void _insertNode(const Unicode& key, const DictUnit* ptValue) void _insertNode(const Unicode& key, const DictUnit* ptValue) {
{ if (key.begin() == key.end()) {
if (key.begin() == key.end())
{
return; return;
} }
TrieNode::NextMap::const_iterator kmIter; TrieNode::NextMap::const_iterator kmIter;
Unicode::const_iterator citer= key.begin(); Unicode::const_iterator citer= key.begin();
TrieNode *ptNode = _base + (*(citer++)); TrieNode *ptNode = _base + (*(citer++));
for (; citer != key.end(); citer++) for (; citer != key.end(); citer++) {
{ if (NULL == ptNode->next) {
if (NULL == ptNode->next)
{
ptNode->next = new TrieNode::NextMap; ptNode->next = new TrieNode::NextMap;
} }
kmIter = ptNode->next->find(*citer); kmIter = ptNode->next->find(*citer);
if (ptNode->next->end() == kmIter) if (ptNode->next->end() == kmIter) {
{
TrieNode *nextNode = new TrieNode; TrieNode *nextNode = new TrieNode;
(*(ptNode->next))[*citer] = nextNode; (*(ptNode->next))[*citer] = nextNode;
ptNode = nextNode; ptNode = nextNode;
} } else {
else
{
ptNode = kmIter->second; ptNode = kmIter->second;
} }
} }
ptNode->ptValue = ptValue; ptNode->ptValue = ptValue;
} }
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
{ if (valuePointers.empty() || keys.empty()) {
if (valuePointers.empty() || keys.empty())
{
return; return;
} }
assert(keys.size() == valuePointers.size()); assert(keys.size() == valuePointers.size());
for (size_t i = 0; i < keys.size(); i++) for (size_t i = 0; i < keys.size(); i++) {
{
_insertNode(keys[i], valuePointers[i]); _insertNode(keys[i], valuePointers[i]);
} }
} }
void _deleteNode(TrieNode* node) void _deleteNode(TrieNode* node) {
{ if (NULL == node) {
if (NULL == node)
{
return; return;
} }
if (NULL != node->next) if (NULL != node->next) {
{
TrieNode::NextMap::iterator it; TrieNode::NextMap::iterator it;
for (it = node->next->begin(); it != node->next->end(); it++) for (it = node->next->begin(); it != node->next->end(); it++) {
{
_deleteNode(it->second); _deleteNode(it->second);
} }
delete node->next; delete node->next;