prettify Trie.hpp ing

This commit is contained in:
wyy 2014-03-16 20:42:20 +08:00
parent fe7e3ff807
commit 762495f5f4

View File

@ -30,11 +30,11 @@ namespace CppJieba
{ {
TrieNodeMap hmap; TrieNodeMap hmap;
bool isLeaf; bool isLeaf;
size_t nodeInfoVecPos; size_t nodeInfoPos;
TrieNode() TrieNode()
{ {
isLeaf = false; isLeaf = false;
nodeInfoVecPos = 0; nodeInfoPos = 0;
} }
}; };
@ -64,7 +64,7 @@ namespace CppJieba
private: private:
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfos;
int64_t _freqSum; int64_t _freqSum;
double _minLogFreq; double _minLogFreq;
@ -97,11 +97,7 @@ namespace CppJieba
LogError("_trieInsert failed."); LogError("_trieInsert failed.");
return false; return false;
} }
if(!_countWeight()) _countWeight();
{
LogError("_countWeight failed.");
return false;
}
return _setInitFlag(true); return _setInitFlag(true);
} }
@ -123,16 +119,7 @@ namespace CppJieba
} }
if(p->isLeaf) if(p->isLeaf)
{ {
size_t pos = p->nodeInfoVecPos; return &(_nodeInfos[p->nodeInfoPos]);
if(pos < _nodeInfoVec.size())
{
return &(_nodeInfoVec[pos]);
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
} }
return NULL; return NULL;
} }
@ -149,14 +136,14 @@ namespace CppJieba
p = p->hmap[*itr]; p = p->hmap[*itr];
if(p->isLeaf) if(p->isLeaf)
{ {
size_t pos = p->nodeInfoVecPos; size_t pos = p->nodeInfoPos;
if(pos < _nodeInfoVec.size()) if(pos < _nodeInfos.size())
{ {
res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos])); res.push_back(make_pair(itr-begin, &_nodeInfos[pos]));
} }
else else
{ {
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); LogFatal("node's nodeInfoPos is out of _nodeInfos's range");
return false; return false;
} }
} }
@ -176,14 +163,14 @@ namespace CppJieba
p = p->hmap[*itr]; p = p->hmap[*itr];
if(p->isLeaf) if(p->isLeaf)
{ {
size_t pos = p->nodeInfoVecPos; size_t pos = p->nodeInfoPos;
if(pos < _nodeInfoVec.size()) if(pos < _nodeInfos.size())
{ {
res[itr-begin + offset] = &_nodeInfoVec[pos]; res[itr-begin + offset] = &_nodeInfos[pos];
} }
else else
{ {
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); LogFatal("node's nodeInfoPos is out of _nodeInfos's range");
return false; return false;
} }
} }
@ -195,16 +182,15 @@ namespace CppJieba
double getMinLogFreq() const {return _minLogFreq;}; double getMinLogFreq() const {return _minLogFreq;};
private: private:
void _insert(const TrieNodeInfo& nodeInfo) void _insert(const TrieNodeInfo& nodeInfo, size_t nodeInfoPos)
{ {
const Unicode& uintVec = nodeInfo.word; const Unicode& uintVec = nodeInfo.word;
TrieNode* p = _root; TrieNode* p = _root;
for(size_t i = 0; i < uintVec.size(); i++) for(size_t i = 0; i < uintVec.size(); i++)
{ {
uint16_t cu = uintVec[i]; uint16_t cu = uintVec[i];
assert(p); assert(p);
if(p->hmap.end() == p->hmap.find(cu)) if(!isIn(p->hmap, cu))
{ {
TrieNode * next = new TrieNode; TrieNode * next = new TrieNode;
assert(next); assert(next);
@ -216,13 +202,9 @@ namespace CppJieba
p = p->hmap[cu]; p = p->hmap[cu];
} }
} }
assert(p);
assert(!p->isLeaf);
p->isLeaf = true; p->isLeaf = true;
_nodeInfoVec.push_back(nodeInfo); p->nodeInfoPos = nodeInfoPos;
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
} }
private: private:
@ -235,45 +217,45 @@ namespace CppJieba
return false; return false;
} }
string line; string line;
vector<string> vecBuf; vector<string> buf;
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
for(size_t lineno = 0 ; getline(ifs, line); lineno++) for(size_t lineno = 0 ; getline(ifs, line); lineno++)
{ {
split(line, vecBuf, " "); split(line, buf, " ");
assert(vecBuf.size() == DICT_COLUMN_NUM); assert(buf.size() == DICT_COLUMN_NUM);
if(!TransCode::decode(vecBuf[0], nodeInfo.word)) if(!TransCode::decode(buf[0], nodeInfo.word))
{ {
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
return false; return false;
} }
nodeInfo.freq = atoi(vecBuf[1].c_str()); nodeInfo.freq = atoi(buf[1].c_str());
nodeInfo.tag = vecBuf[2]; nodeInfo.tag = buf[2];
_insert(nodeInfo); _nodeInfos.push_back(nodeInfo);
}
for(size_t i = 0; i < _nodeInfos.size(); i++)
{
_insert(_nodeInfos[i], i);
} }
return true; return true;
} }
bool _countWeight() void _countWeight()
{ {
if(_nodeInfoVec.empty() || 0 != _freqSum)
{
LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
return false;
}
//freq total freq //freq total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++) _freqSum = 0;
for(size_t i = 0; i < _nodeInfos.size(); i++)
{ {
_freqSum += _nodeInfoVec[i].freq; _freqSum += _nodeInfos[i].freq;
} }
assert(_freqSum); assert(_freqSum);
//normalize //normalize
for(size_t i = 0; i < _nodeInfoVec.size(); i++) for(size_t i = 0; i < _nodeInfos.size(); i++)
{ {
TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; TrieNodeInfo& nodeInfo = _nodeInfos[i];
assert(nodeInfo.freq); assert(nodeInfo.freq);
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
if(_minLogFreq > nodeInfo.logFreq) if(_minLogFreq > nodeInfo.logFreq)
@ -282,7 +264,6 @@ namespace CppJieba
} }
} }
return true;
} }
void _deleteNode(TrieNode* node) void _deleteNode(TrieNode* node)