split Trie.hpp into (Trie.hpp & DictTrie.hpp)

This commit is contained in:
wyy 2014-04-10 21:05:01 +08:00
parent e6fde86be5
commit f70b654b66
2 changed files with 248 additions and 213 deletions

145
src/DictTrie.hpp Normal file
View File

@ -0,0 +1,145 @@
#ifndef CPPJIEBA_DICT_TRIE_HPP
#define CPPJIEBA_DICT_TRIE_HPP
#include <iostream>
#include <fstream>
#include <map>
#include <cstring>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "Limonp/InitOnOff.hpp"
#include "TransCode.hpp"
#include "Trie.hpp"
namespace CppJieba
{
using namespace Limonp;
const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3;
struct DictUnit
{
Unicode word;
size_t freq;
string tag;
double logFreq; //logFreq = log(freq/sum(freq));
};
typedef map<size_t, const DictUnit*> DagType;
class DictTrie: InitOnOff
{
private:
DictTrieNode* _root;
vector<DictUnit> _nodeInfos;
int64_t _freqSum;
double _minLogFreq;
public:
DictTrie()
{
_root = new DictTrieNode;
_root.ptKeyMap = NULL;
_root.offset = 0;
_freqSum = 0;
_minLogFreq = MAX_DOUBLE;
_setInitFlag(false);
}
DictTrie(const string& filePath)
{
new (this) DictTrie();
_setInitFlag(init(filePath));
}
~DictTrie()
{
_deleteNode(_root);
}
private:
public:
bool init(const string& filePath)
{
assert(!_getInitFlag());
_loadDict(filePath, _nodeInfos);
_createDictTrie(_nodeInfos, _root);
_freqSum = _calculateFreqSum(_nodeInfos);
assert(_freqSum);
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
return _setInitFlag(true);
}
public:
double getMinLogFreq() const {return _minLogFreq;};
private:
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
{
ifstream ifs(filePath.c_str());
if(!ifs)
{
LogFatal("open %s failed.", filePath.c_str());
exit(1);
}
string line;
vector<string> buf;
nodeInfos.clear();
DictUnit nodeInfo;
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
{
split(line, buf, " ");
assert(buf.size() == DICT_COLUMN_NUM);
if(!TransCode::decode(buf[0], nodeInfo.word))
{
LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue;
}
nodeInfo.freq = atoi(buf[1].c_str());
nodeInfo.tag = buf[2];
nodeInfos.push_back(nodeInfo);
}
}
size_t _calculateFreqSum(const vector<DictUnit>& nodeInfos) const
{
size_t freqSum = 0;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
freqSum += nodeInfos[i].freq;
}
return freqSum;
}
double _calculateLogFreqAndGetMinValue(vector<DictUnit>& nodeInfos, size_t freqSum) const
{
assert(freqSum);
double minLogFreq = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
DictUnit& nodeInfo = nodeInfos[i];
assert(nodeInfo.freq);
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
if(minLogFreq > nodeInfo.logFreq)
{
minLogFreq = nodeInfo.logFreq;
}
}
return minLogFreq;
}
};
}
#endif

View File

@ -1,241 +1,131 @@
/************************************ #ifndef CPPJIEBA_TRIE_HPP
* file enc : ASCII #define CPPJIEBA_TRIE_HPP
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_TRIE_H
#define CPPJIEBA_TRIE_H
#include <iostream>
#include <fstream>
#include <map>
#include <cstring>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "Limonp/InitOnOff.hpp"
#include "TransCode.hpp"
#include "Limonp/std_outbound.hpp"
#include <vector>
namespace CppJieba namespace CppJieba
{ {
using namespace Limonp; template <class KeyType, class ValueType>
const double MIN_DOUBLE = -3.14e+100; class TrieNode
const double MAX_DOUBLE = 3.14e+100; {
const size_t DICT_COLUMN_NUM = 3; public:
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap; typedef unordered_map<KeyType, TrieNode*> TrieNodeMapType;
struct TrieNodeInfo; public:
struct TrieNode TrieNodeMap * ptKeyMap;
{ const ValueType * ptValue;
TrieNodeMap hmap; };
const TrieNodeInfo * ptTrieNodeInfo;
TrieNode(): ptTrieNodeInfo(NULL)
{}
};
struct TrieNodeInfo template <class KeyType, class ValueType>
{ class Trie
Unicode word; {
size_t freq; private:
string tag; TrieNode* _root;
double logFreq; //logFreq = log(freq/sum(freq)); private:
}; public:
Trie(const vector<KeyType>& keys, const vector<ValueType* >& valuePointers)
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
{
return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ;
}
typedef map<size_t, const TrieNodeInfo*> DagType;
class Trie: public InitOnOff
{
private:
TrieNode* _root;
vector<TrieNodeInfo> _nodeInfos;
int64_t _freqSum;
double _minLogFreq;
public:
Trie()
{
_root = new TrieNode;
_freqSum = 0;
_minLogFreq = MAX_DOUBLE;
_setInitFlag(false);
}
Trie(const string& filePath)
{
new (this) Trie();
_setInitFlag(init(filePath));
}
~Trie()
{
_deleteNode(_root);
}
private:
public:
bool init(const string& filePath)
{
assert(!_getInitFlag());
_loadDict(filePath, _nodeInfos);
_createTrie(_nodeInfos, _root);
_freqSum = _calculateFreqSum(_nodeInfos);
assert(_freqSum);
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
return _setInitFlag(true);
}
public:
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{
TrieNodeMap::const_iterator citer;
const TrieNode* p = _root;
for(Unicode::const_iterator it = begin; it != end; it++)
{ {
citer = p->hmap.find(*it); _root = new TrieNode;
if(p->hmap.end() == citer) _root->ptKeyMap = NULL;
{ _root->ptValue = NULL;
return NULL;
} _createTrie(keys, valuePointers);
p = citer->second;
} }
return p->ptTrieNodeInfo; ~Trie()
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const
{
const TrieNode* p = _root;
TrieNodeMap::const_iterator citer;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{ {
citer = p->hmap.find(*itr);
if(p->hmap.end() == citer)
{
break;
}
p = citer->second;
if(p->ptTrieNodeInfo)
{
res[itr - begin + offset] = p->ptTrieNodeInfo;
}
} }
return !res.empty(); public:
} const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const
public:
double getMinLogFreq() const {return _minLogFreq;};
private:
void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const
{
const Unicode& unico = nodeInfo.word;
TrieNodeMap::const_iterator citer;
for(size_t i = 0; i < unico.size(); i++)
{ {
uint16_t cu = unico[i]; TrieNodeMapType::const_iterator citer;
assert(ptNode); const TrieNode* ptNode = _root;
citer = ptNode->hmap.find(cu); for(KeyType::const_iterator it = begin; it != end; it++)
if(ptNode->hmap.end() == citer)
{ {
TrieNode * next = new TrieNode; citer = ptNode->ptKeyMap->find(*it);
ptNode->hmap[cu] = next; if(ptNode->ptKeyMap->end() == citer)
ptNode = next; {
return NULL;
}
ptNode= citer->second;
} }
else return ptNode->ptValue;
}
bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map<KeyType::size_type, const ValueType* >& ordererMap) const
{
const TrieNode * ptNode = _root;
TrieNodeMapType::const_iterator citer;
for(KeyType::const_iterator itr = begin; itr != end ; itr++)
{ {
citer = ptNode->ptKeyMap->find(*itr);
if(ptNode->ptKeyMap->end() == citer)
{
break;
}
ptNode = citer->second; ptNode = citer->second;
if(ptNode->ptValue)
{
ordererMap[itr - begin] = ptNode->ptValue;
}
} }
} }
private:
ptNode->ptTrieNodeInfo = &nodeInfo; void _createTrie(const vector<KeyType>& keys, const vector<ValueType*>& valuePointers)
}
private:
void _loadDict(const string& filePath, vector<TrieNodeInfo>& nodeInfos) const
{
ifstream ifs(filePath.c_str());
if(!ifs)
{ {
LogFatal("open %s failed.", filePath.c_str()); if(values.empty() || keys.empty())
exit(1);
}
string line;
vector<string> buf;
nodeInfos.clear();
TrieNodeInfo nodeInfo;
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
{
split(line, buf, " ");
assert(buf.size() == DICT_COLUMN_NUM);
if(!TransCode::decode(buf[0], nodeInfo.word))
{ {
LogError("line[%u:%s] illegal.", lineno, line.c_str()); return;
continue;
} }
nodeInfo.freq = atoi(buf[1].c_str()); assert(keys.size() == valuePointers.size());
nodeInfo.tag = buf[2];
nodeInfos.push_back(nodeInfo); for(size_t i = 0; i < keys.size(); i++)
}
}
bool _createTrie(const vector<TrieNodeInfo>& nodeInfos, TrieNode * ptNode)
{
for(size_t i = 0; i < _nodeInfos.size(); i++)
{
_insertNode(_nodeInfos[i], ptNode);
}
return true;
}
size_t _calculateFreqSum(const vector<TrieNodeInfo>& nodeInfos) const
{
size_t freqSum = 0;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
freqSum += nodeInfos[i].freq;
}
return freqSum;
}
double _calculateLogFreqAndGetMinValue(vector<TrieNodeInfo>& nodeInfos, size_t freqSum) const
{
assert(freqSum);
double minLogFreq = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++)
{
TrieNodeInfo& nodeInfo = nodeInfos[i];
assert(nodeInfo.freq);
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
if(minLogFreq > nodeInfo.logFreq)
{ {
minLogFreq = nodeInfo.logFreq; _insertNode(keys[i], valuePointers[i]);
} }
} }
return minLogFreq; private:
} void _insertNode(const KeyType& key, const Value* ptValue)
void _deleteNode(TrieNode* node)
{
if(!node)
{ {
return; TrieNode* ptNode = _root;
}
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
{
TrieNode* next = it->second;
_deleteNode(next);
}
delete node;
}
}; TrieNode::KeyMapType::const_iterator kmIter;
for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++)
{
if(NULL == ptNode->ptKeyMap)
{
ptNode->ptKeyMap = new TrieNode::KeyMapType;
}
kmIter = ptNode->ptKeyMap->find(*citer);
if(ptNode->ptKeyMap->end() == kmIter)
{
TrieNode * nextNode = new TrieNode;
nextNode->ptKeyMap = NULL;
nextNode->ptValue = NULL;
ptNode->ptKeyMap[*citer] = nextNode;
ptNode = next;
}
else
{
ptNode = kmIter->second;
}
}
ptNode->ptValue = ptValue;
}
void _deleteNode(TrieNode* node)
{
if(!node)
{
return;
}
for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
{
_deleteNode(it->second);
}
delete node->ptKeyMap;
delete node;
}
}
} }
#endif #endif