mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
split Trie.hpp into (Trie.hpp & DictTrie.hpp)
This commit is contained in:
parent
e6fde86be5
commit
f70b654b66
145
src/DictTrie.hpp
Normal file
145
src/DictTrie.hpp
Normal file
@ -0,0 +1,145 @@
|
||||
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
||||
#define CPPJIEBA_DICT_TRIE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <cstring>
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Limonp/InitOnOff.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
|
||||
|
||||
struct DictUnit
|
||||
{
|
||||
Unicode word;
|
||||
size_t freq;
|
||||
string tag;
|
||||
double logFreq; //logFreq = log(freq/sum(freq));
|
||||
};
|
||||
|
||||
typedef map<size_t, const DictUnit*> DagType;
|
||||
|
||||
class DictTrie: InitOnOff
|
||||
{
|
||||
|
||||
private:
|
||||
DictTrieNode* _root;
|
||||
vector<DictUnit> _nodeInfos;
|
||||
|
||||
int64_t _freqSum;
|
||||
double _minLogFreq;
|
||||
|
||||
public:
|
||||
DictTrie()
|
||||
{
|
||||
_root = new DictTrieNode;
|
||||
_root.ptKeyMap = NULL;
|
||||
_root.offset = 0;
|
||||
_freqSum = 0;
|
||||
_minLogFreq = MAX_DOUBLE;
|
||||
_setInitFlag(false);
|
||||
}
|
||||
DictTrie(const string& filePath)
|
||||
{
|
||||
new (this) DictTrie();
|
||||
_setInitFlag(init(filePath));
|
||||
}
|
||||
~DictTrie()
|
||||
{
|
||||
_deleteNode(_root);
|
||||
}
|
||||
private:
|
||||
|
||||
|
||||
public:
|
||||
bool init(const string& filePath)
|
||||
{
|
||||
assert(!_getInitFlag());
|
||||
_loadDict(filePath, _nodeInfos);
|
||||
_createDictTrie(_nodeInfos, _root);
|
||||
_freqSum = _calculateFreqSum(_nodeInfos);
|
||||
assert(_freqSum);
|
||||
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
double getMinLogFreq() const {return _minLogFreq;};
|
||||
|
||||
private:
|
||||
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs)
|
||||
{
|
||||
LogFatal("open %s failed.", filePath.c_str());
|
||||
exit(1);
|
||||
}
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
nodeInfos.clear();
|
||||
DictUnit nodeInfo;
|
||||
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
|
||||
{
|
||||
split(line, buf, " ");
|
||||
assert(buf.size() == DICT_COLUMN_NUM);
|
||||
|
||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
||||
{
|
||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||
continue;
|
||||
}
|
||||
nodeInfo.freq = atoi(buf[1].c_str());
|
||||
nodeInfo.tag = buf[2];
|
||||
|
||||
nodeInfos.push_back(nodeInfo);
|
||||
}
|
||||
}
|
||||
size_t _calculateFreqSum(const vector<DictUnit>& nodeInfos) const
|
||||
{
|
||||
size_t freqSum = 0;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
freqSum += nodeInfos[i].freq;
|
||||
}
|
||||
return freqSum;
|
||||
}
|
||||
double _calculateLogFreqAndGetMinValue(vector<DictUnit>& nodeInfos, size_t freqSum) const
|
||||
{
|
||||
assert(freqSum);
|
||||
double minLogFreq = MAX_DOUBLE;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
DictUnit& nodeInfo = nodeInfos[i];
|
||||
assert(nodeInfo.freq);
|
||||
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
|
||||
if(minLogFreq > nodeInfo.logFreq)
|
||||
{
|
||||
minLogFreq = nodeInfo.logFreq;
|
||||
}
|
||||
}
|
||||
return minLogFreq;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
316
src/Trie.hpp
316
src/Trie.hpp
@ -1,241 +1,131 @@
|
||||
/************************************
|
||||
* file enc : ASCII
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef CPPJIEBA_TRIE_H
|
||||
#define CPPJIEBA_TRIE_H
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <cstring>
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Limonp/InitOnOff.hpp"
|
||||
#include "TransCode.hpp"
|
||||
|
||||
#ifndef CPPJIEBA_TRIE_HPP
|
||||
#define CPPJIEBA_TRIE_HPP
|
||||
|
||||
#include "Limonp/std_outbound.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||
struct TrieNodeInfo;
|
||||
struct TrieNode
|
||||
{
|
||||
TrieNodeMap hmap;
|
||||
const TrieNodeInfo * ptTrieNodeInfo;
|
||||
TrieNode(): ptTrieNodeInfo(NULL)
|
||||
{}
|
||||
};
|
||||
template <class KeyType, class ValueType>
|
||||
class TrieNode
|
||||
{
|
||||
public:
|
||||
typedef unordered_map<KeyType, TrieNode*> TrieNodeMapType;
|
||||
public:
|
||||
TrieNodeMap * ptKeyMap;
|
||||
const ValueType * ptValue;
|
||||
};
|
||||
|
||||
struct TrieNodeInfo
|
||||
{
|
||||
Unicode word;
|
||||
size_t freq;
|
||||
string tag;
|
||||
double logFreq; //logFreq = log(freq/sum(freq));
|
||||
};
|
||||
|
||||
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
|
||||
{
|
||||
return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ;
|
||||
}
|
||||
|
||||
typedef map<size_t, const TrieNodeInfo*> DagType;
|
||||
|
||||
class Trie: public InitOnOff
|
||||
{
|
||||
|
||||
private:
|
||||
TrieNode* _root;
|
||||
vector<TrieNodeInfo> _nodeInfos;
|
||||
|
||||
int64_t _freqSum;
|
||||
double _minLogFreq;
|
||||
|
||||
public:
|
||||
Trie()
|
||||
{
|
||||
_root = new TrieNode;
|
||||
_freqSum = 0;
|
||||
_minLogFreq = MAX_DOUBLE;
|
||||
_setInitFlag(false);
|
||||
}
|
||||
Trie(const string& filePath)
|
||||
{
|
||||
new (this) Trie();
|
||||
_setInitFlag(init(filePath));
|
||||
}
|
||||
~Trie()
|
||||
{
|
||||
_deleteNode(_root);
|
||||
}
|
||||
private:
|
||||
|
||||
|
||||
public:
|
||||
bool init(const string& filePath)
|
||||
{
|
||||
assert(!_getInitFlag());
|
||||
_loadDict(filePath, _nodeInfos);
|
||||
_createTrie(_nodeInfos, _root);
|
||||
_freqSum = _calculateFreqSum(_nodeInfos);
|
||||
assert(_freqSum);
|
||||
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
|
||||
public:
|
||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
|
||||
{
|
||||
TrieNodeMap::const_iterator citer;
|
||||
const TrieNode* p = _root;
|
||||
for(Unicode::const_iterator it = begin; it != end; it++)
|
||||
template <class KeyType, class ValueType>
|
||||
class Trie
|
||||
{
|
||||
private:
|
||||
TrieNode* _root;
|
||||
private:
|
||||
public:
|
||||
Trie(const vector<KeyType>& keys, const vector<ValueType* >& valuePointers)
|
||||
{
|
||||
citer = p->hmap.find(*it);
|
||||
if(p->hmap.end() == citer)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
p = citer->second;
|
||||
_root = new TrieNode;
|
||||
_root->ptKeyMap = NULL;
|
||||
_root->ptValue = NULL;
|
||||
|
||||
_createTrie(keys, valuePointers);
|
||||
}
|
||||
return p->ptTrieNodeInfo;
|
||||
}
|
||||
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType & res, size_t offset = 0) const
|
||||
{
|
||||
const TrieNode* p = _root;
|
||||
TrieNodeMap::const_iterator citer;
|
||||
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
||||
~Trie()
|
||||
{
|
||||
citer = p->hmap.find(*itr);
|
||||
if(p->hmap.end() == citer)
|
||||
{
|
||||
break;
|
||||
}
|
||||
p = citer->second;
|
||||
if(p->ptTrieNodeInfo)
|
||||
{
|
||||
res[itr - begin + offset] = p->ptTrieNodeInfo;
|
||||
}
|
||||
}
|
||||
return !res.empty();
|
||||
}
|
||||
|
||||
public:
|
||||
double getMinLogFreq() const {return _minLogFreq;};
|
||||
|
||||
private:
|
||||
void _insertNode(const TrieNodeInfo& nodeInfo, TrieNode* ptNode) const
|
||||
{
|
||||
const Unicode& unico = nodeInfo.word;
|
||||
TrieNodeMap::const_iterator citer;
|
||||
for(size_t i = 0; i < unico.size(); i++)
|
||||
public:
|
||||
const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const
|
||||
{
|
||||
uint16_t cu = unico[i];
|
||||
assert(ptNode);
|
||||
citer = ptNode->hmap.find(cu);
|
||||
if(ptNode->hmap.end() == citer)
|
||||
TrieNodeMapType::const_iterator citer;
|
||||
const TrieNode* ptNode = _root;
|
||||
for(KeyType::const_iterator it = begin; it != end; it++)
|
||||
{
|
||||
TrieNode * next = new TrieNode;
|
||||
ptNode->hmap[cu] = next;
|
||||
ptNode = next;
|
||||
citer = ptNode->ptKeyMap->find(*it);
|
||||
if(ptNode->ptKeyMap->end() == citer)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
ptNode= citer->second;
|
||||
}
|
||||
else
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map<KeyType::size_type, const ValueType* >& ordererMap) const
|
||||
{
|
||||
const TrieNode * ptNode = _root;
|
||||
TrieNodeMapType::const_iterator citer;
|
||||
for(KeyType::const_iterator itr = begin; itr != end ; itr++)
|
||||
{
|
||||
citer = ptNode->ptKeyMap->find(*itr);
|
||||
if(ptNode->ptKeyMap->end() == citer)
|
||||
{
|
||||
break;
|
||||
}
|
||||
ptNode = citer->second;
|
||||
if(ptNode->ptValue)
|
||||
{
|
||||
ordererMap[itr - begin] = ptNode->ptValue;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ptNode->ptTrieNodeInfo = &nodeInfo;
|
||||
}
|
||||
|
||||
private:
|
||||
void _loadDict(const string& filePath, vector<TrieNodeInfo>& nodeInfos) const
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(!ifs)
|
||||
private:
|
||||
void _createTrie(const vector<KeyType>& keys, const vector<ValueType*>& valuePointers)
|
||||
{
|
||||
LogFatal("open %s failed.", filePath.c_str());
|
||||
exit(1);
|
||||
}
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
nodeInfos.clear();
|
||||
TrieNodeInfo nodeInfo;
|
||||
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
|
||||
{
|
||||
split(line, buf, " ");
|
||||
assert(buf.size() == DICT_COLUMN_NUM);
|
||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
||||
if(values.empty() || keys.empty())
|
||||
{
|
||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
nodeInfo.freq = atoi(buf[1].c_str());
|
||||
nodeInfo.tag = buf[2];
|
||||
assert(keys.size() == valuePointers.size());
|
||||
|
||||
nodeInfos.push_back(nodeInfo);
|
||||
}
|
||||
}
|
||||
bool _createTrie(const vector<TrieNodeInfo>& nodeInfos, TrieNode * ptNode)
|
||||
{
|
||||
for(size_t i = 0; i < _nodeInfos.size(); i++)
|
||||
{
|
||||
_insertNode(_nodeInfos[i], ptNode);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
size_t _calculateFreqSum(const vector<TrieNodeInfo>& nodeInfos) const
|
||||
{
|
||||
size_t freqSum = 0;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
freqSum += nodeInfos[i].freq;
|
||||
}
|
||||
return freqSum;
|
||||
}
|
||||
double _calculateLogFreqAndGetMinValue(vector<TrieNodeInfo>& nodeInfos, size_t freqSum) const
|
||||
{
|
||||
assert(freqSum);
|
||||
double minLogFreq = MAX_DOUBLE;
|
||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
||||
{
|
||||
TrieNodeInfo& nodeInfo = nodeInfos[i];
|
||||
assert(nodeInfo.freq);
|
||||
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(freqSum));
|
||||
if(minLogFreq > nodeInfo.logFreq)
|
||||
for(size_t i = 0; i < keys.size(); i++)
|
||||
{
|
||||
minLogFreq = nodeInfo.logFreq;
|
||||
_insertNode(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
return minLogFreq;
|
||||
}
|
||||
|
||||
void _deleteNode(TrieNode* node)
|
||||
{
|
||||
if(!node)
|
||||
private:
|
||||
void _insertNode(const KeyType& key, const Value* ptValue)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
||||
{
|
||||
TrieNode* next = it->second;
|
||||
_deleteNode(next);
|
||||
}
|
||||
delete node;
|
||||
}
|
||||
TrieNode* ptNode = _root;
|
||||
|
||||
};
|
||||
TrieNode::KeyMapType::const_iterator kmIter;
|
||||
|
||||
for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++)
|
||||
{
|
||||
if(NULL == ptNode->ptKeyMap)
|
||||
{
|
||||
ptNode->ptKeyMap = new TrieNode::KeyMapType;
|
||||
}
|
||||
kmIter = ptNode->ptKeyMap->find(*citer);
|
||||
if(ptNode->ptKeyMap->end() == kmIter)
|
||||
{
|
||||
TrieNode * nextNode = new TrieNode;
|
||||
nextNode->ptKeyMap = NULL;
|
||||
nextNode->ptValue = NULL;
|
||||
|
||||
ptNode->ptKeyMap[*citer] = nextNode;
|
||||
ptNode = next;
|
||||
}
|
||||
else
|
||||
{
|
||||
ptNode = kmIter->second;
|
||||
}
|
||||
}
|
||||
ptNode->ptValue = ptValue;
|
||||
}
|
||||
void _deleteNode(TrieNode* node)
|
||||
{
|
||||
if(!node)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
|
||||
{
|
||||
_deleteNode(it->second);
|
||||
}
|
||||
delete node->ptKeyMap;
|
||||
delete node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user