mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
prettify Trie.hpp ing
This commit is contained in:
parent
582d61e3e8
commit
fe7e3ff807
@ -48,7 +48,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
//resut of searching in trie tree
|
//resut of searching in trie tree
|
||||||
vector<pair<uint, const TrieNodeInfo*> > tRes;
|
vector<pair<size_t, const TrieNodeInfo*> > tRes;
|
||||||
|
|
||||||
//max index of res's words
|
//max index of res's words
|
||||||
int maxIdx = 0;
|
int maxIdx = 0;
|
||||||
@ -63,7 +63,7 @@ namespace CppJieba
|
|||||||
//find word start from uItr
|
//find word start from uItr
|
||||||
if (_trie.find(uItr, end, tRes))
|
if (_trie.find(uItr, end, tRes))
|
||||||
{
|
{
|
||||||
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
{
|
{
|
||||||
wordLen = itr->second->word.size();
|
wordLen = itr->second->word.size();
|
||||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
||||||
|
@ -76,7 +76,7 @@ namespace CppJieba
|
|||||||
LogError("not inited.");
|
LogError("not inited.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
vector<uint> status;
|
vector<size_t> status;
|
||||||
if(!_viterbi(begin, end, status))
|
if(!_viterbi(begin, end, status))
|
||||||
{
|
{
|
||||||
LogError("_viterbi failed.");
|
LogError("_viterbi failed.");
|
||||||
@ -85,7 +85,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right;
|
Unicode::const_iterator right;
|
||||||
for(uint i =0; i< status.size(); i++)
|
for(size_t i =0; i< status.size(); i++)
|
||||||
{
|
{
|
||||||
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
||||||
{
|
{
|
||||||
@ -110,7 +110,7 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
string tmp;
|
string tmp;
|
||||||
for(uint i = 0; i < words.size(); i++)
|
for(size_t i = 0; i < words.size(); i++)
|
||||||
{
|
{
|
||||||
if(TransCode::encode(words[i], tmp))
|
if(TransCode::encode(words[i], tmp))
|
||||||
{
|
{
|
||||||
@ -121,7 +121,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const
|
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
||||||
{
|
{
|
||||||
if(begin == end)
|
if(begin == end)
|
||||||
{
|
{
|
||||||
@ -133,7 +133,7 @@ namespace CppJieba
|
|||||||
size_t XYSize = X * Y;
|
size_t XYSize = X * Y;
|
||||||
int * path;
|
int * path;
|
||||||
double * weight;
|
double * weight;
|
||||||
uint now, old, stat;
|
size_t now, old, stat;
|
||||||
double tmp, endE, endS;
|
double tmp, endE, endS;
|
||||||
|
|
||||||
try
|
try
|
||||||
@ -153,21 +153,21 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
//start
|
//start
|
||||||
for(uint y = 0; y < Y; y++)
|
for(size_t y = 0; y < Y; y++)
|
||||||
{
|
{
|
||||||
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
||||||
path[0 + y * X] = -1;
|
path[0 + y * X] = -1;
|
||||||
}
|
}
|
||||||
//process
|
//process
|
||||||
//for(; begin != end; begin++)
|
//for(; begin != end; begin++)
|
||||||
for(uint x = 1; x < X; x++)
|
for(size_t x = 1; x < X; x++)
|
||||||
{
|
{
|
||||||
for(uint y = 0; y < Y; y++)
|
for(size_t y = 0; y < Y; y++)
|
||||||
{
|
{
|
||||||
now = x + y*X;
|
now = x + y*X;
|
||||||
weight[now] = MIN_DOUBLE;
|
weight[now] = MIN_DOUBLE;
|
||||||
path[now] = E; // warning
|
path[now] = E; // warning
|
||||||
for(uint preY = 0; preY < Y; preY++)
|
for(size_t preY = 0; preY < Y; preY++)
|
||||||
{
|
{
|
||||||
old = x - 1 + preY * X;
|
old = x - 1 + preY * X;
|
||||||
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||||
@ -221,14 +221,14 @@ namespace CppJieba
|
|||||||
LogError("start_p illegal");
|
LogError("start_p illegal");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(uint j = 0; j< tmp.size(); j++)
|
for(size_t j = 0; j< tmp.size(); j++)
|
||||||
{
|
{
|
||||||
_startProb[j] = atof(tmp[j].c_str());
|
_startProb[j] = atof(tmp[j].c_str());
|
||||||
//cout<<_startProb[j]<<endl;
|
//cout<<_startProb[j]<<endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _transProb
|
//load _transProb
|
||||||
for(uint i = 0; i < STATUS_SUM; i++)
|
for(size_t i = 0; i < STATUS_SUM; i++)
|
||||||
{
|
{
|
||||||
if(!_getLine(ifile, line))
|
if(!_getLine(ifile, line))
|
||||||
{
|
{
|
||||||
@ -240,7 +240,7 @@ namespace CppJieba
|
|||||||
LogError("trans_p illegal");
|
LogError("trans_p illegal");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(uint j =0; j < STATUS_SUM; j++)
|
for(size_t j =0; j < STATUS_SUM; j++)
|
||||||
{
|
{
|
||||||
_transProb[i][j] = atof(tmp[j].c_str());
|
_transProb[i][j] = atof(tmp[j].c_str());
|
||||||
//cout<<_transProb[i][j]<<endl;
|
//cout<<_transProb[i][j]<<endl;
|
||||||
@ -301,7 +301,7 @@ namespace CppJieba
|
|||||||
vector<string> tmp, tmp2;
|
vector<string> tmp, tmp2;
|
||||||
uint16_t unico = 0;
|
uint16_t unico = 0;
|
||||||
split(line, tmp, ",");
|
split(line, tmp, ",");
|
||||||
for(uint i = 0; i < tmp.size(); i++)
|
for(size_t i = 0; i < tmp.size(); i++)
|
||||||
{
|
{
|
||||||
split(tmp[i], tmp2, ":");
|
split(tmp[i], tmp2, ":");
|
||||||
if(2 != tmp2.size())
|
if(2 != tmp2.size())
|
||||||
|
@ -37,7 +37,7 @@ namespace CppJieba
|
|||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, uint topN) const
|
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
||||||
{
|
{
|
||||||
assert(_getInitFlag());
|
assert(_getInitFlag());
|
||||||
vector<pair<string, double> > topWords;
|
vector<pair<string, double> > topWords;
|
||||||
@ -45,14 +45,14 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(uint i = 0; i < topWords.size(); i++)
|
for(size_t i = 0; i < topWords.size(); i++)
|
||||||
{
|
{
|
||||||
keywords.push_back(topWords[i].first);
|
keywords.push_back(topWords[i].first);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const
|
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
|
||||||
{
|
{
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if(!_segment.cut(str, words))
|
if(!_segment.cut(str, words))
|
||||||
@ -75,7 +75,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(uint i = 0; i < words.size(); i ++)
|
for(size_t i = 0; i < words.size(); i ++)
|
||||||
{
|
{
|
||||||
wordmap[ words[i] ] += 1.0;
|
wordmap[ words[i] ] += 1.0;
|
||||||
}
|
}
|
||||||
|
@ -66,7 +66,7 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
string tmp;
|
string tmp;
|
||||||
for(uint i = 0; i < segWordInfos.size(); i++)
|
for(size_t i = 0; i < segWordInfos.size(); i++)
|
||||||
{
|
{
|
||||||
if(TransCode::encode(segWordInfos[i].word, tmp))
|
if(TransCode::encode(segWordInfos[i].word, tmp))
|
||||||
{
|
{
|
||||||
@ -123,7 +123,7 @@ namespace CppJieba
|
|||||||
for(Unicode::const_iterator it = begin; it != end; it++)
|
for(Unicode::const_iterator it = begin; it != end; it++)
|
||||||
{
|
{
|
||||||
SegmentChar schar(*it);
|
SegmentChar schar(*it);
|
||||||
uint i = it - begin;
|
size_t i = it - begin;
|
||||||
_trie.find(it, end, i, schar.dag);
|
_trie.find(it, end, i, schar.dag);
|
||||||
//DagType::iterator dagIter;
|
//DagType::iterator dagIter;
|
||||||
if(schar.dag.end() == schar.dag.find(i))
|
if(schar.dag.end() == schar.dag.find(i))
|
||||||
@ -148,7 +148,7 @@ namespace CppJieba
|
|||||||
segContext[i].weight = MIN_DOUBLE;
|
segContext[i].weight = MIN_DOUBLE;
|
||||||
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
|
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
|
||||||
{
|
{
|
||||||
uint nextPos = it->first;
|
size_t nextPos = it->first;
|
||||||
const TrieNodeInfo* p = it->second;
|
const TrieNodeInfo* p = it->second;
|
||||||
double val = 0.0;
|
double val = 0.0;
|
||||||
if(nextPos + 1 < segContext.size())
|
if(nextPos + 1 < segContext.size())
|
||||||
@ -176,7 +176,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const
|
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const
|
||||||
{
|
{
|
||||||
uint i = 0;
|
size_t i = 0;
|
||||||
while(i < segContext.size())
|
while(i < segContext.size())
|
||||||
{
|
{
|
||||||
const TrieNodeInfo* p = segContext[i].pInfo;
|
const TrieNodeInfo* p = segContext[i].pInfo;
|
||||||
|
@ -59,7 +59,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
vector<Unicode> hmmRes;
|
vector<Unicode> hmmRes;
|
||||||
Unicode piece;
|
Unicode piece;
|
||||||
for (uint i = 0, j = 0; i < infos.size(); i++)
|
for (size_t i = 0, j = 0; i < infos.size(); i++)
|
||||||
{
|
{
|
||||||
//if mp get a word, it's ok, put it into result
|
//if mp get a word, it's ok, put it into result
|
||||||
if (1 != infos[i].word.size())
|
if (1 != infos[i].word.size())
|
||||||
@ -84,7 +84,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
//put hmm result to return
|
//put hmm result to return
|
||||||
for (uint k = 0; k < hmmRes.size(); k++)
|
for (size_t k = 0; k < hmmRes.size(); k++)
|
||||||
{
|
{
|
||||||
res.push_back(hmmRes[k]);
|
res.push_back(hmmRes[k]);
|
||||||
}
|
}
|
||||||
|
@ -37,15 +37,15 @@ namespace CppJieba
|
|||||||
return cut(unico.begin(), unico.end(), res);
|
return cut(unico.begin(), unico.end(), res);
|
||||||
#else
|
#else
|
||||||
const char * const cstr = str.c_str();
|
const char * const cstr = str.c_str();
|
||||||
uint size = str.size();
|
size_t size = str.size();
|
||||||
uint offset = 0;
|
size_t offset = 0;
|
||||||
string subs;
|
string subs;
|
||||||
int ret;
|
int ret;
|
||||||
uint len;
|
size_t len;
|
||||||
while(offset < size)
|
while(offset < size)
|
||||||
{
|
{
|
||||||
const char * const nstr = cstr + offset;
|
const char * const nstr = cstr + offset;
|
||||||
uint nsize = size - offset;
|
size_t nsize = size - offset;
|
||||||
if(-1 == (ret = filterAscii(nstr, nsize, len)) || 0 == len || len > nsize)
|
if(-1 == (ret = filterAscii(nstr, nsize, len)) || 0 == len || len > nsize)
|
||||||
{
|
{
|
||||||
LogFatal("str[%s] illegal.", cstr);
|
LogFatal("str[%s] illegal.", cstr);
|
||||||
@ -78,7 +78,7 @@ namespace CppJieba
|
|||||||
* else count the nonascii string's length and return 1;
|
* else count the nonascii string's length and return 1;
|
||||||
* if errors, return -1;
|
* if errors, return -1;
|
||||||
* */
|
* */
|
||||||
static int filterAscii(const char* str, uint len, uint& resLen)
|
static int filterAscii(const char* str, size_t len, size_t& resLen)
|
||||||
{
|
{
|
||||||
if(!str || !len)
|
if(!str || !len)
|
||||||
{
|
{
|
||||||
|
145
src/Trie.hpp
145
src/Trie.hpp
@ -24,12 +24,13 @@ namespace CppJieba
|
|||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
const double MIN_DOUBLE = -3.14e+100;
|
const double MIN_DOUBLE = -3.14e+100;
|
||||||
const double MAX_DOUBLE = 3.14e+100;
|
const double MAX_DOUBLE = 3.14e+100;
|
||||||
|
const size_t DICT_COLUMN_NUM = 3;
|
||||||
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||||
struct TrieNode
|
struct TrieNode
|
||||||
{
|
{
|
||||||
TrieNodeMap hmap;
|
TrieNodeMap hmap;
|
||||||
bool isLeaf;
|
bool isLeaf;
|
||||||
uint nodeInfoVecPos;
|
size_t nodeInfoVecPos;
|
||||||
TrieNode()
|
TrieNode()
|
||||||
{
|
{
|
||||||
isLeaf = false;
|
isLeaf = false;
|
||||||
@ -44,18 +45,11 @@ namespace CppJieba
|
|||||||
string tag;
|
string tag;
|
||||||
double logFreq; //logFreq = log(freq/sum(freq));
|
double logFreq; //logFreq = log(freq/sum(freq));
|
||||||
TrieNodeInfo():freq(0),logFreq(0.0)
|
TrieNodeInfo():freq(0),logFreq(0.0)
|
||||||
{
|
{}
|
||||||
}
|
|
||||||
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
|
||||||
{
|
{}
|
||||||
}
|
|
||||||
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
|
||||||
{
|
{}
|
||||||
}
|
|
||||||
bool operator == (const TrieNodeInfo & rhs) const
|
|
||||||
{
|
|
||||||
return word == rhs.word && freq == rhs.freq && tag == rhs.tag && abs(logFreq - rhs.logFreq) < 0.001;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
|
inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo)
|
||||||
@ -63,7 +57,7 @@ namespace CppJieba
|
|||||||
return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ;
|
return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef map<uint, const TrieNodeInfo*> DagType;
|
typedef map<size_t, const TrieNodeInfo*> DagType;
|
||||||
|
|
||||||
class Trie: public InitOnOff
|
class Trie: public InitOnOff
|
||||||
{
|
{
|
||||||
@ -89,10 +83,6 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
~Trie()
|
~Trie()
|
||||||
{
|
{
|
||||||
if(!_getInitFlag())
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
_deleteNode(_root);
|
_deleteNode(_root);
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
@ -102,7 +92,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
_root = new TrieNode;
|
_root = new TrieNode;
|
||||||
assert(_root);
|
assert(_root);
|
||||||
if(!_trieInsert(filePath.c_str()))
|
if(!_trieInsert(filePath))
|
||||||
{
|
{
|
||||||
LogError("_trieInsert failed.");
|
LogError("_trieInsert failed.");
|
||||||
return false;
|
return false;
|
||||||
@ -118,16 +108,6 @@ namespace CppJieba
|
|||||||
public:
|
public:
|
||||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
|
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
|
||||||
{
|
{
|
||||||
|
|
||||||
if(!_getInitFlag())
|
|
||||||
{
|
|
||||||
LogFatal("trie not initted!");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if(begin >= end)
|
|
||||||
{
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
TrieNode* p = _root;
|
TrieNode* p = _root;
|
||||||
for(Unicode::const_iterator it = begin; it != end; it++)
|
for(Unicode::const_iterator it = begin; it != end; it++)
|
||||||
{
|
{
|
||||||
@ -143,7 +123,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
if(p->isLeaf)
|
if(p->isLeaf)
|
||||||
{
|
{
|
||||||
uint pos = p->nodeInfoVecPos;
|
size_t pos = p->nodeInfoVecPos;
|
||||||
if(pos < _nodeInfoVec.size())
|
if(pos < _nodeInfoVec.size())
|
||||||
{
|
{
|
||||||
return &(_nodeInfoVec[pos]);
|
return &(_nodeInfoVec[pos]);
|
||||||
@ -157,18 +137,8 @@ namespace CppJieba
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<uint, const TrieNodeInfo*> >& res) const
|
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<size_t, const TrieNodeInfo*> >& res) const
|
||||||
{
|
{
|
||||||
if(!_getInitFlag())
|
|
||||||
{
|
|
||||||
LogFatal("trie not initted!");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (begin >= end)
|
|
||||||
{
|
|
||||||
LogFatal("begin >= end");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
TrieNode* p = _root;
|
TrieNode* p = _root;
|
||||||
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
||||||
{
|
{
|
||||||
@ -179,7 +149,7 @@ namespace CppJieba
|
|||||||
p = p->hmap[*itr];
|
p = p->hmap[*itr];
|
||||||
if(p->isLeaf)
|
if(p->isLeaf)
|
||||||
{
|
{
|
||||||
uint pos = p->nodeInfoVecPos;
|
size_t pos = p->nodeInfoVecPos;
|
||||||
if(pos < _nodeInfoVec.size())
|
if(pos < _nodeInfoVec.size())
|
||||||
{
|
{
|
||||||
res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
|
res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
|
||||||
@ -194,18 +164,8 @@ namespace CppJieba
|
|||||||
return !res.empty();
|
return !res.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, uint offset, DagType & res) const
|
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, size_t offset, DagType & res) const
|
||||||
{
|
{
|
||||||
if(!_getInitFlag())
|
|
||||||
{
|
|
||||||
LogFatal("trie not initted!");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (begin >= end)
|
|
||||||
{
|
|
||||||
LogFatal("begin >= end");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
TrieNode* p = _root;
|
TrieNode* p = _root;
|
||||||
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
||||||
{
|
{
|
||||||
@ -216,10 +176,9 @@ namespace CppJieba
|
|||||||
p = p->hmap[*itr];
|
p = p->hmap[*itr];
|
||||||
if(p->isLeaf)
|
if(p->isLeaf)
|
||||||
{
|
{
|
||||||
uint pos = p->nodeInfoVecPos;
|
size_t pos = p->nodeInfoVecPos;
|
||||||
if(pos < _nodeInfoVec.size())
|
if(pos < _nodeInfoVec.size())
|
||||||
{
|
{
|
||||||
//res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
|
|
||||||
res[itr-begin + offset] = &_nodeInfoVec[pos];
|
res[itr-begin + offset] = &_nodeInfoVec[pos];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -233,32 +192,22 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
double getMinLogFreq()const{return _minLogFreq;};
|
double getMinLogFreq() const {return _minLogFreq;};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _insert(const TrieNodeInfo& nodeInfo)
|
void _insert(const TrieNodeInfo& nodeInfo)
|
||||||
{
|
{
|
||||||
|
|
||||||
const Unicode& uintVec = nodeInfo.word;
|
const Unicode& uintVec = nodeInfo.word;
|
||||||
TrieNode* p = _root;
|
TrieNode* p = _root;
|
||||||
for(uint i = 0; i < uintVec.size(); i++)
|
for(size_t i = 0; i < uintVec.size(); i++)
|
||||||
{
|
{
|
||||||
uint16_t cu = uintVec[i];
|
uint16_t cu = uintVec[i];
|
||||||
if(NULL == p)
|
assert(p);
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if(p->hmap.end() == p->hmap.find(cu))
|
if(p->hmap.end() == p->hmap.find(cu))
|
||||||
{
|
{
|
||||||
TrieNode * next = NULL;
|
TrieNode * next = new TrieNode;
|
||||||
try
|
assert(next);
|
||||||
{
|
|
||||||
next = new TrieNode;
|
|
||||||
}
|
|
||||||
catch(const bad_alloc& e)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
p->hmap[cu] = next;
|
p->hmap[cu] = next;
|
||||||
p = next;
|
p = next;
|
||||||
}
|
}
|
||||||
@ -267,62 +216,41 @@ namespace CppJieba
|
|||||||
p = p->hmap[cu];
|
p = p->hmap[cu];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(NULL == p)
|
assert(p);
|
||||||
{
|
assert(!p->isLeaf);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if(p->isLeaf)
|
|
||||||
{
|
|
||||||
LogError("this node already _inserted");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
p->isLeaf = true;
|
p->isLeaf = true;
|
||||||
_nodeInfoVec.push_back(nodeInfo);
|
_nodeInfoVec.push_back(nodeInfo);
|
||||||
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
|
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _trieInsert(const char * const filePath)
|
bool _trieInsert(const string& filePath)
|
||||||
{
|
{
|
||||||
ifstream ifs(filePath);
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs)
|
if(!ifs)
|
||||||
{
|
{
|
||||||
LogError("open %s failed.", filePath);
|
LogError("open %s failed.", filePath.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
string line;
|
string line;
|
||||||
vector<string> vecBuf;
|
vector<string> vecBuf;
|
||||||
|
|
||||||
TrieNodeInfo nodeInfo;
|
TrieNodeInfo nodeInfo;
|
||||||
size_t lineno = 0;
|
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
|
||||||
while(getline(ifs, line))
|
|
||||||
{
|
{
|
||||||
vecBuf.clear();
|
|
||||||
lineno ++;
|
|
||||||
split(line, vecBuf, " ");
|
split(line, vecBuf, " ");
|
||||||
if(3 < vecBuf.size())
|
assert(vecBuf.size() == DICT_COLUMN_NUM);
|
||||||
{
|
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if(!TransCode::decode(vecBuf[0], nodeInfo.word))
|
if(!TransCode::decode(vecBuf[0], nodeInfo.word))
|
||||||
{
|
{
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
||||||
if(3 == vecBuf.size())
|
nodeInfo.tag = vecBuf[2];
|
||||||
{
|
|
||||||
nodeInfo.tag = vecBuf[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!_insert(nodeInfo))
|
_insert(nodeInfo);
|
||||||
{
|
|
||||||
assert(false);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -340,21 +268,13 @@ namespace CppJieba
|
|||||||
_freqSum += _nodeInfoVec[i].freq;
|
_freqSum += _nodeInfoVec[i].freq;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(0 == _freqSum)
|
assert(_freqSum);
|
||||||
{
|
|
||||||
LogError("_freqSum == 0 .");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
//normalize
|
//normalize
|
||||||
for(uint i = 0; i < _nodeInfoVec.size(); i++)
|
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
{
|
{
|
||||||
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
|
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
|
||||||
if(0 == nodeInfo.freq)
|
assert(nodeInfo.freq);
|
||||||
{
|
|
||||||
LogFatal("nodeInfo.freq == 0!");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
|
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
|
||||||
if(_minLogFreq > nodeInfo.logFreq)
|
if(_minLogFreq > nodeInfo.logFreq)
|
||||||
{
|
{
|
||||||
@ -367,12 +287,15 @@ namespace CppJieba
|
|||||||
|
|
||||||
void _deleteNode(TrieNode* node)
|
void _deleteNode(TrieNode* node)
|
||||||
{
|
{
|
||||||
|
if(!node)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
||||||
{
|
{
|
||||||
TrieNode* next = it->second;
|
TrieNode* next = it->second;
|
||||||
_deleteNode(next);
|
_deleteNode(next);
|
||||||
}
|
}
|
||||||
|
|
||||||
delete node;
|
delete node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,11 +12,11 @@ TEST(SegmentBaseTest, Test1)
|
|||||||
buf.push_back("你好");
|
buf.push_back("你好");
|
||||||
buf.push_back("...hh");
|
buf.push_back("...hh");
|
||||||
vector<string> res;
|
vector<string> res;
|
||||||
uint size = strlen(str);
|
size_t size = strlen(str);
|
||||||
uint offset = 0;
|
size_t offset = 0;
|
||||||
while(offset < size)
|
while(offset < size)
|
||||||
{
|
{
|
||||||
uint len = 0;
|
size_t len = 0;
|
||||||
const char* t = str + offset;
|
const char* t = str + offset;
|
||||||
SegmentBase::filterAscii(t, size - offset, len);
|
SegmentBase::filterAscii(t, size - offset, len);
|
||||||
s.assign(t, len);
|
s.assign(t, len);
|
||||||
|
@ -7,6 +7,7 @@ static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
|
|||||||
|
|
||||||
TEST(TrieTest, Test1)
|
TEST(TrieTest, Test1)
|
||||||
{
|
{
|
||||||
|
string s1, s2;
|
||||||
Trie trie;
|
Trie trie;
|
||||||
ASSERT_TRUE(trie.init(DICT_FILE));
|
ASSERT_TRUE(trie.init(DICT_FILE));
|
||||||
ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001);
|
ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001);
|
||||||
@ -18,14 +19,16 @@ TEST(TrieTest, Test1)
|
|||||||
nodeInfo.freq = 8779;
|
nodeInfo.freq = 8779;
|
||||||
nodeInfo.tag = "v";
|
nodeInfo.tag = "v";
|
||||||
nodeInfo.logFreq = -8.87033;
|
nodeInfo.logFreq = -8.87033;
|
||||||
|
s1 << nodeInfo;
|
||||||
|
s2 << (*trie.find(uni.begin(), uni.end()));
|
||||||
|
|
||||||
EXPECT_EQ(nodeInfo, *trie.find(uni.begin(), uni.end()));
|
EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2);
|
||||||
word = "清华大学";
|
word = "清华大学";
|
||||||
vector<pair<uint, const TrieNodeInfo*> > res;
|
vector<pair<size_t, const TrieNodeInfo*> > res;
|
||||||
map<uint, const TrieNodeInfo* > resMap;
|
map<size_t, const TrieNodeInfo* > resMap;
|
||||||
map<uint, const TrieNodeInfo* > map;
|
map<size_t, const TrieNodeInfo* > map;
|
||||||
const char * words[] = {"清", "清华", "清华大学"};
|
const char * words[] = {"清", "清华", "清华大学"};
|
||||||
for(uint i = 0; i < sizeof(words)/sizeof(words[0]); i++)
|
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
|
||||||
{
|
{
|
||||||
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
||||||
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
||||||
@ -34,7 +37,7 @@ TEST(TrieTest, Test1)
|
|||||||
//TrieNodeInfo
|
//TrieNodeInfo
|
||||||
//res.push_back(make_pair(0, ))
|
//res.push_back(make_pair(0, ))
|
||||||
|
|
||||||
vector<pair<uint, const TrieNodeInfo*> > vec;
|
vector<pair<size_t, const TrieNodeInfo*> > vec;
|
||||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||||
//print(uni);
|
//print(uni);
|
||||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec));
|
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user