mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
change algorithm for QuerySegment(now is mix+full) | use TrieManager to get a trie for all Segment
This commit is contained in:
parent
a0f588a8af
commit
7add684a8a
@ -9,13 +9,14 @@
|
|||||||
#include "ISegment.hpp"
|
#include "ISegment.hpp"
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "TransCode.hpp"
|
#include "TransCode.hpp"
|
||||||
|
#include "TrieManager.hpp"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
class FullSegment: public SegmentBase
|
class FullSegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
Trie _trie;
|
Trie* _trie;
|
||||||
const string _dictPath;
|
const string _dictPath;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -29,18 +30,12 @@ namespace CppJieba
|
|||||||
LogError("already inited before now.");
|
LogError("already inited before now.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(!_trie.init())
|
_trie = TrieManager::getInstance().getTrie(_dictPath.c_str());
|
||||||
|
if (NULL == _trie)
|
||||||
{
|
{
|
||||||
LogError("_trie.init failed.");
|
LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
|
|
||||||
if(!_trie.loadDict(_dictPath.c_str()))
|
|
||||||
{
|
|
||||||
LogError("_trie.loadDict faield.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LogInfo("_trie.loadDict end.");
|
|
||||||
return _setInitFlag(true);
|
return _setInitFlag(true);
|
||||||
}
|
}
|
||||||
bool dispose()
|
bool dispose()
|
||||||
@ -49,7 +44,6 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
_trie.dispose();
|
|
||||||
_setInitFlag(false);
|
_setInitFlag(false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -81,7 +75,7 @@ namespace CppJieba
|
|||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
||||||
{
|
{
|
||||||
//find word start from uItr
|
//find word start from uItr
|
||||||
if (_trie.find(uItr, end, tRes))
|
if (_trie->find(uItr, end, tRes))
|
||||||
{
|
{
|
||||||
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
{
|
{
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include "Limonp/logger.hpp"
|
#include "Limonp/logger.hpp"
|
||||||
#include "Trie.hpp"
|
#include "Trie.hpp"
|
||||||
|
#include "TrieManager.hpp"
|
||||||
#include "ISegment.hpp"
|
#include "ISegment.hpp"
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
|
|
||||||
@ -32,7 +33,7 @@ namespace CppJieba
|
|||||||
class MPSegment: public SegmentBase
|
class MPSegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
Trie _trie;
|
Trie* _trie;
|
||||||
private:
|
private:
|
||||||
const string _dictPath;
|
const string _dictPath;
|
||||||
|
|
||||||
@ -47,18 +48,12 @@ namespace CppJieba
|
|||||||
LogError("already inited before now.");
|
LogError("already inited before now.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(!_trie.init())
|
_trie = TrieManager::getInstance().getTrie(_dictPath.c_str());
|
||||||
|
if (_trie == NULL)
|
||||||
{
|
{
|
||||||
LogError("_trie.init failed.");
|
LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
|
|
||||||
if(!_trie.loadDict(_dictPath.c_str()))
|
|
||||||
{
|
|
||||||
LogError("_trie.loadDict faield.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LogInfo("_trie.loadDict end.");
|
|
||||||
return _setInitFlag(true);
|
return _setInitFlag(true);
|
||||||
}
|
}
|
||||||
virtual bool dispose()
|
virtual bool dispose()
|
||||||
@ -67,18 +62,12 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
_trie.dispose();
|
|
||||||
_setInitFlag(false);
|
_setInitFlag(false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||||
{
|
{
|
||||||
//if(!_getInitFlag())
|
|
||||||
//{
|
|
||||||
// LogError("not inited.");
|
|
||||||
// return false;
|
|
||||||
//}
|
|
||||||
assert(_getInitFlag());
|
assert(_getInitFlag());
|
||||||
|
|
||||||
vector<TrieNodeInfo> segWordInfos;
|
vector<TrieNodeInfo> segWordInfos;
|
||||||
@ -145,7 +134,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
SegmentChar schar(*it);
|
SegmentChar schar(*it);
|
||||||
uint i = it - begin;
|
uint i = it - begin;
|
||||||
_trie.find(it, end, i, schar.dag);
|
_trie->find(it, end, i, schar.dag);
|
||||||
//DagType::iterator dagIter;
|
//DagType::iterator dagIter;
|
||||||
if(schar.dag.end() == schar.dag.find(i))
|
if(schar.dag.end() == schar.dag.find(i))
|
||||||
{
|
{
|
||||||
@ -183,7 +172,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
val += _trie.getMinLogFreq();
|
val += _trie->getMinLogFreq();
|
||||||
}
|
}
|
||||||
if(val > segContext[i].weight)
|
if(val > segContext[i].weight)
|
||||||
{
|
{
|
||||||
@ -211,7 +200,7 @@ namespace CppJieba
|
|||||||
TrieNodeInfo nodeInfo;
|
TrieNodeInfo nodeInfo;
|
||||||
nodeInfo.word.push_back(segContext[i].uniCh);
|
nodeInfo.word.push_back(segContext[i].uniCh);
|
||||||
nodeInfo.freq = 0;
|
nodeInfo.freq = 0;
|
||||||
nodeInfo.logFreq = _trie.getMinLogFreq();
|
nodeInfo.logFreq = _trie->getMinLogFreq();
|
||||||
res.push_back(nodeInfo);
|
res.push_back(nodeInfo);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
@ -8,21 +8,22 @@
|
|||||||
#include "Trie.hpp"
|
#include "Trie.hpp"
|
||||||
#include "ISegment.hpp"
|
#include "ISegment.hpp"
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "HMMSegment.hpp"
|
|
||||||
#include "FullSegment.hpp"
|
#include "FullSegment.hpp"
|
||||||
|
#include "MixSegment.hpp"
|
||||||
#include "TransCode.hpp"
|
#include "TransCode.hpp"
|
||||||
|
#include "TrieManager.hpp"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
class QuerySegment: public SegmentBase
|
class QuerySegment: public SegmentBase
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
HMMSegment _hmmSeg;
|
MixSegment _mixSeg;
|
||||||
FullSegment _fullSeg;
|
FullSegment _fullSeg;
|
||||||
int _maxWordLen;
|
int _maxWordLen;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
QuerySegment(const char* fullSegDict, const char* hmmSegDict, int maxWordLen): _hmmSeg(hmmSegDict), _fullSeg(fullSegDict), _maxWordLen(maxWordLen){};
|
QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){};
|
||||||
virtual ~QuerySegment(){dispose();};
|
virtual ~QuerySegment(){dispose();};
|
||||||
public:
|
public:
|
||||||
bool init()
|
bool init()
|
||||||
@ -31,9 +32,9 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
LogError("inited.");
|
LogError("inited.");
|
||||||
}
|
}
|
||||||
if (!_hmmSeg.init())
|
if (!_mixSeg.init())
|
||||||
{
|
{
|
||||||
LogError("_hmmSeg init");
|
LogError("_mixSeg init");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!_fullSeg.init())
|
if (!_fullSeg.init())
|
||||||
@ -50,7 +51,7 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
_fullSeg.dispose();
|
_fullSeg.dispose();
|
||||||
_hmmSeg.dispose();
|
_mixSeg.dispose();
|
||||||
_setInitFlag(false);
|
_setInitFlag(false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -68,22 +69,22 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//use hmm cut first
|
//use mix cut first
|
||||||
vector<Unicode> hmmRes;
|
vector<Unicode> mixRes;
|
||||||
if (!_hmmSeg.cut(begin, end, hmmRes))
|
if (!_mixSeg.cut(begin, end, mixRes))
|
||||||
{
|
{
|
||||||
LogError("_hmmSeg cut failed.");
|
LogError("_mixSeg cut failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<Unicode> fullRes;
|
vector<Unicode> fullRes;
|
||||||
for (vector<Unicode>::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++)
|
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
|
||||||
{
|
{
|
||||||
|
|
||||||
// if it's too long, cut with _fullSeg, put fullRes in res
|
// if it's too long, cut with _fullSeg, put fullRes in res
|
||||||
if (hmmResItr->size() > _maxWordLen)
|
if (mixResItr->size() > _maxWordLen)
|
||||||
{
|
{
|
||||||
if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes))
|
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
|
||||||
{
|
{
|
||||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
|
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
|
||||||
{
|
{
|
||||||
@ -91,9 +92,9 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else // just use the hmm result
|
else // just use the mix result
|
||||||
{
|
{
|
||||||
res.push_back(*hmmResItr);
|
res.push_back(*mixResItr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user