change algorithm for QuerySegment(now is mix+full) | use TrieManager to get a trie for all Segment

This commit is contained in:
aholic 2013-12-16 14:18:44 +08:00
parent a0f588a8af
commit 7add684a8a
3 changed files with 30 additions and 46 deletions

View File

@ -9,13 +9,14 @@
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
#include "TrieManager.hpp"
namespace CppJieba namespace CppJieba
{ {
class FullSegment: public SegmentBase class FullSegment: public SegmentBase
{ {
private: private:
Trie _trie; Trie* _trie;
const string _dictPath; const string _dictPath;
public: public:
@ -29,18 +30,12 @@ namespace CppJieba
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
if(!_trie.init()) _trie = TrieManager::getInstance().getTrie(_dictPath.c_str());
if (NULL == _trie)
{ {
LogError("_trie.init failed."); LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str());
return false; return false;
} }
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
if(!_trie.loadDict(_dictPath.c_str()))
{
LogError("_trie.loadDict faield.");
return false;
}
LogInfo("_trie.loadDict end.");
return _setInitFlag(true); return _setInitFlag(true);
} }
bool dispose() bool dispose()
@ -49,7 +44,6 @@ namespace CppJieba
{ {
return true; return true;
} }
_trie.dispose();
_setInitFlag(false); _setInitFlag(false);
return true; return true;
} }
@ -81,7 +75,7 @@ namespace CppJieba
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
{ {
//find word start from uItr //find word start from uItr
if (_trie.find(uItr, end, tRes)) if (_trie->find(uItr, end, tRes))
{ {
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {

View File

@ -10,6 +10,7 @@
#include <cassert> #include <cassert>
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Trie.hpp" #include "Trie.hpp"
#include "TrieManager.hpp"
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
@ -32,7 +33,7 @@ namespace CppJieba
class MPSegment: public SegmentBase class MPSegment: public SegmentBase
{ {
private: private:
Trie _trie; Trie* _trie;
private: private:
const string _dictPath; const string _dictPath;
@ -47,18 +48,12 @@ namespace CppJieba
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
if(!_trie.init()) _trie = TrieManager::getInstance().getTrie(_dictPath.c_str());
if (_trie == NULL)
{ {
LogError("_trie.init failed."); LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str());
return false; return false;
} }
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
if(!_trie.loadDict(_dictPath.c_str()))
{
LogError("_trie.loadDict faield.");
return false;
}
LogInfo("_trie.loadDict end.");
return _setInitFlag(true); return _setInitFlag(true);
} }
virtual bool dispose() virtual bool dispose()
@ -67,18 +62,12 @@ namespace CppJieba
{ {
return true; return true;
} }
_trie.dispose();
_setInitFlag(false); _setInitFlag(false);
return true; return true;
} }
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag()); assert(_getInitFlag());
vector<TrieNodeInfo> segWordInfos; vector<TrieNodeInfo> segWordInfos;
@ -145,7 +134,7 @@ namespace CppJieba
{ {
SegmentChar schar(*it); SegmentChar schar(*it);
uint i = it - begin; uint i = it - begin;
_trie.find(it, end, i, schar.dag); _trie->find(it, end, i, schar.dag);
//DagType::iterator dagIter; //DagType::iterator dagIter;
if(schar.dag.end() == schar.dag.find(i)) if(schar.dag.end() == schar.dag.find(i))
{ {
@ -183,7 +172,7 @@ namespace CppJieba
} }
else else
{ {
val += _trie.getMinLogFreq(); val += _trie->getMinLogFreq();
} }
if(val > segContext[i].weight) if(val > segContext[i].weight)
{ {
@ -211,7 +200,7 @@ namespace CppJieba
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
nodeInfo.word.push_back(segContext[i].uniCh); nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0; nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq(); nodeInfo.logFreq = _trie->getMinLogFreq();
res.push_back(nodeInfo); res.push_back(nodeInfo);
i++; i++;
} }

View File

@ -8,21 +8,22 @@
#include "Trie.hpp" #include "Trie.hpp"
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "HMMSegment.hpp"
#include "FullSegment.hpp" #include "FullSegment.hpp"
#include "MixSegment.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
#include "TrieManager.hpp"
namespace CppJieba namespace CppJieba
{ {
class QuerySegment: public SegmentBase class QuerySegment: public SegmentBase
{ {
private: private:
HMMSegment _hmmSeg; MixSegment _mixSeg;
FullSegment _fullSeg; FullSegment _fullSeg;
int _maxWordLen; int _maxWordLen;
public: public:
QuerySegment(const char* fullSegDict, const char* hmmSegDict, int maxWordLen): _hmmSeg(hmmSegDict), _fullSeg(fullSegDict), _maxWordLen(maxWordLen){}; QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){};
virtual ~QuerySegment(){dispose();}; virtual ~QuerySegment(){dispose();};
public: public:
bool init() bool init()
@ -31,9 +32,9 @@ namespace CppJieba
{ {
LogError("inited."); LogError("inited.");
} }
if (!_hmmSeg.init()) if (!_mixSeg.init())
{ {
LogError("_hmmSeg init"); LogError("_mixSeg init");
return false; return false;
} }
if (!_fullSeg.init()) if (!_fullSeg.init())
@ -50,7 +51,7 @@ namespace CppJieba
return true; return true;
} }
_fullSeg.dispose(); _fullSeg.dispose();
_hmmSeg.dispose(); _mixSeg.dispose();
_setInitFlag(false); _setInitFlag(false);
return true; return true;
} }
@ -68,22 +69,22 @@ namespace CppJieba
return false; return false;
} }
//use hmm cut first //use mix cut first
vector<Unicode> hmmRes; vector<Unicode> mixRes;
if (!_hmmSeg.cut(begin, end, hmmRes)) if (!_mixSeg.cut(begin, end, mixRes))
{ {
LogError("_hmmSeg cut failed."); LogError("_mixSeg cut failed.");
return false; return false;
} }
vector<Unicode> fullRes; vector<Unicode> fullRes;
for (vector<Unicode>::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++) for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
{ {
// if it's too long, cut with _fullSeg, put fullRes in res // if it's too long, cut with _fullSeg, put fullRes in res
if (hmmResItr->size() > _maxWordLen) if (mixResItr->size() > _maxWordLen)
{ {
if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes)) if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
{ {
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
{ {
@ -91,9 +92,9 @@ namespace CppJieba
} }
} }
} }
else // just use the hmm result else // just use the mix result
{ {
res.push_back(*hmmResItr); res.push_back(*mixResItr);
} }
} }