Merge pull request #9 from aholic/master

remove NO_CODING_LOG | make MixSegment looks better
This commit is contained in:
Wu Yanyi 2013-12-14 06:27:25 -08:00
commit d47900d65a
3 changed files with 73 additions and 80 deletions

View File

@ -24,13 +24,11 @@ namespace CppJieba
public: public:
bool init() bool init()
{ {
#ifndef NO_CODING_LOG
if(_getInitFlag()) if(_getInitFlag())
{ {
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
#endif
if(!_trie.init()) if(!_trie.init())
{ {
LogError("_trie.init failed."); LogError("_trie.init failed.");
@ -47,12 +45,10 @@ namespace CppJieba
} }
bool dispose() bool dispose()
{ {
#ifndef NO_CODING_LOG
if(!_getInitFlag()) if(!_getInitFlag())
{ {
return true; return true;
} }
#endif
_trie.dispose(); _trie.dispose();
_setInitFlag(false); _setInitFlag(false);
return true; return true;
@ -65,18 +61,12 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
#ifndef NO_CODING_LOG
//if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end) if (begin >= end)
{ {
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
#endif
//resut of searching in trie tree //resut of searching in trie tree
vector<pair<uint, const TrieNodeInfo*> > tRes; vector<pair<uint, const TrieNodeInfo*> > tRes;
@ -123,26 +113,21 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{ {
#ifndef NO_CODING_LOG assert(_getInitFlag());
if (!_getInitFlag()) if (begin >= end)
{ {
LogError("not inited."); LogError("begin >= end");
return false; return false;
} }
if (begin > end)
{
LogError("begin > end");
return false;
}
#endif
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes))
{ {
LogError("get unicode cut result error."); LogError("get unicode cut result error.");
return false; return false;
} }
string tmp;
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
{ {
if (TransCode::encode(*uItr, tmp)) if (TransCode::encode(*uItr, tmp))

View File

@ -55,72 +55,95 @@ namespace CppJieba
public: public:
using SegmentBase::cut; using SegmentBase::cut;
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag()); assert(_getInitFlag());
if(begin == end) if(begin >= end)
{ {
LogError("begin >= end");
return false; return false;
} }
vector<TrieNodeInfo> infos; vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos)) if(!_mpSeg.cut(begin, end, infos))
{ {
LogError("mpSeg cutDAG failed."); LogError("mpSeg cutDAG failed.");
return false; return false;
} }
Unicode unico;
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
string tmp; Unicode piece;
for(uint i= 0; i < infos.size(); i++) for (uint i = 0, j = 0; i < infos.size(); i++)
{ {
TransCode::encode(infos[i].word,tmp); //if mp get a word, it's ok, put it into result
if(1 == infos[i].word.size()) if (1 != infos[i].word.size())
{ {
unico.push_back(infos[i].word[0]); res.push_back(infos[i].word);
continue;
} }
else
// if mp get a single one, collect it in sequence
j = i;
while (j < infos.size() && infos[j].word.size() == 1)
{ {
if(!unico.empty()) piece.push_back(infos[j].word[0]);
{ j++;
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
unico.clear();
TransCode::encode(infos[i].word, tmp);
res.push_back(tmp);
} }
}
if(!unico.empty()) // cut the sequence with hmm
{ if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{ {
LogError("_hmmSeg cut failed."); LogError("_hmmSeg cut failed.");
return false; return false;
} }
for(uint j = 0; j < hmmRes.size(); j++)
//put hmm result to return
for (uint k = 0; k < hmmRes.size(); k++)
{ {
TransCode::encode(hmmRes[j], tmp); res.push_back(hmmRes[k]);
res.push_back(tmp);
} }
//clear tmp vars
piece.clear();
hmmRes.clear();
//let i jump over this piece
i = j - 1;
} }
return true; return true;
} }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
assert(_getInitFlag());
if(begin >= end)
{
LogError("begin >= end");
return false;
}
vector<Unicode> uRes;
if (!cut(begin, end, uRes))
{
LogError("get unicode cut result error.");
return false;
}
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
{
if (TransCode::encode(*uItr, tmp))
{
res.push_back(tmp);
}
else
{
LogError("encode failed.");
}
}
return true;
}
}; };
} }

View File

@ -27,12 +27,10 @@ namespace CppJieba
public: public:
bool init() bool init()
{ {
#ifndef NO_CODING_LOG
if (_getInitFlag()) if (_getInitFlag())
{ {
LogError("inited."); LogError("inited.");
} }
#endif
if (!_hmmSeg.init()) if (!_hmmSeg.init())
{ {
LogError("_hmmSeg init"); LogError("_hmmSeg init");
@ -47,12 +45,10 @@ namespace CppJieba
} }
bool dispose() bool dispose()
{ {
#ifndef NO_CODING_LOG
if(!_getInitFlag()) if(!_getInitFlag())
{ {
return true; return true;
} }
#endif
_fullSeg.dispose(); _fullSeg.dispose();
_hmmSeg.dispose(); _hmmSeg.dispose();
_setInitFlag(false); _setInitFlag(false);
@ -66,18 +62,12 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
#ifndef NO_CODING_LOG
//if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end) if (begin >= end)
{ {
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
#endif
//use hmm cut first //use hmm cut first
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
if (!_hmmSeg.cut(begin, end, hmmRes)) if (!_hmmSeg.cut(begin, end, hmmRes))
@ -113,18 +103,13 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{ {
#ifndef NO_CODING_LOG assert(_getInitFlag());
if (!_getInitFlag()) if (begin >= end)
{ {
LogError("not inited."); LogError("begin >= end");
return false; return false;
} }
if (begin > end)
{
LogError("begin > end");
return false;
}
#endif
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes))
{ {