make MixSegment looks better

This commit is contained in:
aholic 2013-11-28 10:49:40 +08:00
parent 12328a3a7e
commit 599c130bd9
3 changed files with 73 additions and 65 deletions

View File

@ -61,16 +61,12 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
//if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end) if (begin >= end)
{ {
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
//resut of searching in trie tree //resut of searching in trie tree
vector<pair<uint, const TrieNodeInfo*> > tRes; vector<pair<uint, const TrieNodeInfo*> > tRes;
@ -117,24 +113,21 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{ {
if (!_getInitFlag()) assert(_getInitFlag());
if (begin >= end)
{ {
LogError("not inited."); LogError("begin >= end");
return false;
}
if (begin > end)
{
LogError("begin > end");
return false; return false;
} }
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes))
{ {
LogError("get unicode cut result error."); LogError("get unicode cut result error.");
return false; return false;
} }
string tmp;
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
{ {
if (TransCode::encode(*uItr, tmp)) if (TransCode::encode(*uItr, tmp))

View File

@ -55,71 +55,93 @@ namespace CppJieba
public: public:
using SegmentBase::cut; using SegmentBase::cut;
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag()); assert(_getInitFlag());
if(begin == end) if(begin >= end)
{ {
LogError("begin >= end");
return false; return false;
} }
vector<TrieNodeInfo> infos; vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos)) if(!_mpSeg.cut(begin, end, infos))
{ {
LogError("mpSeg cutDAG failed."); LogError("mpSeg cutDAG failed.");
return false; return false;
} }
Unicode unico;
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
Unicode piece;
for (uint i = 0, j = 0; i < infos.size(); i++)
{
//if mp get a word, it's ok, put it into result
if (1 != infos[i].word.size())
{
res.push_back(infos[i].word);
continue;
}
// if mp get a single one, collect it in sequence
j = i;
while (j < infos.size() && infos[j].word.size() == 1)
{
piece.push_back(infos[j].word[0]);
j++;
}
// cut the sequence with hmm
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
//put hmm result to return
for (uint k = 0; k < hmmRes.size(); k++)
{
res.push_back(hmmRes[k]);
}
//clear tmp vars
piece.clear();
hmmRes.clear();
//let i jump over this piece
i = j - 1;
}
return true;
}
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
assert(_getInitFlag());
if(begin >= end)
{
LogError("begin >= end");
return false;
}
vector<Unicode> uRes;
if (!cut(begin, end, uRes))
{
LogError("get unicode cut result error.");
return false;
}
string tmp; string tmp;
for(uint i= 0; i < infos.size(); i++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
{ {
TransCode::encode(infos[i].word,tmp); if (TransCode::encode(*uItr, tmp))
if(1 == infos[i].word.size())
{ {
unico.push_back(infos[i].word[0]); res.push_back(tmp);
} }
else else
{ {
if(!unico.empty()) LogError("encode failed.");
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
} }
} }
unico.clear();
TransCode::encode(infos[i].word, tmp);
res.push_back(tmp);
}
}
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
return true; return true;
} }
}; };

View File

@ -62,16 +62,12 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
//if (!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
if (begin >= end) if (begin >= end)
{ {
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
//use hmm cut first //use hmm cut first
vector<Unicode> hmmRes; vector<Unicode> hmmRes;
if (!_hmmSeg.cut(begin, end, hmmRes)) if (!_hmmSeg.cut(begin, end, hmmRes))
@ -107,16 +103,13 @@ namespace CppJieba
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{ {
if (!_getInitFlag()) assert(_getInitFlag());
if (begin >= end)
{ {
LogError("not inited."); LogError("begin >= end");
return false;
}
if (begin > end)
{
LogError("begin > end");
return false; return false;
} }
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes))
{ {