diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index d67f2f3..f1182fc 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -24,13 +24,11 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if(_getInitFlag()) { LogError("already inited before now."); return false; } -#endif if(!_trie.init()) { LogError("_trie.init failed."); @@ -47,12 +45,10 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _trie.dispose(); _setInitFlag(false); return true; @@ -65,18 +61,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } -#endif + //resut of searching in trie tree vector > tRes; @@ -123,26 +113,21 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); + LogError("begin >= end"); return false; } - if (begin > end) - { - LogError("begin > end"); - return false; - } -#endif + vector uRes; if (!cut(begin, end, uRes)) { LogError("get unicode cut result error."); return false; } - string tmp; + string tmp; for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { if (TransCode::encode(*uItr, tmp)) diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 5f35031..79c40d1 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,72 +55,95 @@ namespace CppJieba public: using SegmentBase::cut; public: - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); - if(begin == end) + if(begin >= end) { + LogError("begin >= end"); return false; } + vector infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } - Unicode unico; + vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) + Unicode piece; + for (uint i = 0, j = 0; i < infos.size(); i++) { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) + //if mp get a word, it's ok, put it into result + if (1 != infos[i].word.size()) { - unico.push_back(infos[i].word[0]); + res.push_back(infos[i].word); + continue; } - else + + // if mp get a single one, collect it in sequence + j = i; + while (j < infos.size() && infos[j].word.size() == 1) { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); + piece.push_back(infos[j].word[0]); + j++; } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + + // cut the sequence with hmm + if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } - for(uint j = 0; j < hmmRes.size(); j++) + + //put hmm result to return + for (uint k = 0; k < hmmRes.size(); k++) { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); + res.push_back(hmmRes[k]); } + + //clear tmp vars + piece.clear(); + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; } return true; } + + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + assert(_getInitFlag()); + if(begin >= end) + { + LogError("begin >= end"); + return false; + } + + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } }; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 4487339..ae5987e 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -27,12 +27,10 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if (_getInitFlag()) { LogError("inited."); } -#endif if (!_hmmSeg.init()) { LogError("_hmmSeg init"); @@ -47,12 +45,10 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _fullSeg.dispose(); _hmmSeg.dispose(); _setInitFlag(false); @@ -66,18 +62,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } -#endif + //use hmm cut first vector hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) @@ -113,18 +103,13 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); + LogError("begin >= end"); return false; } - if (begin > end) - { - LogError("begin > end"); - return false; - } -#endif + vector uRes; if (!cut(begin, end, uRes)) {