From 12328a3a7e9c847303343b72d3302e84c1696a08 Mon Sep 17 00:00:00 2001 From: aholic Date: Thu, 28 Nov 2013 09:17:29 +0800 Subject: [PATCH 1/2] remove macro NO_CODING_LOG --- src/FullSegment.hpp | 8 -------- src/MixSegment.hpp | 1 + src/QuerySegment.hpp | 8 -------- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index d67f2f3..5fc3604 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -24,13 +24,11 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if(_getInitFlag()) { LogError("already inited before now."); return false; } -#endif if(!_trie.init()) { LogError("_trie.init failed."); @@ -47,12 +45,10 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _trie.dispose(); _setInitFlag(false); return true; @@ -65,7 +61,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG //if (!_getInitFlag()) //{ // LogError("not inited."); @@ -76,7 +71,6 @@ namespace CppJieba LogError("begin >= end"); return false; } -#endif //resut of searching in trie tree vector > tRes; @@ -123,7 +117,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG if (!_getInitFlag()) { LogError("not inited."); @@ -134,7 +127,6 @@ namespace CppJieba LogError("begin > end"); return false; } -#endif vector uRes; if (!cut(begin, end, uRes)) { diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 5f35031..272a57f 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,6 +55,7 @@ namespace CppJieba public: using SegmentBase::cut; public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { //if(!_getInitFlag()) diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 4487339..de268c5 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -27,12 +27,10 @@ namespace CppJieba public: bool init() { -#ifndef NO_CODING_LOG if (_getInitFlag()) { LogError("inited."); } -#endif if (!_hmmSeg.init()) { LogError("_hmmSeg init"); @@ -47,12 +45,10 @@ namespace CppJieba } bool dispose() { -#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } -#endif _fullSeg.dispose(); _hmmSeg.dispose(); _setInitFlag(false); @@ -66,7 +62,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); -#ifndef NO_CODING_LOG //if (!_getInitFlag()) //{ // LogError("not inited."); @@ -77,7 +72,6 @@ namespace CppJieba LogError("begin >= end"); return false; } -#endif //use hmm cut first vector hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) @@ -113,7 +107,6 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { -#ifndef NO_CODING_LOG if (!_getInitFlag()) { LogError("not inited."); @@ -124,7 +117,6 @@ namespace CppJieba LogError("begin > end"); return false; } -#endif vector uRes; if (!cut(begin, end, uRes)) { From 599c130bd9a5790740e575777bda05aecc98b483 Mon Sep 17 00:00:00 2001 From: aholic Date: Thu, 28 Nov 2013 10:49:40 +0800 Subject: [PATCH 2/2] make MixSegment looks better --- src/FullSegment.hpp | 19 +++----- src/MixSegment.hpp | 102 ++++++++++++++++++++++++++----------------- src/QuerySegment.hpp | 17 +++----- 3 files changed, 73 insertions(+), 65 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 5fc3604..f1182fc 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -61,16 +61,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } + //resut of searching in trie tree vector > tRes; @@ -117,24 +113,21 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } + vector uRes; if (!cut(begin, end, uRes)) { LogError("get unicode cut result error."); return false; } - string tmp; + string tmp; for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { if (TransCode::encode(*uItr, tmp)) diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 272a57f..79c40d1 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,73 +55,95 @@ namespace CppJieba public: using SegmentBase::cut; public: - - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); - if(begin == end) + if(begin >= end) { + LogError("begin >= end"); return false; } + vector infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } - Unicode unico; + vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) + Unicode piece; + for (uint i = 0, j = 0; i < infos.size(); i++) { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) + //if mp get a word, it's ok, put it into result + if (1 != infos[i].word.size()) { - unico.push_back(infos[i].word[0]); + res.push_back(infos[i].word); + continue; } - else + + // if mp get a single one, collect it in sequence + j = i; + while (j < infos.size() && infos[j].word.size() == 1) { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); + piece.push_back(infos[j].word[0]); + j++; } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + + // cut the sequence with hmm + if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } - for(uint j = 0; j < hmmRes.size(); j++) + + //put hmm result to return + for (uint k = 0; k < hmmRes.size(); k++) { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); + res.push_back(hmmRes[k]); } + + //clear tmp vars + piece.clear(); + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; } return true; } + + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + assert(_getInitFlag()); + if(begin >= end) + { + LogError("begin >= end"); + return false; + } + + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } }; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index de268c5..ae5987e 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -62,16 +62,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } + //use hmm cut first vector hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) @@ -107,16 +103,13 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } + vector uRes; if (!cut(begin, end, uRes)) {