From 599c130bd9a5790740e575777bda05aecc98b483 Mon Sep 17 00:00:00 2001 From: aholic Date: Thu, 28 Nov 2013 10:49:40 +0800 Subject: [PATCH] make MixSegment looks better --- src/FullSegment.hpp | 19 +++----- src/MixSegment.hpp | 102 ++++++++++++++++++++++++++----------------- src/QuerySegment.hpp | 17 +++----- 3 files changed, 73 insertions(+), 65 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 5fc3604..f1182fc 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -61,16 +61,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } + //resut of searching in trie tree vector > tRes; @@ -117,24 +113,21 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } + vector uRes; if (!cut(begin, end, uRes)) { LogError("get unicode cut result error."); return false; } - string tmp; + string tmp; for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { if (TransCode::encode(*uItr, tmp)) diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 272a57f..79c40d1 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -55,73 +55,95 @@ namespace CppJieba public: using SegmentBase::cut; public: - - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); - if(begin == end) + if(begin >= end) { + LogError("begin >= end"); return false; } + vector infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } - Unicode unico; + vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) + Unicode piece; + for (uint i = 0, j = 0; i < infos.size(); i++) { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) + //if mp get a word, it's ok, put it into result + if (1 != infos[i].word.size()) { - unico.push_back(infos[i].word[0]); + res.push_back(infos[i].word); + continue; } - else + + // if mp get a single one, collect it in sequence + j = i; + while (j < infos.size() && infos[j].word.size() == 1) { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); + piece.push_back(infos[j].word[0]); + j++; } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + + // cut the sequence with hmm + if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } - for(uint j = 0; j < hmmRes.size(); j++) + + //put hmm result to return + for (uint k = 0; k < hmmRes.size(); k++) { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); + res.push_back(hmmRes[k]); } + + //clear tmp vars + piece.clear(); + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; } return true; } + + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + assert(_getInitFlag()); + if(begin >= end) + { + LogError("begin >= end"); + return false; + } + + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } }; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index de268c5..ae5987e 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -62,16 +62,12 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { assert(_getInitFlag()); - //if (!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} if (begin >= end) { LogError("begin >= end"); return false; } + //use hmm cut first vector hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) @@ -107,16 +103,13 @@ namespace CppJieba bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - if (!_getInitFlag()) + assert(_getInitFlag()); + if (begin >= end) { - LogError("not inited."); - return false; - } - if (begin > end) - { - LogError("begin > end"); + LogError("begin >= end"); return false; } + vector uRes; if (!cut(begin, end, uRes)) {