diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 1bc4cf2..91c5fdc 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -1,5 +1,5 @@ #ifndef CPPJIEBA_FULLSEGMENT_H -#define CPPJIEBAi_FULLSEGMENT_H +#define CPPJIEBA_FULLSEGMENT_H #include #include @@ -23,11 +23,13 @@ namespace CppJieba public: bool init() { +#ifndef NO_CODING_LOG if(_getInitFlag()) { LogError("already inited before now."); return false; } +#endif if(!_trie.init()) { LogError("_trie.init failed."); @@ -44,22 +46,35 @@ namespace CppJieba } bool dispose() { +#ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } +#endif _trie.dispose(); _setInitFlag(false); return true; } - bool cut(const string& str, vector& res) const - { - return SegmentBase::cut(str, res); - } + public: + using SegmentBase::cut; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const + public: + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { +#ifndef NO_CODING_LOG + if (!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if (begin > end) + { + LogError("begin > end"); + return false; + } +#endif //resut of searching in trie tree vector > tRes; @@ -71,10 +86,9 @@ namespace CppJieba //tmp variables int wordLen = 0; - string tmp; for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { - //find word start from itr + //find word start from uItr if (_trie.find(uItr, end, tRes)) { for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) @@ -82,32 +96,64 @@ namespace CppJieba wordLen = itr->second->word.size(); if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx) { - if (TransCode::encode(itr->second->word, tmp)) - res.push_back(tmp); - else - LogError("encode failed."); - tmp.clear(); + res.push_back(itr->second->word); } maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; } tRes.clear(); } - else // not found word start from itr + else // not found word start from uItr { if (maxIdx <= uIdx) // never exist in prev results { //put itr itself in res - Unicode uTmp(1, *uItr); - if (TransCode::encode(uTmp, tmp)) - { - res.push_back(tmp); - } - tmp.clear(); + res.push_back(Unicode(1, *uItr)); + + //mark it exits ++maxIdx; } } ++uIdx; } + + return true; + } + + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const + { +#ifndef NO_CODING_LOG + if (!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if (begin > end) + { + LogError("begin > end"); + return false; + } +#endif + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + string tmp; + + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + + return true; } }; } diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp new file mode 100644 index 0000000..58cf1a2 --- /dev/null +++ b/src/QuerySegment.hpp @@ -0,0 +1,151 @@ +#ifndef CPPJIEBA_QUERYSEGMENT_H +#define CPPJIEBA_QUERYSEGMENT_H + +#include +#include +#include "Limonp/logger.hpp" +#include "Trie.hpp" +#include "ISegment.hpp" +#include "SegmentBase.hpp" +#include "HMMSegment.hpp" +#include "FullSegment.hpp" +#include "TransCode.hpp" + +namespace CppJieba +{ + class QuerySegment: public SegmentBase + { + private: + HMMSegment _hmmSeg; + FullSegment _fullSeg; + int _maxWordLen; + + public: + QuerySegment(const char* fullSegDict, const char* hmmSegDict, int maxWordLen): _hmmSeg(hmmSegDict), _fullSeg(fullSegDict), _maxWordLen(maxWordLen){}; + virtual ~QuerySegment(){dispose();}; + public: + bool init() + { +#ifndef NO_CODING_LOG + if (_getInitFlag()) + { + LogError("inited."); + } +#endif + if (!_hmmSeg.init()) + { + LogError("_hmmSeg init"); + return false; + } + if (!_fullSeg.init()) + { + LogError("_fullSeg init"); + return false; + } + return _setInitFlag(true); + } + bool dispose() + { +#ifndef NO_CODING_LOG + if(!_getInitFlag()) + { + return true; + } +#endif + _fullSeg.dispose(); + _hmmSeg.dispose(); + _setInitFlag(false); + return true; + } + + public: + using SegmentBase::cut; + + public: + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const + { +#ifndef NO_CODING_LOG + if (!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if (begin > end) + { + LogError("begin > end"); + return false; + } +#endif + //use hmm cut first + vector hmmRes; + if (!_hmmSeg.cut(begin, end, hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + + vector fullRes; + for (vector::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++) + { + + // if it's too long, cut with _fullSeg, put fullRes in res + if (hmmResItr->size() > _maxWordLen) + { + if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes)) + { + for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) + { + res.push_back(*fullResItr); + } + } + } + else // just use the hmm result + { + res.push_back(*hmmResItr); + } + } + + return true; + } + + + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const + { +#ifndef NO_CODING_LOG + if (!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if (begin > end) + { + LogError("begin > end"); + return false; + } +#endif + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + + return true; + } + }; +} + +#endif diff --git a/src/segment.cpp b/src/segment.cpp index d46dc7b..2eb61f4 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -1,10 +1,13 @@ #include #include +#include +#include #include "Limonp/ArgvContext.hpp" #include "MPSegment.hpp" #include "HMMSegment.hpp" #include "MixSegment.hpp" #include "FullSegment.hpp" +#include "QuerySegment.hpp" using namespace CppJieba; @@ -37,13 +40,15 @@ int main(int argc, char ** argv) { cout<<"usage: \n\t"<\n" <<"options:\n" - <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" + <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutQuery, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" + <<"\t--maxlen\tspecify the granularity of cut used in cutQuery, If not specified, the default is 3\n" <<"example:\n" <<"\t"<