From 26af60d86773ea5f38faf0f73d8baa748d6916a3 Mon Sep 17 00:00:00 2001 From: aholic Date: Wed, 27 Nov 2013 16:16:10 +0800 Subject: [PATCH] add fullSegment --- src/FullSegment.hpp | 114 ++++++++++++++++++++++++++++++++++++++++++++ src/Trie.hpp | 27 ++++++++--- src/segment.cpp | 17 ++++++- 3 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 src/FullSegment.hpp diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp new file mode 100644 index 0000000..9bee642 --- /dev/null +++ b/src/FullSegment.hpp @@ -0,0 +1,114 @@ +#ifndef CPPJIEBA_FULLSEGMENT_H +#define CPPJIEBAi_FULLSEGMENT_H + +#include +#include +#include "Limonp/logger.hpp" +#include "Trie.hpp" +#include "ISegment.hpp" +#include "SegmentBase.hpp" +#include "TransCode.hpp" + +namespace CppJieba +{ + class FullSegment: public SegmentBase + { + private: + Trie _trie; + + public: + FullSegment(){}; + virtual ~FullSegment(){dispose();}; + public: + bool init(const char* const filePath) + { + if(_getInitFlag()) + { + LogError("already inited before now."); + return false; + } + if(!_trie.init()) + { + LogError("_trie.init failed."); + return false; + } + LogInfo("_trie.loadDict(%s) start...", filePath); + if(!_trie.loadDict(filePath)) + { + LogError("_trie.loadDict faield."); + return false; + } + LogInfo("_trie.loadDict end."); + return _setInitFlag(true); + } + bool dispose() + { + if(!_getInitFlag()) + { + return true; + } + _trie.dispose(); + _setInitFlag(false); + return true; + } + + bool cut(const string& str, vector& res) const + { + return SegmentBase::cut(str, res); + } + + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const + { + //resut of searching in trie tree + vector > tRes; + + //max index of res's words + int maxIdx = 0; + + // always equals to (uItr - begin) + int uIdx = 0; + + //tmp variables + int wordLen = 0; + string tmp; + for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) + { + //find word start from itr + if (_trie.find(uItr, end, tRes)) + { + for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + { + wordLen = itr->second->word.size(); + if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx) + { + if (TransCode::encode(itr->second->word, tmp)) + res.push_back(tmp); + else + LogError("encode failed."); + tmp.clear(); + } + maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; + } + tRes.clear(); + } + else // not found word start from itr + { + if (maxIdx <= uIdx) // never exist in prev results + { + //put itr itself in res + Unicode uTmp(1, *uItr); + if (TransCode::encode(uTmp, tmp)) + { + res.push_back(tmp); + } + tmp.clear(); + ++maxIdx; + } + } + ++uIdx; + } + } + }; +} + +#endif diff --git a/src/Trie.hpp b/src/Trie.hpp index c8ab85a..785de82 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -214,27 +214,33 @@ namespace CppJieba } return NULL; } - bool find(const Unicode& unico, vector >& res)const + + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector >& res) const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return false; } - TrieNode* p = _root; - for(uint i = 0; i < unico.size(); i++) + if (begin >= end) { - if(p->hmap.find(unico[i]) == p-> hmap.end()) + LogFatal("begin >= end"); + return false; + } + TrieNode* p = _root; + for (Unicode::const_iterator itr = begin; itr != end; itr++) + { + if(p->hmap.find(*itr) == p-> hmap.end()) { break; } - p = p->hmap[unico[i]]; + p = p->hmap[*itr]; if(p->isLeaf) { uint pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { - res.push_back(make_pair(i, &_nodeInfoVec[pos])); + res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos])); } else { @@ -246,6 +252,15 @@ namespace CppJieba return !res.empty(); } + bool find(const Unicode& unico, vector >& res)const + { + if (!unico.empty()) + { + return find(unico.begin(), unico.end(), res); + } + return false; + } + const TrieNodeInfo* findPrefix(const string& str)const { if(!_getInitFlag()) diff --git a/src/segment.cpp b/src/segment.cpp index ca49d37..ac14c67 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -4,6 +4,7 @@ #include "MPSegment.hpp" #include "HMMSegment.hpp" #include "MixSegment.hpp" +#include "FullSegment.hpp" using namespace CppJieba; @@ -16,6 +17,7 @@ void cut(const ISegment * seg, const char * const filePath) { if(!line.empty()) { + cout << line << endl; res.clear(); seg->cut(line, res); cout<\n" <<"options:\n" - <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" + <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" <<"example:\n" @@ -67,7 +69,18 @@ int main(int argc, char ** argv) cut(&seg, arg[1].c_str()); seg.dispose(); } - else + else if ("cutFull" == algorithm) + { + FullSegment seg; + if (!seg.init(dictPath.c_str())) + { + cout << "seg init failed" << endl; + return false; + } + cut(&seg, arg[1].c_str()); + seg.dispose(); + } + else { MixSegment seg; if(!seg.init(dictPath.c_str(), modelPath.c_str()))