/************************************ * file enc : ASCII * author : wuyanyi09@gmail.com ************************************/ #ifndef CPPJIEBA_MPSEGMENT_H #define CPPJIEBA_MPSEGMENT_H #include #include #include #include "Limonp/logger.hpp" #include "DictTrie.hpp" #include "DictTrie.hpp" #include "ISegment.hpp" #include "SegmentBase.hpp" namespace CppJieba { struct SegmentChar { uint16_t uniCh; DagType dag; const DictUnit * pInfo; double weight; size_t nextPos; SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0) {} }; class MPSegment: public SegmentBase { protected: DictTrie _dictTrie; public: MPSegment(){_setInitFlag(false);}; explicit MPSegment(const string& dictPath, const string& userDictPath = "") { _setInitFlag(init(dictPath, userDictPath)); }; virtual ~MPSegment(){}; public: bool init(const string& dictPath, const string& userDictPath = "") { if(_getInitFlag()) { LogError("already inited before now."); return false; } _dictTrie.init(dictPath, userDictPath); assert(_dictTrie); LogInfo("MPSegment init(%s) ok", dictPath.c_str()); return _setInitFlag(true); } public: using SegmentBase::cut; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { assert(_getInitFlag()); if(begin == end) { return false; } vector words; if(!cut(begin, end, words)) { return false; } string word; for(size_t i = 0; i < words.size(); i++) { if(TransCode::encode(words[i], word)) { res.push_back(word); } else { LogError("encode failed."); } } return true; } bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const { if(!_getInitFlag()) { LogError("not inited."); return false; } vector SegmentChars; //calc DAG if(!_calcDAG(begin, end, SegmentChars)) { LogError("_calcDAG failed."); return false; } if(!_calcDP(SegmentChars)) { LogError("_calcDP failed."); return false; } if(!_cut(SegmentChars, res)) { LogError("_cut failed."); return false; } return true; } private: bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, vector& SegmentChars) const { SegmentChar schar; size_t offset; for(Unicode::const_iterator it = begin; it != end; it++) { schar.uniCh = *it; offset = it - begin; schar.dag.clear(); _dictTrie.find(it, end, schar.dag, offset); if(!isIn(schar.dag, offset)) { schar.dag[offset] = NULL; } SegmentChars.push_back(schar); } return true; } bool _calcDP(vector& SegmentChars)const { if(SegmentChars.empty()) { LogError("SegmentChars empty"); return false; } size_t nextPos; const DictUnit* p; double val; for(int i = SegmentChars.size() - 1; i >= 0; i--) { SegmentChars[i].pInfo = NULL; SegmentChars[i].weight = MIN_DOUBLE; for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++) { nextPos = it->first; p = it->second; val = 0.0; if(nextPos + 1 < SegmentChars.size()) { val += SegmentChars[nextPos + 1].weight; } if(p) { val += p->weight; } else { val += _dictTrie.getMinWeight(); } if(val > SegmentChars[i].weight) { SegmentChars[i].pInfo = p; SegmentChars[i].weight = val; } } } return true; } bool _cut(vector& SegmentChars, vector& res)const { size_t i = 0; while(i < SegmentChars.size()) { const DictUnit* p = SegmentChars[i].pInfo; if(p) { res.push_back(p->word); i += p->word.size(); } else//single chinese word { res.push_back(Unicode(1, SegmentChars[i].uniCh)); i++; } } return true; } }; } #endif