#ifndef CPPJIEBA_MPSEGMENT_H #define CPPJIEBA_MPSEGMENT_H #include #include #include #include "limonp/Logger.hpp" #include "DictTrie.hpp" #include "SegmentBase.hpp" namespace CppJieba { class MPSegment: public SegmentBase { public: MPSegment(const string& dictPath, const string& userDictPath = "") { dictTrie_ = new DictTrie(dictPath, userDictPath); isNeedDestroy_ = true; LogInfo("MPSegment init(%s) ok", dictPath.c_str()); } MPSegment(const DictTrie* dictTrie) : dictTrie_(dictTrie), isNeedDestroy_(false) { assert(dictTrie_); } ~MPSegment() { if (isNeedDestroy_) { delete dictTrie_; } } void cut(const string& sentence, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector uwords; uwords.reserve(sentence.size()); while (pre_filter.HasNext()) { range = pre_filter.Next(); cut(range.begin, range.end, uwords, max_word_len); } TransCode::encode(uwords, words); } void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->find(begin, end, dags, max_word_len); CalcDP(dags); CutByDag(dags, words); } const DictTrie* getDictTrie() const { return dictTrie_; } bool isUserDictSingleChineseWord(const Rune & value) const { return dictTrie_->isUserDictSingleChineseWord(value); } private: void CalcDP(vector& dags) const { size_t nextPos; const DictUnit* p; double val; for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { rit->pInfo = NULL; rit->weight = MIN_DOUBLE; assert(!rit->nexts.empty()); for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { nextPos = it->first; p = it->second; val = 0.0; if (nextPos + 1 < dags.size()) { val += dags[nextPos + 1].weight; } if (p) { val += p->weight; } else { val += dictTrie_->getMinWeight(); } if (val > rit->weight) { rit->pInfo = p; rit->weight = val; } } } } void CutByDag(const vector& dags, vector& words) const { size_t i = 0; while (i < dags.size()) { const DictUnit* p = dags[i].pInfo; if (p) { words.push_back(p->word); i += p->word.size(); } else { //single chinese word words.push_back(Unicode(1, dags[i].rune)); i++; } } } const DictTrie* dictTrie_; bool isNeedDestroy_; }; // class MPSegment } // namespace CppJieba #endif