diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c32ad6f..66803c6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) -SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp ) +SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp) ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC}) ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjserver server.cpp) diff --git a/src/MPSegment.cpp b/src/MPSegment.cpp deleted file mode 100644 index cfbd025..0000000 --- a/src/MPSegment.cpp +++ /dev/null @@ -1,265 +0,0 @@ -/************************************ - * file enc : AISCII - * author : wuyanyi09@gmail.com -************************************/ -#include "MPSegment.h" - -namespace CppJieba -{ - - bool MPSegment::init(const char* const filePath) - { - if(_getInitFlag()) - { - LogError("already inited before now."); - return false; - } - if(!_trie.init()) - { - LogError("_trie.init failed."); - return false; - } - LogInfo("_trie.loadDict(%s) start...", filePath); - if(!_trie.loadDict(filePath)) - { - LogError("_trie.loadDict faield."); - return false; - } - LogInfo("_trie.loadDict end."); - return _setInitFlag(true); - } - - bool MPSegment::dispose() - { - if(!_getInitFlag()) - { - return true; - } - _trie.dispose(); - _setInitFlag(false); - return true; - } - - bool MPSegment::cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - - bool MPSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - vector segWordInfos; - if(!cut(begin, end, segWordInfos)) - { - return false; - } - string tmp; - for(uint i = 0; i < segWordInfos.size(); i++) - { - if(TransCode::encode(segWordInfos[i].word, tmp)) - { - res.push_back(tmp); - } - else - { - LogError("encode failed."); - } - } - return true; - } - - bool MPSegment::cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - SegmentContext segContext; - for(Unicode::const_iterator it = begin; it != end; it++) - { - segContext.push_back(SegmentChar(*it)); - } - - //calc DAG - if(!_calcDAG(segContext)) - { - LogError("_calcDAG failed."); - return false; - } - - if(!_calcDP(segContext)) - { - LogError("_calcDP failed."); - return false; - } - - if(!_cut(segContext, segWordInfos)) - { - LogError("_cut failed."); - return false; - } - - return true; - } - - bool MPSegment::cut(const string& str, vector& segWordInfos)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(str.empty()) - { - return false; - } - Unicode sentence; - - if(!TransCode::decode(str, sentence)) - { - LogError("TransCode::decode failed."); - return false; - } - return cut(sentence.begin(), sentence.end(), segWordInfos); - - } - - bool MPSegment::_calcDAG(SegmentContext& segContext)const - { - if(segContext.empty()) - { - LogError("segContext empty."); - return false; - } - - Unicode unicode; - for(uint i = 0; i < segContext.size(); i++) - { - unicode.clear(); - for(uint j = i ; j < segContext.size(); j++) - { - unicode.push_back(segContext[j].uniCh); - } - - vector > vp; - if(_trie.find(unicode, vp)) - { - for(uint j = 0; j < vp.size(); j++) - { - uint nextp = vp[j].first + i; - segContext[i].dag[nextp] = vp[j].second; - //cout<toString()); - } - } - if(segContext[i].dag.end() == segContext[i].dag.find(i)) - { - segContext[i].dag[i] = NULL; - } - } - return true; - } - - bool MPSegment::_calcDP(SegmentContext& segContext)const - { - if(segContext.empty()) - { - LogError("segContext empty"); - return false; - } - - for(int i = segContext.size() - 1; i >= 0; i--) - { - segContext[i].pInfo = NULL; - segContext[i].weight = MIN_DOUBLE; - for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) - { - uint nextPos = it->first; - const TrieNodeInfo* p = it->second; - double val = 0.0; - if(nextPos + 1 < segContext.size()) - { - val += segContext[nextPos + 1].weight; - } - - if(p) - { - val += p->logFreq; - } - else - { - val += _trie.getMinLogFreq(); - } - if(val > segContext[i].weight) - { - segContext[i].pInfo = p; - segContext[i].weight = val; - } - } - } - return true; - - } - - bool MPSegment::_cut(SegmentContext& segContext, vector& res)const - { - uint i = 0; - while(i < segContext.size()) - { - const TrieNodeInfo* p = segContext[i].pInfo; - if(p) - { - res.push_back(*p); - i += p->word.size(); - } - else//single chinese word - { - TrieNodeInfo nodeInfo; - nodeInfo.word.push_back(segContext[i].uniCh); - nodeInfo.freq = 0; - nodeInfo.logFreq = _trie.getMinLogFreq(); - res.push_back(nodeInfo); - i++; - } - } - return true; - } - -} - - -#ifdef SEGMENT_UT -using namespace CppJieba; - -int main() -{ - MPSegment segment; - segment.init(); - if(!segment._loadSegDict("../dicts/segdict.gbk.v3.0")) - { - cerr<<"1"< res; - string line; - while(getline(ifile, line)) - { - res.clear(); - segment.cut(line, res); - PRINT_VECTOR(res); - getchar(); - } - - segment.dispose(); - return 0; -} - -#endif diff --git a/src/MPSegment.h b/src/MPSegment.h deleted file mode 100644 index 9dd0a7c..0000000 --- a/src/MPSegment.h +++ /dev/null @@ -1,49 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#ifndef CPPJIEBA_MPSEGMENT_H -#define CPPJIEBA_MPSEGMENT_H - -#include -#include -#include "Limonp/logger.hpp" -#include "Trie.hpp" -#include "globals.h" -#include "ISegment.hpp" -#include "SegmentBase.hpp" - -namespace CppJieba -{ - - typedef vector SegmentContext; - - class MPSegment: public SegmentBase - { - private: - Trie _trie; - - public: - MPSegment(){}; - virtual ~MPSegment(){dispose();}; - public: - bool init(const char* const filePath); - bool dispose(); - public: - //bool cut(const string& str, vector& segWordInfos)const; - bool cut(const string& str, vector& res)const; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const; - bool cut(const string& str, vector& segWordInfos)const; - bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const; - //virtual bool cut(const string& str, vector& res)const; - - private: - bool _calcDAG(SegmentContext& segContext)const; - bool _calcDP(SegmentContext& segContext)const; - bool _cut(SegmentContext& segContext, vector& res)const; - - - }; -} - -#endif diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp new file mode 100644 index 0000000..f8ce060 --- /dev/null +++ b/src/MPSegment.hpp @@ -0,0 +1,254 @@ +/************************************ + * file enc : ASCII + * author : wuyanyi09@gmail.com + ************************************/ +#ifndef CPPJIEBA_MPSEGMENT_H +#define CPPJIEBA_MPSEGMENT_H + +#include +#include +#include "Limonp/logger.hpp" +#include "Trie.hpp" +#include "globals.h" +#include "ISegment.hpp" +#include "SegmentBase.hpp" + +namespace CppJieba +{ + + typedef vector SegmentContext; + + class MPSegment: public SegmentBase + { + private: + Trie _trie; + + public: + MPSegment(){}; + virtual ~MPSegment(){dispose();}; + public: + bool init(const char* const filePath) + { + if(_getInitFlag()) + { + LogError("already inited before now."); + return false; + } + if(!_trie.init()) + { + LogError("_trie.init failed."); + return false; + } + LogInfo("_trie.loadDict(%s) start...", filePath); + if(!_trie.loadDict(filePath)) + { + LogError("_trie.loadDict faield."); + return false; + } + LogInfo("_trie.loadDict end."); + return _setInitFlag(true); + } + bool dispose() + { + if(!_getInitFlag()) + { + return true; + } + _trie.dispose(); + _setInitFlag(false); + return true; + } + public: + //bool cut(const string& str, vector& segWordInfos)const; + bool cut(const string& str, vector& res)const + { + return SegmentBase::cut(str, res); + } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + vector segWordInfos; + if(!cut(begin, end, segWordInfos)) + { + return false; + } + string tmp; + for(uint i = 0; i < segWordInfos.size(); i++) + { + if(TransCode::encode(segWordInfos[i].word, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } + bool cut(const string& str, vector& segWordInfos)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if(str.empty()) + { + return false; + } + Unicode sentence; + + if(!TransCode::decode(str, sentence)) + { + LogError("TransCode::decode failed."); + return false; + } + return cut(sentence.begin(), sentence.end(), segWordInfos); + + } + bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + SegmentContext segContext; + for(Unicode::const_iterator it = begin; it != end; it++) + { + segContext.push_back(SegmentChar(*it)); + } + + //calc DAG + if(!_calcDAG(segContext)) + { + LogError("_calcDAG failed."); + return false; + } + + if(!_calcDP(segContext)) + { + LogError("_calcDP failed."); + return false; + } + + if(!_cut(segContext, segWordInfos)) + { + LogError("_cut failed."); + return false; + } + + return true; + } + //virtual bool cut(const string& str, vector& res)const; + + private: + bool _calcDAG(SegmentContext& segContext)const + { + if(segContext.empty()) + { + LogError("segContext empty."); + return false; + } + + Unicode unicode; + for(uint i = 0; i < segContext.size(); i++) + { + unicode.clear(); + for(uint j = i ; j < segContext.size(); j++) + { + unicode.push_back(segContext[j].uniCh); + } + + vector > vp; + if(_trie.find(unicode, vp)) + { + for(uint j = 0; j < vp.size(); j++) + { + uint nextp = vp[j].first + i; + segContext[i].dag[nextp] = vp[j].second; + //cout<toString()); + } + } + if(segContext[i].dag.end() == segContext[i].dag.find(i)) + { + segContext[i].dag[i] = NULL; + } + } + return true; + } + bool _calcDP(SegmentContext& segContext)const + { + if(segContext.empty()) + { + LogError("segContext empty"); + return false; + } + + for(int i = segContext.size() - 1; i >= 0; i--) + { + segContext[i].pInfo = NULL; + segContext[i].weight = MIN_DOUBLE; + for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) + { + uint nextPos = it->first; + const TrieNodeInfo* p = it->second; + double val = 0.0; + if(nextPos + 1 < segContext.size()) + { + val += segContext[nextPos + 1].weight; + } + + if(p) + { + val += p->logFreq; + } + else + { + val += _trie.getMinLogFreq(); + } + if(val > segContext[i].weight) + { + segContext[i].pInfo = p; + segContext[i].weight = val; + } + } + } + return true; + + } + bool _cut(SegmentContext& segContext, vector& res)const + { + uint i = 0; + while(i < segContext.size()) + { + const TrieNodeInfo* p = segContext[i].pInfo; + if(p) + { + res.push_back(*p); + i += p->word.size(); + } + else//single chinese word + { + TrieNodeInfo nodeInfo; + nodeInfo.word.push_back(segContext[i].uniCh); + nodeInfo.freq = 0; + nodeInfo.logFreq = _trie.getMinLogFreq(); + res.push_back(nodeInfo); + i++; + } + } + return true; + } + + + }; +} + +#endif diff --git a/src/MixSegment.h b/src/MixSegment.h index 079db3f..e4009fb 100644 --- a/src/MixSegment.h +++ b/src/MixSegment.h @@ -1,7 +1,7 @@ #ifndef CPPJIEBA_MIXSEGMENT_H #define CPPJIEBA_MIXSEGMENT_H -#include "MPSegment.h" +#include "MPSegment.hpp" #include "HMMSegment.h" #include "Limonp/str_functs.hpp" diff --git a/src/segment.cpp b/src/segment.cpp index e4b3835..296a2c9 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -1,7 +1,7 @@ #include #include #include "Limonp/ArgvContext.hpp" -#include "MPSegment.h" +#include "MPSegment.hpp" #include "HMMSegment.h" #include "MixSegment.h" diff --git a/src/server.cpp b/src/server.cpp index b3c39fb..ae56f9a 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -7,7 +7,7 @@ #include "Limonp/Config.hpp" #include "Husky/Daemon.h" #include "Husky/ServerFrame.h" -#include "MPSegment.h" +#include "MPSegment.hpp" #include "HMMSegment.h" #include "MixSegment.h"