From 58e69783cce9fcbc06078a9f1ecb265aa8d3c5b9 Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 30 Nov 2013 12:41:31 +0800 Subject: [PATCH] merge MixSegment.h/cpp into hpp --- src/CMakeLists.txt | 12 +- src/HMMSegment.hpp | 528 ++++++++++++++++++++++----------------------- src/MixSegment.cpp | 125 ----------- src/MixSegment.h | 28 --- src/MixSegment.hpp | 128 +++++++++++ src/segment.cpp | 2 +- src/server.cpp | 2 +- 7 files changed, 396 insertions(+), 429 deletions(-) delete mode 100644 src/MixSegment.cpp delete mode 100644 src/MixSegment.h create mode 100644 src/MixSegment.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7d7a317..9ee5277 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,23 +1,15 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) -SET(LIBCPPJIEBA_SRC MixSegment.cpp) -ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC}) ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjserver server.cpp) +TARGET_LINK_LIBRARIES(cjserver husky pthread) LINK_DIRECTORIES(Husky) -TARGET_LINK_LIBRARIES(cjsegment cppjieba) -TARGET_LINK_LIBRARIES(cjserver cppjieba husky pthread) - -SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1) - INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) -INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba) -INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) - +INSTALL(FILES ChineseFilter.hpp HMMSegment.hpp MPSegment.hpp structs.h Trie.hpp globals.h ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Limonp) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 39696b6..417ba91 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -31,315 +31,315 @@ namespace CppJieba EmitProbMap _emitProbM; EmitProbMap _emitProbS; vector _emitProbVec; - + public: HMMSegment() { - memset(_startProb, 0, sizeof(_startProb)); - memset(_transProb, 0, sizeof(_transProb)); - _statMap[0] = 'B'; - _statMap[1] = 'E'; - _statMap[2] = 'M'; - _statMap[3] = 'S'; - _emitProbVec.push_back(&_emitProbB); - _emitProbVec.push_back(&_emitProbE); - _emitProbVec.push_back(&_emitProbM); - _emitProbVec.push_back(&_emitProbS); - } + memset(_startProb, 0, sizeof(_startProb)); + memset(_transProb, 0, sizeof(_transProb)); + _statMap[0] = 'B'; + _statMap[1] = 'E'; + _statMap[2] = 'M'; + _statMap[3] = 'S'; + _emitProbVec.push_back(&_emitProbB); + _emitProbVec.push_back(&_emitProbE); + _emitProbVec.push_back(&_emitProbM); + _emitProbVec.push_back(&_emitProbS); + } virtual ~HMMSegment() { - dispose(); - } + dispose(); + } public: bool init(const char* const modelPath) { - return _setInitFlag(_loadModel(modelPath)); - } + return _setInitFlag(_loadModel(modelPath)); + } bool dispose() { - _setInitFlag(false); - return true; - } + _setInitFlag(false); + return true; + } public: bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - vector status; - if(!_viterbi(begin, end, status)) - { - LogError("_viterbi failed."); - return false; - } + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + vector status; + if(!_viterbi(begin, end, status)) + { + LogError("_viterbi failed."); + return false; + } - Unicode::const_iterator left = begin; - Unicode::const_iterator right; - for(uint i =0; i< status.size(); i++) - { - if(status[i] % 2) //if(E == status[i] || S == status[i]) - { - right = begin + i + 1; - res.push_back(Unicode(left, right)); - left = right; + Unicode::const_iterator left = begin; + Unicode::const_iterator right; + for(uint i =0; i< status.size(); i++) + { + if(status[i] % 2) //if(E == status[i] || S == status[i]) + { + right = begin + i + 1; + res.push_back(Unicode(left, right)); + left = right; + } + } + return true; } - } - return true; - } bool cut(const string& str, vector& res)const { - return SegmentBase::cut(str, res); - } + return SegmentBase::cut(str, res); + } bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(begin == end) - { - return false; - } - vector words; - if(!cut(begin, end, words)) - { - return false; - } - string tmp; - for(uint i = 0; i < words.size(); i++) - { - if(TransCode::encode(words[i], tmp)) - { - res.push_back(tmp); + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if(begin == end) + { + return false; + } + vector words; + if(!cut(begin, end, words)) + { + return false; + } + string tmp; + for(uint i = 0; i < words.size(); i++) + { + if(TransCode::encode(words[i], tmp)) + { + res.push_back(tmp); + } + } + return true; } - } - return true; - } //virtual bool cut(const string& str, vector& res)const; private: bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const { - if(begin == end) - { - return false; - } - - size_t Y = STATUS_SUM; - size_t X = end - begin; - size_t XYSize = X * Y; - int * path; - double * weight; - uint now, old, stat; - double tmp, endE, endS; - - try - { - path = new int [XYSize]; - weight = new double [XYSize]; - } - catch(const std::bad_alloc&) - { - LogError("bad_alloc"); - return false; - } - if(NULL == path || NULL == weight) - { - LogError("bad_alloc"); - return false; - } - - //start - for(uint y = 0; y < Y; y++) - { - weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); - path[0 + y * X] = -1; - } - //process - //for(; begin != end; begin++) - for(uint x = 1; x < X; x++) - { - for(uint y = 0; y < Y; y++) - { - now = x + y*X; - weight[now] = MIN_DOUBLE; - path[now] = E; // warning - for(uint preY = 0; preY < Y; preY++) + if(begin == end) { - old = x - 1 + preY * X; - tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); - if(tmp > weight[now]) + return false; + } + + size_t Y = STATUS_SUM; + size_t X = end - begin; + size_t XYSize = X * Y; + int * path; + double * weight; + uint now, old, stat; + double tmp, endE, endS; + + try + { + path = new int [XYSize]; + weight = new double [XYSize]; + } + catch(const std::bad_alloc&) + { + LogError("bad_alloc"); + return false; + } + if(NULL == path || NULL == weight) + { + LogError("bad_alloc"); + return false; + } + + //start + for(uint y = 0; y < Y; y++) + { + weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); + path[0 + y * X] = -1; + } + //process + //for(; begin != end; begin++) + for(uint x = 1; x < X; x++) + { + for(uint y = 0; y < Y; y++) { - weight[now] = tmp; - path[now] = preY; + now = x + y*X; + weight[now] = MIN_DOUBLE; + path[now] = E; // warning + for(uint preY = 0; preY < Y; preY++) + { + old = x - 1 + preY * X; + tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); + if(tmp > weight[now]) + { + weight[now] = tmp; + path[now] = preY; + } + } } } + + endE = weight[X-1+E*X]; + endS = weight[X-1+S*X]; + stat = 0; + if(endE > endS) + { + stat = E; + } + else + { + stat = S; + } + + status.assign(X, 0); + for(int x = X -1 ; x >= 0; x--) + { + status[x] = stat; + stat = path[x + stat*X]; + } + + delete [] path; + delete [] weight; + return true; } - } - - endE = weight[X-1+E*X]; - endS = weight[X-1+S*X]; - stat = 0; - if(endE > endS) - { - stat = E; - } - else - { - stat = S; - } - - status.assign(X, 0); - for(int x = X -1 ; x >= 0; x--) - { - status[x] = stat; - stat = path[x + stat*X]; - } - - delete [] path; - delete [] weight; - return true; - } bool _loadModel(const char* const filePath) { - LogInfo("loadModel [%s] start ...", filePath); - ifstream ifile(filePath); - string line; - vector tmp; - vector tmp2; - //load _startProb - if(!_getLine(ifile, line)) - { - return false; - } - splitStr(line, tmp, " "); - if(tmp.size() != STATUS_SUM) - { - LogError("start_p illegal"); - return false; - } - for(uint j = 0; j< tmp.size(); j++) - { - _startProb[j] = atof(tmp[j].c_str()); - //cout<<_startProb[j]< tmp; + vector tmp2; + //load _startProb + if(!_getLine(ifile, line)) + { + return false; + } + splitStr(line, tmp, " "); + if(tmp.size() != STATUS_SUM) + { + LogError("start_p illegal"); + return false; + } + for(uint j = 0; j< tmp.size(); j++) + { + _startProb[j] = atof(tmp[j].c_str()); + //cout<<_startProb[j]< tmp, tmp2; - uint16_t unico = 0; - splitStr(line, tmp, ","); - for(uint i = 0; i < tmp.size(); i++) - { - splitStr(tmp[i], tmp2, ":"); - if(2 != tmp2.size()) - { - LogError("_emitProb illegal."); - return false; + if(line.empty()) + { + return false; + } + vector tmp, tmp2; + uint16_t unico = 0; + splitStr(line, tmp, ","); + for(uint i = 0; i < tmp.size(); i++) + { + splitStr(tmp[i], tmp2, ":"); + if(2 != tmp2.size()) + { + LogError("_emitProb illegal."); + return false; + } + if(!_decodeOne(tmp2[0], unico)) + { + LogError("TransCode failed."); + return false; + } + mp[unico] = atof(tmp2[1].c_str()); + } + return true; } - if(!_decodeOne(tmp2[0], unico)) - { - LogError("TransCode failed."); - return false; - } - mp[unico] = atof(tmp2[1].c_str()); - } - return true; - } bool _decodeOne(const string& str, uint16_t& res) { - Unicode ui16; - if(!TransCode::decode(str, ui16) || ui16.size() != 1) - { - return false; - } - res = ui16[0]; - return true; - } + Unicode ui16; + if(!TransCode::decode(str, ui16) || ui16.size() != 1) + { + return false; + } + res = ui16[0]; + return true; + } double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const { - EmitProbMap::const_iterator cit = ptMp->find(key); - if(cit == ptMp->end()) - { - return defVal; - } - return cit->second; - - } + EmitProbMap::const_iterator cit = ptMp->find(key); + if(cit == ptMp->end()) + { + return defVal; + } + return cit->second; + + } + - }; } diff --git a/src/MixSegment.cpp b/src/MixSegment.cpp deleted file mode 100644 index 5653388..0000000 --- a/src/MixSegment.cpp +++ /dev/null @@ -1,125 +0,0 @@ -#include "MixSegment.h" - -namespace CppJieba -{ - MixSegment::MixSegment() - { - } - - MixSegment::~MixSegment() - { - dispose(); - } - - bool MixSegment::init(const char* const mpSegDict, const char* const hmmSegDict) - { - if(_getInitFlag()) - { - LogError("inited."); - return false; - } - if(!_mpSeg.init(mpSegDict)) - { - LogError("_mpSeg init"); - return false; - } - if(!_hmmSeg.init(hmmSegDict)) - { - LogError("_hmmSeg init"); - return false; - } - return _setInitFlag(true); - } - - bool MixSegment::dispose() - { - if(!_getInitFlag()) - { - return true; - } - _mpSeg.dispose(); - _hmmSeg.dispose(); - _setInitFlag(false); - return true; - } - bool MixSegment::cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - - bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(begin == end) - { - return false; - } - vector infos; - if(!_mpSeg.cut(begin, end, infos)) - { - LogError("mpSeg cutDAG failed."); - return false; - } - Unicode unico; - vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) - { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) - { - unico.push_back(infos[i].word[0]); - } - else - { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); - } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - - return true; - } -} - -#ifdef MIXSEGMENT_UT -using namespace CppJieba; - -int main() -{ - return 0; -} - -#endif diff --git a/src/MixSegment.h b/src/MixSegment.h deleted file mode 100644 index 9df0701..0000000 --- a/src/MixSegment.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef CPPJIEBA_MIXSEGMENT_H -#define CPPJIEBA_MIXSEGMENT_H - -#include "MPSegment.hpp" -#include "HMMSegment.hpp" -#include "Limonp/str_functs.hpp" - -namespace CppJieba -{ - class MixSegment: public SegmentBase - { - private: - MPSegment _mpSeg; - HMMSegment _hmmSeg; - public: - MixSegment(); - virtual ~MixSegment(); - public: - bool init(const char* const _mpSegDict, const char* const _hmmSegDict); - bool dispose(); - public: - //virtual bool cut(const string& str, vector& res) const; - bool cut(const string& str, vector& res)const; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const; - }; -} - -#endif diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp new file mode 100644 index 0000000..896d7b1 --- /dev/null +++ b/src/MixSegment.hpp @@ -0,0 +1,128 @@ +#ifndef CPPJIEBA_MIXSEGMENT_H +#define CPPJIEBA_MIXSEGMENT_H + +#include "MPSegment.hpp" +#include "HMMSegment.hpp" +#include "Limonp/str_functs.hpp" + +namespace CppJieba +{ + class MixSegment: public SegmentBase + { + private: + MPSegment _mpSeg; + HMMSegment _hmmSeg; + public: + MixSegment() + { + } + virtual ~MixSegment() + { + dispose(); + } + public: + bool init(const char* const mpSegDict, const char* const hmmSegDict) + { + if(_getInitFlag()) + { + LogError("inited."); + return false; + } + if(!_mpSeg.init(mpSegDict)) + { + LogError("_mpSeg init"); + return false; + } + if(!_hmmSeg.init(hmmSegDict)) + { + LogError("_hmmSeg init"); + return false; + } + return _setInitFlag(true); + } + bool dispose() + { + if(!_getInitFlag()) + { + return true; + } + _mpSeg.dispose(); + _hmmSeg.dispose(); + _setInitFlag(false); + return true; + } + public: + //virtual bool cut(const string& str, vector& res) const; + bool cut(const string& str, vector& res)const + { + return SegmentBase::cut(str, res); + } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if(begin == end) + { + return false; + } + vector infos; + if(!_mpSeg.cut(begin, end, infos)) + { + LogError("mpSeg cutDAG failed."); + return false; + } + Unicode unico; + vector hmmRes; + string tmp; + for(uint i= 0; i < infos.size(); i++) + { + TransCode::encode(infos[i].word,tmp); + if(1 == infos[i].word.size()) + { + unico.push_back(infos[i].word[0]); + } + else + { + if(!unico.empty()) + { + hmmRes.clear(); + if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + for(uint j = 0; j < hmmRes.size(); j++) + { + TransCode::encode(hmmRes[j], tmp); + res.push_back(tmp); + } + } + unico.clear(); + TransCode::encode(infos[i].word, tmp); + res.push_back(tmp); + } + } + if(!unico.empty()) + { + hmmRes.clear(); + if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + for(uint j = 0; j < hmmRes.size(); j++) + { + TransCode::encode(hmmRes[j], tmp); + res.push_back(tmp); + } + } + + return true; + } + }; +} + +#endif diff --git a/src/segment.cpp b/src/segment.cpp index 3e027a6..ca49d37 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -3,7 +3,7 @@ #include "Limonp/ArgvContext.hpp" #include "MPSegment.hpp" #include "HMMSegment.hpp" -#include "MixSegment.h" +#include "MixSegment.hpp" using namespace CppJieba; diff --git a/src/server.cpp b/src/server.cpp index ef8f2b9..2545248 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -9,7 +9,7 @@ #include "Husky/ServerFrame.h" #include "MPSegment.hpp" #include "HMMSegment.hpp" -#include "MixSegment.h" +#include "MixSegment.hpp" using namespace Husky; using namespace CppJieba;