merge MixSegment.h/cpp into hpp

This commit is contained in:
wyy 2013-11-30 12:41:31 +08:00
parent 55c64e9893
commit 58e69783cc
7 changed files with 396 additions and 429 deletions

View File

@ -1,23 +1,15 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(LIBCPPJIEBA_SRC MixSegment.cpp)
ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC})
ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjsegment segment.cpp)
ADD_EXECUTABLE(cjserver server.cpp) ADD_EXECUTABLE(cjserver server.cpp)
TARGET_LINK_LIBRARIES(cjserver husky pthread)
LINK_DIRECTORIES(Husky) LINK_DIRECTORIES(Husky)
TARGET_LINK_LIBRARIES(cjsegment cppjieba)
TARGET_LINK_LIBRARIES(cjserver cppjieba husky pthread)
SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba) INSTALL(FILES ChineseFilter.hpp HMMSegment.hpp MPSegment.hpp structs.h Trie.hpp globals.h ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp) ADD_SUBDIRECTORY(Limonp)

View File

@ -31,315 +31,315 @@ namespace CppJieba
EmitProbMap _emitProbM; EmitProbMap _emitProbM;
EmitProbMap _emitProbS; EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec; vector<EmitProbMap* > _emitProbVec;
public: public:
HMMSegment() HMMSegment()
{ {
memset(_startProb, 0, sizeof(_startProb)); memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb)); memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B'; _statMap[0] = 'B';
_statMap[1] = 'E'; _statMap[1] = 'E';
_statMap[2] = 'M'; _statMap[2] = 'M';
_statMap[3] = 'S'; _statMap[3] = 'S';
_emitProbVec.push_back(&_emitProbB); _emitProbVec.push_back(&_emitProbB);
_emitProbVec.push_back(&_emitProbE); _emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbM); _emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbS); _emitProbVec.push_back(&_emitProbS);
} }
virtual ~HMMSegment() virtual ~HMMSegment()
{ {
dispose(); dispose();
} }
public: public:
bool init(const char* const modelPath) bool init(const char* const modelPath)
{ {
return _setInitFlag(_loadModel(modelPath)); return _setInitFlag(_loadModel(modelPath));
} }
bool dispose() bool dispose()
{ {
_setInitFlag(false); _setInitFlag(false);
return true; return true;
} }
public: public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
{ {
if(!_getInitFlag()) if(!_getInitFlag())
{ {
LogError("not inited."); LogError("not inited.");
return false; return false;
} }
vector<uint> status; vector<uint> status;
if(!_viterbi(begin, end, status)) if(!_viterbi(begin, end, status))
{ {
LogError("_viterbi failed."); LogError("_viterbi failed.");
return false; return false;
} }
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right; Unicode::const_iterator right;
for(uint i =0; i< status.size(); i++) for(uint i =0; i< status.size(); i++)
{ {
if(status[i] % 2) //if(E == status[i] || S == status[i]) if(status[i] % 2) //if(E == status[i] || S == status[i])
{ {
right = begin + i + 1; right = begin + i + 1;
res.push_back(Unicode(left, right)); res.push_back(Unicode(left, right));
left = right; left = right;
}
}
return true;
} }
}
return true;
}
bool cut(const string& str, vector<string>& res)const bool cut(const string& str, vector<string>& res)const
{ {
return SegmentBase::cut(str, res); return SegmentBase::cut(str, res);
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
if(!_getInitFlag()) if(!_getInitFlag())
{ {
LogError("not inited."); LogError("not inited.");
return false; return false;
} }
if(begin == end) if(begin == end)
{ {
return false; return false;
} }
vector<Unicode> words; vector<Unicode> words;
if(!cut(begin, end, words)) if(!cut(begin, end, words))
{ {
return false; return false;
} }
string tmp; string tmp;
for(uint i = 0; i < words.size(); i++) for(uint i = 0; i < words.size(); i++)
{ {
if(TransCode::encode(words[i], tmp)) if(TransCode::encode(words[i], tmp))
{ {
res.push_back(tmp); res.push_back(tmp);
}
}
return true;
} }
}
return true;
}
//virtual bool cut(const string& str, vector<string>& res)const; //virtual bool cut(const string& str, vector<string>& res)const;
private: private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const
{ {
if(begin == end) if(begin == end)
{
return false;
}
size_t Y = STATUS_SUM;
size_t X = end - begin;
size_t XYSize = X * Y;
int * path;
double * weight;
uint now, old, stat;
double tmp, endE, endS;
try
{
path = new int [XYSize];
weight = new double [XYSize];
}
catch(const std::bad_alloc&)
{
LogError("bad_alloc");
return false;
}
if(NULL == path || NULL == weight)
{
LogError("bad_alloc");
return false;
}
//start
for(uint y = 0; y < Y; y++)
{
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1;
}
//process
//for(; begin != end; begin++)
for(uint x = 1; x < X; x++)
{
for(uint y = 0; y < Y; y++)
{
now = x + y*X;
weight[now] = MIN_DOUBLE;
path[now] = E; // warning
for(uint preY = 0; preY < Y; preY++)
{ {
old = x - 1 + preY * X; return false;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); }
if(tmp > weight[now])
size_t Y = STATUS_SUM;
size_t X = end - begin;
size_t XYSize = X * Y;
int * path;
double * weight;
uint now, old, stat;
double tmp, endE, endS;
try
{
path = new int [XYSize];
weight = new double [XYSize];
}
catch(const std::bad_alloc&)
{
LogError("bad_alloc");
return false;
}
if(NULL == path || NULL == weight)
{
LogError("bad_alloc");
return false;
}
//start
for(uint y = 0; y < Y; y++)
{
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1;
}
//process
//for(; begin != end; begin++)
for(uint x = 1; x < X; x++)
{
for(uint y = 0; y < Y; y++)
{ {
weight[now] = tmp; now = x + y*X;
path[now] = preY; weight[now] = MIN_DOUBLE;
path[now] = E; // warning
for(uint preY = 0; preY < Y; preY++)
{
old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
if(tmp > weight[now])
{
weight[now] = tmp;
path[now] = preY;
}
}
} }
} }
endE = weight[X-1+E*X];
endS = weight[X-1+S*X];
stat = 0;
if(endE > endS)
{
stat = E;
}
else
{
stat = S;
}
status.assign(X, 0);
for(int x = X -1 ; x >= 0; x--)
{
status[x] = stat;
stat = path[x + stat*X];
}
delete [] path;
delete [] weight;
return true;
} }
}
endE = weight[X-1+E*X];
endS = weight[X-1+S*X];
stat = 0;
if(endE > endS)
{
stat = E;
}
else
{
stat = S;
}
status.assign(X, 0);
for(int x = X -1 ; x >= 0; x--)
{
status[x] = stat;
stat = path[x + stat*X];
}
delete [] path;
delete [] weight;
return true;
}
bool _loadModel(const char* const filePath) bool _loadModel(const char* const filePath)
{ {
LogInfo("loadModel [%s] start ...", filePath); LogInfo("loadModel [%s] start ...", filePath);
ifstream ifile(filePath); ifstream ifile(filePath);
string line; string line;
vector<string> tmp; vector<string> tmp;
vector<string> tmp2; vector<string> tmp2;
//load _startProb //load _startProb
if(!_getLine(ifile, line)) if(!_getLine(ifile, line))
{ {
return false; return false;
} }
splitStr(line, tmp, " "); splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM)
{ {
LogError("start_p illegal"); LogError("start_p illegal");
return false; return false;
} }
for(uint j = 0; j< tmp.size(); j++) for(uint j = 0; j< tmp.size(); j++)
{ {
_startProb[j] = atof(tmp[j].c_str()); _startProb[j] = atof(tmp[j].c_str());
//cout<<_startProb[j]<<endl; //cout<<_startProb[j]<<endl;
} }
//load _transProb //load _transProb
for(uint i = 0; i < STATUS_SUM; i++) for(uint i = 0; i < STATUS_SUM; i++)
{ {
if(!_getLine(ifile, line)) if(!_getLine(ifile, line))
{ {
return false; return false;
}
splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("trans_p illegal");
return false;
}
for(uint j =0; j < STATUS_SUM; j++)
{
_transProb[i][j] = atof(tmp[j].c_str());
//cout<<_transProb[i][j]<<endl;
}
}
//load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
{
return false;
}
//load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
{
return false;
}
//load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
{
return false;
}
//load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
{
return false;
}
LogInfo("loadModel [%s] end.", filePath);
return true;
} }
splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("trans_p illegal");
return false;
}
for(uint j =0; j < STATUS_SUM; j++)
{
_transProb[i][j] = atof(tmp[j].c_str());
//cout<<_transProb[i][j]<<endl;
}
}
//load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
{
return false;
}
//load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
{
return false;
}
//load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
{
return false;
}
//load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
{
return false;
}
LogInfo("loadModel [%s] end.", filePath);
return true;
}
bool _getLine(ifstream& ifile, string& line) bool _getLine(ifstream& ifile, string& line)
{ {
while(getline(ifile, line)) while(getline(ifile, line))
{ {
trim(line); trim(line);
if(line.empty()) if(line.empty())
{ {
continue; continue;
}
if(strStartsWith(line, "#"))
{
continue;
}
return true;
}
return false;
} }
if(strStartsWith(line, "#"))
{
continue;
}
return true;
}
return false;
}
bool _loadEmitProb(const string& line, EmitProbMap& mp) bool _loadEmitProb(const string& line, EmitProbMap& mp)
{ {
if(line.empty()) if(line.empty())
{ {
return false; return false;
} }
vector<string> tmp, tmp2; vector<string> tmp, tmp2;
uint16_t unico = 0; uint16_t unico = 0;
splitStr(line, tmp, ","); splitStr(line, tmp, ",");
for(uint i = 0; i < tmp.size(); i++) for(uint i = 0; i < tmp.size(); i++)
{ {
splitStr(tmp[i], tmp2, ":"); splitStr(tmp[i], tmp2, ":");
if(2 != tmp2.size()) if(2 != tmp2.size())
{ {
LogError("_emitProb illegal."); LogError("_emitProb illegal.");
return false; return false;
}
if(!_decodeOne(tmp2[0], unico))
{
LogError("TransCode failed.");
return false;
}
mp[unico] = atof(tmp2[1].c_str());
}
return true;
} }
if(!_decodeOne(tmp2[0], unico))
{
LogError("TransCode failed.");
return false;
}
mp[unico] = atof(tmp2[1].c_str());
}
return true;
}
bool _decodeOne(const string& str, uint16_t& res) bool _decodeOne(const string& str, uint16_t& res)
{ {
Unicode ui16; Unicode ui16;
if(!TransCode::decode(str, ui16) || ui16.size() != 1) if(!TransCode::decode(str, ui16) || ui16.size() != 1)
{ {
return false; return false;
} }
res = ui16[0]; res = ui16[0];
return true; return true;
} }
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
{ {
EmitProbMap::const_iterator cit = ptMp->find(key); EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end()) if(cit == ptMp->end())
{ {
return defVal; return defVal;
} }
return cit->second; return cit->second;
} }
}; };
} }

View File

@ -1,125 +0,0 @@
#include "MixSegment.h"
namespace CppJieba
{
MixSegment::MixSegment()
{
}
MixSegment::~MixSegment()
{
dispose();
}
bool MixSegment::init(const char* const mpSegDict, const char* const hmmSegDict)
{
if(_getInitFlag())
{
LogError("inited.");
return false;
}
if(!_mpSeg.init(mpSegDict))
{
LogError("_mpSeg init");
return false;
}
if(!_hmmSeg.init(hmmSegDict))
{
LogError("_hmmSeg init");
return false;
}
return _setInitFlag(true);
}
bool MixSegment::dispose()
{
if(!_getInitFlag())
{
return true;
}
_mpSeg.dispose();
_hmmSeg.dispose();
_setInitFlag(false);
return true;
}
bool MixSegment::cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(begin == end)
{
return false;
}
vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos))
{
LogError("mpSeg cutDAG failed.");
return false;
}
Unicode unico;
vector<Unicode> hmmRes;
string tmp;
for(uint i= 0; i < infos.size(); i++)
{
TransCode::encode(infos[i].word,tmp);
if(1 == infos[i].word.size())
{
unico.push_back(infos[i].word[0]);
}
else
{
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
unico.clear();
TransCode::encode(infos[i].word, tmp);
res.push_back(tmp);
}
}
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
return true;
}
}
#ifdef MIXSEGMENT_UT
using namespace CppJieba;
int main()
{
return 0;
}
#endif

View File

@ -1,28 +0,0 @@
#ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "Limonp/str_functs.hpp"
namespace CppJieba
{
class MixSegment: public SegmentBase
{
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment();
virtual ~MixSegment();
public:
bool init(const char* const _mpSegDict, const char* const _hmmSegDict);
bool dispose();
public:
//virtual bool cut(const string& str, vector<string>& res) const;
bool cut(const string& str, vector<string>& res)const;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const;
};
}
#endif

128
src/MixSegment.hpp Normal file
View File

@ -0,0 +1,128 @@
#ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "Limonp/str_functs.hpp"
namespace CppJieba
{
class MixSegment: public SegmentBase
{
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment()
{
}
virtual ~MixSegment()
{
dispose();
}
public:
bool init(const char* const mpSegDict, const char* const hmmSegDict)
{
if(_getInitFlag())
{
LogError("inited.");
return false;
}
if(!_mpSeg.init(mpSegDict))
{
LogError("_mpSeg init");
return false;
}
if(!_hmmSeg.init(hmmSegDict))
{
LogError("_hmmSeg init");
return false;
}
return _setInitFlag(true);
}
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_mpSeg.dispose();
_hmmSeg.dispose();
_setInitFlag(false);
return true;
}
public:
//virtual bool cut(const string& str, vector<string>& res) const;
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(begin == end)
{
return false;
}
vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos))
{
LogError("mpSeg cutDAG failed.");
return false;
}
Unicode unico;
vector<Unicode> hmmRes;
string tmp;
for(uint i= 0; i < infos.size(); i++)
{
TransCode::encode(infos[i].word,tmp);
if(1 == infos[i].word.size())
{
unico.push_back(infos[i].word[0]);
}
else
{
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
unico.clear();
TransCode::encode(infos[i].word, tmp);
res.push_back(tmp);
}
}
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
return true;
}
};
}
#endif

View File

@ -3,7 +3,7 @@
#include "Limonp/ArgvContext.hpp" #include "Limonp/ArgvContext.hpp"
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "MixSegment.h" #include "MixSegment.hpp"
using namespace CppJieba; using namespace CppJieba;

View File

@ -9,7 +9,7 @@
#include "Husky/ServerFrame.h" #include "Husky/ServerFrame.h"
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "MixSegment.h" #include "MixSegment.hpp"
using namespace Husky; using namespace Husky;
using namespace CppJieba; using namespace CppJieba;