diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1196cef..e102af9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,23 +1,15 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) -SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp) -ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC}) ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjserver server.cpp) +TARGET_LINK_LIBRARIES(cjserver husky pthread) LINK_DIRECTORIES(Husky) -TARGET_LINK_LIBRARIES(cjsegment cppjieba) -TARGET_LINK_LIBRARIES(cjserver cppjieba husky pthread) - -SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1) - INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) -INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba) -INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) - +INSTALL(FILES ChineseFilter.hpp HMMSegment.hpp MPSegment.hpp Trie.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba) ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Limonp) diff --git a/src/ChineseFilter.hpp b/src/ChineseFilter.hpp index 6016b09..adca186 100644 --- a/src/ChineseFilter.hpp +++ b/src/ChineseFilter.hpp @@ -1,14 +1,14 @@ #ifndef CPPJIEBA_CHINESEFILTER_H #define CPPJIEBA_CHINESEFILTER_H -#include "globals.h" #include "TransCode.hpp" namespace CppJieba { + enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1, OTHERS = 2}; + typedef Unicode::const_iterator UniConIter; class ChineseFilter; - class ChFilterIterator { public: diff --git a/src/HMMSegment.cpp b/src/HMMSegment.cpp deleted file mode 100644 index 37f3f99..0000000 --- a/src/HMMSegment.cpp +++ /dev/null @@ -1,341 +0,0 @@ -#include "HMMSegment.h" - -namespace CppJieba -{ - HMMSegment::HMMSegment() - { - memset(_startProb, 0, sizeof(_startProb)); - memset(_transProb, 0, sizeof(_transProb)); - _statMap[0] = 'B'; - _statMap[1] = 'E'; - _statMap[2] = 'M'; - _statMap[3] = 'S'; - _emitProbVec.push_back(&_emitProbB); - _emitProbVec.push_back(&_emitProbE); - _emitProbVec.push_back(&_emitProbM); - _emitProbVec.push_back(&_emitProbS); - } - - HMMSegment::~HMMSegment() - { - dispose(); - } - - bool HMMSegment::init(const char* const modelPath) - { - return _setInitFlag(_loadModel(modelPath)); - } - - bool HMMSegment::dispose() - { - _setInitFlag(false); - return true; - } - - bool HMMSegment::_loadModel(const char* const filePath) - { - LogInfo("loadModel [%s] start ...", filePath); - ifstream ifile(filePath); - string line; - vector tmp; - vector tmp2; - //load _startProb - if(!_getLine(ifile, line)) - { - return false; - } - splitStr(line, tmp, " "); - if(tmp.size() != STATUS_SUM) - { - LogError("start_p illegal"); - return false; - } - for(uint j = 0; j< tmp.size(); j++) - { - _startProb[j] = atof(tmp[j].c_str()); - //cout<<_startProb[j]<& res)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - vector status; - if(!_viterbi(begin, end, status)) - { - LogError("_viterbi failed."); - return false; - } - - Unicode::const_iterator left = begin; - Unicode::const_iterator right; - for(uint i =0; i< status.size(); i++) - { - if(status[i] % 2) //if(E == status[i] || S == status[i]) - { - right = begin + i + 1; - res.push_back(Unicode(left, right)); - left = right; - } - } - return true; - } - - bool HMMSegment::cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - - bool HMMSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(begin == end) - { - return false; - } - vector words; - if(!cut(begin, end, words)) - { - return false; - } - string tmp; - for(uint i = 0; i < words.size(); i++) - { - if(TransCode::encode(words[i], tmp)) - { - res.push_back(tmp); - } - } - return true; - } - - bool HMMSegment::_viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const - { - if(begin == end) - { - return false; - } - - size_t Y = STATUS_SUM; - size_t X = end - begin; - size_t XYSize = X * Y; - int * path; - double * weight; - uint now, old, stat; - double tmp, endE, endS; - - try - { - path = new int [XYSize]; - weight = new double [XYSize]; - } - catch(const std::bad_alloc&) - { - LogError("bad_alloc"); - return false; - } - if(NULL == path || NULL == weight) - { - LogError("bad_alloc"); - return false; - } - - //start - for(uint y = 0; y < Y; y++) - { - weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); - path[0 + y * X] = -1; - } - //process - //for(; begin != end; begin++) - for(uint x = 1; x < X; x++) - { - for(uint y = 0; y < Y; y++) - { - now = x + y*X; - weight[now] = MIN_DOUBLE; - path[now] = E; // warning - for(uint preY = 0; preY < Y; preY++) - { - old = x - 1 + preY * X; - tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); - if(tmp > weight[now]) - { - weight[now] = tmp; - path[now] = preY; - } - } - } - } - - endE = weight[X-1+E*X]; - endS = weight[X-1+S*X]; - stat = 0; - if(endE > endS) - { - stat = E; - } - else - { - stat = S; - } - - status.assign(X, 0); - for(int x = X -1 ; x >= 0; x--) - { - status[x] = stat; - stat = path[x + stat*X]; - } - - delete [] path; - delete [] weight; - return true; - } - - bool HMMSegment::_getLine(ifstream& ifile, string& line) - { - while(getline(ifile, line)) - { - trim(line); - if(line.empty()) - { - continue; - } - if(strStartsWith(line, "#")) - { - continue; - } - return true; - } - return false; - } - - bool HMMSegment::_loadEmitProb(const string& line, EmitProbMap& mp) - { - if(line.empty()) - { - return false; - } - vector tmp, tmp2; - uint16_t unico = 0; - splitStr(line, tmp, ","); - for(uint i = 0; i < tmp.size(); i++) - { - splitStr(tmp[i], tmp2, ":"); - if(2 != tmp2.size()) - { - LogError("_emitProb illegal."); - return false; - } - if(!_decodeOne(tmp2[0], unico)) - { - LogError("TransCode failed."); - return false; - } - mp[unico] = atof(tmp2[1].c_str()); - } - return true; - } - - bool HMMSegment::_decodeOne(const string& str, uint16_t& res) - { - Unicode ui16; - if(!TransCode::decode(str, ui16) || ui16.size() != 1) - { - return false; - } - res = ui16[0]; - return true; - } - - double HMMSegment::_getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const - { - EmitProbMap::const_iterator cit = ptMp->find(key); - if(cit == ptMp->end()) - { - return defVal; - } - return cit->second; - - } -} - - -#ifdef HMMSEGMENT_UT -using namespace CppJieba; - - -size_t add(size_t a, size_t b) -{ - return a*b; -} -int main() -{ - TransCode::setUtf8Enc(); - HMMSegment hmm; - hmm.loadModel("../dicts/hmm_model.utf8"); - vector res; - hmm.cut("小明硕士毕业于北邮网络研究院。。.", res); - cout< -#include -#include -#include "Limonp/str_functs.hpp" -#include "Limonp/logger.hpp" -#include "globals.h" -#include "TransCode.hpp" -#include "ISegment.hpp" -#include "SegmentBase.hpp" - -namespace CppJieba -{ - using namespace Limonp; - class HMMSegment: public SegmentBase - { - public: - /* - * STATUS: - * 0:B, 1:E, 2:M, 3:S - * */ - enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; - private: - char _statMap[STATUS_SUM]; - double _startProb[STATUS_SUM]; - double _transProb[STATUS_SUM][STATUS_SUM]; - EmitProbMap _emitProbB; - EmitProbMap _emitProbE; - EmitProbMap _emitProbM; - EmitProbMap _emitProbS; - vector _emitProbVec; - - public: - HMMSegment(); - virtual ~HMMSegment(); - public: - bool init(const char* const modelPath); - bool dispose(); - public: - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const ; - bool cut(const string& str, vector& res)const; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const; - //virtual bool cut(const string& str, vector& res)const; - - private: - bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const; - bool _loadModel(const char* const filePath); - bool _getLine(ifstream& ifile, string& line); - bool _loadEmitProb(const string& line, EmitProbMap& mp); - bool _decodeOne(const string& str, uint16_t& res); - double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const ; - - - }; -} - -#endif diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp new file mode 100644 index 0000000..f3f36c2 --- /dev/null +++ b/src/HMMSegment.hpp @@ -0,0 +1,346 @@ +#ifndef CPPJIBEA_HMMSEGMENT_H +#define CPPJIBEA_HMMSEGMENT_H + +#include +#include +#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" +#include "TransCode.hpp" +#include "ISegment.hpp" +#include "SegmentBase.hpp" + +namespace CppJieba +{ + using namespace Limonp; + typedef unordered_map EmitProbMap; + class HMMSegment: public SegmentBase + { + public: + /* + * STATUS: + * 0:B, 1:E, 2:M, 3:S + * */ + enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; + private: + char _statMap[STATUS_SUM]; + double _startProb[STATUS_SUM]; + double _transProb[STATUS_SUM][STATUS_SUM]; + EmitProbMap _emitProbB; + EmitProbMap _emitProbE; + EmitProbMap _emitProbM; + EmitProbMap _emitProbS; + vector _emitProbVec; + + public: + HMMSegment() + { + memset(_startProb, 0, sizeof(_startProb)); + memset(_transProb, 0, sizeof(_transProb)); + _statMap[0] = 'B'; + _statMap[1] = 'E'; + _statMap[2] = 'M'; + _statMap[3] = 'S'; + _emitProbVec.push_back(&_emitProbB); + _emitProbVec.push_back(&_emitProbE); + _emitProbVec.push_back(&_emitProbM); + _emitProbVec.push_back(&_emitProbS); + } + virtual ~HMMSegment() + { + dispose(); + } + public: + bool init(const char* const modelPath) + { + return _setInitFlag(_loadModel(modelPath)); + } + bool dispose() + { + _setInitFlag(false); + return true; + } + public: + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + vector status; + if(!_viterbi(begin, end, status)) + { + LogError("_viterbi failed."); + return false; + } + + Unicode::const_iterator left = begin; + Unicode::const_iterator right; + for(uint i =0; i< status.size(); i++) + { + if(status[i] % 2) //if(E == status[i] || S == status[i]) + { + right = begin + i + 1; + res.push_back(Unicode(left, right)); + left = right; + } + } + return true; + } + bool cut(const string& str, vector& res)const + { + return SegmentBase::cut(str, res); + } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if(begin == end) + { + return false; + } + vector words; + if(!cut(begin, end, words)) + { + return false; + } + string tmp; + for(uint i = 0; i < words.size(); i++) + { + if(TransCode::encode(words[i], tmp)) + { + res.push_back(tmp); + } + } + return true; + } + //virtual bool cut(const string& str, vector& res)const; + + private: + bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const + { + if(begin == end) + { + return false; + } + + size_t Y = STATUS_SUM; + size_t X = end - begin; + size_t XYSize = X * Y; + int * path; + double * weight; + uint now, old, stat; + double tmp, endE, endS; + + try + { + path = new int [XYSize]; + weight = new double [XYSize]; + } + catch(const std::bad_alloc&) + { + LogError("bad_alloc"); + return false; + } + if(NULL == path || NULL == weight) + { + LogError("bad_alloc"); + return false; + } + + //start + for(uint y = 0; y < Y; y++) + { + weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); + path[0 + y * X] = -1; + } + //process + //for(; begin != end; begin++) + for(uint x = 1; x < X; x++) + { + for(uint y = 0; y < Y; y++) + { + now = x + y*X; + weight[now] = MIN_DOUBLE; + path[now] = E; // warning + for(uint preY = 0; preY < Y; preY++) + { + old = x - 1 + preY * X; + tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); + if(tmp > weight[now]) + { + weight[now] = tmp; + path[now] = preY; + } + } + } + } + + endE = weight[X-1+E*X]; + endS = weight[X-1+S*X]; + stat = 0; + if(endE > endS) + { + stat = E; + } + else + { + stat = S; + } + + status.assign(X, 0); + for(int x = X -1 ; x >= 0; x--) + { + status[x] = stat; + stat = path[x + stat*X]; + } + + delete [] path; + delete [] weight; + return true; + } + bool _loadModel(const char* const filePath) + { + LogInfo("loadModel [%s] start ...", filePath); + ifstream ifile(filePath); + string line; + vector tmp; + vector tmp2; + //load _startProb + if(!_getLine(ifile, line)) + { + return false; + } + splitStr(line, tmp, " "); + if(tmp.size() != STATUS_SUM) + { + LogError("start_p illegal"); + return false; + } + for(uint j = 0; j< tmp.size(); j++) + { + _startProb[j] = atof(tmp[j].c_str()); + //cout<<_startProb[j]< tmp, tmp2; + uint16_t unico = 0; + splitStr(line, tmp, ","); + for(uint i = 0; i < tmp.size(); i++) + { + splitStr(tmp[i], tmp2, ":"); + if(2 != tmp2.size()) + { + LogError("_emitProb illegal."); + return false; + } + if(!_decodeOne(tmp2[0], unico)) + { + LogError("TransCode failed."); + return false; + } + mp[unico] = atof(tmp2[1].c_str()); + } + return true; + } + bool _decodeOne(const string& str, uint16_t& res) + { + Unicode ui16; + if(!TransCode::decode(str, ui16) || ui16.size() != 1) + { + return false; + } + res = ui16[0]; + return true; + } + double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const + { + EmitProbMap::const_iterator cit = ptMp->find(key); + if(cit == ptMp->end()) + { + return defVal; + } + return cit->second; + + } + + + }; +} + +#endif diff --git a/src/ISegment.hpp b/src/ISegment.hpp index 058236a..48b8cd5 100644 --- a/src/ISegment.hpp +++ b/src/ISegment.hpp @@ -1,7 +1,6 @@ #ifndef CPPJIEBA_SEGMENTINTERFACE_H #define CPPJIEBA_SEGMENTINTERFACE_H -#include "globals.h" namespace CppJieba { diff --git a/src/MPSegment.cpp b/src/MPSegment.cpp deleted file mode 100644 index cfbd025..0000000 --- a/src/MPSegment.cpp +++ /dev/null @@ -1,265 +0,0 @@ -/************************************ - * file enc : AISCII - * author : wuyanyi09@gmail.com -************************************/ -#include "MPSegment.h" - -namespace CppJieba -{ - - bool MPSegment::init(const char* const filePath) - { - if(_getInitFlag()) - { - LogError("already inited before now."); - return false; - } - if(!_trie.init()) - { - LogError("_trie.init failed."); - return false; - } - LogInfo("_trie.loadDict(%s) start...", filePath); - if(!_trie.loadDict(filePath)) - { - LogError("_trie.loadDict faield."); - return false; - } - LogInfo("_trie.loadDict end."); - return _setInitFlag(true); - } - - bool MPSegment::dispose() - { - if(!_getInitFlag()) - { - return true; - } - _trie.dispose(); - _setInitFlag(false); - return true; - } - - bool MPSegment::cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - - bool MPSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - vector segWordInfos; - if(!cut(begin, end, segWordInfos)) - { - return false; - } - string tmp; - for(uint i = 0; i < segWordInfos.size(); i++) - { - if(TransCode::encode(segWordInfos[i].word, tmp)) - { - res.push_back(tmp); - } - else - { - LogError("encode failed."); - } - } - return true; - } - - bool MPSegment::cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - SegmentContext segContext; - for(Unicode::const_iterator it = begin; it != end; it++) - { - segContext.push_back(SegmentChar(*it)); - } - - //calc DAG - if(!_calcDAG(segContext)) - { - LogError("_calcDAG failed."); - return false; - } - - if(!_calcDP(segContext)) - { - LogError("_calcDP failed."); - return false; - } - - if(!_cut(segContext, segWordInfos)) - { - LogError("_cut failed."); - return false; - } - - return true; - } - - bool MPSegment::cut(const string& str, vector& segWordInfos)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(str.empty()) - { - return false; - } - Unicode sentence; - - if(!TransCode::decode(str, sentence)) - { - LogError("TransCode::decode failed."); - return false; - } - return cut(sentence.begin(), sentence.end(), segWordInfos); - - } - - bool MPSegment::_calcDAG(SegmentContext& segContext)const - { - if(segContext.empty()) - { - LogError("segContext empty."); - return false; - } - - Unicode unicode; - for(uint i = 0; i < segContext.size(); i++) - { - unicode.clear(); - for(uint j = i ; j < segContext.size(); j++) - { - unicode.push_back(segContext[j].uniCh); - } - - vector > vp; - if(_trie.find(unicode, vp)) - { - for(uint j = 0; j < vp.size(); j++) - { - uint nextp = vp[j].first + i; - segContext[i].dag[nextp] = vp[j].second; - //cout<toString()); - } - } - if(segContext[i].dag.end() == segContext[i].dag.find(i)) - { - segContext[i].dag[i] = NULL; - } - } - return true; - } - - bool MPSegment::_calcDP(SegmentContext& segContext)const - { - if(segContext.empty()) - { - LogError("segContext empty"); - return false; - } - - for(int i = segContext.size() - 1; i >= 0; i--) - { - segContext[i].pInfo = NULL; - segContext[i].weight = MIN_DOUBLE; - for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) - { - uint nextPos = it->first; - const TrieNodeInfo* p = it->second; - double val = 0.0; - if(nextPos + 1 < segContext.size()) - { - val += segContext[nextPos + 1].weight; - } - - if(p) - { - val += p->logFreq; - } - else - { - val += _trie.getMinLogFreq(); - } - if(val > segContext[i].weight) - { - segContext[i].pInfo = p; - segContext[i].weight = val; - } - } - } - return true; - - } - - bool MPSegment::_cut(SegmentContext& segContext, vector& res)const - { - uint i = 0; - while(i < segContext.size()) - { - const TrieNodeInfo* p = segContext[i].pInfo; - if(p) - { - res.push_back(*p); - i += p->word.size(); - } - else//single chinese word - { - TrieNodeInfo nodeInfo; - nodeInfo.word.push_back(segContext[i].uniCh); - nodeInfo.freq = 0; - nodeInfo.logFreq = _trie.getMinLogFreq(); - res.push_back(nodeInfo); - i++; - } - } - return true; - } - -} - - -#ifdef SEGMENT_UT -using namespace CppJieba; - -int main() -{ - MPSegment segment; - segment.init(); - if(!segment._loadSegDict("../dicts/segdict.gbk.v3.0")) - { - cerr<<"1"< res; - string line; - while(getline(ifile, line)) - { - res.clear(); - segment.cut(line, res); - PRINT_VECTOR(res); - getchar(); - } - - segment.dispose(); - return 0; -} - -#endif diff --git a/src/MPSegment.h b/src/MPSegment.h deleted file mode 100644 index a3eaae3..0000000 --- a/src/MPSegment.h +++ /dev/null @@ -1,49 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#ifndef CPPJIEBA_MPSEGMENT_H -#define CPPJIEBA_MPSEGMENT_H - -#include -#include -#include "Limonp/logger.hpp" -#include "Trie.h" -#include "globals.h" -#include "ISegment.hpp" -#include "SegmentBase.hpp" - -namespace CppJieba -{ - - typedef vector SegmentContext; - - class MPSegment: public SegmentBase - { - private: - Trie _trie; - - public: - MPSegment(){}; - virtual ~MPSegment(){dispose();}; - public: - bool init(const char* const filePath); - bool dispose(); - public: - //bool cut(const string& str, vector& segWordInfos)const; - bool cut(const string& str, vector& res)const; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const; - bool cut(const string& str, vector& segWordInfos)const; - bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const; - //virtual bool cut(const string& str, vector& res)const; - - private: - bool _calcDAG(SegmentContext& segContext)const; - bool _calcDP(SegmentContext& segContext)const; - bool _cut(SegmentContext& segContext, vector& res)const; - - - }; -} - -#endif diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp new file mode 100644 index 0000000..42e0d49 --- /dev/null +++ b/src/MPSegment.hpp @@ -0,0 +1,264 @@ +/************************************ + * file enc : ASCII + * author : wuyanyi09@gmail.com + ************************************/ +#ifndef CPPJIEBA_MPSEGMENT_H +#define CPPJIEBA_MPSEGMENT_H + +#include +#include +#include "Limonp/logger.hpp" +#include "Trie.hpp" +#include "ISegment.hpp" +#include "SegmentBase.hpp" + +namespace CppJieba +{ + + struct SegmentChar + { + uint16_t uniCh; + DagType dag; + const TrieNodeInfo * pInfo; + double weight; + + SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0) + { + } + }; + typedef vector SegmentContext; + + class MPSegment: public SegmentBase + { + private: + Trie _trie; + + public: + MPSegment(){}; + virtual ~MPSegment(){dispose();}; + public: + bool init(const char* const filePath) + { + if(_getInitFlag()) + { + LogError("already inited before now."); + return false; + } + if(!_trie.init()) + { + LogError("_trie.init failed."); + return false; + } + LogInfo("_trie.loadDict(%s) start...", filePath); + if(!_trie.loadDict(filePath)) + { + LogError("_trie.loadDict faield."); + return false; + } + LogInfo("_trie.loadDict end."); + return _setInitFlag(true); + } + bool dispose() + { + if(!_getInitFlag()) + { + return true; + } + _trie.dispose(); + _setInitFlag(false); + return true; + } + public: + //bool cut(const string& str, vector& segWordInfos)const; + bool cut(const string& str, vector& res)const + { + return SegmentBase::cut(str, res); + } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + vector segWordInfos; + if(!cut(begin, end, segWordInfos)) + { + return false; + } + string tmp; + for(uint i = 0; i < segWordInfos.size(); i++) + { + if(TransCode::encode(segWordInfos[i].word, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } + } + return true; + } + bool cut(const string& str, vector& segWordInfos)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if(str.empty()) + { + return false; + } + Unicode sentence; + + if(!TransCode::decode(str, sentence)) + { + LogError("TransCode::decode failed."); + return false; + } + return cut(sentence.begin(), sentence.end(), segWordInfos); + + } + bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& segWordInfos)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + SegmentContext segContext; + for(Unicode::const_iterator it = begin; it != end; it++) + { + segContext.push_back(SegmentChar(*it)); + } + + //calc DAG + if(!_calcDAG(segContext)) + { + LogError("_calcDAG failed."); + return false; + } + + if(!_calcDP(segContext)) + { + LogError("_calcDP failed."); + return false; + } + + if(!_cut(segContext, segWordInfos)) + { + LogError("_cut failed."); + return false; + } + + return true; + } + //virtual bool cut(const string& str, vector& res)const; + + private: + bool _calcDAG(SegmentContext& segContext)const + { + if(segContext.empty()) + { + LogError("segContext empty."); + return false; + } + + Unicode unicode; + for(uint i = 0; i < segContext.size(); i++) + { + unicode.clear(); + for(uint j = i ; j < segContext.size(); j++) + { + unicode.push_back(segContext[j].uniCh); + } + + vector > vp; + if(_trie.find(unicode, vp)) + { + for(uint j = 0; j < vp.size(); j++) + { + uint nextp = vp[j].first + i; + segContext[i].dag[nextp] = vp[j].second; + //cout<toString()); + } + } + if(segContext[i].dag.end() == segContext[i].dag.find(i)) + { + segContext[i].dag[i] = NULL; + } + } + return true; + } + bool _calcDP(SegmentContext& segContext)const + { + if(segContext.empty()) + { + LogError("segContext empty"); + return false; + } + + for(int i = segContext.size() - 1; i >= 0; i--) + { + segContext[i].pInfo = NULL; + segContext[i].weight = MIN_DOUBLE; + for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) + { + uint nextPos = it->first; + const TrieNodeInfo* p = it->second; + double val = 0.0; + if(nextPos + 1 < segContext.size()) + { + val += segContext[nextPos + 1].weight; + } + + if(p) + { + val += p->logFreq; + } + else + { + val += _trie.getMinLogFreq(); + } + if(val > segContext[i].weight) + { + segContext[i].pInfo = p; + segContext[i].weight = val; + } + } + } + return true; + + } + bool _cut(SegmentContext& segContext, vector& res)const + { + uint i = 0; + while(i < segContext.size()) + { + const TrieNodeInfo* p = segContext[i].pInfo; + if(p) + { + res.push_back(*p); + i += p->word.size(); + } + else//single chinese word + { + TrieNodeInfo nodeInfo; + nodeInfo.word.push_back(segContext[i].uniCh); + nodeInfo.freq = 0; + nodeInfo.logFreq = _trie.getMinLogFreq(); + res.push_back(nodeInfo); + i++; + } + } + return true; + } + + + }; +} + +#endif diff --git a/src/MixSegment.cpp b/src/MixSegment.cpp deleted file mode 100644 index 5653388..0000000 --- a/src/MixSegment.cpp +++ /dev/null @@ -1,125 +0,0 @@ -#include "MixSegment.h" - -namespace CppJieba -{ - MixSegment::MixSegment() - { - } - - MixSegment::~MixSegment() - { - dispose(); - } - - bool MixSegment::init(const char* const mpSegDict, const char* const hmmSegDict) - { - if(_getInitFlag()) - { - LogError("inited."); - return false; - } - if(!_mpSeg.init(mpSegDict)) - { - LogError("_mpSeg init"); - return false; - } - if(!_hmmSeg.init(hmmSegDict)) - { - LogError("_hmmSeg init"); - return false; - } - return _setInitFlag(true); - } - - bool MixSegment::dispose() - { - if(!_getInitFlag()) - { - return true; - } - _mpSeg.dispose(); - _hmmSeg.dispose(); - _setInitFlag(false); - return true; - } - bool MixSegment::cut(const string& str, vector& res)const - { - return SegmentBase::cut(str, res); - } - - bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - if(!_getInitFlag()) - { - LogError("not inited."); - return false; - } - if(begin == end) - { - return false; - } - vector infos; - if(!_mpSeg.cut(begin, end, infos)) - { - LogError("mpSeg cutDAG failed."); - return false; - } - Unicode unico; - vector hmmRes; - string tmp; - for(uint i= 0; i < infos.size(); i++) - { - TransCode::encode(infos[i].word,tmp); - if(1 == infos[i].word.size()) - { - unico.push_back(infos[i].word[0]); - } - else - { - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - unico.clear(); - TransCode::encode(infos[i].word, tmp); - res.push_back(tmp); - } - } - if(!unico.empty()) - { - hmmRes.clear(); - if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) - { - LogError("_hmmSeg cut failed."); - return false; - } - for(uint j = 0; j < hmmRes.size(); j++) - { - TransCode::encode(hmmRes[j], tmp); - res.push_back(tmp); - } - } - - return true; - } -} - -#ifdef MIXSEGMENT_UT -using namespace CppJieba; - -int main() -{ - return 0; -} - -#endif diff --git a/src/MixSegment.h b/src/MixSegment.h deleted file mode 100644 index 079db3f..0000000 --- a/src/MixSegment.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef CPPJIEBA_MIXSEGMENT_H -#define CPPJIEBA_MIXSEGMENT_H - -#include "MPSegment.h" -#include "HMMSegment.h" -#include "Limonp/str_functs.hpp" - -namespace CppJieba -{ - class MixSegment: public SegmentBase - { - private: - MPSegment _mpSeg; - HMMSegment _hmmSeg; - public: - MixSegment(); - virtual ~MixSegment(); - public: - bool init(const char* const _mpSegDict, const char* const _hmmSegDict); - bool dispose(); - public: - //virtual bool cut(const string& str, vector& res) const; - bool cut(const string& str, vector& res)const; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const; - }; -} - -#endif diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp new file mode 100644 index 0000000..896d7b1 --- /dev/null +++ b/src/MixSegment.hpp @@ -0,0 +1,128 @@ +#ifndef CPPJIEBA_MIXSEGMENT_H +#define CPPJIEBA_MIXSEGMENT_H + +#include "MPSegment.hpp" +#include "HMMSegment.hpp" +#include "Limonp/str_functs.hpp" + +namespace CppJieba +{ + class MixSegment: public SegmentBase + { + private: + MPSegment _mpSeg; + HMMSegment _hmmSeg; + public: + MixSegment() + { + } + virtual ~MixSegment() + { + dispose(); + } + public: + bool init(const char* const mpSegDict, const char* const hmmSegDict) + { + if(_getInitFlag()) + { + LogError("inited."); + return false; + } + if(!_mpSeg.init(mpSegDict)) + { + LogError("_mpSeg init"); + return false; + } + if(!_hmmSeg.init(hmmSegDict)) + { + LogError("_hmmSeg init"); + return false; + } + return _setInitFlag(true); + } + bool dispose() + { + if(!_getInitFlag()) + { + return true; + } + _mpSeg.dispose(); + _hmmSeg.dispose(); + _setInitFlag(false); + return true; + } + public: + //virtual bool cut(const string& str, vector& res) const; + bool cut(const string& str, vector& res)const + { + return SegmentBase::cut(str, res); + } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const + { + if(!_getInitFlag()) + { + LogError("not inited."); + return false; + } + if(begin == end) + { + return false; + } + vector infos; + if(!_mpSeg.cut(begin, end, infos)) + { + LogError("mpSeg cutDAG failed."); + return false; + } + Unicode unico; + vector hmmRes; + string tmp; + for(uint i= 0; i < infos.size(); i++) + { + TransCode::encode(infos[i].word,tmp); + if(1 == infos[i].word.size()) + { + unico.push_back(infos[i].word[0]); + } + else + { + if(!unico.empty()) + { + hmmRes.clear(); + if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + for(uint j = 0; j < hmmRes.size(); j++) + { + TransCode::encode(hmmRes[j], tmp); + res.push_back(tmp); + } + } + unico.clear(); + TransCode::encode(infos[i].word, tmp); + res.push_back(tmp); + } + } + if(!unico.empty()) + { + hmmRes.clear(); + if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + for(uint j = 0; j < hmmRes.size(); j++) + { + TransCode::encode(hmmRes[j], tmp); + res.push_back(tmp); + } + } + + return true; + } + }; +} + +#endif diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index b082f56..a187686 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -1,7 +1,6 @@ #ifndef CPPJIEBA_SEGMENTBASE_H #define CPPJIEBA_SEGMENTBASE_H -#include "globals.h" #include "ISegment.hpp" #include "ChineseFilter.hpp" #include "Limonp/str_functs.hpp" diff --git a/src/TransCode.hpp b/src/TransCode.hpp index febe3a4..113aceb 100644 --- a/src/TransCode.hpp +++ b/src/TransCode.hpp @@ -6,13 +6,13 @@ #define CPPJIEBA_TRANSCODE_H -#include "globals.h" #include "Limonp/str_functs.hpp" namespace CppJieba { using namespace Limonp; + typedef std::vector Unicode; namespace TransCode { inline bool decode(const string& str, vector& vec) diff --git a/src/Trie.cpp b/src/Trie.cpp deleted file mode 100644 index b72ff3b..0000000 --- a/src/Trie.cpp +++ /dev/null @@ -1,390 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#include "Trie.h" - -namespace CppJieba -{ - - Trie::Trie() - { - - _root = NULL; - _freqSum = 0; - _minLogFreq = MAX_DOUBLE; - _initFlag = false; - } - - Trie::~Trie() - { - dispose(); - } - - bool Trie::init() - { - if(_getInitFlag()) - { - LogError("already initted!"); - return false; - } - - try - { - _root = new TrieNode; - } - catch(const bad_alloc& e) - { - return false; - } - if(NULL == _root) - { - return false; - } - _setInitFlag(true); - return true; - } - - bool Trie::loadDict(const char * const filePath) - { - if(!_getInitFlag()) - { - LogError("not initted."); - return false; - } - - if(!checkFileExist(filePath)) - { - LogError("cann't find fiel[%s].",filePath); - return false; - } - bool res = false; - res = _trieInsert(filePath); - if(!res) - { - LogError("_trieInsert failed."); - return false; - } - res = _countWeight(); - if(!res) - { - LogError("_countWeight failed."); - return false; - } - return true; - } - - bool Trie::_trieInsert(const char * const filePath) - { - - ifstream ifile(filePath); - string line; - vector vecBuf; - - TrieNodeInfo nodeInfo; - while(getline(ifile, line)) - { - vecBuf.clear(); - splitStr(line, vecBuf, " "); - if(3 < vecBuf.size()) - { - LogError("line[%s] illegal.", line.c_str()); - return false; - } - if(!TransCode::decode(vecBuf[0], nodeInfo.word)) - { - return false; - } - nodeInfo.freq = atoi(vecBuf[1].c_str()); - if(3 == vecBuf.size()) - { - nodeInfo.tag = vecBuf[2]; - } - - //insert node - if(!insert(nodeInfo)) - { - LogError("insert node failed!"); - } - } - return true; - } - - bool Trie::dispose() - { - if(!_getInitFlag()) - { - return false; - } - bool ret = _deleteNode(_root); - if(!ret) - { - LogFatal("_deleteNode failed!"); - return false; - } - _root = NULL; - _nodeInfoVec.clear(); - - _setInitFlag(false); - return ret; - } - - const TrieNodeInfo* Trie::findPrefix(const string& str)const - { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return NULL; - } - Unicode uintVec; - - if(!TransCode::decode(str, uintVec)) - { - LogError("TransCode::decode failed."); - return NULL; - } - - //find - TrieNode* p = _root; - uint pos = 0; - uint16_t chUni = 0; - const TrieNodeInfo * res = NULL; - for(uint i = 0; i < uintVec.size(); i++) - { - chUni = uintVec[i]; - if(p->isLeaf) - { - pos = p->nodeInfoVecPos; - if(pos >= _nodeInfoVec.size()) - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return NULL; - } - res = &(_nodeInfoVec[pos]); - - } - if(p->hmap.find(chUni) == p->hmap.end()) - { - break; - } - else - { - p = p->hmap[chUni]; - } - } - return res; - } - - const TrieNodeInfo* Trie::find(const string& str)const - { - Unicode uintVec; - if(!TransCode::decode(str, uintVec)) - { - return NULL; - } - return find(uintVec); - } - - const TrieNodeInfo* Trie::find(const Unicode& uintVec)const - { - if(uintVec.empty()) - { - return NULL; - } - return find(uintVec.begin(), uintVec.end()); - } - - const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)const - { - - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return NULL; - } - if(begin >= end) - { - return NULL; - } - TrieNode* p = _root; - for(Unicode::const_iterator it = begin; it != end; it++) - { - uint16_t chUni = *it; - if(p->hmap.find(chUni) == p-> hmap.end()) - { - return NULL; - } - else - { - p = p->hmap[chUni]; - } - } - if(p->isLeaf) - { - uint pos = p->nodeInfoVecPos; - if(pos < _nodeInfoVec.size()) - { - return &(_nodeInfoVec[pos]); - } - else - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return NULL; - } - } - return NULL; - } - - bool Trie::find(const Unicode& unico, vector >& res)const - { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return false; - } - TrieNode* p = _root; - //for(Unicode::const_iterator it = begin; it != end; it++) - for(uint i = 0; i < unico.size(); i++) - { - if(p->hmap.find(unico[i]) == p-> hmap.end()) - { - break; - } - p = p->hmap[unico[i]]; - if(p->isLeaf) - { - uint pos = p->nodeInfoVecPos; - if(pos < _nodeInfoVec.size()) - { - res.push_back(make_pair(i, &_nodeInfoVec[pos])); - } - else - { - LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); - return false; - } - } - } - return !res.empty(); - } - - bool Trie::_deleteNode(TrieNode* node) - { - for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) - { - TrieNode* next = it->second; - _deleteNode(next); - } - - delete node; - return true; - } - - - bool Trie::insert(const TrieNodeInfo& nodeInfo) - { - if(!_getInitFlag()) - { - LogFatal("not initted!"); - return false; - } - - - const Unicode& uintVec = nodeInfo.word; - TrieNode* p = _root; - for(uint i = 0; i < uintVec.size(); i++) - { - uint16_t cu = uintVec[i]; - if(NULL == p) - { - return false; - } - if(p->hmap.end() == p->hmap.find(cu)) - { - TrieNode * next = NULL; - try - { - next = new TrieNode; - } - catch(const bad_alloc& e) - { - return false; - } - p->hmap[cu] = next; - p = next; - } - else - { - p = p->hmap[cu]; - } - } - if(NULL == p) - { - return false; - } - if(p->isLeaf) - { - LogError("this node already inserted"); - return false; - } - - p->isLeaf = true; - _nodeInfoVec.push_back(nodeInfo); - p->nodeInfoVecPos = _nodeInfoVec.size() - 1; - - return true; - } - - bool Trie::_countWeight() - { - if(_nodeInfoVec.empty() || 0 != _freqSum) - { - LogError("_nodeInfoVec is empty or _freqSum has been counted already."); - return false; - } - - //freq total freq - for(size_t i = 0; i < _nodeInfoVec.size(); i++) - { - _freqSum += _nodeInfoVec[i].freq; - } - - if(0 == _freqSum) - { - LogError("_freqSum == 0 ."); - return false; - } - - //normalize - for(uint i = 0; i < _nodeInfoVec.size(); i++) - { - TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; - if(0 == nodeInfo.freq) - { - LogFatal("nodeInfo.freq == 0!"); - return false; - } - nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); - if(_minLogFreq > nodeInfo.logFreq) - { - _minLogFreq = nodeInfo.logFreq; - } - } - - return true; - } -} - -#ifdef TRIE_UT -using namespace CppJieba; -int main() -{ - Trie trie; - trie.init(); - trie.loadDict("../dicts/segdict.gbk.v2.1"); - //trie.loadDict("tmp"); - cout< -#include -#include -#include -#include -#include -#include -#include "Limonp/str_functs.hpp" -#include "Limonp/logger.hpp" -#include "TransCode.hpp" -#include "globals.h" -#include "structs.h" - - -namespace CppJieba -{ - using namespace Limonp; - struct TrieNode - { - TrieNodeMap hmap; - bool isLeaf; - uint nodeInfoVecPos; - TrieNode() - { - isLeaf = false; - nodeInfoVecPos = 0; - } - }; - - class Trie - { - - private: - TrieNode* _root; - vector _nodeInfoVec; - - bool _initFlag; - int64_t _freqSum; - double _minLogFreq; - - public: - Trie(); - ~Trie(); - bool init(); - bool loadDict(const char * const filePath); - bool dispose(); - - private: - void _setInitFlag(bool on){_initFlag = on;}; - bool _getInitFlag()const{return _initFlag;}; - - public: - const TrieNodeInfo* find(const string& str)const; - const TrieNodeInfo* find(const Unicode& uintVec)const; - const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const; - bool find(const Unicode& unico, vector >& res)const; - - const TrieNodeInfo* findPrefix(const string& str)const; - - public: - //double getWeight(const string& str); - //double getWeight(const Unicode& uintVec); - //double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end); - double getMinLogFreq()const{return _minLogFreq;}; - - //int64_t getTotalCount(){return _freqSum;}; - - bool insert(const TrieNodeInfo& nodeInfo); - - private: - bool _trieInsert(const char * const filePath); - bool _countWeight(); - bool _deleteNode(TrieNode* node); - - }; -} - -#endif diff --git a/src/Trie.hpp b/src/Trie.hpp new file mode 100644 index 0000000..c281047 --- /dev/null +++ b/src/Trie.hpp @@ -0,0 +1,441 @@ +/************************************ + * file enc : ASCII + * author : wuyanyi09@gmail.com + ************************************/ +#ifndef CPPJIEBA_TRIE_H +#define CPPJIEBA_TRIE_H + +#include +#include +#include +#include +#include +#include +#include +#include "Limonp/str_functs.hpp" +#include "Limonp/logger.hpp" +#include "TransCode.hpp" + + +namespace CppJieba +{ + using namespace Limonp; + const double MIN_DOUBLE = -3.14e+100; + const double MAX_DOUBLE = 3.14e+100; + typedef unordered_map TrieNodeMap; + struct TrieNode + { + TrieNodeMap hmap; + bool isLeaf; + uint nodeInfoVecPos; + TrieNode() + { + isLeaf = false; + nodeInfoVecPos = 0; + } + }; + + struct TrieNodeInfo + { + Unicode word; + size_t freq; + string tag; + double logFreq; //logFreq = log(freq/sum(freq)); + TrieNodeInfo():freq(0),logFreq(0.0) + { + } + TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) + { + } + TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) + { + } + string toString()const + { + string tmp; + TransCode::encode(word, tmp); + return string_format("{word:%s,freq:%d, logFreq:%lf}", tmp.c_str(), freq, logFreq); + } + }; + typedef unordered_map DagType; + + class Trie + { + + private: + TrieNode* _root; + vector _nodeInfoVec; + + bool _initFlag; + int64_t _freqSum; + double _minLogFreq; + + public: + Trie() + { + _root = NULL; + _freqSum = 0; + _minLogFreq = MAX_DOUBLE; + _initFlag = false; + } + ~Trie() + { + dispose(); + } + bool init() + { + if(_getInitFlag()) + { + LogError("already initted!"); + return false; + } + + try + { + _root = new TrieNode; + } + catch(const bad_alloc& e) + { + return false; + } + if(NULL == _root) + { + return false; + } + _setInitFlag(true); + return true; + } + bool dispose() + { + if(!_getInitFlag()) + { + return false; + } + bool ret = _deleteNode(_root); + if(!ret) + { + LogFatal("_deleteNode failed!"); + return false; + } + _root = NULL; + _nodeInfoVec.clear(); + + _setInitFlag(false); + return ret; + } + bool loadDict(const char * const filePath) + { + if(!_getInitFlag()) + { + LogError("not initted."); + return false; + } + + if(!checkFileExist(filePath)) + { + LogError("cann't find fiel[%s].",filePath); + return false; + } + bool res = false; + res = _trieInsert(filePath); + if(!res) + { + LogError("_trieInsert failed."); + return false; + } + res = _countWeight(); + if(!res) + { + LogError("_countWeight failed."); + return false; + } + return true; + } + + private: + void _setInitFlag(bool on){_initFlag = on;}; + bool _getInitFlag()const{return _initFlag;}; + + public: + const TrieNodeInfo* find(const string& str)const + { + Unicode uintVec; + if(!TransCode::decode(str, uintVec)) + { + return NULL; + } + return find(uintVec); + } + const TrieNodeInfo* find(const Unicode& uintVec)const + { + if(uintVec.empty()) + { + return NULL; + } + return find(uintVec.begin(), uintVec.end()); + } + const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const + { + + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return NULL; + } + if(begin >= end) + { + return NULL; + } + TrieNode* p = _root; + for(Unicode::const_iterator it = begin; it != end; it++) + { + uint16_t chUni = *it; + if(p->hmap.find(chUni) == p-> hmap.end()) + { + return NULL; + } + else + { + p = p->hmap[chUni]; + } + } + if(p->isLeaf) + { + uint pos = p->nodeInfoVecPos; + if(pos < _nodeInfoVec.size()) + { + return &(_nodeInfoVec[pos]); + } + else + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return NULL; + } + } + return NULL; + } + bool find(const Unicode& unico, vector >& res)const + { + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return false; + } + TrieNode* p = _root; + for(uint i = 0; i < unico.size(); i++) + { + if(p->hmap.find(unico[i]) == p-> hmap.end()) + { + break; + } + p = p->hmap[unico[i]]; + if(p->isLeaf) + { + uint pos = p->nodeInfoVecPos; + if(pos < _nodeInfoVec.size()) + { + res.push_back(make_pair(i, &_nodeInfoVec[pos])); + } + else + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return false; + } + } + } + return !res.empty(); + } + + const TrieNodeInfo* findPrefix(const string& str)const + { + if(!_getInitFlag()) + { + LogFatal("trie not initted!"); + return NULL; + } + Unicode uintVec; + + if(!TransCode::decode(str, uintVec)) + { + LogError("TransCode::decode failed."); + return NULL; + } + + //find + TrieNode* p = _root; + uint pos = 0; + uint16_t chUni = 0; + const TrieNodeInfo * res = NULL; + for(uint i = 0; i < uintVec.size(); i++) + { + chUni = uintVec[i]; + if(p->isLeaf) + { + pos = p->nodeInfoVecPos; + if(pos >= _nodeInfoVec.size()) + { + LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); + return NULL; + } + res = &(_nodeInfoVec[pos]); + + } + if(p->hmap.find(chUni) == p->hmap.end()) + { + break; + } + else + { + p = p->hmap[chUni]; + } + } + return res; + } + + public: + double getMinLogFreq()const{return _minLogFreq;}; + + bool insert(const TrieNodeInfo& nodeInfo) + { + if(!_getInitFlag()) + { + LogFatal("not initted!"); + return false; + } + + + const Unicode& uintVec = nodeInfo.word; + TrieNode* p = _root; + for(uint i = 0; i < uintVec.size(); i++) + { + uint16_t cu = uintVec[i]; + if(NULL == p) + { + return false; + } + if(p->hmap.end() == p->hmap.find(cu)) + { + TrieNode * next = NULL; + try + { + next = new TrieNode; + } + catch(const bad_alloc& e) + { + return false; + } + p->hmap[cu] = next; + p = next; + } + else + { + p = p->hmap[cu]; + } + } + if(NULL == p) + { + return false; + } + if(p->isLeaf) + { + LogError("this node already inserted"); + return false; + } + + p->isLeaf = true; + _nodeInfoVec.push_back(nodeInfo); + p->nodeInfoVecPos = _nodeInfoVec.size() - 1; + + return true; + } + + private: + bool _trieInsert(const char * const filePath) + { + ifstream ifile(filePath); + string line; + vector vecBuf; + + TrieNodeInfo nodeInfo; + while(getline(ifile, line)) + { + vecBuf.clear(); + splitStr(line, vecBuf, " "); + if(3 < vecBuf.size()) + { + LogError("line[%s] illegal.", line.c_str()); + return false; + } + if(!TransCode::decode(vecBuf[0], nodeInfo.word)) + { + return false; + } + nodeInfo.freq = atoi(vecBuf[1].c_str()); + if(3 == vecBuf.size()) + { + nodeInfo.tag = vecBuf[2]; + } + + //insert node + if(!insert(nodeInfo)) + { + LogError("insert node failed!"); + } + } + return true; + } + bool _countWeight() + { + if(_nodeInfoVec.empty() || 0 != _freqSum) + { + LogError("_nodeInfoVec is empty or _freqSum has been counted already."); + return false; + } + + //freq total freq + for(size_t i = 0; i < _nodeInfoVec.size(); i++) + { + _freqSum += _nodeInfoVec[i].freq; + } + + if(0 == _freqSum) + { + LogError("_freqSum == 0 ."); + return false; + } + + //normalize + for(uint i = 0; i < _nodeInfoVec.size(); i++) + { + TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; + if(0 == nodeInfo.freq) + { + LogFatal("nodeInfo.freq == 0!"); + return false; + } + nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); + if(_minLogFreq > nodeInfo.logFreq) + { + _minLogFreq = nodeInfo.logFreq; + } + } + + return true; + } + + bool _deleteNode(TrieNode* node) + { + for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) + { + TrieNode* next = it->second; + _deleteNode(next); + } + + delete node; + return true; + } + + }; +} + +#endif diff --git a/src/globals.h b/src/globals.h deleted file mode 100644 index c12007d..0000000 --- a/src/globals.h +++ /dev/null @@ -1,36 +0,0 @@ -/************************************ - * file enc : ASCII - * author : wuyanyi09@gmail.com -************************************/ -#ifndef CPPJIEBA_GLOBALS_H -#define CPPJIEBA_GLOBALS_H - -#include -#include -#include -#include -#include -//#include -#include -//#include - -namespace CppJieba -{ - - using namespace std; - using std::tr1::unordered_map; - //using __gnu_cxx::hash_map; - //using namespace stdext; - //typedefs - typedef std::vector::iterator VSI; - typedef std::vector Unicode; - typedef Unicode::const_iterator UniConIter; - typedef unordered_map TrieNodeMap; - typedef unordered_map EmitProbMap; - - const double MIN_DOUBLE = -3.14e+100; - const double MAX_DOUBLE = 3.14e+100; - enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1, OTHERS = 2}; -} - -#endif diff --git a/src/segment.cpp b/src/segment.cpp index e4b3835..ca49d37 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -1,9 +1,9 @@ #include #include #include "Limonp/ArgvContext.hpp" -#include "MPSegment.h" -#include "HMMSegment.h" -#include "MixSegment.h" +#include "MPSegment.hpp" +#include "HMMSegment.hpp" +#include "MixSegment.hpp" using namespace CppJieba; diff --git a/src/server.cpp b/src/server.cpp index b3c39fb..2545248 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -7,9 +7,9 @@ #include "Limonp/Config.hpp" #include "Husky/Daemon.h" #include "Husky/ServerFrame.h" -#include "MPSegment.h" -#include "HMMSegment.h" -#include "MixSegment.h" +#include "MPSegment.hpp" +#include "HMMSegment.hpp" +#include "MixSegment.hpp" using namespace Husky; using namespace CppJieba; diff --git a/src/structs.h b/src/structs.h deleted file mode 100644 index 88c5894..0000000 --- a/src/structs.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef CPPJIEBA_STRUCTS_H -#define CPPJIEBA_STRUCTS_H - -#include -#include "globals.h" -#include "Trie.h" -#include "TransCode.hpp" - -namespace CppJieba -{ - - struct TrieNodeInfo - { - //string word; - //size_t wLen;// the word's len , not string.length(), - Unicode word; - size_t freq; - string tag; - double logFreq; //logFreq = log(freq/sum(freq)); - TrieNodeInfo():freq(0),logFreq(0.0) - { - } - TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) - { - } - TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) - { - } - string toString()const - { - string tmp; - TransCode::encode(word, tmp); - return string_format("{word:%s,freq:%d, logFreq:%lf}", tmp.c_str(), freq, logFreq); - } - }; - - typedef unordered_map DagType; - struct SegmentChar - { - uint16_t uniCh; - DagType dag; - const TrieNodeInfo * pInfo; - double weight; - - SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0) - { - } - - /*const TrieNodeInfo* pInfo; - double weight; - SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w) - { - }*/ - }; - /* - struct SegmentContext - { - vector context; - bool getDA - };*/ - typedef vector SegmentContext; - - - struct KeyWordInfo: public TrieNodeInfo - { - double idf; - double weight;// log(wLen+1)*logFreq; - KeyWordInfo():idf(0.0),weight(0.0) - { - } - KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0) - { - } - KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo) - { - } - string toString() const - { - string tmp; - TransCode::encode(word, tmp); - return string_format("{word:%s,weight:%lf, idf:%lf}", tmp.c_str(), weight, idf); - } - KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo) - { - word = trieNodeInfo.word; - freq = trieNodeInfo.freq; - tag = trieNodeInfo.tag; - logFreq = trieNodeInfo.logFreq; - return *this; - } - }; - - inline ostream& operator << (ostream& os, const KeyWordInfo& info) - { - string tmp; - TransCode::encode(info.word, tmp); - return os << "{words:" << tmp << ", weight:" << info.weight << ", idf:" << info.idf << "}"; - } - - //inline string joinWordInfos(const vector& vec) - //{ - // vector tmp; - // for(uint i = 0; i < vec.size(); i++) - // { - // tmp.push_back(vec[i].toString()); - // } - // return joinStr(tmp, ","); - //} -} - -#endif