diff --git a/demo/segment_demo.cpp b/demo/segment_demo.cpp index 9e4d7ef..d3f9a7e 100644 --- a/demo/segment_demo.cpp +++ b/demo/segment_demo.cpp @@ -6,6 +6,7 @@ using namespace CppJieba; MPSegment seg; HMMSegment hmmseg; +MixSegment mixseg; bool init(const char * const dictPath, const char * const modelPath) { if(!seg.init(dictPath)) @@ -20,10 +21,16 @@ bool init(const char * const dictPath, const char * const modelPath) return false; } + if(!mixseg.init(dictPath, modelPath)) + { + cout<<"mixseg init failed."< res; @@ -53,14 +60,16 @@ void cutHMM(const char * const filePath) } } -void cutAll(const char* const filePath) +void cutMix(const char* const filePath) { ifstream ifs(filePath); - vector res; + vector res; string line; while(getline(ifs, line)) { - seg.cut(line, res); + mixseg.cut(line, res); + cout<\n" <<"options:\n" - <<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM] for now. \n\t\t\tIf not specified, the default is cutDAG\n" + <<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" <<"\t--dictpath\tIf not specified, the default is "<& res) - { - if(str.empty()) - { - return false; - } - vector unico; + + bool HMMSegment::cut(const Unicode& unico, vector& res) + { vector status; - vector::iterator begin, left, right; - if(!TransCode::decode(str, unico)) - - { - LogError("TransCode failed."); - return false; - } - if(!viterbi(unico, status)) { LogError("viterbi failed."); return false; } - //cout<& res) + { + if(str.empty()) + { + return false; + } + Unicode unico; + if(!TransCode::decode(str, unico)) + + { + LogError("TransCode failed."); + return false; + } + vector words; + if(!cut(unico, words)) + { + return false; + } + res.clear(); + for(uint i = 0; i < words.size(); i++) + { + res.push_back(TransCode::encode(words[i].begin(), words[i].end())); + } return true; } - bool HMMSegment::viterbi(const vector& unico, vector& status) + bool HMMSegment::viterbi(const Unicode& unico, vector& status) { if(unico.empty()) { @@ -193,17 +206,12 @@ namespace CppJieba { old = x - 1 + preY * X; tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE); - //cout< weight[now]) { weight[now] = tmp; path[now] = preY; } } - //cout< ui16; + Unicode ui16; if(!TransCode::decode(str, ui16) || ui16.size() != 1) { return false; diff --git a/src/HMMSegment.h b/src/HMMSegment.h index 28f77f6..a68661b 100644 --- a/src/HMMSegment.h +++ b/src/HMMSegment.h @@ -13,12 +13,13 @@ namespace CppJieba using namespace CPPCOMMON; class HMMSegment { - private: + public: /* * STATUS: * 0:B, 1:E, 2:M, 3:S * */ enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; + private: char _statMap[STATUS_SUM]; double _startProb[STATUS_SUM]; double _transProb[STATUS_SUM][STATUS_SUM]; @@ -35,8 +36,10 @@ namespace CppJieba bool init(const char* const modelPath); bool dispose(); public: + bool cut(const Unicode& unico, vector& res); bool cut(const string& str, vector& res); - bool viterbi(const vector& unico, vector& status); + bool viterbi(const Unicode& unico, vector& status); + private: bool _loadModel(const char* const filePath); bool _getLine(ifstream& ifile, string& line); diff --git a/src/MixSegment.cpp b/src/MixSegment.cpp index 2b73510..39c54af 100644 --- a/src/MixSegment.cpp +++ b/src/MixSegment.cpp @@ -40,10 +40,48 @@ namespace CppJieba LogError("_mpSeg cutDAG failed."); return false; } + res.clear(); + Unicode unico; + vector hmmRes; for(uint i= 0; i < infos.size(); i++) { + if(1 == infos[i].word.size()) + { + unico.push_back(infos[i].word[0]); + } + else + { + if(!unico.empty()) + { + if(!_hmmSeg.cut(unico, hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + for(uint j = 0; j < hmmRes.size(); j++) + { + res.push_back(TransCode::encode(hmmRes[j])); + } + } + unico.clear(); + + res.push_back(TransCode::encode(infos[i].word)); + } } + if(!unico.empty()) + { + if(!_hmmSeg.cut(unico, hmmRes)) + { + LogError("_hmmSeg cut failed."); + return false; + } + for(uint j = 0; j < hmmRes.size(); j++) + { + res.push_back(TransCode::encode(hmmRes[j])); + } + } + return true; } } diff --git a/src/TransCode.cpp b/src/TransCode.cpp index c7d773f..80ab080 100644 --- a/src/TransCode.cpp +++ b/src/TransCode.cpp @@ -111,10 +111,15 @@ namespace CppJieba } return true; } + + string TransCode::encode(const Unicode& sentence) + { + return encode(sentence.begin(), sentence.end()); + } string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end) { - if(NULL == _pf_encode) + if(!_pf_encode) { return ""; } diff --git a/src/TransCode.h b/src/TransCode.h index 220bb17..5627941 100644 --- a/src/TransCode.h +++ b/src/TransCode.h @@ -38,6 +38,7 @@ namespace CppJieba public: static bool decode(const string& str, vector& vec); static string encode(Unicode::const_iterator begin, Unicode::const_iterator end); + static string encode(const Unicode& sentence); //static size_t getWordLength(const string& str); public: static bool gbkToVec(const string& str, vector& vec); diff --git a/src/headers.h b/src/headers.h index 1227cac..437052f 100644 --- a/src/headers.h +++ b/src/headers.h @@ -12,5 +12,6 @@ #include "Trie.h" #include "TransCode.h" #include "HMMSegment.h" +#include "MixSegment.h" #endif