finished mixsegment.cpp/h and modify some funct for adapt to mixsegment

This commit is contained in:
wyy 2013-09-09 16:34:38 +08:00
parent a10876f808
commit 4ec1ad8031
7 changed files with 113 additions and 38 deletions

View File

@ -6,6 +6,7 @@ using namespace CppJieba;
MPSegment seg; MPSegment seg;
HMMSegment hmmseg; HMMSegment hmmseg;
MixSegment mixseg;
bool init(const char * const dictPath, const char * const modelPath) bool init(const char * const dictPath, const char * const modelPath)
{ {
if(!seg.init(dictPath)) if(!seg.init(dictPath))
@ -20,10 +21,16 @@ bool init(const char * const dictPath, const char * const modelPath)
return false; return false;
} }
if(!mixseg.init(dictPath, modelPath))
{
cout<<"mixseg init failed."<<endl;
return false;
}
return true; return true;
} }
void cut(const char * const filePath) void cutMP(const char * const filePath)
{ {
ifstream ifile(filePath); ifstream ifile(filePath);
vector<string> res; vector<string> res;
@ -53,14 +60,16 @@ void cutHMM(const char * const filePath)
} }
} }
void cutAll(const char* const filePath) void cutMix(const char* const filePath)
{ {
ifstream ifs(filePath); ifstream ifs(filePath);
vector<TrieNodeInfo> res; vector<string> res;
string line; string line;
while(getline(ifs, line)) while(getline(ifs, line))
{ {
seg.cut(line, res); mixseg.cut(line, res);
cout<<line<<endl;
cout<<vecToString(res)<<endl;
} }
} }
@ -76,6 +85,11 @@ bool dispose()
cout<<"seg dispose failed."<<endl; cout<<"seg dispose failed."<<endl;
return false; return false;
} }
if(!mixseg.dispose())
{
cout<<"seg dispose failed."<<endl;
return false;
}
return true; return true;
} }
@ -88,13 +102,14 @@ int main(int argc, char ** argv)
{ {
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n" cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
<<"options:\n" <<"options:\n"
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM] for now. \n\t\t\tIf not specified, the default is cutDAG\n" <<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n' <<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n' <<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n" <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
<<"example:\n" <<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n" <<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n" <<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n" <<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl; <<endl;
@ -131,10 +146,14 @@ int main(int argc, char ** argv)
{ {
cutHMM(arg[1].c_str()); cutHMM(arg[1].c_str());
} }
else else if("cutMix" == algorithm)
{ {
cut(arg[1].c_str()); cutMix(arg[1].c_str());
} }
else
{
cutMix(arg[1].c_str());
}
dispose(); dispose();
return 0; return 0;
} }

View File

@ -104,46 +104,59 @@ namespace CppJieba
return true; return true;
} }
bool HMMSegment::cut(const string& str, vector<string>& res)
{ bool HMMSegment::cut(const Unicode& unico, vector<Unicode>& res)
if(str.empty()) {
{
return false;
}
vector<uint16_t> unico;
vector<uint> status; vector<uint> status;
vector<uint16_t>::iterator begin, left, right;
if(!TransCode::decode(str, unico))
{
LogError("TransCode failed.");
return false;
}
if(!viterbi(unico, status)) if(!viterbi(unico, status))
{ {
LogError("viterbi failed."); LogError("viterbi failed.");
return false; return false;
} }
//cout<<encodeing(status)<<endl;
begin = unico.begin(); Unicode::const_iterator begin = unico.begin();
left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right;
res.clear(); res.clear();
for(uint i =0; i< status.size(); i++) for(uint i =0; i< status.size(); i++)
{ {
if(status[i] % 2) //if(E == status[i] || S == status[i]) if(status[i] % 2) //if(E == status[i] || S == status[i])
{ {
right = begin + i + 1; right = begin + i + 1;
res.push_back(TransCode::encode(left, right)); res.push_back(Unicode(left, right));
left = right; left = right;
} }
} }
return true;
}
bool HMMSegment::cut(const string& str, vector<string>& res)
{
if(str.empty())
{
return false;
}
Unicode unico;
if(!TransCode::decode(str, unico))
{
LogError("TransCode failed.");
return false;
}
vector<Unicode> words;
if(!cut(unico, words))
{
return false;
}
res.clear();
for(uint i = 0; i < words.size(); i++)
{
res.push_back(TransCode::encode(words[i].begin(), words[i].end()));
}
return true; return true;
} }
bool HMMSegment::viterbi(const vector<uint16_t>& unico, vector<uint>& status) bool HMMSegment::viterbi(const Unicode& unico, vector<uint>& status)
{ {
if(unico.empty()) if(unico.empty())
{ {
@ -193,17 +206,12 @@ namespace CppJieba
{ {
old = x - 1 + preY * X; old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE); tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE);
//cout<<MIN_DOUBLE+MIN_DOUBLE+MIN_DOUBLE<<endl;
//cout<<weight[old]<<":"<<_transProb[preY][y]<<":"<<_getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE)<<endl;
//cout<<tmp<<endl;
if(tmp > weight[now]) if(tmp > weight[now])
{ {
weight[now] = tmp; weight[now] = tmp;
path[now] = preY; path[now] = preY;
} }
} }
//cout<<x<<":"<<y<<":"<<weight[now]<<endl;
//getchar();
} }
} }
@ -278,7 +286,7 @@ namespace CppJieba
bool HMMSegment::_decodeOne(const string& str, uint16_t& res) bool HMMSegment::_decodeOne(const string& str, uint16_t& res)
{ {
vector<uint16_t> ui16; Unicode ui16;
if(!TransCode::decode(str, ui16) || ui16.size() != 1) if(!TransCode::decode(str, ui16) || ui16.size() != 1)
{ {
return false; return false;

View File

@ -13,12 +13,13 @@ namespace CppJieba
using namespace CPPCOMMON; using namespace CPPCOMMON;
class HMMSegment class HMMSegment
{ {
private: public:
/* /*
* STATUS: * STATUS:
* 0:B, 1:E, 2:M, 3:S * 0:B, 1:E, 2:M, 3:S
* */ * */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
private:
char _statMap[STATUS_SUM]; char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM]; double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM]; double _transProb[STATUS_SUM][STATUS_SUM];
@ -35,8 +36,10 @@ namespace CppJieba
bool init(const char* const modelPath); bool init(const char* const modelPath);
bool dispose(); bool dispose();
public: public:
bool cut(const Unicode& unico, vector<Unicode>& res);
bool cut(const string& str, vector<string>& res); bool cut(const string& str, vector<string>& res);
bool viterbi(const vector<uint16_t>& unico, vector<uint>& status); bool viterbi(const Unicode& unico, vector<uint>& status);
private: private:
bool _loadModel(const char* const filePath); bool _loadModel(const char* const filePath);
bool _getLine(ifstream& ifile, string& line); bool _getLine(ifstream& ifile, string& line);

View File

@ -40,10 +40,48 @@ namespace CppJieba
LogError("_mpSeg cutDAG failed."); LogError("_mpSeg cutDAG failed.");
return false; return false;
} }
res.clear();
Unicode unico;
vector<Unicode> hmmRes;
for(uint i= 0; i < infos.size(); i++) for(uint i= 0; i < infos.size(); i++)
{ {
if(1 == infos[i].word.size())
{
unico.push_back(infos[i].word[0]);
}
else
{
if(!unico.empty())
{
if(!_hmmSeg.cut(unico, hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
res.push_back(TransCode::encode(hmmRes[j]));
}
}
unico.clear();
res.push_back(TransCode::encode(infos[i].word));
}
} }
if(!unico.empty())
{
if(!_hmmSeg.cut(unico, hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
res.push_back(TransCode::encode(hmmRes[j]));
}
}
return true; return true;
} }
} }

View File

@ -111,10 +111,15 @@ namespace CppJieba
} }
return true; return true;
} }
string TransCode::encode(const Unicode& sentence)
{
return encode(sentence.begin(), sentence.end());
}
string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end) string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end)
{ {
if(NULL == _pf_encode) if(!_pf_encode)
{ {
return ""; return "";
} }

View File

@ -38,6 +38,7 @@ namespace CppJieba
public: public:
static bool decode(const string& str, vector<uint16_t>& vec); static bool decode(const string& str, vector<uint16_t>& vec);
static string encode(Unicode::const_iterator begin, Unicode::const_iterator end); static string encode(Unicode::const_iterator begin, Unicode::const_iterator end);
static string encode(const Unicode& sentence);
//static size_t getWordLength(const string& str); //static size_t getWordLength(const string& str);
public: public:
static bool gbkToVec(const string& str, vector<uint16_t>& vec); static bool gbkToVec(const string& str, vector<uint16_t>& vec);

View File

@ -12,5 +12,6 @@
#include "Trie.h" #include "Trie.h"
#include "TransCode.h" #include "TransCode.h"
#include "HMMSegment.h" #include "HMMSegment.h"
#include "MixSegment.h"
#endif #endif