finished mixsegment.cpp/h and modify some funct for adapt to mixsegment

This commit is contained in:
wyy 2013-09-09 16:34:38 +08:00
parent a10876f808
commit 4ec1ad8031
7 changed files with 113 additions and 38 deletions

View File

@ -6,6 +6,7 @@ using namespace CppJieba;
MPSegment seg;
HMMSegment hmmseg;
MixSegment mixseg;
bool init(const char * const dictPath, const char * const modelPath)
{
if(!seg.init(dictPath))
@ -20,10 +21,16 @@ bool init(const char * const dictPath, const char * const modelPath)
return false;
}
if(!mixseg.init(dictPath, modelPath))
{
cout<<"mixseg init failed."<<endl;
return false;
}
return true;
}
void cut(const char * const filePath)
void cutMP(const char * const filePath)
{
ifstream ifile(filePath);
vector<string> res;
@ -53,14 +60,16 @@ void cutHMM(const char * const filePath)
}
}
void cutAll(const char* const filePath)
void cutMix(const char* const filePath)
{
ifstream ifs(filePath);
vector<TrieNodeInfo> res;
vector<string> res;
string line;
while(getline(ifs, line))
{
seg.cut(line, res);
mixseg.cut(line, res);
cout<<line<<endl;
cout<<vecToString(res)<<endl;
}
}
@ -76,6 +85,11 @@ bool dispose()
cout<<"seg dispose failed."<<endl;
return false;
}
if(!mixseg.dispose())
{
cout<<"seg dispose failed."<<endl;
return false;
}
return true;
}
@ -88,13 +102,14 @@ int main(int argc, char ** argv)
{
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
<<"options:\n"
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl;
@ -131,9 +146,13 @@ int main(int argc, char ** argv)
{
cutHMM(arg[1].c_str());
}
else if("cutMix" == algorithm)
{
cutMix(arg[1].c_str());
}
else
{
cut(arg[1].c_str());
cutMix(arg[1].c_str());
}
dispose();
return 0;

View File

@ -104,46 +104,59 @@ namespace CppJieba
return true;
}
bool HMMSegment::cut(const string& str, vector<string>& res)
bool HMMSegment::cut(const Unicode& unico, vector<Unicode>& res)
{
if(str.empty())
{
return false;
}
vector<uint16_t> unico;
vector<uint> status;
vector<uint16_t>::iterator begin, left, right;
if(!TransCode::decode(str, unico))
{
LogError("TransCode failed.");
return false;
}
if(!viterbi(unico, status))
{
LogError("viterbi failed.");
return false;
}
//cout<<encodeing(status)<<endl;
begin = unico.begin();
left = begin;
Unicode::const_iterator begin = unico.begin();
Unicode::const_iterator left = begin;
Unicode::const_iterator right;
res.clear();
for(uint i =0; i< status.size(); i++)
{
if(status[i] % 2) //if(E == status[i] || S == status[i])
{
right = begin + i + 1;
res.push_back(TransCode::encode(left, right));
res.push_back(Unicode(left, right));
left = right;
}
}
return true;
}
bool HMMSegment::viterbi(const vector<uint16_t>& unico, vector<uint>& status)
bool HMMSegment::cut(const string& str, vector<string>& res)
{
if(str.empty())
{
return false;
}
Unicode unico;
if(!TransCode::decode(str, unico))
{
LogError("TransCode failed.");
return false;
}
vector<Unicode> words;
if(!cut(unico, words))
{
return false;
}
res.clear();
for(uint i = 0; i < words.size(); i++)
{
res.push_back(TransCode::encode(words[i].begin(), words[i].end()));
}
return true;
}
bool HMMSegment::viterbi(const Unicode& unico, vector<uint>& status)
{
if(unico.empty())
{
@ -193,17 +206,12 @@ namespace CppJieba
{
old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE);
//cout<<MIN_DOUBLE+MIN_DOUBLE+MIN_DOUBLE<<endl;
//cout<<weight[old]<<":"<<_transProb[preY][y]<<":"<<_getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE)<<endl;
//cout<<tmp<<endl;
if(tmp > weight[now])
{
weight[now] = tmp;
path[now] = preY;
}
}
//cout<<x<<":"<<y<<":"<<weight[now]<<endl;
//getchar();
}
}
@ -278,7 +286,7 @@ namespace CppJieba
bool HMMSegment::_decodeOne(const string& str, uint16_t& res)
{
vector<uint16_t> ui16;
Unicode ui16;
if(!TransCode::decode(str, ui16) || ui16.size() != 1)
{
return false;

View File

@ -13,12 +13,13 @@ namespace CppJieba
using namespace CPPCOMMON;
class HMMSegment
{
private:
public:
/*
* STATUS:
* 0:B, 1:E, 2:M, 3:S
* */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
@ -35,8 +36,10 @@ namespace CppJieba
bool init(const char* const modelPath);
bool dispose();
public:
bool cut(const Unicode& unico, vector<Unicode>& res);
bool cut(const string& str, vector<string>& res);
bool viterbi(const vector<uint16_t>& unico, vector<uint>& status);
bool viterbi(const Unicode& unico, vector<uint>& status);
private:
bool _loadModel(const char* const filePath);
bool _getLine(ifstream& ifile, string& line);

View File

@ -40,10 +40,48 @@ namespace CppJieba
LogError("_mpSeg cutDAG failed.");
return false;
}
res.clear();
Unicode unico;
vector<Unicode> hmmRes;
for(uint i= 0; i < infos.size(); i++)
{
if(1 == infos[i].word.size())
{
unico.push_back(infos[i].word[0]);
}
else
{
if(!unico.empty())
{
if(!_hmmSeg.cut(unico, hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
res.push_back(TransCode::encode(hmmRes[j]));
}
}
unico.clear();
res.push_back(TransCode::encode(infos[i].word));
}
}
if(!unico.empty())
{
if(!_hmmSeg.cut(unico, hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
res.push_back(TransCode::encode(hmmRes[j]));
}
}
return true;
}
}

View File

@ -112,9 +112,14 @@ namespace CppJieba
return true;
}
string TransCode::encode(const Unicode& sentence)
{
return encode(sentence.begin(), sentence.end());
}
string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end)
{
if(NULL == _pf_encode)
if(!_pf_encode)
{
return "";
}

View File

@ -38,6 +38,7 @@ namespace CppJieba
public:
static bool decode(const string& str, vector<uint16_t>& vec);
static string encode(Unicode::const_iterator begin, Unicode::const_iterator end);
static string encode(const Unicode& sentence);
//static size_t getWordLength(const string& str);
public:
static bool gbkToVec(const string& str, vector<uint16_t>& vec);

View File

@ -12,5 +12,6 @@
#include "Trie.h"
#include "TransCode.h"
#include "HMMSegment.h"
#include "MixSegment.h"
#endif