mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
finished mixsegment.cpp/h and modify some funct for adapt to mixsegment
This commit is contained in:
parent
a10876f808
commit
4ec1ad8031
@ -6,6 +6,7 @@ using namespace CppJieba;
|
|||||||
|
|
||||||
MPSegment seg;
|
MPSegment seg;
|
||||||
HMMSegment hmmseg;
|
HMMSegment hmmseg;
|
||||||
|
MixSegment mixseg;
|
||||||
bool init(const char * const dictPath, const char * const modelPath)
|
bool init(const char * const dictPath, const char * const modelPath)
|
||||||
{
|
{
|
||||||
if(!seg.init(dictPath))
|
if(!seg.init(dictPath))
|
||||||
@ -20,10 +21,16 @@ bool init(const char * const dictPath, const char * const modelPath)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(!mixseg.init(dictPath, modelPath))
|
||||||
|
{
|
||||||
|
cout<<"mixseg init failed."<<endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cut(const char * const filePath)
|
void cutMP(const char * const filePath)
|
||||||
{
|
{
|
||||||
ifstream ifile(filePath);
|
ifstream ifile(filePath);
|
||||||
vector<string> res;
|
vector<string> res;
|
||||||
@ -53,14 +60,16 @@ void cutHMM(const char * const filePath)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cutAll(const char* const filePath)
|
void cutMix(const char* const filePath)
|
||||||
{
|
{
|
||||||
ifstream ifs(filePath);
|
ifstream ifs(filePath);
|
||||||
vector<TrieNodeInfo> res;
|
vector<string> res;
|
||||||
string line;
|
string line;
|
||||||
while(getline(ifs, line))
|
while(getline(ifs, line))
|
||||||
{
|
{
|
||||||
seg.cut(line, res);
|
mixseg.cut(line, res);
|
||||||
|
cout<<line<<endl;
|
||||||
|
cout<<vecToString(res)<<endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,6 +85,11 @@ bool dispose()
|
|||||||
cout<<"seg dispose failed."<<endl;
|
cout<<"seg dispose failed."<<endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if(!mixseg.dispose())
|
||||||
|
{
|
||||||
|
cout<<"seg dispose failed."<<endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,13 +102,14 @@ int main(int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
|
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
|
||||||
<<"options:\n"
|
<<"options:\n"
|
||||||
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
||||||
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
|
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
|
||||||
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
|
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
|
||||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
|
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
|
||||||
<<"example:\n"
|
<<"example:\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||||
|
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
||||||
<<endl;
|
<<endl;
|
||||||
|
|
||||||
@ -131,10 +146,14 @@ int main(int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
cutHMM(arg[1].c_str());
|
cutHMM(arg[1].c_str());
|
||||||
}
|
}
|
||||||
else
|
else if("cutMix" == algorithm)
|
||||||
{
|
{
|
||||||
cut(arg[1].c_str());
|
cutMix(arg[1].c_str());
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cutMix(arg[1].c_str());
|
||||||
|
}
|
||||||
dispose();
|
dispose();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -104,46 +104,59 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HMMSegment::cut(const string& str, vector<string>& res)
|
|
||||||
{
|
bool HMMSegment::cut(const Unicode& unico, vector<Unicode>& res)
|
||||||
if(str.empty())
|
{
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
vector<uint16_t> unico;
|
|
||||||
vector<uint> status;
|
vector<uint> status;
|
||||||
vector<uint16_t>::iterator begin, left, right;
|
|
||||||
if(!TransCode::decode(str, unico))
|
|
||||||
|
|
||||||
{
|
|
||||||
LogError("TransCode failed.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!viterbi(unico, status))
|
if(!viterbi(unico, status))
|
||||||
{
|
{
|
||||||
LogError("viterbi failed.");
|
LogError("viterbi failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
//cout<<encodeing(status)<<endl;
|
|
||||||
begin = unico.begin();
|
Unicode::const_iterator begin = unico.begin();
|
||||||
left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
|
Unicode::const_iterator right;
|
||||||
res.clear();
|
res.clear();
|
||||||
for(uint i =0; i< status.size(); i++)
|
for(uint i =0; i< status.size(); i++)
|
||||||
{
|
{
|
||||||
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
||||||
{
|
{
|
||||||
right = begin + i + 1;
|
right = begin + i + 1;
|
||||||
res.push_back(TransCode::encode(left, right));
|
res.push_back(Unicode(left, right));
|
||||||
left = right;
|
left = right;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HMMSegment::cut(const string& str, vector<string>& res)
|
||||||
|
{
|
||||||
|
if(str.empty())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Unicode unico;
|
||||||
|
if(!TransCode::decode(str, unico))
|
||||||
|
|
||||||
|
{
|
||||||
|
LogError("TransCode failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
vector<Unicode> words;
|
||||||
|
if(!cut(unico, words))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
res.clear();
|
||||||
|
for(uint i = 0; i < words.size(); i++)
|
||||||
|
{
|
||||||
|
res.push_back(TransCode::encode(words[i].begin(), words[i].end()));
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HMMSegment::viterbi(const vector<uint16_t>& unico, vector<uint>& status)
|
bool HMMSegment::viterbi(const Unicode& unico, vector<uint>& status)
|
||||||
{
|
{
|
||||||
if(unico.empty())
|
if(unico.empty())
|
||||||
{
|
{
|
||||||
@ -193,17 +206,12 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
old = x - 1 + preY * X;
|
old = x - 1 + preY * X;
|
||||||
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE);
|
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE);
|
||||||
//cout<<MIN_DOUBLE+MIN_DOUBLE+MIN_DOUBLE<<endl;
|
|
||||||
//cout<<weight[old]<<":"<<_transProb[preY][y]<<":"<<_getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE)<<endl;
|
|
||||||
//cout<<tmp<<endl;
|
|
||||||
if(tmp > weight[now])
|
if(tmp > weight[now])
|
||||||
{
|
{
|
||||||
weight[now] = tmp;
|
weight[now] = tmp;
|
||||||
path[now] = preY;
|
path[now] = preY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//cout<<x<<":"<<y<<":"<<weight[now]<<endl;
|
|
||||||
//getchar();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -278,7 +286,7 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool HMMSegment::_decodeOne(const string& str, uint16_t& res)
|
bool HMMSegment::_decodeOne(const string& str, uint16_t& res)
|
||||||
{
|
{
|
||||||
vector<uint16_t> ui16;
|
Unicode ui16;
|
||||||
if(!TransCode::decode(str, ui16) || ui16.size() != 1)
|
if(!TransCode::decode(str, ui16) || ui16.size() != 1)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
|
@ -13,12 +13,13 @@ namespace CppJieba
|
|||||||
using namespace CPPCOMMON;
|
using namespace CPPCOMMON;
|
||||||
class HMMSegment
|
class HMMSegment
|
||||||
{
|
{
|
||||||
private:
|
public:
|
||||||
/*
|
/*
|
||||||
* STATUS:
|
* STATUS:
|
||||||
* 0:B, 1:E, 2:M, 3:S
|
* 0:B, 1:E, 2:M, 3:S
|
||||||
* */
|
* */
|
||||||
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
||||||
|
private:
|
||||||
char _statMap[STATUS_SUM];
|
char _statMap[STATUS_SUM];
|
||||||
double _startProb[STATUS_SUM];
|
double _startProb[STATUS_SUM];
|
||||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||||
@ -35,8 +36,10 @@ namespace CppJieba
|
|||||||
bool init(const char* const modelPath);
|
bool init(const char* const modelPath);
|
||||||
bool dispose();
|
bool dispose();
|
||||||
public:
|
public:
|
||||||
|
bool cut(const Unicode& unico, vector<Unicode>& res);
|
||||||
bool cut(const string& str, vector<string>& res);
|
bool cut(const string& str, vector<string>& res);
|
||||||
bool viterbi(const vector<uint16_t>& unico, vector<uint>& status);
|
bool viterbi(const Unicode& unico, vector<uint>& status);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _loadModel(const char* const filePath);
|
bool _loadModel(const char* const filePath);
|
||||||
bool _getLine(ifstream& ifile, string& line);
|
bool _getLine(ifstream& ifile, string& line);
|
||||||
|
@ -40,10 +40,48 @@ namespace CppJieba
|
|||||||
LogError("_mpSeg cutDAG failed.");
|
LogError("_mpSeg cutDAG failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
res.clear();
|
||||||
|
Unicode unico;
|
||||||
|
vector<Unicode> hmmRes;
|
||||||
for(uint i= 0; i < infos.size(); i++)
|
for(uint i= 0; i < infos.size(); i++)
|
||||||
{
|
{
|
||||||
|
if(1 == infos[i].word.size())
|
||||||
|
{
|
||||||
|
unico.push_back(infos[i].word[0]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(!unico.empty())
|
||||||
|
{
|
||||||
|
if(!_hmmSeg.cut(unico, hmmRes))
|
||||||
|
{
|
||||||
|
LogError("_hmmSeg cut failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for(uint j = 0; j < hmmRes.size(); j++)
|
||||||
|
{
|
||||||
|
res.push_back(TransCode::encode(hmmRes[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unico.clear();
|
||||||
|
|
||||||
|
res.push_back(TransCode::encode(infos[i].word));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
if(!unico.empty())
|
||||||
|
{
|
||||||
|
if(!_hmmSeg.cut(unico, hmmRes))
|
||||||
|
{
|
||||||
|
LogError("_hmmSeg cut failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for(uint j = 0; j < hmmRes.size(); j++)
|
||||||
|
{
|
||||||
|
res.push_back(TransCode::encode(hmmRes[j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,10 +111,15 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string TransCode::encode(const Unicode& sentence)
|
||||||
|
{
|
||||||
|
return encode(sentence.begin(), sentence.end());
|
||||||
|
}
|
||||||
|
|
||||||
string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end)
|
string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||||
{
|
{
|
||||||
if(NULL == _pf_encode)
|
if(!_pf_encode)
|
||||||
{
|
{
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,7 @@ namespace CppJieba
|
|||||||
public:
|
public:
|
||||||
static bool decode(const string& str, vector<uint16_t>& vec);
|
static bool decode(const string& str, vector<uint16_t>& vec);
|
||||||
static string encode(Unicode::const_iterator begin, Unicode::const_iterator end);
|
static string encode(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||||
|
static string encode(const Unicode& sentence);
|
||||||
//static size_t getWordLength(const string& str);
|
//static size_t getWordLength(const string& str);
|
||||||
public:
|
public:
|
||||||
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
||||||
|
@ -12,5 +12,6 @@
|
|||||||
#include "Trie.h"
|
#include "Trie.h"
|
||||||
#include "TransCode.h"
|
#include "TransCode.h"
|
||||||
#include "HMMSegment.h"
|
#include "HMMSegment.h"
|
||||||
|
#include "MixSegment.h"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user