mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
finished mixsegment.cpp/h and modify some funct for adapt to mixsegment
This commit is contained in:
parent
a10876f808
commit
4ec1ad8031
@ -6,6 +6,7 @@ using namespace CppJieba;
|
||||
|
||||
MPSegment seg;
|
||||
HMMSegment hmmseg;
|
||||
MixSegment mixseg;
|
||||
bool init(const char * const dictPath, const char * const modelPath)
|
||||
{
|
||||
if(!seg.init(dictPath))
|
||||
@ -20,10 +21,16 @@ bool init(const char * const dictPath, const char * const modelPath)
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!mixseg.init(dictPath, modelPath))
|
||||
{
|
||||
cout<<"mixseg init failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void cut(const char * const filePath)
|
||||
void cutMP(const char * const filePath)
|
||||
{
|
||||
ifstream ifile(filePath);
|
||||
vector<string> res;
|
||||
@ -53,14 +60,16 @@ void cutHMM(const char * const filePath)
|
||||
}
|
||||
}
|
||||
|
||||
void cutAll(const char* const filePath)
|
||||
void cutMix(const char* const filePath)
|
||||
{
|
||||
ifstream ifs(filePath);
|
||||
vector<TrieNodeInfo> res;
|
||||
vector<string> res;
|
||||
string line;
|
||||
while(getline(ifs, line))
|
||||
{
|
||||
seg.cut(line, res);
|
||||
mixseg.cut(line, res);
|
||||
cout<<line<<endl;
|
||||
cout<<vecToString(res)<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -76,6 +85,11 @@ bool dispose()
|
||||
cout<<"seg dispose failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
if(!mixseg.dispose())
|
||||
{
|
||||
cout<<"seg dispose failed."<<endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -88,13 +102,14 @@ int main(int argc, char ** argv)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
|
||||
<<"options:\n"
|
||||
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
||||
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
||||
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
|
||||
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
|
||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
|
||||
<<"example:\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
||||
<<endl;
|
||||
|
||||
@ -131,10 +146,14 @@ int main(int argc, char ** argv)
|
||||
{
|
||||
cutHMM(arg[1].c_str());
|
||||
}
|
||||
else
|
||||
else if("cutMix" == algorithm)
|
||||
{
|
||||
cut(arg[1].c_str());
|
||||
cutMix(arg[1].c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
cutMix(arg[1].c_str());
|
||||
}
|
||||
dispose();
|
||||
return 0;
|
||||
}
|
||||
|
@ -104,46 +104,59 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HMMSegment::cut(const string& str, vector<string>& res)
|
||||
{
|
||||
if(str.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
vector<uint16_t> unico;
|
||||
|
||||
bool HMMSegment::cut(const Unicode& unico, vector<Unicode>& res)
|
||||
{
|
||||
vector<uint> status;
|
||||
vector<uint16_t>::iterator begin, left, right;
|
||||
if(!TransCode::decode(str, unico))
|
||||
|
||||
{
|
||||
LogError("TransCode failed.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!viterbi(unico, status))
|
||||
{
|
||||
LogError("viterbi failed.");
|
||||
return false;
|
||||
}
|
||||
//cout<<encodeing(status)<<endl;
|
||||
begin = unico.begin();
|
||||
left = begin;
|
||||
|
||||
Unicode::const_iterator begin = unico.begin();
|
||||
Unicode::const_iterator left = begin;
|
||||
Unicode::const_iterator right;
|
||||
res.clear();
|
||||
for(uint i =0; i< status.size(); i++)
|
||||
{
|
||||
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
||||
{
|
||||
right = begin + i + 1;
|
||||
res.push_back(TransCode::encode(left, right));
|
||||
res.push_back(Unicode(left, right));
|
||||
left = right;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HMMSegment::cut(const string& str, vector<string>& res)
|
||||
{
|
||||
if(str.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
Unicode unico;
|
||||
if(!TransCode::decode(str, unico))
|
||||
|
||||
{
|
||||
LogError("TransCode failed.");
|
||||
return false;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
if(!cut(unico, words))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
res.clear();
|
||||
for(uint i = 0; i < words.size(); i++)
|
||||
{
|
||||
res.push_back(TransCode::encode(words[i].begin(), words[i].end()));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HMMSegment::viterbi(const vector<uint16_t>& unico, vector<uint>& status)
|
||||
bool HMMSegment::viterbi(const Unicode& unico, vector<uint>& status)
|
||||
{
|
||||
if(unico.empty())
|
||||
{
|
||||
@ -193,17 +206,12 @@ namespace CppJieba
|
||||
{
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE);
|
||||
//cout<<MIN_DOUBLE+MIN_DOUBLE+MIN_DOUBLE<<endl;
|
||||
//cout<<weight[old]<<":"<<_transProb[preY][y]<<":"<<_getEmitProb(_emitProbVec[y], unico[x], MIN_DOUBLE)<<endl;
|
||||
//cout<<tmp<<endl;
|
||||
if(tmp > weight[now])
|
||||
{
|
||||
weight[now] = tmp;
|
||||
path[now] = preY;
|
||||
}
|
||||
}
|
||||
//cout<<x<<":"<<y<<":"<<weight[now]<<endl;
|
||||
//getchar();
|
||||
}
|
||||
}
|
||||
|
||||
@ -278,7 +286,7 @@ namespace CppJieba
|
||||
|
||||
bool HMMSegment::_decodeOne(const string& str, uint16_t& res)
|
||||
{
|
||||
vector<uint16_t> ui16;
|
||||
Unicode ui16;
|
||||
if(!TransCode::decode(str, ui16) || ui16.size() != 1)
|
||||
{
|
||||
return false;
|
||||
|
@ -13,12 +13,13 @@ namespace CppJieba
|
||||
using namespace CPPCOMMON;
|
||||
class HMMSegment
|
||||
{
|
||||
private:
|
||||
public:
|
||||
/*
|
||||
* STATUS:
|
||||
* 0:B, 1:E, 2:M, 3:S
|
||||
* */
|
||||
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
||||
private:
|
||||
char _statMap[STATUS_SUM];
|
||||
double _startProb[STATUS_SUM];
|
||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||
@ -35,8 +36,10 @@ namespace CppJieba
|
||||
bool init(const char* const modelPath);
|
||||
bool dispose();
|
||||
public:
|
||||
bool cut(const Unicode& unico, vector<Unicode>& res);
|
||||
bool cut(const string& str, vector<string>& res);
|
||||
bool viterbi(const vector<uint16_t>& unico, vector<uint>& status);
|
||||
bool viterbi(const Unicode& unico, vector<uint>& status);
|
||||
|
||||
private:
|
||||
bool _loadModel(const char* const filePath);
|
||||
bool _getLine(ifstream& ifile, string& line);
|
||||
|
@ -40,10 +40,48 @@ namespace CppJieba
|
||||
LogError("_mpSeg cutDAG failed.");
|
||||
return false;
|
||||
}
|
||||
res.clear();
|
||||
Unicode unico;
|
||||
vector<Unicode> hmmRes;
|
||||
for(uint i= 0; i < infos.size(); i++)
|
||||
{
|
||||
if(1 == infos[i].word.size())
|
||||
{
|
||||
unico.push_back(infos[i].word[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(!unico.empty())
|
||||
{
|
||||
if(!_hmmSeg.cut(unico, hmmRes))
|
||||
{
|
||||
LogError("_hmmSeg cut failed.");
|
||||
return false;
|
||||
}
|
||||
for(uint j = 0; j < hmmRes.size(); j++)
|
||||
{
|
||||
res.push_back(TransCode::encode(hmmRes[j]));
|
||||
}
|
||||
}
|
||||
unico.clear();
|
||||
|
||||
res.push_back(TransCode::encode(infos[i].word));
|
||||
}
|
||||
|
||||
}
|
||||
if(!unico.empty())
|
||||
{
|
||||
if(!_hmmSeg.cut(unico, hmmRes))
|
||||
{
|
||||
LogError("_hmmSeg cut failed.");
|
||||
return false;
|
||||
}
|
||||
for(uint j = 0; j < hmmRes.size(); j++)
|
||||
{
|
||||
res.push_back(TransCode::encode(hmmRes[j]));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -111,10 +111,15 @@ namespace CppJieba
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
string TransCode::encode(const Unicode& sentence)
|
||||
{
|
||||
return encode(sentence.begin(), sentence.end());
|
||||
}
|
||||
|
||||
string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end)
|
||||
{
|
||||
if(NULL == _pf_encode)
|
||||
if(!_pf_encode)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ namespace CppJieba
|
||||
public:
|
||||
static bool decode(const string& str, vector<uint16_t>& vec);
|
||||
static string encode(Unicode::const_iterator begin, Unicode::const_iterator end);
|
||||
static string encode(const Unicode& sentence);
|
||||
//static size_t getWordLength(const string& str);
|
||||
public:
|
||||
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
||||
|
@ -12,5 +12,6 @@
|
||||
#include "Trie.h"
|
||||
#include "TransCode.h"
|
||||
#include "HMMSegment.h"
|
||||
#include "MixSegment.h"
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user