remve the utf8 trans out of Transcode.h and delete TransCode this static class ,using namespace TransCode instead, and inlining funct in it , so remove the cpp , only use transcode.h

This commit is contained in:
wyy 2013-09-18 10:21:10 +08:00
parent 538c0119ba
commit 961575e339
7 changed files with 89 additions and 265 deletions

View File

@ -1,181 +0,0 @@
#include "TransCode.h"
namespace CppJieba
{
vector<string> TransCode::_encVec;
bool TransCode::_isInitted = TransCode::init();
TransCode::pf_decode_t TransCode::_pf_decode = NULL;
TransCode::pf_encode_t TransCode::_pf_encode = NULL;
bool TransCode::init()
{
_pf_decode = gbkToVec;
_pf_encode = vecToGbk;
return true;
}
TransCode::TransCode()
{
}
TransCode::~TransCode()
{
}
void TransCode::setGbkEnc()
{
_pf_decode = gbkToVec;
_pf_encode = vecToGbk;
}
void TransCode::setUtf8Enc()
{
_pf_decode = utf8ToVec;
_pf_encode = vecToUtf8;
}
bool TransCode::utf8ToVec(const string& str, vector<uint16_t>& vec)
{
char ch1, ch2;
if(str.empty())
{
return false;
}
vec.clear();
size_t siz = str.size();
for(uint i = 0;i < siz;)
{
if(!(str[i] & 0x80)) // 0xxxxxxx
{
vec.push_back(str[i]);
i++;
}
else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
{
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
vec.push_back(twocharToUint16(ch1, ch2));
i += 2;
}
else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
{
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
vec.push_back(twocharToUint16(ch1, ch2));
i += 3;
}
else
{
return false;
}
}
return true;
}
bool TransCode::gbkToVec(const string& str, vector<uint16_t>& vec)
{
vec.clear();
if(str.empty())
{
return false;
}
uint i = 0;
while(i < str.size())
{
if(0 == (str[i] & 0x80))
{
vec.push_back(uint16_t(str[i]));
i++;
}
else
{
if(i + 1 < str.size()) //&& (str[i+1] & 0x80))
{
vec.push_back(twocharToUint16(str[i], str[i + 1]));
i += 2;
}
else
{
return false;
}
}
}
return true;
}
bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
{
if(begin >= end)
{
return false;
}
res.clear();
uint16_t ui;
while(begin != end)
{
ui = *begin;
if(ui <= 0x7f)
{
res += char(ui);
}
else if(ui <= 0x7ff)
{
res += char(((ui>>6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
}
else
{
res += char(((ui >> 12) & 0x0f )| 0xe0);
res += char(((ui>>6) & 0x3f )| 0x80 );
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
return true;
}
bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
{
if(begin >= end)
{
return false;
}
res.clear();
pair<char, char> pa;
while(begin != end)
{
pa = uint16ToChar2(*begin);
if(pa.first & 0x80)
{
res += pa.first;
res += pa.second;
}
else
{
res += pa.second;
}
begin++;
}
return true;
}
}
#ifdef CPPJIEBA_TRANSCODE_UT
using namespace CPPCOMMON;
using namespace CppJieba;
int main()
{
string a("abd你好世界!a");
vector<uint16_t> vec;
//TransCode::setUtf8Enc();
cout<<TransCode::decode(a, vec)<<endl;
PRINT_VECTOR(vec);
cout<<TransCode::encode(vec.begin(), vec.end())<<endl;
return 0;
}
#endif

View File

@ -1,7 +1,6 @@
/************************************
* file enc : utf-8
* author : wuyanyi09@gmail.com
************************************/
************************************/
#ifndef CPPJIEBA_TRANSCODE_H
#define CPPJIEBA_TRANSCODE_H
@ -12,60 +11,71 @@
namespace CppJieba
{
using namespace CPPCOMMON;
class TransCode
{
public:
typedef bool (*pf_decode_t)(const string&, vector<uint16_t>&);
typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
private:
static vector<string> _encVec;
static bool _isInitted;
static pf_decode_t _pf_decode;
static pf_encode_t _pf_encode;
public:
static void setGbkEnc();
static void setUtf8Enc();
private:
TransCode();
~TransCode();
public:
static bool init();
public:
static inline bool decode(const string& str, vector<uint16_t>& vec);
static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
static inline bool encode(const Unicode& sentence, string& res);
public:
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
public:
static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
};
namespace TransCode
{
inline bool TransCode::decode(const string& str, vector<uint16_t>& vec)
{
if(NULL == _pf_decode)
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{
return false;
if(begin >= end)
{
return false;
}
res.clear();
pair<char, char> pa;
while(begin != end)
{
pa = CPPCOMMON::uint16ToChar2(*begin);
if(pa.first & 0x80)
{
res += pa.first;
res += pa.second;
}
else
{
res += pa.second;
}
begin++;
}
return true;
}
return _pf_decode(str, vec);
}
inline bool TransCode::encode(const Unicode& sentence, string& res)
{
return encode(sentence.begin(), sentence.end(), res);
}
inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
{
if(!_pf_encode)
inline bool encode(const vector<uint16_t>& sentence, string& res)
{
return false;
return encode(sentence.begin(), sentence.end(), res);
}
return _pf_encode(begin, end, res);
inline bool decode(const string& str, vector<uint16_t>& vec)
{
vec.clear();
if(str.empty())
{
return false;
}
uint i = 0;
while(i < str.size())
{
if(0 == (str[i] & 0x80))
{
vec.push_back(uint16_t(str[i]));
i++;
}
else
{
if(i + 1 < str.size()) //&& (str[i+1] & 0x80))
{
vec.push_back(CPPCOMMON::twocharToUint16(str[i], str[i + 1]));
i += 2;
}
else
{
return false;
}
}
}
return true;
}
}
}

View File

@ -9,6 +9,7 @@
namespace CppJieba
{
using namespace CPPCOMMON;
struct TrieNodeInfo
{
//string word;

View File

@ -1,4 +1,3 @@
make && \
./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 &&\
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM &&\
./segment_demo testlines.utf8 --algorithm cutMix
./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk
./segment_demo testlines.gbk --modelpath ../dicts/hmm_model.gbk --algorithm cutHMM
./segment_demo testlines.gbk --modelpath ../dicts/hmm_model.gbk --algorithm cutMix

View File

@ -29,7 +29,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath)
ext.dispose();
}
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk";
int main(int argc, char ** argv)
{
@ -38,9 +38,9 @@ int main(int argc, char ** argv)
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n"
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf-8.\n"
//<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf-8.\n"
<<"examples:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
//<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<endl;
return -1;
@ -53,14 +53,14 @@ int main(int argc, char ** argv)
{
dictPath = DEFAULT_DICTPATH;
}
if("gbk" == encoding)
{
TransCode::setGbkEnc();
}
else
{
TransCode::setUtf8Enc();
}
//if("gbk" == encoding)
//{
// TransCode::setGbkEnc();
//}
//else
//{
// TransCode::setUtf8Enc();
//}
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
return 0;

View File

@ -92,8 +92,8 @@ bool dispose()
return true;
}
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8";
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.utf8";
const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk";
const char * const DEFAULT_MODELPATH = "../dicts/hmm_model.gbk";
int main(int argc, char ** argv)
{
@ -104,11 +104,10 @@ int main(int argc, char ** argv)
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
<<"example:\n"
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
<<"\t"<<argv[0]<<" testlines.gbk --modelpath ../dicts/hmm_model.gbk --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" testlines.gbk --modelpath ../dicts/hmm_model.gbk --algorithm cutMix\n"
<<endl;
return -1;
@ -116,7 +115,7 @@ int main(int argc, char ** argv)
ArgvContext arg(argc, argv);
string dictPath = arg["--dictpath"];
string modelPath = arg["--modelpath"];
string encoding = arg["--encoding"];
//string encoding = arg["--encoding"];
string algorithm = arg["--algorithm"];
if(dictPath.empty())
{
@ -126,14 +125,14 @@ int main(int argc, char ** argv)
{
modelPath = DEFAULT_MODELPATH;
}
if("gbk" == encoding)
{
TransCode::setGbkEnc();
}
else
{
TransCode::setUtf8Enc();
}
//if("gbk" == encoding)
//{
// TransCode::setGbkEnc();
//}
//else
//{
// TransCode::setUtf8Enc();
//}
if(!init(dictPath.c_str(), modelPath.c_str()))
{

View File

@ -1,4 +0,0 @@
我来到北京清华大学
他来到了网易杭研大厦
杭研
小明硕士毕业于中国科学院计算所,后在日本京都大学深造