update cppcomm

This commit is contained in:
gwdwyy 2013-07-22 14:31:59 +08:00
parent 86cf38bfef
commit ca5e5517e7
4 changed files with 72 additions and 86 deletions

View File

@ -37,79 +37,59 @@ namespace CPPCOMMON
_encoding = enc; _encoding = enc;
return true; return true;
} }
string UnicodeEncoding::encode(const string& str)
string UnicodeEncoding::encode(const vector<uint16_t>& unicode)
{ {
if(!isUniStrValid(str)) if(unicode.empty())
{ {
return ""; return "";
} }
if(UTF8ENC == _encoding) if(UTF8ENC == _encoding)
{ {
return unicodeToUtf8(str); return unicodeToUtf8(unicode);
} }
else if(GBKENC == _encoding) else if(GBKENC == _encoding)
{ {
return utf8ToGbk(unicodeToUtf8(str)); return utf8ToGbk(unicodeToUtf8(unicode));
} }
return ""; return "";
} }
string UnicodeEncoding::decode(const string& str)
bool UnicodeEncoding::decode(const string& str, vector<uint16_t>& unicode)
{ {
if(str.empty()) if(str.empty())
{ {
return ""; return false;
} }
string res;
if(UTF8ENC == _encoding) if(UTF8ENC == _encoding)
{ {
return utf8ToUnicode(str, unicode);
res = utf8ToUnicode(str);
if(isUniStrValid(res))
{
return res;
}
} }
else if(GBKENC == _encoding) else if(GBKENC == _encoding)
{ {
res = utf8ToUnicode(gbkToUtf8(str)); return utf8ToUnicode(gbkToUtf8(str), unicode);
if(isUniStrValid(res))
{
return res;
}
} }
return ""; return false;
} }
} }
#ifdef ENCODING_UT #ifdef ENCODING_UT
using namespace CPPCOMMON; using namespace CPPCOMMON;
int main() int main()
{ {
UnicodeEncoding enc; UnicodeEncoding enc(GBKENC);
ifstream ifile("testdata/dict.utf8"); ifstream ifile("testdata/dict.gbk");
vector<uint16_t> unicode;
string line; string line;
//enc.setEncoding(UnicodeEncoding::UTF8ENC);
//enc.setEncoding(UnicodeEncoding::GBKENC);
//while(getline(ifile, line))
//{
// cout<<line<<endl;
// cout<<enc.encode(enc.decode(line))<<endl;
// cout<<enc.decode(enc.encode(line))<<endl;
// cout<<enc.decode(line)<<endl;
// cout<<enc.encode(line)<<endl;
//}
ifile.close();
ifile.open("testdata/dict.gbk");
enc.setEncoding(GBKENC);
while(getline(ifile, line)) while(getline(ifile, line))
{ {
cout<<line<<endl; cout<<line<<endl;
cout<<line.size()<<endl; cout<<line.size()<<endl;
cout<<enc.encode(enc.decode(line))<<endl; enc.decode(line, unicode);
cout<<enc.decode(enc.encode(line))<<endl; printUnicode(unicode);
cout<<enc.decode(line)<<endl; cout<<enc.encode(unicode)<<endl;
cout<<enc.encode(line)<<endl;
} }
ifile.close(); ifile.close();

View File

@ -26,13 +26,8 @@ namespace CPPCOMMON
~UnicodeEncoding(); ~UnicodeEncoding();
public: public:
bool setEncoding(const string& enc); bool setEncoding(const string& enc);
string encode(const string& str); string encode(const vector<uint16_t>& unicode);
string decode(const string& str); bool decode(const string& str, vector<uint16_t>& unicode);
public:
bool isUniStrValid(const string& unistr)
{
return !(unistr.empty() || unistr.size() % 2);
}
}; };
} }

View File

@ -213,37 +213,32 @@ namespace CPPCOMMON
return res; return res;
} }
string unicodeToUtf8(const string& uniStr) string unicodeToUtf8(const vector<uint16_t>& unicode)
{ {
size_t len = uniStr.size(); if(unicode.empty())
if(uniStr.empty() || len%2)
{ {
return ""; return "";
} }
uint16_t * uniArr = new uint16_t[(len>>1) + 1]; uint16_t * uniArr = new uint16_t[unicode.size() + 1];
if(NULL == uniArr) if(NULL == uniArr)
{ {
return ""; return "";
} }
char * utfStr = new char[(len<<1) + 1]; char * utfStr = new char[unicode.size() * 4 + 1];
if(NULL == utfStr) if(NULL == utfStr)
{ {
delete [] uniArr; delete [] uniArr;
return ""; return "";
} }
for(int i = 0; i < len; i+=2) for(uint i = 0; i < unicode.size(); i++)
{ {
uniArr[i>>1] = twocharToUint16(uniStr[i], uniStr[i+1]); uniArr[i] = unicode[i];
} }
string res; string res("");
size_t utfLen = unicodeToUtf8(uniArr, len>>1, utfStr); size_t utfLen = unicodeToUtf8(uniArr, unicode.size(), utfStr);
if(0 == utfLen) if(0 != utfLen)
{
res = "";
}
else
{ {
res = utfStr; res = utfStr;
} }
@ -253,7 +248,6 @@ namespace CPPCOMMON
} }
/*from: http://www.cppblog.com/lf426/archive/2008/03/31/45796.html */ /*from: http://www.cppblog.com/lf426/archive/2008/03/31/45796.html */
/*if the inutf8 is not utf8 , this function maybe cause core dump!!!*/
int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode) int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode)
{ {
int length; int length;
@ -289,28 +283,25 @@ namespace CPPCOMMON
return length; return length;
} }
string utf8ToUnicode(const string& utfStr) bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode)
{ {
unicode.clear();
if(utfStr.empty()) if(utfStr.empty())
{ {
return ""; return false;
} }
uint16_t* pUni = new uint16_t[utfStr.size() + 1]; uint16_t* pUni = new uint16_t[utfStr.size() + 1];
if(NULL == pUni) if(NULL == pUni)
{ {
return ""; return false;
} }
size_t uniLen = utf8ToUnicode(utfStr.c_str(), utfStr.size(), pUni); size_t uniLen = utf8ToUnicode(utfStr.c_str(), utfStr.size(), pUni);
string res("");
for(uint i = 0; i < uniLen; i++) for(uint i = 0; i < uniLen; i++)
{ {
unicode.push_back(pUni[i]);
pair<char, char> char2= uint16ToChar2(pUni[i]);
res += char2.first;
res += char2.second;
} }
delete [] pUni; delete [] pUni;
return res; return true;
} }
//iconv //iconv
@ -384,17 +375,33 @@ namespace CPPCOMMON
return res; return res;
} }
size_t getUtf8WordLen(const string& utf) //unicode str to vec
bool uniStrToVec(const string& str, vector<uint16_t>& vec)
{ {
string uni = utf8ToUnicode(utf); vec.clear();
if(uni.empty()||uni.size()%2) if(str.empty() || str.size() % 2)
{ {
return 0; return false;
} }
else for(uint i = 0; i < str.size(); i+=2)
{ {
return uni.size()/2; vec.push_back(twocharToUint16(str[i], str[i + 1]));
} }
return true;
}
//unicode vec to str
string uniVecToStr(const vector<uint16_t>& vec)
{
string res("");
for(uint i = 0; i < vec.size(); i++)
{
pair<char,char> pa = uint16ToChar2(vec[i]);
res += pa.first;
res += pa.second;
}
return res;
} }
} }
@ -444,16 +451,14 @@ int main()
//cout<<string_format("hehe%s11asd%dasf","[here]",2); //cout<<string_format("hehe%s11asd%dasf","[here]",2);
ifstream ifile("testdata/dict.gbk"); ifstream ifile("testdata/dict.gbk");
string line; string line;
vector<uint16_t> unicode;
while(getline(ifile, line)) while(getline(ifile, line))
{ {
cout<<line<<endl; cout<<line<<endl;
string uniStr = utf8ToUnicode(line); utf8ToUnicode(line, unicode);
cout<<utf8ToUnicode(uniStr)<<endl;// this will core dump printUnicode(unicode);
string utfStr = unicodeToUtf8(uniStr); cout<<unicodeToUtf8(unicode)<<endl;;
cout<<utfStr<<endl;
} }
cout<<utf8ToGbk("")<<endl;
cout<<gbkToUtf8("")<<endl;
//vector<string> tmp; //vector<string> tmp;
//tmp.push_back("1"); //tmp.push_back("1");
////tmp.push_back("2"); ////tmp.push_back("2");
@ -465,7 +470,6 @@ int main()
//{ //{
// cout<<line<<endl; // cout<<line<<endl;
// string s = gbkToUtf8(line); // string s = gbkToUtf8(line);
// cout<<getUtf8WordLen(s)<<endl;
// s = utf8ToGbk(s); // s = utf8ToGbk(s);
// cout<<s<<endl; // cout<<s<<endl;
//} //}

View File

@ -38,15 +38,17 @@ namespace CPPCOMMON
//encode //encode
size_t unicodeToUtf8(uint16_t *in, size_t len, char * out); size_t unicodeToUtf8(uint16_t *in, size_t len, char * out);
string unicodeToUtf8(const string& uniStr); string unicodeToUtf8(const vector<uint16_t>& unicode);
int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode); int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode);
string utf8ToUnicode(const string& utfStr); bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode);
int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen); int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen);
string gbkToUtf8(const string& gbk); string gbkToUtf8(const string& gbk);
string utf8ToGbk(const string& utf); string utf8ToGbk(const string& utf);
size_t getUtf8WordLen(const string& utf); bool uniStrToVec(const string& str, vector<uint16_t>& vec);
string uniVecToStr(const vector<uint16_t>& vec);
inline uint16_t twocharToUint16(char high, char low) inline uint16_t twocharToUint16(char high, char low)
{ {
return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
@ -60,6 +62,11 @@ namespace CPPCOMMON
return res; return res;
} }
inline void printUnicode(const vector<uint16_t>& unicode)
{
cout<<uniVecToStr(unicode)<<endl;
}
} }
#endif #endif