From ca5e5517e723164c663ed22569752dcb206ff399 Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Mon, 22 Jul 2013 14:31:59 +0800 Subject: [PATCH] update cppcomm --- src/cppcommon/encoding.cpp | 56 +++++++++----------------- src/cppcommon/encoding.h | 9 +---- src/cppcommon/str_functs.cpp | 78 +++++++++++++++++++----------------- src/cppcommon/str_functs.h | 15 +++++-- 4 files changed, 72 insertions(+), 86 deletions(-) diff --git a/src/cppcommon/encoding.cpp b/src/cppcommon/encoding.cpp index b16045f..26c1600 100644 --- a/src/cppcommon/encoding.cpp +++ b/src/cppcommon/encoding.cpp @@ -37,79 +37,59 @@ namespace CPPCOMMON _encoding = enc; return true; } - string UnicodeEncoding::encode(const string& str) + + string UnicodeEncoding::encode(const vector& unicode) { - if(!isUniStrValid(str)) + if(unicode.empty()) { return ""; } if(UTF8ENC == _encoding) { - return unicodeToUtf8(str); + return unicodeToUtf8(unicode); } else if(GBKENC == _encoding) { - return utf8ToGbk(unicodeToUtf8(str)); + return utf8ToGbk(unicodeToUtf8(unicode)); } return ""; } - string UnicodeEncoding::decode(const string& str) + + bool UnicodeEncoding::decode(const string& str, vector& unicode) { if(str.empty()) { - return ""; + return false; } - string res; if(UTF8ENC == _encoding) { - - res = utf8ToUnicode(str); - if(isUniStrValid(res)) - { - return res; - } + return utf8ToUnicode(str, unicode); } else if(GBKENC == _encoding) { - res = utf8ToUnicode(gbkToUtf8(str)); - if(isUniStrValid(res)) - { - return res; - } + return utf8ToUnicode(gbkToUtf8(str), unicode); } - return ""; + return false; } + } #ifdef ENCODING_UT using namespace CPPCOMMON; int main() { - UnicodeEncoding enc; - ifstream ifile("testdata/dict.utf8"); + UnicodeEncoding enc(GBKENC); + ifstream ifile("testdata/dict.gbk"); + vector unicode; string line; - //enc.setEncoding(UnicodeEncoding::UTF8ENC); - //enc.setEncoding(UnicodeEncoding::GBKENC); - //while(getline(ifile, line)) - //{ - // cout<& unicode); + bool decode(const string& str, vector& unicode); }; } diff --git a/src/cppcommon/str_functs.cpp b/src/cppcommon/str_functs.cpp index 4c6456a..cc428f3 100644 --- a/src/cppcommon/str_functs.cpp +++ b/src/cppcommon/str_functs.cpp @@ -213,37 +213,32 @@ namespace CPPCOMMON return res; } - string unicodeToUtf8(const string& uniStr) + string unicodeToUtf8(const vector& unicode) { - size_t len = uniStr.size(); - if(uniStr.empty() || len%2) + if(unicode.empty()) { return ""; } - uint16_t * uniArr = new uint16_t[(len>>1) + 1]; + uint16_t * uniArr = new uint16_t[unicode.size() + 1]; if(NULL == uniArr) { return ""; } - char * utfStr = new char[(len<<1) + 1]; + char * utfStr = new char[unicode.size() * 4 + 1]; if(NULL == utfStr) { delete [] uniArr; return ""; } - for(int i = 0; i < len; i+=2) + for(uint i = 0; i < unicode.size(); i++) { - uniArr[i>>1] = twocharToUint16(uniStr[i], uniStr[i+1]); + uniArr[i] = unicode[i]; } - string res; - size_t utfLen = unicodeToUtf8(uniArr, len>>1, utfStr); - if(0 == utfLen) - { - res = ""; - } - else + string res(""); + size_t utfLen = unicodeToUtf8(uniArr, unicode.size(), utfStr); + if(0 != utfLen) { res = utfStr; } @@ -253,7 +248,6 @@ namespace CPPCOMMON } /*from: http://www.cppblog.com/lf426/archive/2008/03/31/45796.html */ - /*if the inutf8 is not utf8 , this function maybe cause core dump!!!*/ int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode) { int length; @@ -289,28 +283,25 @@ namespace CPPCOMMON return length; } - string utf8ToUnicode(const string& utfStr) + bool utf8ToUnicode(const string& utfStr, vector& unicode) { + unicode.clear(); if(utfStr.empty()) { - return ""; + return false; } uint16_t* pUni = new uint16_t[utfStr.size() + 1]; if(NULL == pUni) { - return ""; + return false; } size_t uniLen = utf8ToUnicode(utfStr.c_str(), utfStr.size(), pUni); - string res(""); for(uint i = 0; i < uniLen; i++) { - - pair char2= uint16ToChar2(pUni[i]); - res += char2.first; - res += char2.second; + unicode.push_back(pUni[i]); } delete [] pUni; - return res; + return true; } //iconv @@ -384,17 +375,33 @@ namespace CPPCOMMON return res; } - size_t getUtf8WordLen(const string& utf) + //unicode str to vec + bool uniStrToVec(const string& str, vector& vec) { - string uni = utf8ToUnicode(utf); - if(uni.empty()||uni.size()%2) + vec.clear(); + if(str.empty() || str.size() % 2) { - return 0; + return false; } - else + for(uint i = 0; i < str.size(); i+=2) { - return uni.size()/2; + vec.push_back(twocharToUint16(str[i], str[i + 1])); } + + return true; + } + + //unicode vec to str + string uniVecToStr(const vector& vec) + { + string res(""); + for(uint i = 0; i < vec.size(); i++) + { + pair pa = uint16ToChar2(vec[i]); + res += pa.first; + res += pa.second; + } + return res; } } @@ -444,16 +451,14 @@ int main() //cout< unicode; while(getline(ifile, line)) { cout< tmp; //tmp.push_back("1"); ////tmp.push_back("2"); @@ -465,7 +470,6 @@ int main() //{ // cout<& unicode); int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode); - string utf8ToUnicode(const string& utfStr); + bool utf8ToUnicode(const string& utfStr, vector& unicode); + int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen); string gbkToUtf8(const string& gbk); string utf8ToGbk(const string& utf); - size_t getUtf8WordLen(const string& utf); - + bool uniStrToVec(const string& str, vector& vec); + string uniVecToStr(const vector& vec); + inline uint16_t twocharToUint16(char high, char low) { return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); @@ -60,6 +62,11 @@ namespace CPPCOMMON return res; } + inline void printUnicode(const vector& unicode) + { + cout<