From ca5e5517e723164c663ed22569752dcb206ff399 Mon Sep 17 00:00:00 2001
From: gwdwyy <wuyanyi09@gmail.com>
Date: Mon, 22 Jul 2013 14:31:59 +0800
Subject: [PATCH] update cppcomm

---
 src/cppcommon/encoding.cpp   | 56 +++++++++-----------------
 src/cppcommon/encoding.h     |  9 +----
 src/cppcommon/str_functs.cpp | 78 +++++++++++++++++++-----------------
 src/cppcommon/str_functs.h   | 15 +++++--
 4 files changed, 72 insertions(+), 86 deletions(-)

diff --git a/src/cppcommon/encoding.cpp b/src/cppcommon/encoding.cpp
index b16045f..26c1600 100644
--- a/src/cppcommon/encoding.cpp
+++ b/src/cppcommon/encoding.cpp
@@ -37,79 +37,59 @@ namespace CPPCOMMON
 		_encoding = enc;
 		return true;
 	}
-	string UnicodeEncoding::encode(const string& str)
+
+	string UnicodeEncoding::encode(const vector<uint16_t>& unicode)
 	{
-		if(!isUniStrValid(str))
+		if(unicode.empty())
 		{
 			return "";
 		}
 		if(UTF8ENC == _encoding)
 		{
-			return unicodeToUtf8(str);
+			return unicodeToUtf8(unicode);
 		}
 		else if(GBKENC  == _encoding)
 		{
-			return utf8ToGbk(unicodeToUtf8(str));
+			return utf8ToGbk(unicodeToUtf8(unicode));
 		}
 		return "";
 	}
-	string UnicodeEncoding::decode(const string& str)
+
+	bool UnicodeEncoding::decode(const string& str, vector<uint16_t>& unicode)
 	{
 		if(str.empty())
 		{
-			return "";
+			return false;
 		}
-		string res;
 		if(UTF8ENC == _encoding)
 		{
-			
-			res = utf8ToUnicode(str);
-			if(isUniStrValid(res))
-			{
-				return res;
-			}
+			return utf8ToUnicode(str, unicode);
 		}
 		else if(GBKENC == _encoding)
 		{
-			res = utf8ToUnicode(gbkToUtf8(str));
-			if(isUniStrValid(res))
-			{
-				return res;
-			}
+			return utf8ToUnicode(gbkToUtf8(str), unicode);
 		}
-		return "";
+		return false;
 	}
+
 }
 
 #ifdef ENCODING_UT
 using namespace CPPCOMMON;
 int main()
 {
-	UnicodeEncoding enc;
-	ifstream ifile("testdata/dict.utf8");
+	UnicodeEncoding enc(GBKENC);
+	ifstream ifile("testdata/dict.gbk");
+	vector<uint16_t> unicode;
 	string line;
-	//enc.setEncoding(UnicodeEncoding::UTF8ENC);
-	//enc.setEncoding(UnicodeEncoding::GBKENC);
-	//while(getline(ifile, line))
-	//{
-	//	cout<<line<<endl;
-	//	cout<<enc.encode(enc.decode(line))<<endl;
-	//	cout<<enc.decode(enc.encode(line))<<endl;
-	//	cout<<enc.decode(line)<<endl;
-	//	cout<<enc.encode(line)<<endl;
-	//}
-	ifile.close();
-	ifile.open("testdata/dict.gbk");
-	enc.setEncoding(GBKENC);
 	while(getline(ifile, line))
 	{
 		
 		cout<<line<<endl;
 		cout<<line.size()<<endl;
-		cout<<enc.encode(enc.decode(line))<<endl;
-		cout<<enc.decode(enc.encode(line))<<endl;
-		cout<<enc.decode(line)<<endl;
-		cout<<enc.encode(line)<<endl;
+		enc.decode(line, unicode);
+		printUnicode(unicode);
+		cout<<enc.encode(unicode)<<endl;
 	}
 	ifile.close();
 	
diff --git a/src/cppcommon/encoding.h b/src/cppcommon/encoding.h
index b2506e0..a1ef9eb 100644
--- a/src/cppcommon/encoding.h
+++ b/src/cppcommon/encoding.h
@@ -26,13 +26,8 @@ namespace CPPCOMMON
 			~UnicodeEncoding();
 		public:
 			bool setEncoding(const string& enc);
-			string encode(const string& str);
-			string decode(const string& str);
-		public:
-			bool isUniStrValid(const string& unistr)
-			{
-				return !(unistr.empty() || unistr.size() % 2);
-			}
+			string encode(const vector<uint16_t>& unicode);
+			bool decode(const string& str, vector<uint16_t>& unicode);
 			
 	};
 }
diff --git a/src/cppcommon/str_functs.cpp b/src/cppcommon/str_functs.cpp
index 4c6456a..cc428f3 100644
--- a/src/cppcommon/str_functs.cpp
+++ b/src/cppcommon/str_functs.cpp
@@ -213,37 +213,32 @@ namespace CPPCOMMON
         return res;
     }
 
-	string unicodeToUtf8(const string& uniStr)
+	string unicodeToUtf8(const vector<uint16_t>& unicode)
 	{
-		size_t len = uniStr.size();
-		if(uniStr.empty() || len%2)
+		if(unicode.empty())
 		{
 			return "";
 		}
 
-		uint16_t * uniArr = new uint16_t[(len>>1) + 1];
+		uint16_t * uniArr = new uint16_t[unicode.size() + 1];
 		if(NULL == uniArr)
 		{
 			return "";
 		}
-		char * utfStr = new char[(len<<1) + 1];
+		char * utfStr = new char[unicode.size() * 4 + 1];
 		if(NULL == utfStr)
 		{
 			delete [] uniArr;
 			return "";
 		}
-		for(int i = 0; i < len; i+=2)
+		for(uint i = 0; i < unicode.size(); i++)
 		{
-			uniArr[i>>1] = twocharToUint16(uniStr[i], uniStr[i+1]);
+			uniArr[i] = unicode[i];
 		}
 		
-		string res;
-		size_t utfLen = unicodeToUtf8(uniArr, len>>1, utfStr);
-		if(0 == utfLen)
-		{
-			res = "";
-		}
-		else
+		string res("");
+		size_t utfLen = unicodeToUtf8(uniArr, unicode.size(), utfStr);
+		if(0 != utfLen)
 		{
 			res = utfStr;
 		}
@@ -253,7 +248,6 @@ namespace CPPCOMMON
 	}
 
     /*from: http://www.cppblog.com/lf426/archive/2008/03/31/45796.html */
-	/*if the inutf8 is not utf8 , this function maybe cause core dump!!!*/
     int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode)
     {
         int length;
@@ -289,28 +283,25 @@ namespace CPPCOMMON
         return length;
     }
 
-	string utf8ToUnicode(const string& utfStr)
+	bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode)
 	{
+		unicode.clear();
 		if(utfStr.empty())
 		{
-			return "";
+			return false;
 		}
 		uint16_t* pUni = new uint16_t[utfStr.size() + 1];
 		if(NULL == pUni)
 		{
-			return "";
+			return false;
 		}
 		size_t uniLen = utf8ToUnicode(utfStr.c_str(), utfStr.size(), pUni);
-		string res("");
 		for(uint i = 0; i < uniLen; i++)
 		{
-			
-			pair<char, char> char2= uint16ToChar2(pUni[i]);
-			res += char2.first;
-			res += char2.second;
+			unicode.push_back(pUni[i]);
 		}
 		delete [] pUni;
-		return res;
+		return true;
 	}
 
 	//iconv
@@ -384,17 +375,33 @@ namespace CPPCOMMON
 		return res;
 	}
 
-	size_t getUtf8WordLen(const string& utf)
+	//unicode str to vec
+	bool uniStrToVec(const string& str, vector<uint16_t>& vec)
 	{
-		string uni = utf8ToUnicode(utf);
-		if(uni.empty()||uni.size()%2)
+		vec.clear();
+		if(str.empty() || str.size() % 2)
 		{
-			return 0;
+			return false;
 		}
-		else
+		for(uint i = 0; i < str.size(); i+=2)
 		{
-			return uni.size()/2;
+			vec.push_back(twocharToUint16(str[i], str[i + 1]));
 		}
+
+		return true;
+	}
+
+	//unicode vec to str
+	string uniVecToStr(const vector<uint16_t>& vec)
+	{
+		string res("");
+		for(uint i = 0; i < vec.size(); i++)
+		{
+			pair<char,char> pa = uint16ToChar2(vec[i]);
+			res += pa.first;
+			res += pa.second;
+		}
+		return res;
 	}
 
 }
@@ -444,16 +451,14 @@ int main()
 	//cout<<string_format("hehe%s11asd%dasf","[here]",2);
 	ifstream ifile("testdata/dict.gbk");
 	string line;
+	vector<uint16_t> unicode;
 	while(getline(ifile, line))
 	{
 		cout<<line<<endl;
-		string uniStr = utf8ToUnicode(line);
-		cout<<utf8ToUnicode(uniStr)<<endl;// this will core dump
-		string utfStr = unicodeToUtf8(uniStr);
-		cout<<utfStr<<endl;
+		utf8ToUnicode(line, unicode);
+		printUnicode(unicode);
+		cout<<unicodeToUtf8(unicode)<<endl;;
 	}
-	cout<<utf8ToGbk("")<<endl;
-	cout<<gbkToUtf8("")<<endl;
 	//vector<string> tmp;
 	//tmp.push_back("1");
 	////tmp.push_back("2");
@@ -465,7 +470,6 @@ int main()
 	//{
 	//	cout<<line<<endl;
 	//	string s = gbkToUtf8(line);
-	//	cout<<getUtf8WordLen(s)<<endl;
 	//	s = utf8ToGbk(s);
 	//	cout<<s<<endl;
 	//}
diff --git a/src/cppcommon/str_functs.h b/src/cppcommon/str_functs.h
index d9c9581..6fc07e5 100644
--- a/src/cppcommon/str_functs.h
+++ b/src/cppcommon/str_functs.h
@@ -38,15 +38,17 @@ namespace CPPCOMMON
 
     //encode
     size_t unicodeToUtf8(uint16_t *in, size_t len, char * out);
-	string unicodeToUtf8(const string& uniStr);
+	string unicodeToUtf8(const vector<uint16_t>& unicode);
     int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode);
-	string utf8ToUnicode(const string& utfStr);
+	bool utf8ToUnicode(const string& utfStr, vector<uint16_t>& unicode);
+
 	int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen);
 	string gbkToUtf8(const string& gbk);
 	string utf8ToGbk(const string& utf);
 
-	size_t getUtf8WordLen(const string& utf);
-	
+	bool uniStrToVec(const string& str, vector<uint16_t>& vec);
+	string uniVecToStr(const vector<uint16_t>& vec);
+
 	inline uint16_t twocharToUint16(char high, char low)
 	{
 		return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
@@ -60,6 +62,11 @@ namespace CPPCOMMON
 		return res;
 	}
 
+	inline void printUnicode(const vector<uint16_t>& unicode)
+	{
+		cout<<uniVecToStr(unicode)<<endl;
+	}
+
 	
 }
 #endif