#include "str_functs.h" namespace CPPCOMMON { string joinStr(const vector& src, const string& connectorStr) { string res; string tmpStr; size_t len = src.size(); for(size_t i = 0; i < len - 1; i++) { res += stripStr(src[i]); res += connectorStr; } if(0 < len) { res += stripStr(src[len-1]); } return res; } vector splitStr(const string& source, const string& pattern) { vector res; splitStr(source, res, pattern); return res; } void splitStr(const string& source, vector& out_vec, const string& pattern) { if(0 == pattern.size()) { return; } string s = source + pattern; string::size_type pos; int length = s.size(); for(int i = 0; i < length; i++) { pos = s.find(pattern, i); if(pos < length) { string tmp = stripStr(s.substr(i, pos - i)); if("" != tmp) { out_vec.push_back(tmp); } i = pos + pattern.size() - 1; } } } string stripStr(const string& str, const string& patternStr) { if(str.empty()) { return str; } string::size_type posL = str.find_first_not_of(patternStr); if(string::npos == posL) { return str; } string::size_type posR = str.find_last_not_of(patternStr); return str.substr(posL, posR - posL + 1); } bool splitStrMultiPatterns( const string& strSrc, vector& outVec, const vector& patterns ) { char transChar = '#'; int transLenThreshold = 10; string transStr; transStr += transChar; while(strSrc.find(transStr) != string::npos) { transStr += transChar; if(transStr.size() > transLenThreshold) return false; } string strSrcMutable = strSrc; for(int i = 0; i < patterns.size(); i++) { strSrcMutable = replaceStr(strSrcMutable, patterns[i], transStr); } splitStr(strSrcMutable, outVec, transStr); return true; } string upperStr(const string& strIn) { string str = strIn; transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper); return str; } string lowerStr(const string& strIn) { string str = strIn; transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower); return str; } string replaceStr(const string& strSrc, const string& oldStr, const string& newStr, int count) { string strRet = strSrc; size_t pos = 0; int l_count = 0; if(-1 == count) count = strRet.size(); while((pos = strRet.find(oldStr, pos)) != string::npos) { strRet.replace(pos, oldStr.size(), newStr); if(++l_count >= count) break; pos += newStr.size(); } return strRet; } unsigned int countStrDistance(const string& A, const string& B) { unsigned int lenA = A.size(); unsigned int lenB = B.size(); unsigned int len = (lenA < lenB ? lenA : lenB); unsigned int res = lenA + lenB - 2 * len; for(size_t i = 0; i < len; i++) { if(A[i] != B[i]) res++; } return res; } unsigned int countStrSimilarity(const string& A, const string& B) { unsigned int lenA = A.size(); unsigned int lenB = B.size(); unsigned int len = (lenA < lenB ? lenA : lenB); unsigned int res = 0; for(size_t i = 0; i < len; i++) { if(A[i] == B[i]) res++; } return res; } //unicode utf8 transform size_t unicodeToUtf8(uint16_t *in, size_t len, char * out) { size_t res = 0; for (int i = 0; i < len; i++) { uint16_t unicode = in[i]; if (unicode >= 0x0000 && unicode <= 0x007f) { *out = (uint8_t)unicode; out += 1; res += 1; } else if (unicode >= 0x0080 && unicode <= 0x07ff) { *out = 0xc0 | (unicode >> 6); out += 1; *out = 0x80 | (unicode & (0xff >> 2)); out += 1; res += 2; } else if (unicode >= 0x0800 && unicode <= 0xffff) { *out = 0xe0 | (unicode >> 12); out += 1; *out = 0x80 | ((unicode >> 6) & 0x3f); out += 1; *out = 0x80 | (unicode & 0x3f); out += 1; res += 3; } } *out = '\0'; return res; } string unicodeToUtf8(const string& uniStr) { size_t len = uniStr.size(); if(len%2) { return ""; } uint16_t * uniArr = new uint16_t[len>>1]; char * utfStr = new char[len<<1]; for(int i = 0; i < len; i+=2) { //uint16_t tmp1 = uniStr[i]; //tmp1 <<= 8; //tmp1 &= 0xff00; //uint16_t tmp2 = uniStr[i+1]; //tmp2 &= 0x00ff; //uniArr[i>>1] = tmp1 | tmp2; uniArr[i>>1] = twocharToUint16(uniStr[i], uniStr[i+1]); } string res; size_t utfLen = unicodeToUtf8(uniArr, len>>1, utfStr); if(0 == utfLen) { res = ""; } else { res = utfStr; } delete [] uniArr; delete [] utfStr; return res; } /*from: http://www.cppblog.com/lf426/archive/2008/03/31/45796.html */ int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode) { int length; const unsigned char* utf8 = (const unsigned char*) inutf8; const unsigned char* t = (const unsigned char*) inutf8; length = 0; while (utf8 - t < len) { if ( *(unsigned char *) utf8 <= 0x7f ) { //expand with 0s. *unicode++ = *utf8++; } //2 byte. else if ( *(unsigned char *) utf8 <= 0xdf ) { *unicode++ = ((*(unsigned char *) utf8 & 0x1f) << 6) + ((*(unsigned char *) (utf8 + 1)) & 0x3f); utf8 += 2; } //3 byte.Chinese may use 3 byte. else { *unicode++ = ((int) (*(unsigned char *) utf8 & 0x0f) << 12) + ((*(unsigned char *) (utf8 + 1) & 0x3f) << 6) + (*(unsigned char *) (utf8 + 2) & 0x3f); utf8 += 3; } length++; } *unicode = 0; return length; } string utf8ToUnicode(const string& utfStr) { uint16_t* pUni = new uint16_t[utfStr.size()]; size_t uniLen = utf8ToUnicode(utfStr.c_str(), utfStr.size(), pUni); string res; if(uniLen ==0 ) { res = ""; } else { for(uint i = 0; i < uniLen; i++) { //char c = 0; //c = ((pUni[i]>>8) & 0x00ff); //res += c; //c = (pUni[i] & 0x00ff); //res += c; pair char2= uint16ToChar2(pUni[i]); res += char2.first; res += char2.second; } } delete [] pUni; return res; } } #ifdef TEST_STR_FUNCTS #include using namespace CPPCOMMON; using namespace std; int main() { //string s = " \t\n1 a h \n"; //cout<<"["< vec; //splitStr("1 3 4", vec); //char * a[] = {"3","jaj","ads"}; //vector pats(a,a+3); //vec.clear(); //splitStrMultiPattern("1 #3 jajs5 asdf3ads 4", vec, pats); //for(int i=0;i