cppjieba/cppcommon/str_functs.cpp
2013-07-09 14:01:26 +08:00

345 lines
7.8 KiB
C++

#include "str_functs.h"
namespace CPPCOMMON
{
string joinStr(const vector<string>& src, const string& connectorStr)
{
string res;
string tmpStr;
size_t len = src.size();
for(size_t i = 0; i < len - 1; i++)
{
res += stripStr(src[i]);
res += connectorStr;
}
if(0 < len)
{
res += stripStr(src[len-1]);
}
return res;
}
vector<string> splitStr(const string& source, const string& pattern)
{
vector<string> res;
splitStr(source, res, pattern);
return res;
}
void splitStr(const string& source, vector<string>& out_vec, const string& pattern)
{
if(0 == pattern.size())
{
return;
}
string s = source + pattern;
string::size_type pos;
int length = s.size();
for(int i = 0; i < length; i++)
{
pos = s.find(pattern, i);
if(pos < length)
{
string tmp = stripStr(s.substr(i, pos - i));
if("" != tmp)
{
out_vec.push_back(tmp);
}
i = pos + pattern.size() - 1;
}
}
}
string stripStr(const string& str, const string& patternStr)
{
if(str.empty())
{
return str;
}
string::size_type posL = str.find_first_not_of(patternStr);
if(string::npos == posL)
{
return str;
}
string::size_type posR = str.find_last_not_of(patternStr);
return str.substr(posL, posR - posL + 1);
}
bool splitStrMultiPatterns(
const string& strSrc,
vector<string>& outVec,
const vector<string>& patterns
)
{
char transChar = '#';
int transLenThreshold = 10;
string transStr;
transStr += transChar;
while(strSrc.find(transStr) != string::npos)
{
transStr += transChar;
if(transStr.size() > transLenThreshold)
return false;
}
string strSrcMutable = strSrc;
for(int i = 0; i < patterns.size(); i++)
{
strSrcMutable = replaceStr(strSrcMutable, patterns[i], transStr);
}
splitStr(strSrcMutable, outVec, transStr);
return true;
}
string upperStr(const string& strIn)
{
string str = strIn;
transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
return str;
}
string lowerStr(const string& strIn)
{
string str = strIn;
transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
return str;
}
string replaceStr(const string& strSrc, const string& oldStr, const string& newStr, int count)
{
string strRet = strSrc;
size_t pos = 0;
int l_count = 0;
if(-1 == count)
count = strRet.size();
while((pos = strRet.find(oldStr, pos)) != string::npos)
{
strRet.replace(pos, oldStr.size(), newStr);
if(++l_count >= count)
break;
pos += newStr.size();
}
return strRet;
}
unsigned int countStrDistance(const string& A, const string& B)
{
unsigned int lenA = A.size();
unsigned int lenB = B.size();
unsigned int len = (lenA < lenB ? lenA : lenB);
unsigned int res = lenA + lenB - 2 * len;
for(size_t i = 0; i < len; i++)
{
if(A[i] != B[i])
res++;
}
return res;
}
unsigned int countStrSimilarity(const string& A, const string& B)
{
unsigned int lenA = A.size();
unsigned int lenB = B.size();
unsigned int len = (lenA < lenB ? lenA : lenB);
unsigned int res = 0;
for(size_t i = 0; i < len; i++)
{
if(A[i] == B[i])
res++;
}
return res;
}
//unicode utf8 transform
size_t unicodeToUtf8(uint16_t *in, size_t len, char * out)
{
size_t res = 0;
for (int i = 0; i < len; i++)
{
uint16_t unicode = in[i];
if (unicode >= 0x0000 && unicode <= 0x007f)
{
*out = (uint8_t)unicode;
out += 1;
res += 1;
}
else if (unicode >= 0x0080 && unicode <= 0x07ff)
{
*out = 0xc0 | (unicode >> 6);
out += 1;
*out = 0x80 | (unicode & (0xff >> 2));
out += 1;
res += 2;
}
else if (unicode >= 0x0800 && unicode <= 0xffff)
{
*out = 0xe0 | (unicode >> 12);
out += 1;
*out = 0x80 | ((unicode >> 6) & 0x3f);
out += 1;
*out = 0x80 | (unicode & 0x3f);
out += 1;
res += 3;
}
}
*out = '\0';
return res;
}
string unicodeToUtf8(const string& uniStr)
{
size_t len = uniStr.size();
if(len%2)
{
return "";
}
uint16_t * uniArr = new uint16_t[len>>1];
char * utfStr = new char[len<<1];
for(int i = 0; i < len; i+=2)
{
//uint16_t tmp1 = uniStr[i];
//tmp1 <<= 8;
//tmp1 &= 0xff00;
//uint16_t tmp2 = uniStr[i+1];
//tmp2 &= 0x00ff;
//uniArr[i>>1] = tmp1 | tmp2;
uniArr[i>>1] = twocharToUint16(uniStr[i], uniStr[i+1]);
}
string res;
size_t utfLen = unicodeToUtf8(uniArr, len>>1, utfStr);
if(0 == utfLen)
{
res = "";
}
else
{
res = utfStr;
}
delete [] uniArr;
delete [] utfStr;
return res;
}
/*from: http://www.cppblog.com/lf426/archive/2008/03/31/45796.html */
int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode)
{
int length;
const unsigned char* utf8 = (const unsigned char*) inutf8;
const unsigned char* t = (const unsigned char*) inutf8;
length = 0;
while (utf8 - t < len)
{
if ( *(unsigned char *) utf8 <= 0x7f )
{
//expand with 0s.
*unicode++ = *utf8++;
}
//2 byte.
else if ( *(unsigned char *) utf8 <= 0xdf )
{
*unicode++ = ((*(unsigned char *) utf8 & 0x1f) << 6) + ((*(unsigned char *) (utf8 + 1)) & 0x3f);
utf8 += 2;
}
//3 byte.Chinese may use 3 byte.
else {
*unicode++ = ((int) (*(unsigned char *) utf8 & 0x0f) << 12) +
((*(unsigned char *) (utf8 + 1) & 0x3f) << 6) +
(*(unsigned char *) (utf8 + 2) & 0x3f);
utf8 += 3;
}
length++;
}
*unicode = 0;
return length;
}
string utf8ToUnicode(const string& utfStr)
{
uint16_t* pUni = new uint16_t[utfStr.size()];
size_t uniLen = utf8ToUnicode(utfStr.c_str(), utfStr.size(), pUni);
string res;
if(uniLen ==0 )
{
res = "";
}
else
{
for(uint i = 0; i < uniLen; i++)
{
//char c = 0;
//c = ((pUni[i]>>8) & 0x00ff);
//res += c;
//c = (pUni[i] & 0x00ff);
//res += c;
pair<char, char> char2= uint16ToChar2(pUni[i]);
res += char2.first;
res += char2.second;
}
}
delete [] pUni;
return res;
}
}
#ifdef TEST_STR_FUNCTS
#include <iostream>
using namespace CPPCOMMON;
using namespace std;
int main()
{
//string s = " \t\n1 a h \n";
//cout<<"["<<stripStr(s)<<"]"<<endl;
//cout<<countStrDistance("Aheheh","heheh1212")<<endl;
//cout<<joinStr(splitStr(s), ",")<<endl;
//vector<string> vec;
//splitStr("1 3 4", vec);
//char * a[] = {"3","jaj","ads"};
//vector<string> pats(a,a+3);
//vec.clear();
//splitStrMultiPattern("1 #3 jajs5 asdf3ads 4", vec, pats);
//for(int i=0;i<vec.size();i++)
//{
// cout<<vec[i]<<endl;
//}
//string s = "1111aaafasfa,asdj.sadhashfhaha";
//upperStr(s);
//cout<<s<<endl;
//
//s = "ab1ba2ab3";
//cout<<replaceStr(s,"ab","###")<<endl;
//ifstream ifile("testdata/dict.txt");
//string line;
//while(getline(ifile, line))
//{
// uint16_t strbuf[1024];
// size_t unilen = utf8ToUnicode(line.c_str(), line.size(), strbuf);
// for(int i = 0; i < unilen; i++)
// {
// // printf("%x\n", strbuf[i]);
// }
// char utf8str[512]={0};
// unicodeToUtf8(strbuf, unilen, utf8str);
// //cout<<strlen(utf8str);
// cout<<utf8str<<endl;
//}
ifstream ifile("testdata/dict.txt");
string line;
while(getline(ifile, line))
{
cout<<line<<endl;
string uniStr = utf8ToUnicode(line);
//cout<<uniStr<<endl;
string utfStr = unicodeToUtf8(uniStr);
cout<<utfStr<<endl;
}
getchar();
return 0;
}
#endif