mv unicode <=> utf8 from transcode.hpp into Limonp/str_functs.hpp

This commit is contained in:
wyy 2013-12-04 07:13:31 -08:00
parent fc55fb4ccc
commit 35ba8f058e
3 changed files with 82 additions and 100 deletions

View File

@ -1,6 +1,7 @@
PROJECT(CPPJIEBA) PROJECT(CPPJIEBA)
SET(CMAKE_INSTALL_PREFIX /usr) SET(CMAKE_INSTALL_PREFIX /usr)
ADD_DEFINITIONS(-std=c++0x -O3) ADD_DEFINITIONS(-std=c++0x -O3)
#ADD_DEFINITIONS(-DCPPJIEBA_GBK)
ADD_SUBDIRECTORY(src) ADD_SUBDIRECTORY(src)
ADD_SUBDIRECTORY(dicts) ADD_SUBDIRECTORY(dicts)
ADD_SUBDIRECTORY(scripts) ADD_SUBDIRECTORY(scripts)

View File

@ -120,7 +120,7 @@ namespace Limonp
return res; return res;
} }
inline bool splitStr(const string& src, vector<string>& res, const string& pattern) inline bool splitStr(const string& src, vector<string>& res, const string& pattern)
{ {
@ -218,41 +218,73 @@ namespace Limonp
return str.find(ch) != string::npos; return str.find(ch) != string::npos;
} }
//inline void extractWords(const string& sentence, vector<string>& words) inline bool utf8ToUnicode(const string& str, vector<uint16_t>& vec)
//{ {
// bool flag = false; char ch1, ch2;
// uint lhs = 0, len = 0; if(str.empty())
// for(uint i = 0; i < sentence.size(); i++) {
// { return false;
// char x = sentence[i]; }
// if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a)) vec.clear();
// { size_t siz = str.size();
// if(flag) for(uint i = 0;i < siz;)
// { {
// len ++; if(!(str[i] & 0x80)) // 0xxxxxxx
// } {
// else vec.push_back(str[i]);
// { i++;
// lhs = i; }
// len = 1; else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
// } {
// flag = true; ch1 = (str[i] >> 2) & 0x07;
// } ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
// else vec.push_back(twocharToUint16(ch1, ch2));
// { i += 2;
// if(flag) }
// { else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
// words.push_back(string(sentence, lhs, len)); {
// } ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
// flag = false; ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
// } vec.push_back(twocharToUint16(ch1, ch2));
// } i += 3;
// if(flag) }
// { else
// words.push_back(string(sentence, lhs, len)); {
// } return false;
//} }
}
return true;
}
inline bool unicodeToUtf8(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{
if(begin >= end)
{
return false;
}
res.clear();
uint16_t ui;
while(begin != end)
{
ui = *begin;
if(ui <= 0x7f)
{
res += char(ui);
}
else if(ui <= 0x7ff)
{
res += char(((ui>>6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
}
else
{
res += char(((ui >> 12) & 0x0f )| 0xe0);
res += char(((ui>>6) & 0x3f )| 0x80 );
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
return true;
}
} }
#endif #endif

View File

@ -17,76 +17,25 @@ namespace CppJieba
{ {
inline bool decode(const string& str, vector<uint16_t>& vec) inline bool decode(const string& str, vector<uint16_t>& vec)
{ {
char ch1, ch2; #ifdef CPPJIEBA_GBK
if(str.empty()) return false;
{ #else
return false; return utf8ToUnicode(str, vec);
} #endif
vec.clear();
size_t siz = str.size();
for(uint i = 0;i < siz;)
{
if(!(str[i] & 0x80)) // 0xxxxxxx
{
vec.push_back(str[i]);
i++;
}
else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
{
ch1 = (str[i] >> 2) & 0x07;
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
vec.push_back(twocharToUint16(ch1, ch2));
i += 2;
}
else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
{
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
vec.push_back(twocharToUint16(ch1, ch2));
i += 3;
}
else
{
return false;
}
}
return true;
} }
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res) inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
{ {
if(begin >= end) #ifdef CPPJIEBA_GBK
{ return false;
return false; #else
} return unicodeToUtf8(begin, end, res);
res.clear(); #endif
uint16_t ui;
while(begin != end)
{
ui = *begin;
if(ui <= 0x7f)
{
res += char(ui);
}
else if(ui <= 0x7ff)
{
res += char(((ui>>6) & 0x1f) | 0xc0);
res += char((ui & 0x3f) | 0x80);
}
else
{
res += char(((ui >> 12) & 0x0f )| 0xe0);
res += char(((ui>>6) & 0x3f )| 0x80 );
res += char((ui & 0x3f) | 0x80);
}
begin ++;
}
return true;
} }
inline bool encode(const vector<uint16_t>& sentence, string& res)
inline bool encode(const vector<uint16_t>& uni, string& res)
{ {
return encode(sentence.begin(), sentence.end(), res); return encode(uni.begin(), uni.end(), res);
} }
} }
} }