From 35ba8f058ea436bd1997c77b0f5111bfcb9e4f88 Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 4 Dec 2013 07:13:31 -0800 Subject: [PATCH] mv unicode <=> utf8 from transcode.hpp into Limonp/str_functs.hpp --- CMakeLists.txt | 1 + src/Limonp/str_functs.hpp | 104 +++++++++++++++++++++++++------------- src/TransCode.hpp | 77 +++++----------------------- 3 files changed, 82 insertions(+), 100 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e00c1a..dd751d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ PROJECT(CPPJIEBA) SET(CMAKE_INSTALL_PREFIX /usr) ADD_DEFINITIONS(-std=c++0x -O3) +#ADD_DEFINITIONS(-DCPPJIEBA_GBK) ADD_SUBDIRECTORY(src) ADD_SUBDIRECTORY(dicts) ADD_SUBDIRECTORY(scripts) diff --git a/src/Limonp/str_functs.hpp b/src/Limonp/str_functs.hpp index 8ad62a3..f572ee9 100644 --- a/src/Limonp/str_functs.hpp +++ b/src/Limonp/str_functs.hpp @@ -120,7 +120,7 @@ namespace Limonp return res; } - + inline bool splitStr(const string& src, vector& res, const string& pattern) { @@ -218,41 +218,73 @@ namespace Limonp return str.find(ch) != string::npos; } - //inline void extractWords(const string& sentence, vector& words) - //{ - // bool flag = false; - // uint lhs = 0, len = 0; - // for(uint i = 0; i < sentence.size(); i++) - // { - // char x = sentence[i]; - // if((0x0030 <= x && x<= 0x0039) || (0x0041 <= x && x <= 0x005a ) || (0x0061 <= x && x <= 0x007a)) - // { - // if(flag) - // { - // len ++; - // } - // else - // { - // lhs = i; - // len = 1; - // } - // flag = true; - // } - // else - // { - // if(flag) - // { - // words.push_back(string(sentence, lhs, len)); - // } - // flag = false; - // } - // } - // if(flag) - // { - // words.push_back(string(sentence, lhs, len)); - // } - //} - + inline bool utf8ToUnicode(const string& str, vector& vec) + { + char ch1, ch2; + if(str.empty()) + { + return false; + } + vec.clear(); + size_t siz = str.size(); + for(uint i = 0;i < siz;) + { + if(!(str[i] & 0x80)) // 0xxxxxxx + { + vec.push_back(str[i]); + i++; + } + else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx + { + ch1 = (str[i] >> 2) & 0x07; + ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); + vec.push_back(twocharToUint16(ch1, ch2)); + i += 2; + } + else if((unsigned char)str[i] <= 0xef && i + 2 < siz) + { + ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); + ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); + vec.push_back(twocharToUint16(ch1, ch2)); + i += 3; + } + else + { + return false; + } + } + return true; + } + inline bool unicodeToUtf8(vector::const_iterator begin, vector::const_iterator end, string& res) + { + if(begin >= end) + { + return false; + } + res.clear(); + uint16_t ui; + while(begin != end) + { + ui = *begin; + if(ui <= 0x7f) + { + res += char(ui); + } + else if(ui <= 0x7ff) + { + res += char(((ui>>6) & 0x1f) | 0xc0); + res += char((ui & 0x3f) | 0x80); + } + else + { + res += char(((ui >> 12) & 0x0f )| 0xe0); + res += char(((ui>>6) & 0x3f )| 0x80 ); + res += char((ui & 0x3f) | 0x80); + } + begin ++; + } + return true; + } } #endif diff --git a/src/TransCode.hpp b/src/TransCode.hpp index 113aceb..4e9c926 100644 --- a/src/TransCode.hpp +++ b/src/TransCode.hpp @@ -17,76 +17,25 @@ namespace CppJieba { inline bool decode(const string& str, vector& vec) { - char ch1, ch2; - if(str.empty()) - { - return false; - } - vec.clear(); - size_t siz = str.size(); - for(uint i = 0;i < siz;) - { - if(!(str[i] & 0x80)) // 0xxxxxxx - { - vec.push_back(str[i]); - i++; - } - else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx - { - ch1 = (str[i] >> 2) & 0x07; - ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 2; - } - else if((unsigned char)str[i] <= 0xef && i + 2 < siz) - { - ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); - ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 3; - } - else - { - return false; - } - } - return true; +#ifdef CPPJIEBA_GBK + return false; +#else + return utf8ToUnicode(str, vec); +#endif } - inline bool encode(vector::const_iterator begin, vector::const_iterator end, string& res) { - if(begin >= end) - { - return false; - } - res.clear(); - uint16_t ui; - while(begin != end) - { - ui = *begin; - if(ui <= 0x7f) - { - res += char(ui); - } - else if(ui <= 0x7ff) - { - res += char(((ui>>6) & 0x1f) | 0xc0); - res += char((ui & 0x3f) | 0x80); - } - else - { - res += char(((ui >> 12) & 0x0f )| 0xe0); - res += char(((ui>>6) & 0x3f )| 0x80 ); - res += char((ui & 0x3f) | 0x80); - } - begin ++; - } - return true; +#ifdef CPPJIEBA_GBK + return false; +#else + return unicodeToUtf8(begin, end, res); +#endif } - inline bool encode(const vector& sentence, string& res) + + inline bool encode(const vector& uni, string& res) { - return encode(sentence.begin(), sentence.end(), res); + return encode(uni.begin(), uni.end(), res); } } }