From 62efd1fca4adba34c8b50d20805f8966d0c7eade Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Wed, 21 Aug 2013 13:00:48 +0800 Subject: [PATCH] little change --- .gitignore | 5 +++++ demo/testlines.utf8 | 3 +++ src/TransCode.cpp | 22 +++++++++------------- 3 files changed, 17 insertions(+), 13 deletions(-) create mode 100644 demo/testlines.utf8 diff --git a/.gitignore b/.gitignore index f047bef..94c2ec8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,8 @@ tags log main lib*.a +*_demo +segdict* +prior.gbk +tmp +t.* diff --git a/demo/testlines.utf8 b/demo/testlines.utf8 new file mode 100644 index 0000000..3419fda --- /dev/null +++ b/demo/testlines.utf8 @@ -0,0 +1,3 @@ +我来到北京清华大学 +他来到了网易杭研大厦 +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 diff --git a/src/TransCode.cpp b/src/TransCode.cpp index 663899d..b4d8601 100644 --- a/src/TransCode.cpp +++ b/src/TransCode.cpp @@ -51,23 +51,24 @@ namespace CppJieba return false; } vec.clear(); - for(uint i = 0;i < str.size();) + size_t siz = str.size(); + for(uint i = 0;i < siz;) { - if((unsigned char)str[i] <= 0x7f) // 0xxxxxxx + if(!(str[i] & 0x80)) // 0xxxxxxx { vec.push_back(str[i]); i++; } - else if ((unsigned char)str[i] <= 0xdf && i + 1 < str.size()) // 110xxxxxx + else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx { - ch1 = ((unsigned char)str[i] >> 2) & 0x07; + ch1 = (str[i] >> 2) & 0x07; ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); vec.push_back(twocharToUint16(ch1, ch2)); i += 2; } - else if((unsigned char)str[i] <= 0xef && i + 2 < str.size()) + else if((unsigned char)str[i] <= 0xef && i + 2 < siz) { - ch1 = ((unsigned char)str[i] << 4) | (((unsigned char)str[i+1] >> 2) & 0x0f ); + ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); vec.push_back(twocharToUint16(ch1, ch2)); i += 3; @@ -178,13 +179,8 @@ namespace CppJieba size_t TransCode::getWordLength(const string& str) { - if(NULL == _pf_strToVec) - { - return 0; - } vector vec; - bool ret = strToVec(str, vec); - if(!ret) + if(!strToVec(str, vec)) { return 0; } @@ -222,7 +218,7 @@ int main() //vector vec; //tmp("1",vec); - string a("严"); + string a("abd你好世界!a"); vector vec; //TransCode::setUtf8Enc(); cout<