From 961575e33968dac048f5ff01ccaf1cb20c05eb53 Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 18 Sep 2013 10:21:10 +0800 Subject: [PATCH] remve the utf8 trans out of Transcode.h and delete TransCode this static class ,using namespace TransCode instead, and inlining funct in it , so remove the cpp , only use transcode.h --- cppjieba/TransCode.cpp | 181 --------------------------------------- cppjieba/TransCode.h | 110 +++++++++++++----------- cppjieba/structs.h | 1 + demo/example.sh | 7 +- demo/keywordext_demo.cpp | 22 ++--- demo/segment_demo.cpp | 29 +++---- demo/testlines.utf8 | 4 - 7 files changed, 89 insertions(+), 265 deletions(-) delete mode 100644 cppjieba/TransCode.cpp delete mode 100644 demo/testlines.utf8 diff --git a/cppjieba/TransCode.cpp b/cppjieba/TransCode.cpp deleted file mode 100644 index d6e9bd2..0000000 --- a/cppjieba/TransCode.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "TransCode.h" - -namespace CppJieba -{ - vector TransCode::_encVec; - bool TransCode::_isInitted = TransCode::init(); - TransCode::pf_decode_t TransCode::_pf_decode = NULL; - TransCode::pf_encode_t TransCode::_pf_encode = NULL; - - bool TransCode::init() - { - _pf_decode = gbkToVec; - _pf_encode = vecToGbk; - return true; - } - - TransCode::TransCode() - { - - } - TransCode::~TransCode() - { - } - - void TransCode::setGbkEnc() - { - _pf_decode = gbkToVec; - _pf_encode = vecToGbk; - } - - void TransCode::setUtf8Enc() - { - _pf_decode = utf8ToVec; - _pf_encode = vecToUtf8; - } - - - bool TransCode::utf8ToVec(const string& str, vector& vec) - { - char ch1, ch2; - if(str.empty()) - { - return false; - } - vec.clear(); - size_t siz = str.size(); - for(uint i = 0;i < siz;) - { - if(!(str[i] & 0x80)) // 0xxxxxxx - { - vec.push_back(str[i]); - i++; - } - else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx - { - ch1 = (str[i] >> 2) & 0x07; - ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 2; - } - else if((unsigned char)str[i] <= 0xef && i + 2 < siz) - { - ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); - ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 3; - } - else - { - return false; - } - } - return true; - } - - bool TransCode::gbkToVec(const string& str, vector& vec) - { - vec.clear(); - if(str.empty()) - { - return false; - } - uint i = 0; - while(i < str.size()) - { - if(0 == (str[i] & 0x80)) - { - vec.push_back(uint16_t(str[i])); - i++; - } - else - { - if(i + 1 < str.size()) //&& (str[i+1] & 0x80)) - { - vec.push_back(twocharToUint16(str[i], str[i + 1])); - i += 2; - } - else - { - return false; - } - } - } - return true; - } - - - bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) - { - if(begin >= end) - { - return false; - } - res.clear(); - uint16_t ui; - while(begin != end) - { - ui = *begin; - if(ui <= 0x7f) - { - res += char(ui); - } - else if(ui <= 0x7ff) - { - res += char(((ui>>6) & 0x1f) | 0xc0); - res += char((ui & 0x3f) | 0x80); - } - else - { - res += char(((ui >> 12) & 0x0f )| 0xe0); - res += char(((ui>>6) & 0x3f )| 0x80 ); - res += char((ui & 0x3f) | 0x80); - } - begin ++; - } - return true; - } - - bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) - { - if(begin >= end) - { - return false; - } - res.clear(); - pair pa; - while(begin != end) - { - pa = uint16ToChar2(*begin); - if(pa.first & 0x80) - { - res += pa.first; - res += pa.second; - } - else - { - res += pa.second; - } - begin++; - } - return true; - } -} - - -#ifdef CPPJIEBA_TRANSCODE_UT -using namespace CPPCOMMON; -using namespace CppJieba; -int main() -{ - string a("abd你好世界!a"); - vector vec; - //TransCode::setUtf8Enc(); - cout<&); - typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - private: - static vector _encVec; - static bool _isInitted; - static pf_decode_t _pf_decode; - static pf_encode_t _pf_encode; - - public: - static void setGbkEnc(); - static void setUtf8Enc(); - - private: - TransCode(); - ~TransCode(); - public: - static bool init(); - public: - static inline bool decode(const string& str, vector& vec); - static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - static inline bool encode(const Unicode& sentence, string& res); - public: - static bool gbkToVec(const string& str, vector& vec); - static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - public: - static bool utf8ToVec(const string& str, vector& vec); - static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - }; + namespace TransCode + { - inline bool TransCode::decode(const string& str, vector& vec) - { - if(NULL == _pf_decode) + inline bool encode(vector::const_iterator begin, vector::const_iterator end, string& res) { - return false; + if(begin >= end) + { + return false; + } + res.clear(); + pair pa; + while(begin != end) + { + pa = CPPCOMMON::uint16ToChar2(*begin); + if(pa.first & 0x80) + { + res += pa.first; + res += pa.second; + } + else + { + res += pa.second; + } + begin++; + } + return true; } - return _pf_decode(str, vec); - } - inline bool TransCode::encode(const Unicode& sentence, string& res) - { - return encode(sentence.begin(), sentence.end(), res); - } - - inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) - { - if(!_pf_encode) + + inline bool encode(const vector& sentence, string& res) { - return false; + return encode(sentence.begin(), sentence.end(), res); } - return _pf_encode(begin, end, res); + + inline bool decode(const string& str, vector& vec) + { + vec.clear(); + if(str.empty()) + { + return false; + } + uint i = 0; + while(i < str.size()) + { + if(0 == (str[i] & 0x80)) + { + vec.push_back(uint16_t(str[i])); + i++; + } + else + { + if(i + 1 < str.size()) //&& (str[i+1] & 0x80)) + { + vec.push_back(CPPCOMMON::twocharToUint16(str[i], str[i + 1])); + i += 2; + } + else + { + return false; + } + } + } + return true; + } + } } diff --git a/cppjieba/structs.h b/cppjieba/structs.h index 2c01cef..4b60e55 100644 --- a/cppjieba/structs.h +++ b/cppjieba/structs.h @@ -9,6 +9,7 @@ namespace CppJieba { + using namespace CPPCOMMON; struct TrieNodeInfo { //string word; diff --git a/demo/example.sh b/demo/example.sh index b856869..b19ec1c 100755 --- a/demo/example.sh +++ b/demo/example.sh @@ -1,4 +1,3 @@ -make && \ -./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 &&\ -./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM &&\ -./segment_demo testlines.utf8 --algorithm cutMix +./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk +./segment_demo testlines.gbk --modelpath ../dicts/hmm_model.gbk --algorithm cutHMM +./segment_demo testlines.gbk --modelpath ../dicts/hmm_model.gbk --algorithm cutMix diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp index 1d42500..777f6cc 100644 --- a/demo/keywordext_demo.cpp +++ b/demo/keywordext_demo.cpp @@ -29,7 +29,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath) ext.dispose(); } -const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.utf8"; +const char * const DEFAULT_DICTPATH = "../dicts/jieba.dict.gbk"; int main(int argc, char ** argv) { @@ -38,9 +38,9 @@ int main(int argc, char ** argv) cout<<"usage: \n\t"<\n" <<"options:\n" <<"\t--dictpath\tIf not specified, the default is "<