From 4a11d95cf6a1af121089a304f4bb7cd0a16a0283 Mon Sep 17 00:00:00 2001 From: wyy Date: Wed, 18 Sep 2013 10:57:41 +0800 Subject: [PATCH] remve the gbk trans out of Transcode.h and delete TransCode this static class ,using namespace TransCode instead, and inlining funct in it , so remove the cpp , only use transcode.h --- README.md | 13 +-- cppjieba/TransCode.cpp | 181 --------------------------------------- cppjieba/TransCode.h | 121 +++++++++++++++----------- demo/example.sh | 5 +- demo/keywordext_demo.cpp | 12 --- demo/segment_demo.cpp | 14 +-- demo/testlines.gbk | 3 - 7 files changed, 84 insertions(+), 265 deletions(-) delete mode 100644 cppjieba/TransCode.cpp delete mode 100644 demo/testlines.gbk diff --git a/README.md b/README.md index 4dedf93..a27287b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,12 @@ #CppJieba是"结巴"中文分词的C++库 ## 中文编码 -* 现在支持utf8,gbk编码的分词。默认编码是utf8。 + +现在支持utf8,gbk编码的分词。 + +- `master`分支支持utf8编码 +- `gbk`分支支持gbk编码 + ## 模块详解 @@ -120,17 +125,15 @@ make 之后产生libcppjieb.a usage: ./segment_demo[options] options: - --algorithm Supported encoding methods are [cutDAG, cutHMM, cutMix] for now. + --algorithm Supported methods are [cutDAG, cutHMM, cutMix] for now. If not specified, the default is cutDAG --dictpath If not specified, the default is ../dicts/jieba.dict.utf8 --modelpath If not specified, the default is ../dicts/hmm_model.utf8 - --encoding Supported encoding methods are [gbk, utf-8] for now. If not specified, the default is utf8. example: - ./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 + ./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix - ./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk ``` diff --git a/cppjieba/TransCode.cpp b/cppjieba/TransCode.cpp deleted file mode 100644 index d6e9bd2..0000000 --- a/cppjieba/TransCode.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "TransCode.h" - -namespace CppJieba -{ - vector TransCode::_encVec; - bool TransCode::_isInitted = TransCode::init(); - TransCode::pf_decode_t TransCode::_pf_decode = NULL; - TransCode::pf_encode_t TransCode::_pf_encode = NULL; - - bool TransCode::init() - { - _pf_decode = gbkToVec; - _pf_encode = vecToGbk; - return true; - } - - TransCode::TransCode() - { - - } - TransCode::~TransCode() - { - } - - void TransCode::setGbkEnc() - { - _pf_decode = gbkToVec; - _pf_encode = vecToGbk; - } - - void TransCode::setUtf8Enc() - { - _pf_decode = utf8ToVec; - _pf_encode = vecToUtf8; - } - - - bool TransCode::utf8ToVec(const string& str, vector& vec) - { - char ch1, ch2; - if(str.empty()) - { - return false; - } - vec.clear(); - size_t siz = str.size(); - for(uint i = 0;i < siz;) - { - if(!(str[i] & 0x80)) // 0xxxxxxx - { - vec.push_back(str[i]); - i++; - } - else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx - { - ch1 = (str[i] >> 2) & 0x07; - ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 2; - } - else if((unsigned char)str[i] <= 0xef && i + 2 < siz) - { - ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); - ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); - vec.push_back(twocharToUint16(ch1, ch2)); - i += 3; - } - else - { - return false; - } - } - return true; - } - - bool TransCode::gbkToVec(const string& str, vector& vec) - { - vec.clear(); - if(str.empty()) - { - return false; - } - uint i = 0; - while(i < str.size()) - { - if(0 == (str[i] & 0x80)) - { - vec.push_back(uint16_t(str[i])); - i++; - } - else - { - if(i + 1 < str.size()) //&& (str[i+1] & 0x80)) - { - vec.push_back(twocharToUint16(str[i], str[i + 1])); - i += 2; - } - else - { - return false; - } - } - } - return true; - } - - - bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) - { - if(begin >= end) - { - return false; - } - res.clear(); - uint16_t ui; - while(begin != end) - { - ui = *begin; - if(ui <= 0x7f) - { - res += char(ui); - } - else if(ui <= 0x7ff) - { - res += char(((ui>>6) & 0x1f) | 0xc0); - res += char((ui & 0x3f) | 0x80); - } - else - { - res += char(((ui >> 12) & 0x0f )| 0xe0); - res += char(((ui>>6) & 0x3f )| 0x80 ); - res += char((ui & 0x3f) | 0x80); - } - begin ++; - } - return true; - } - - bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) - { - if(begin >= end) - { - return false; - } - res.clear(); - pair pa; - while(begin != end) - { - pa = uint16ToChar2(*begin); - if(pa.first & 0x80) - { - res += pa.first; - res += pa.second; - } - else - { - res += pa.second; - } - begin++; - } - return true; - } -} - - -#ifdef CPPJIEBA_TRANSCODE_UT -using namespace CPPCOMMON; -using namespace CppJieba; -int main() -{ - string a("abd你好世界!a"); - vector vec; - //TransCode::setUtf8Enc(); - cout<&); - typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - private: - static vector _encVec; - static bool _isInitted; - static pf_decode_t _pf_decode; - static pf_encode_t _pf_encode; - - public: - static void setGbkEnc(); - static void setUtf8Enc(); - - private: - TransCode(); - ~TransCode(); - public: - static bool init(); - public: - static inline bool decode(const string& str, vector& vec); - static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - static inline bool encode(const Unicode& sentence, string& res); - - public: - static bool gbkToVec(const string& str, vector& vec); - static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - public: - static bool utf8ToVec(const string& str, vector& vec); - static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); - }; - - inline bool TransCode::decode(const string& str, vector& vec) - { - if(NULL == _pf_decode) + inline bool decode(const string& str, vector& vec) { - return false; + char ch1, ch2; + if(str.empty()) + { + return false; + } + vec.clear(); + size_t siz = str.size(); + for(uint i = 0;i < siz;) + { + if(!(str[i] & 0x80)) // 0xxxxxxx + { + vec.push_back(str[i]); + i++; + } + else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx + { + ch1 = (str[i] >> 2) & 0x07; + ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); + vec.push_back(twocharToUint16(ch1, ch2)); + i += 2; + } + else if((unsigned char)str[i] <= 0xef && i + 2 < siz) + { + ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); + ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); + vec.push_back(twocharToUint16(ch1, ch2)); + i += 3; + } + else + { + return false; + } + } + return true; } - return _pf_decode(str, vec); - } - inline bool TransCode::encode(const Unicode& sentence, string& res) - { - return encode(sentence.begin(), sentence.end(), res); - } - - inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) - { - if(!_pf_encode) + + + inline bool encode(vector::const_iterator begin, vector::const_iterator end, string& res) { - return false; + if(begin >= end) + { + return false; + } + res.clear(); + uint16_t ui; + while(begin != end) + { + ui = *begin; + if(ui <= 0x7f) + { + res += char(ui); + } + else if(ui <= 0x7ff) + { + res += char(((ui>>6) & 0x1f) | 0xc0); + res += char((ui & 0x3f) | 0x80); + } + else + { + res += char(((ui >> 12) & 0x0f )| 0xe0); + res += char(((ui>>6) & 0x3f )| 0x80 ); + res += char((ui & 0x3f) | 0x80); + } + begin ++; + } + return true; + } + inline bool encode(const vector& sentence, string& res) + { + return encode(sentence.begin(), sentence.end(), res); } - return _pf_encode(begin, end, res); } } diff --git a/demo/example.sh b/demo/example.sh index b856869..2fae9dd 100755 --- a/demo/example.sh +++ b/demo/example.sh @@ -1,4 +1,3 @@ -make && \ -./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 &&\ -./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM &&\ +./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 +./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM ./segment_demo testlines.utf8 --algorithm cutMix diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp index 1d42500..cc4db47 100644 --- a/demo/keywordext_demo.cpp +++ b/demo/keywordext_demo.cpp @@ -38,30 +38,18 @@ int main(int argc, char ** argv) cout<<"usage: \n\t"<\n" <<"options:\n" <<"\t--dictpath\tIf not specified, the default is "<\n" <<"options:\n" - <<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" + <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" <<"\t--dictpath\tIf not specified, the default is "<