From 4a11d95cf6a1af121089a304f4bb7cd0a16a0283 Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Wed, 18 Sep 2013 10:57:41 +0800
Subject: [PATCH] remve the gbk trans out of Transcode.h and delete TransCode
 this static class ,using namespace TransCode instead, and inlining funct in
 it , so remove the cpp , only use transcode.h

---
 README.md                |  13 +--
 cppjieba/TransCode.cpp   | 181 ---------------------------------------
 cppjieba/TransCode.h     | 121 +++++++++++++++-----------
 demo/example.sh          |   5 +-
 demo/keywordext_demo.cpp |  12 ---
 demo/segment_demo.cpp    |  14 +--
 demo/testlines.gbk       |   3 -
 7 files changed, 84 insertions(+), 265 deletions(-)
 delete mode 100644 cppjieba/TransCode.cpp
 delete mode 100644 demo/testlines.gbk
diff --git a/README.md b/README.md
index 4dedf93..a27287b 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,12 @@
 #CppJiebaæ˜¯"ç»“å·´"ä¸­æ–‡åˆ†è¯çš„C++åº“
 
 ## ä¸­æ–‡ç¼–ç 
-* çŽ°åœ¨æ”¯æŒutf8,gbkç¼–ç çš„åˆ†è¯ã€‚é»˜è®¤ç¼–ç æ˜¯utf8ã€‚  
+
+çŽ°åœ¨æ”¯æŒutf8,gbkç¼–ç çš„åˆ†è¯ã€‚   
+
+- `master`åˆ†æ”¯æ”¯æŒutf8ç¼–ç    
+- `gbk`åˆ†æ”¯æ”¯æŒgbkç¼–ç 
+
 
 ## æ¨¡å—è¯¦è§£
 
@@ -120,17 +125,15 @@ make ä¹‹åŽäº§ç”Ÿlibcppjieb.a
 usage:
         ./segment_demo[options] <filename>
 options:
-        --algorithm     Supported encoding methods are [cutDAG, cutHMM, cutMix] for now.
+        --algorithm     Supported methods are [cutDAG, cutHMM, cutMix] for now.
                         If not specified, the default is cutDAG
         --dictpath      If not specified, the default is ../dicts/jieba.dict.utf8
         --modelpath     If not specified, the default is ../dicts/hmm_model.utf8
-        --encoding      Supported encoding methods are [gbk, utf-8] for now.
                         If not specified, the default is utf8.
 example:
-        ./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8
+        ./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
         ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
         ./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix
-        ./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk
 
 ```
 
diff --git a/cppjieba/TransCode.cpp b/cppjieba/TransCode.cpp
deleted file mode 100644
index d6e9bd2..0000000
--- a/cppjieba/TransCode.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-#include "TransCode.h"
-
-namespace CppJieba
-{
-    vector<string> TransCode::_encVec;
-    bool TransCode::_isInitted = TransCode::init();
-    TransCode::pf_decode_t TransCode::_pf_decode = NULL;
-    TransCode::pf_encode_t TransCode::_pf_encode = NULL;
-
-    bool TransCode::init()
-    {
-        _pf_decode = gbkToVec;
-        _pf_encode = vecToGbk;
-        return true;
-    }
-
-    TransCode::TransCode()
-    {
-        
-    }
-    TransCode::~TransCode()
-    {
-    }
-
-    void TransCode::setGbkEnc()
-    {
-        _pf_decode = gbkToVec;
-        _pf_encode = vecToGbk;
-    }
-
-    void TransCode::setUtf8Enc()
-    {
-        _pf_decode = utf8ToVec;
-        _pf_encode = vecToUtf8;
-    }
-    
-
-    bool TransCode::utf8ToVec(const string& str, vector<uint16_t>& vec)
-    {
-        char ch1, ch2;
-        if(str.empty())
-        {
-            return false;
-        }
-        vec.clear();
-        size_t siz = str.size();
-        for(uint i = 0;i < siz;)
-        {
-            if(!(str[i] & 0x80)) // 0xxxxxxx
-            {
-                vec.push_back(str[i]);
-                i++;
-            }
-            else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
-            {
-                ch1 = (str[i] >> 2) & 0x07;
-                ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
-                vec.push_back(twocharToUint16(ch1, ch2));
-                i += 2;
-            }
-            else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
-            {
-                ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
-                ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 
-                vec.push_back(twocharToUint16(ch1, ch2));
-                i += 3;
-            }
-            else
-            {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    bool TransCode::gbkToVec(const string& str, vector<uint16_t>& vec)
-    {
-        vec.clear();
-        if(str.empty())
-        {
-            return false;
-        }
-        uint i = 0;
-        while(i < str.size())
-        {
-            if(0 == (str[i] & 0x80))
-            {
-                vec.push_back(uint16_t(str[i]));
-                i++;
-            }
-            else
-            {
-                if(i + 1 < str.size()) //&& (str[i+1] & 0x80))
-                {
-                    vec.push_back(twocharToUint16(str[i], str[i + 1]));
-                    i += 2;
-                }
-                else
-                {
-                    return false;
-                }
-            }
-        }
-        return true;
-    }
-    
-
-    bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
-    {
-        if(begin >= end)
-        {
-            return false;
-        }
-        res.clear();
-        uint16_t ui;
-        while(begin != end)
-        {
-            ui = *begin;
-            if(ui <= 0x7f)
-            {
-                res += char(ui);
-            }
-            else if(ui <= 0x7ff)
-            {
-                res += char(((ui>>6) & 0x1f) | 0xc0);
-                res += char((ui & 0x3f) | 0x80);
-            }
-            else
-            {
-                res += char(((ui >> 12) & 0x0f )| 0xe0);
-                res += char(((ui>>6) & 0x3f )| 0x80 );
-                res += char((ui & 0x3f) | 0x80);
-            }
-            begin ++;
-        }
-        return true;
-    }
-
-    bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
-    {
-        if(begin >= end)
-        {
-            return false;
-        }
-        res.clear();
-        pair<char, char> pa;
-        while(begin != end)
-        {
-            pa = uint16ToChar2(*begin);
-            if(pa.first & 0x80)
-            {
-                res += pa.first;
-                res += pa.second;
-            }
-            else
-            {
-                res += pa.second;
-            }
-            begin++;
-        }
-        return true;
-    }
-}
-
-
-#ifdef CPPJIEBA_TRANSCODE_UT
-using namespace CPPCOMMON;
-using namespace CppJieba;
-int main()
-{
-    string a("abdä½ å¥½ä¸–ç•Œ!a");
-    vector<uint16_t> vec;
-    //TransCode::setUtf8Enc();
-    cout<<TransCode::decode(a, vec)<<endl;
-    PRINT_VECTOR(vec);
-
-    cout<<TransCode::encode(vec.begin(), vec.end())<<endl;
-    
-    return 0;
-}
-#endif
diff --git a/cppjieba/TransCode.h b/cppjieba/TransCode.h
index ebae6f3..bf6915e 100644
--- a/cppjieba/TransCode.h
+++ b/cppjieba/TransCode.h
@@ -1,7 +1,7 @@
 /************************************
  * file enc : utf-8
  * author   : wuyanyi09@gmail.com
-************************************/
+ ************************************/
 #ifndef CPPJIEBA_TRANSCODE_H
 #define CPPJIEBA_TRANSCODE_H
 
@@ -12,60 +12,83 @@
 
 namespace CppJieba
 {
+
     using namespace CPPCOMMON;
-    class TransCode
+    namespace TransCode
     {
-        public:
-            typedef bool (*pf_decode_t)(const string&, vector<uint16_t>&);
-            typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
-        private:
-            static vector<string> _encVec;
-            static bool _isInitted;
-            static pf_decode_t _pf_decode;
-            static pf_encode_t _pf_encode;
-            
-        public:
-            static void setGbkEnc();
-            static void setUtf8Enc();
-            
-        private:
-            TransCode();
-            ~TransCode();
-        public:
-            static bool init();
-        public:
-            static inline bool decode(const string& str, vector<uint16_t>& vec);
-            static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
-            static inline bool encode(const Unicode& sentence, string& res);
-
-        public:
-            static bool gbkToVec(const string& str, vector<uint16_t>& vec);
-            static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
-        public:
-            static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
-            static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
-    };
-
-    inline bool TransCode::decode(const string& str, vector<uint16_t>& vec)
-    {
-        if(NULL == _pf_decode)
+        inline bool decode(const string& str, vector<uint16_t>& vec)
         {
-            return false;
+            char ch1, ch2;
+            if(str.empty())
+            {
+                return false;
+            }
+            vec.clear();
+            size_t siz = str.size();
+            for(uint i = 0;i < siz;)
+            {
+                if(!(str[i] & 0x80)) // 0xxxxxxx
+                {
+                    vec.push_back(str[i]);
+                    i++;
+                }
+                else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
+                {
+                    ch1 = (str[i] >> 2) & 0x07;
+                    ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
+                    vec.push_back(twocharToUint16(ch1, ch2));
+                    i += 2;
+                }
+                else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
+                {
+                    ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
+                    ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); 
+                    vec.push_back(twocharToUint16(ch1, ch2));
+                    i += 3;
+                }
+                else
+                {
+                    return false;
+                }
+            }
+            return true;
         }
-        return _pf_decode(str, vec);
-    }
-    inline bool TransCode::encode(const Unicode& sentence, string& res)
-    {
-        return encode(sentence.begin(), sentence.end(), res);
-    }
-    
-    inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
-    {
-        if(!_pf_encode)
+
+
+        inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
         {
-            return false;
+            if(begin >= end)
+            {
+                return false;
+            }
+            res.clear();
+            uint16_t ui;
+            while(begin != end)
+            {
+                ui = *begin;
+                if(ui <= 0x7f)
+                {
+                    res += char(ui);
+                }
+                else if(ui <= 0x7ff)
+                {
+                    res += char(((ui>>6) & 0x1f) | 0xc0);
+                    res += char((ui & 0x3f) | 0x80);
+                }
+                else
+                {
+                    res += char(((ui >> 12) & 0x0f )| 0xe0);
+                    res += char(((ui>>6) & 0x3f )| 0x80 );
+                    res += char((ui & 0x3f) | 0x80);
+                }
+                begin ++;
+            }
+            return true;
+        }
+        inline bool encode(const vector<uint16_t>& sentence, string& res)
+        {
+            return encode(sentence.begin(), sentence.end(), res);
         }
-        return _pf_encode(begin, end, res);
     }
 }
 
diff --git a/demo/example.sh b/demo/example.sh
index b856869..2fae9dd 100755
--- a/demo/example.sh
+++ b/demo/example.sh
@@ -1,4 +1,3 @@
-make && \
-./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 &&\
-./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM &&\
+./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 
+./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM 
 ./segment_demo testlines.utf8 --algorithm cutMix
diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp
index 1d42500..cc4db47 100644
--- a/demo/keywordext_demo.cpp
+++ b/demo/keywordext_demo.cpp
@@ -38,30 +38,18 @@ int main(int argc, char ** argv)
         cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
             <<"options:\n"
             <<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
-            <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf-8.\n"
             <<"examples:\n"
             <<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
-            <<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
             <<endl;
         return -1;
     }
 
     ArgvContext arg(argc, argv);
     string dictPath = arg["--dictpath"];
-    string encoding = arg["--encoding"];
     if("" == dictPath)
     {
         dictPath = DEFAULT_DICTPATH;
     }
-    if("gbk" == encoding)
-    {
-        TransCode::setGbkEnc();
-    }
-    else
-    {
-        TransCode::setUtf8Enc();
-    }
-    
     testKeyWordExt(dictPath.c_str(), arg[1].c_str());
     return 0;
 }
diff --git a/demo/segment_demo.cpp b/demo/segment_demo.cpp
index 1bd660c..196df15 100644
--- a/demo/segment_demo.cpp
+++ b/demo/segment_demo.cpp
@@ -101,12 +101,11 @@ int main(int argc, char ** argv)
     {
         cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
             <<"options:\n"
-            <<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
+            <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
             <<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
             <<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
-            <<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
             <<"example:\n"
-            <<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
+            <<"\t"<<argv[0]<<" testlines.utf8 --dictpath ../dicts/jieba.dict.utf8\n"
             <<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
             <<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
             <<endl;
@@ -116,7 +115,6 @@ int main(int argc, char ** argv)
     ArgvContext arg(argc, argv);
     string dictPath = arg["--dictpath"];
     string modelPath = arg["--modelpath"];
-    string encoding = arg["--encoding"];
     string algorithm = arg["--algorithm"];
     if(dictPath.empty())
     {
@@ -126,14 +124,6 @@ int main(int argc, char ** argv)
     {
         modelPath = DEFAULT_MODELPATH;
     }
-    if("gbk" == encoding)
-    {
-        TransCode::setGbkEnc();
-    }
-    else
-    {
-        TransCode::setUtf8Enc();
-    }
 
     if(!init(dictPath.c_str(), modelPath.c_str()))
     {
diff --git a/demo/testlines.gbk b/demo/testlines.gbk
deleted file mode 100644
index 119a3c2..0000000
--- a/demo/testlines.gbk
+++ /dev/null
@@ -1,3 +0,0 @@
-ÎÒÀ´µ½±±¾©Çå»ª´óÑ§
-ËûÀ´µ½ÁËÍøÒ×º¼ÑÐ´óÏÃ
-Ð¡Ã÷Ë¶Ê¿±ÏÒµÓÚÖÐ¹ú¿ÆÑ§Ôº¼ÆËãËù£¬ºóÔÚÈÕ±¾¾©¶¼´óÑ§ÉîÔì