mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
remve the gbk trans out of Transcode.h and delete TransCode this static class ,using namespace TransCode instead, and inlining funct in it , so remove the cpp , only use transcode.h
This commit is contained in:
parent
538c0119ba
commit
4a11d95cf6
13
README.md
13
README.md
@ -1,7 +1,12 @@
|
|||||||
#CppJieba是"结巴"中文分词的C++库
|
#CppJieba是"结巴"中文分词的C++库
|
||||||
|
|
||||||
## 中文编码
|
## 中文编码
|
||||||
* 现在支持utf8,gbk编码的分词。默认编码是utf8。
|
|
||||||
|
现在支持utf8,gbk编码的分词。
|
||||||
|
|
||||||
|
- `master`分支支持utf8编码
|
||||||
|
- `gbk`分支支持gbk编码
|
||||||
|
|
||||||
|
|
||||||
## 模块详解
|
## 模块详解
|
||||||
|
|
||||||
@ -120,17 +125,15 @@ make 之后产生libcppjieb.a
|
|||||||
usage:
|
usage:
|
||||||
./segment_demo[options] <filename>
|
./segment_demo[options] <filename>
|
||||||
options:
|
options:
|
||||||
--algorithm Supported encoding methods are [cutDAG, cutHMM, cutMix] for now.
|
--algorithm Supported methods are [cutDAG, cutHMM, cutMix] for now.
|
||||||
If not specified, the default is cutDAG
|
If not specified, the default is cutDAG
|
||||||
--dictpath If not specified, the default is ../dicts/jieba.dict.utf8
|
--dictpath If not specified, the default is ../dicts/jieba.dict.utf8
|
||||||
--modelpath If not specified, the default is ../dicts/hmm_model.utf8
|
--modelpath If not specified, the default is ../dicts/hmm_model.utf8
|
||||||
--encoding Supported encoding methods are [gbk, utf-8] for now.
|
|
||||||
If not specified, the default is utf8.
|
If not specified, the default is utf8.
|
||||||
example:
|
example:
|
||||||
./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8
|
./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
|
||||||
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
|
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
|
||||||
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix
|
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix
|
||||||
./segment_demo testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1,181 +0,0 @@
|
|||||||
#include "TransCode.h"
|
|
||||||
|
|
||||||
namespace CppJieba
|
|
||||||
{
|
|
||||||
vector<string> TransCode::_encVec;
|
|
||||||
bool TransCode::_isInitted = TransCode::init();
|
|
||||||
TransCode::pf_decode_t TransCode::_pf_decode = NULL;
|
|
||||||
TransCode::pf_encode_t TransCode::_pf_encode = NULL;
|
|
||||||
|
|
||||||
bool TransCode::init()
|
|
||||||
{
|
|
||||||
_pf_decode = gbkToVec;
|
|
||||||
_pf_encode = vecToGbk;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
TransCode::TransCode()
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
TransCode::~TransCode()
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void TransCode::setGbkEnc()
|
|
||||||
{
|
|
||||||
_pf_decode = gbkToVec;
|
|
||||||
_pf_encode = vecToGbk;
|
|
||||||
}
|
|
||||||
|
|
||||||
void TransCode::setUtf8Enc()
|
|
||||||
{
|
|
||||||
_pf_decode = utf8ToVec;
|
|
||||||
_pf_encode = vecToUtf8;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool TransCode::utf8ToVec(const string& str, vector<uint16_t>& vec)
|
|
||||||
{
|
|
||||||
char ch1, ch2;
|
|
||||||
if(str.empty())
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
vec.clear();
|
|
||||||
size_t siz = str.size();
|
|
||||||
for(uint i = 0;i < siz;)
|
|
||||||
{
|
|
||||||
if(!(str[i] & 0x80)) // 0xxxxxxx
|
|
||||||
{
|
|
||||||
vec.push_back(str[i]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
|
|
||||||
{
|
|
||||||
ch1 = (str[i] >> 2) & 0x07;
|
|
||||||
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
|
|
||||||
vec.push_back(twocharToUint16(ch1, ch2));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
|
|
||||||
{
|
|
||||||
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
|
|
||||||
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
|
|
||||||
vec.push_back(twocharToUint16(ch1, ch2));
|
|
||||||
i += 3;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TransCode::gbkToVec(const string& str, vector<uint16_t>& vec)
|
|
||||||
{
|
|
||||||
vec.clear();
|
|
||||||
if(str.empty())
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
uint i = 0;
|
|
||||||
while(i < str.size())
|
|
||||||
{
|
|
||||||
if(0 == (str[i] & 0x80))
|
|
||||||
{
|
|
||||||
vec.push_back(uint16_t(str[i]));
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if(i + 1 < str.size()) //&& (str[i+1] & 0x80))
|
|
||||||
{
|
|
||||||
vec.push_back(twocharToUint16(str[i], str[i + 1]));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
|
||||||
{
|
|
||||||
if(begin >= end)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
res.clear();
|
|
||||||
uint16_t ui;
|
|
||||||
while(begin != end)
|
|
||||||
{
|
|
||||||
ui = *begin;
|
|
||||||
if(ui <= 0x7f)
|
|
||||||
{
|
|
||||||
res += char(ui);
|
|
||||||
}
|
|
||||||
else if(ui <= 0x7ff)
|
|
||||||
{
|
|
||||||
res += char(((ui>>6) & 0x1f) | 0xc0);
|
|
||||||
res += char((ui & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
res += char(((ui >> 12) & 0x0f )| 0xe0);
|
|
||||||
res += char(((ui>>6) & 0x3f )| 0x80 );
|
|
||||||
res += char((ui & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
begin ++;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
|
||||||
{
|
|
||||||
if(begin >= end)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
res.clear();
|
|
||||||
pair<char, char> pa;
|
|
||||||
while(begin != end)
|
|
||||||
{
|
|
||||||
pa = uint16ToChar2(*begin);
|
|
||||||
if(pa.first & 0x80)
|
|
||||||
{
|
|
||||||
res += pa.first;
|
|
||||||
res += pa.second;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
res += pa.second;
|
|
||||||
}
|
|
||||||
begin++;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef CPPJIEBA_TRANSCODE_UT
|
|
||||||
using namespace CPPCOMMON;
|
|
||||||
using namespace CppJieba;
|
|
||||||
int main()
|
|
||||||
{
|
|
||||||
string a("abd你好世界!a");
|
|
||||||
vector<uint16_t> vec;
|
|
||||||
//TransCode::setUtf8Enc();
|
|
||||||
cout<<TransCode::decode(a, vec)<<endl;
|
|
||||||
PRINT_VECTOR(vec);
|
|
||||||
|
|
||||||
cout<<TransCode::encode(vec.begin(), vec.end())<<endl;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -12,60 +12,83 @@
|
|||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
using namespace CPPCOMMON;
|
using namespace CPPCOMMON;
|
||||||
class TransCode
|
namespace TransCode
|
||||||
{
|
{
|
||||||
public:
|
inline bool decode(const string& str, vector<uint16_t>& vec)
|
||||||
typedef bool (*pf_decode_t)(const string&, vector<uint16_t>&);
|
|
||||||
typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
|
||||||
private:
|
|
||||||
static vector<string> _encVec;
|
|
||||||
static bool _isInitted;
|
|
||||||
static pf_decode_t _pf_decode;
|
|
||||||
static pf_encode_t _pf_encode;
|
|
||||||
|
|
||||||
public:
|
|
||||||
static void setGbkEnc();
|
|
||||||
static void setUtf8Enc();
|
|
||||||
|
|
||||||
private:
|
|
||||||
TransCode();
|
|
||||||
~TransCode();
|
|
||||||
public:
|
|
||||||
static bool init();
|
|
||||||
public:
|
|
||||||
static inline bool decode(const string& str, vector<uint16_t>& vec);
|
|
||||||
static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
|
||||||
static inline bool encode(const Unicode& sentence, string& res);
|
|
||||||
|
|
||||||
public:
|
|
||||||
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
|
||||||
static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
|
||||||
public:
|
|
||||||
static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
|
|
||||||
static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
|
||||||
};
|
|
||||||
|
|
||||||
inline bool TransCode::decode(const string& str, vector<uint16_t>& vec)
|
|
||||||
{
|
{
|
||||||
if(NULL == _pf_decode)
|
char ch1, ch2;
|
||||||
|
if(str.empty())
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return _pf_decode(str, vec);
|
vec.clear();
|
||||||
|
size_t siz = str.size();
|
||||||
|
for(uint i = 0;i < siz;)
|
||||||
|
{
|
||||||
|
if(!(str[i] & 0x80)) // 0xxxxxxx
|
||||||
|
{
|
||||||
|
vec.push_back(str[i]);
|
||||||
|
i++;
|
||||||
}
|
}
|
||||||
inline bool TransCode::encode(const Unicode& sentence, string& res)
|
else if ((unsigned char)str[i] <= 0xdf && i + 1 < siz) // 110xxxxxx
|
||||||
|
{
|
||||||
|
ch1 = (str[i] >> 2) & 0x07;
|
||||||
|
ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
|
||||||
|
vec.push_back(twocharToUint16(ch1, ch2));
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
else if((unsigned char)str[i] <= 0xef && i + 2 < siz)
|
||||||
|
{
|
||||||
|
ch1 = (str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
|
||||||
|
ch2 = ((str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
|
||||||
|
vec.push_back(twocharToUint16(ch1, ch2));
|
||||||
|
i += 3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline bool encode(vector<uint16_t>::const_iterator begin, vector<uint16_t>::const_iterator end, string& res)
|
||||||
|
{
|
||||||
|
if(begin >= end)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
res.clear();
|
||||||
|
uint16_t ui;
|
||||||
|
while(begin != end)
|
||||||
|
{
|
||||||
|
ui = *begin;
|
||||||
|
if(ui <= 0x7f)
|
||||||
|
{
|
||||||
|
res += char(ui);
|
||||||
|
}
|
||||||
|
else if(ui <= 0x7ff)
|
||||||
|
{
|
||||||
|
res += char(((ui>>6) & 0x1f) | 0xc0);
|
||||||
|
res += char((ui & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
res += char(((ui >> 12) & 0x0f )| 0xe0);
|
||||||
|
res += char(((ui>>6) & 0x3f )| 0x80 );
|
||||||
|
res += char((ui & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
begin ++;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline bool encode(const vector<uint16_t>& sentence, string& res)
|
||||||
{
|
{
|
||||||
return encode(sentence.begin(), sentence.end(), res);
|
return encode(sentence.begin(), sentence.end(), res);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
|
||||||
{
|
|
||||||
if(!_pf_encode)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return _pf_encode(begin, end, res);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
make && \
|
./segment_demo testlines.utf8 --dictpath ../dicts/jieba.dict.utf8
|
||||||
./segment_demo testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8 &&\
|
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM
|
||||||
./segment_demo testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM &&\
|
|
||||||
./segment_demo testlines.utf8 --algorithm cutMix
|
./segment_demo testlines.utf8 --algorithm cutMix
|
||||||
|
@ -38,30 +38,18 @@ int main(int argc, char ** argv)
|
|||||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||||
<<"options:\n"
|
<<"options:\n"
|
||||||
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
|
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<"\n"
|
||||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf-8.\n"
|
|
||||||
<<"examples:\n"
|
<<"examples:\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.gbk --encoding gbk --dictpath ../dicts/jieba.dict.gbk\n"
|
|
||||||
<<endl;
|
<<endl;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ArgvContext arg(argc, argv);
|
ArgvContext arg(argc, argv);
|
||||||
string dictPath = arg["--dictpath"];
|
string dictPath = arg["--dictpath"];
|
||||||
string encoding = arg["--encoding"];
|
|
||||||
if("" == dictPath)
|
if("" == dictPath)
|
||||||
{
|
{
|
||||||
dictPath = DEFAULT_DICTPATH;
|
dictPath = DEFAULT_DICTPATH;
|
||||||
}
|
}
|
||||||
if("gbk" == encoding)
|
|
||||||
{
|
|
||||||
TransCode::setGbkEnc();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
TransCode::setUtf8Enc();
|
|
||||||
}
|
|
||||||
|
|
||||||
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
|
testKeyWordExt(dictPath.c_str(), arg[1].c_str());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -101,12 +101,11 @@ int main(int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
|
cout<<"usage: \n\t"<<argv[0]<<"[options] <filename>\n"
|
||||||
<<"options:\n"
|
<<"options:\n"
|
||||||
<<"\t--algorithm\tSupported encoding methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n"
|
||||||
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
|
<<"\t--dictpath\tIf not specified, the default is "<<DEFAULT_DICTPATH<<'\n'
|
||||||
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
|
<<"\t--modelpath\tIf not specified, the default is "<<DEFAULT_MODELPATH<<'\n'
|
||||||
<<"\t--encoding\tSupported encoding methods are [gbk, utf-8] for now. \n\t\t\tIf not specified, the default is utf8.\n"
|
|
||||||
<<"example:\n"
|
<<"example:\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.utf8 --encoding utf-8 --dictpath ../dicts/jieba.dict.utf8\n"
|
<<"\t"<<argv[0]<<" testlines.utf8 --dictpath ../dicts/jieba.dict.utf8\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||||
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
<<"\t"<<argv[0]<<" testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||||
<<endl;
|
<<endl;
|
||||||
@ -116,7 +115,6 @@ int main(int argc, char ** argv)
|
|||||||
ArgvContext arg(argc, argv);
|
ArgvContext arg(argc, argv);
|
||||||
string dictPath = arg["--dictpath"];
|
string dictPath = arg["--dictpath"];
|
||||||
string modelPath = arg["--modelpath"];
|
string modelPath = arg["--modelpath"];
|
||||||
string encoding = arg["--encoding"];
|
|
||||||
string algorithm = arg["--algorithm"];
|
string algorithm = arg["--algorithm"];
|
||||||
if(dictPath.empty())
|
if(dictPath.empty())
|
||||||
{
|
{
|
||||||
@ -126,14 +124,6 @@ int main(int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
modelPath = DEFAULT_MODELPATH;
|
modelPath = DEFAULT_MODELPATH;
|
||||||
}
|
}
|
||||||
if("gbk" == encoding)
|
|
||||||
{
|
|
||||||
TransCode::setGbkEnc();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
TransCode::setUtf8Enc();
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!init(dictPath.c_str(), modelPath.c_str()))
|
if(!init(dictPath.c_str(), modelPath.c_str()))
|
||||||
{
|
{
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
我来到北京清华大学
|
|
||||||
他来到了网易杭研大厦
|
|
||||||
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
|
|
Loading…
x
Reference in New Issue
Block a user