mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
rewrite transcode for time conmused by mv some return value of funct to argv of functs
This commit is contained in:
parent
d5b9bb21dd
commit
aefbbdf49d
@ -149,9 +149,13 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
res.clear();
|
res.clear();
|
||||||
|
string tmp;
|
||||||
for(uint i = 0; i < words.size(); i++)
|
for(uint i = 0; i < words.size(); i++)
|
||||||
{
|
{
|
||||||
res.push_back(TransCode::encode(words[i].begin(), words[i].end()));
|
if(TransCode::encode(words[i], tmp))
|
||||||
|
{
|
||||||
|
res.push_back(tmp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -27,30 +27,6 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
//bool KeyWordExt::_loadPriorSubWords(const char * const filePath)
|
|
||||||
//{
|
|
||||||
// LogInfo(string_format("_loadPriorSubWords(%s) start", filePath));
|
|
||||||
// if(!checkFileExist(filePath))
|
|
||||||
// {
|
|
||||||
// LogError(string_format("cann't find file[%s].",filePath));
|
|
||||||
// return false;
|
|
||||||
// }
|
|
||||||
// if(!_priorSubWords.empty())
|
|
||||||
// {
|
|
||||||
// LogError("_priorSubWords has been initted before");
|
|
||||||
// return false;
|
|
||||||
// }
|
|
||||||
// ifstream infile(filePath);
|
|
||||||
// string subword;
|
|
||||||
// while(getline(infile, subword))
|
|
||||||
// {
|
|
||||||
// _priorSubWords.push_back(subword);
|
|
||||||
// }
|
|
||||||
// LogInfo(string_format("_loadPriorSubWords(%s) end", filePath));
|
|
||||||
// infile.close();
|
|
||||||
// return true;
|
|
||||||
//}
|
|
||||||
|
|
||||||
bool KeyWordExt::loadStopWords(const char * const filePath)
|
bool KeyWordExt::loadStopWords(const char * const filePath)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -44,9 +44,17 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
res.clear();
|
res.clear();
|
||||||
|
string tmp;
|
||||||
for(uint i = 0; i < segWordInfos.size(); i++)
|
for(uint i = 0; i < segWordInfos.size(); i++)
|
||||||
{
|
{
|
||||||
res.push_back(TransCode::encode(segWordInfos[i].word.begin(), segWordInfos[i].word.end()));
|
if(TransCode::encode(segWordInfos[i].word, tmp))
|
||||||
|
{
|
||||||
|
res.push_back(tmp);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LogError("encode failed.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -43,6 +43,7 @@ namespace CppJieba
|
|||||||
res.clear();
|
res.clear();
|
||||||
Unicode unico;
|
Unicode unico;
|
||||||
vector<Unicode> hmmRes;
|
vector<Unicode> hmmRes;
|
||||||
|
string tmp;
|
||||||
for(uint i= 0; i < infos.size(); i++)
|
for(uint i= 0; i < infos.size(); i++)
|
||||||
{
|
{
|
||||||
if(1 == infos[i].word.size())
|
if(1 == infos[i].word.size())
|
||||||
@ -60,12 +61,14 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
for(uint j = 0; j < hmmRes.size(); j++)
|
for(uint j = 0; j < hmmRes.size(); j++)
|
||||||
{
|
{
|
||||||
res.push_back(TransCode::encode(hmmRes[j]));
|
TransCode::encode(hmmRes[j], tmp);
|
||||||
|
res.push_back(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unico.clear();
|
unico.clear();
|
||||||
|
|
||||||
res.push_back(TransCode::encode(infos[i].word));
|
TransCode::encode(infos[i].word, tmp);
|
||||||
|
res.push_back(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -78,7 +81,8 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
for(uint j = 0; j < hmmRes.size(); j++)
|
for(uint j = 0; j < hmmRes.size(); j++)
|
||||||
{
|
{
|
||||||
res.push_back(TransCode::encode(hmmRes[j]));
|
TransCode::encode(hmmRes[j], tmp);
|
||||||
|
res.push_back(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,14 +34,6 @@ namespace CppJieba
|
|||||||
_pf_encode = vecToUtf8;
|
_pf_encode = vecToUtf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TransCode::decode(const string& str, vector<uint16_t>& vec)
|
|
||||||
{
|
|
||||||
if(NULL == _pf_decode)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return _pf_decode(str, vec);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TransCode::utf8ToVec(const string& str, vector<uint16_t>& vec)
|
bool TransCode::utf8ToVec(const string& str, vector<uint16_t>& vec)
|
||||||
{
|
{
|
||||||
@ -112,27 +104,14 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
string TransCode::encode(const Unicode& sentence)
|
|
||||||
{
|
|
||||||
return encode(sentence.begin(), sentence.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end)
|
bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
||||||
{
|
|
||||||
if(!_pf_encode)
|
|
||||||
{
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
return _pf_encode(begin, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
string TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end)
|
|
||||||
{
|
{
|
||||||
if(begin >= end)
|
if(begin >= end)
|
||||||
{
|
{
|
||||||
return "";
|
return false;
|
||||||
}
|
}
|
||||||
string res;
|
res.clear();
|
||||||
uint16_t ui;
|
uint16_t ui;
|
||||||
while(begin != end)
|
while(begin != end)
|
||||||
{
|
{
|
||||||
@ -154,17 +133,17 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
begin ++;
|
begin ++;
|
||||||
}
|
}
|
||||||
return res;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
string TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end)
|
bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
||||||
{
|
{
|
||||||
if(begin >= end)
|
if(begin >= end)
|
||||||
{
|
{
|
||||||
return "";
|
return false;
|
||||||
}
|
}
|
||||||
|
res.clear();
|
||||||
pair<char, char> pa;
|
pair<char, char> pa;
|
||||||
string res;
|
|
||||||
while(begin != end)
|
while(begin != end)
|
||||||
{
|
{
|
||||||
pa = uint16ToChar2(*begin);
|
pa = uint16ToChar2(*begin);
|
||||||
@ -179,21 +158,8 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
begin++;
|
begin++;
|
||||||
}
|
}
|
||||||
return res;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
//size_t TransCode::getWordLength(const string& str)
|
|
||||||
//{
|
|
||||||
// vector<uint16_t> vec;
|
|
||||||
// if(!decode(str, vec))
|
|
||||||
// {
|
|
||||||
// return 0;
|
|
||||||
// }
|
|
||||||
// else
|
|
||||||
// {
|
|
||||||
// return vec.size();
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -202,27 +168,6 @@ using namespace CPPCOMMON;
|
|||||||
using namespace CppJieba;
|
using namespace CppJieba;
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
//ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt");
|
|
||||||
//string line;
|
|
||||||
//Unicode vec;
|
|
||||||
//while(getline(ifile, line))
|
|
||||||
//{
|
|
||||||
//
|
|
||||||
// cout<<line<<endl;
|
|
||||||
// cout<<line.size()<<endl;
|
|
||||||
// if(!TransCode::decode(line, vec))
|
|
||||||
// {
|
|
||||||
// cout<<"error"<<endl;
|
|
||||||
// }
|
|
||||||
// PRINT_VECTOR(vec);
|
|
||||||
// cout<<TransCode::encode(vec)<<endl;
|
|
||||||
//}
|
|
||||||
//ifile.close();
|
|
||||||
//typedef bool (* pf)(const string& , vector<uint16_t>&);
|
|
||||||
//pf tmp = TransCode::a;
|
|
||||||
//vector<uint16_t> vec;
|
|
||||||
//tmp("1",vec);
|
|
||||||
|
|
||||||
string a("abd你好世界!a");
|
string a("abd你好世界!a");
|
||||||
vector<uint16_t> vec;
|
vector<uint16_t> vec;
|
||||||
//TransCode::setUtf8Enc();
|
//TransCode::setUtf8Enc();
|
||||||
|
@ -17,38 +17,56 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
typedef bool (*pf_decode_t)(const string&, vector<uint16_t>&);
|
typedef bool (*pf_decode_t)(const string&, vector<uint16_t>&);
|
||||||
typedef string (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end);
|
typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
||||||
typedef size_t (*pf_getWordLength_t)(const string& str);
|
|
||||||
private:
|
private:
|
||||||
static vector<string> _encVec;
|
static vector<string> _encVec;
|
||||||
static bool _isInitted;
|
static bool _isInitted;
|
||||||
static pf_decode_t _pf_decode;
|
static pf_decode_t _pf_decode;
|
||||||
static pf_encode_t _pf_encode;
|
static pf_encode_t _pf_encode;
|
||||||
static pf_getWordLength_t _pf_getWordLength;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
static void setGbkEnc();
|
static void setGbkEnc();
|
||||||
static void setUtf8Enc();
|
static void setUtf8Enc();
|
||||||
|
|
||||||
public:
|
private:
|
||||||
TransCode();
|
TransCode();
|
||||||
~TransCode();
|
~TransCode();
|
||||||
public:
|
public:
|
||||||
static bool init();
|
static bool init();
|
||||||
public:
|
public:
|
||||||
static bool decode(const string& str, vector<uint16_t>& vec);
|
static inline bool decode(const string& str, vector<uint16_t>& vec);
|
||||||
static string encode(Unicode::const_iterator begin, Unicode::const_iterator end);
|
static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
||||||
static string encode(const Unicode& sentence);
|
static inline bool encode(const Unicode& sentence, string& res);
|
||||||
//static size_t getWordLength(const string& str);
|
|
||||||
public:
|
public:
|
||||||
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
static bool gbkToVec(const string& str, vector<uint16_t>& vec);
|
||||||
static string vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end);
|
static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
||||||
//static size_t getGbkLength(const string& str);
|
|
||||||
public:
|
public:
|
||||||
static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
|
static bool utf8ToVec(const string& str, vector<uint16_t>& vec);
|
||||||
static string vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end);
|
static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res);
|
||||||
//static size_t getUtf8Length(const string& str);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline bool TransCode::decode(const string& str, vector<uint16_t>& vec)
|
||||||
|
{
|
||||||
|
if(NULL == _pf_decode)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return _pf_decode(str, vec);
|
||||||
|
}
|
||||||
|
inline bool TransCode::encode(const Unicode& sentence, string& res)
|
||||||
|
{
|
||||||
|
return encode(sentence.begin(), sentence.end(), res);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
||||||
|
{
|
||||||
|
if(!_pf_encode)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return _pf_encode(begin, end, res);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -82,9 +82,11 @@ namespace CppJieba
|
|||||||
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
|
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
string toString() const
|
inline string toString() const
|
||||||
{
|
{
|
||||||
return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::encode(word.begin(), word.end()).c_str(), weight, idf);
|
string tmp;
|
||||||
|
TransCode::encode(word, tmp);
|
||||||
|
return string_format("{word:%s,weight:%lf, idf:%lf}", tmp.c_str(), weight, idf);
|
||||||
}
|
}
|
||||||
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
|
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user