diff --git a/src/HMMSegment.cpp b/src/HMMSegment.cpp index 04f813a..dcede07 100644 --- a/src/HMMSegment.cpp +++ b/src/HMMSegment.cpp @@ -149,9 +149,13 @@ namespace CppJieba return false; } res.clear(); + string tmp; for(uint i = 0; i < words.size(); i++) { - res.push_back(TransCode::encode(words[i].begin(), words[i].end())); + if(TransCode::encode(words[i], tmp)) + { + res.push_back(tmp); + } } return true; } diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index 1b7c2b1..979f9e1 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -27,30 +27,6 @@ namespace CppJieba return true; } - //bool KeyWordExt::_loadPriorSubWords(const char * const filePath) - //{ - // LogInfo(string_format("_loadPriorSubWords(%s) start", filePath)); - // if(!checkFileExist(filePath)) - // { - // LogError(string_format("cann't find file[%s].",filePath)); - // return false; - // } - // if(!_priorSubWords.empty()) - // { - // LogError("_priorSubWords has been initted before"); - // return false; - // } - // ifstream infile(filePath); - // string subword; - // while(getline(infile, subword)) - // { - // _priorSubWords.push_back(subword); - // } - // LogInfo(string_format("_loadPriorSubWords(%s) end", filePath)); - // infile.close(); - // return true; - //} - bool KeyWordExt::loadStopWords(const char * const filePath) { diff --git a/src/MPSegment.cpp b/src/MPSegment.cpp index 643bd8b..4db524b 100644 --- a/src/MPSegment.cpp +++ b/src/MPSegment.cpp @@ -44,9 +44,17 @@ namespace CppJieba return false; } res.clear(); + string tmp; for(uint i = 0; i < segWordInfos.size(); i++) { - res.push_back(TransCode::encode(segWordInfos[i].word.begin(), segWordInfos[i].word.end())); + if(TransCode::encode(segWordInfos[i].word, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); + } } return true; } diff --git a/src/MixSegment.cpp b/src/MixSegment.cpp index 39c54af..d9a763e 100644 --- a/src/MixSegment.cpp +++ b/src/MixSegment.cpp @@ -43,6 +43,7 @@ namespace CppJieba res.clear(); Unicode unico; vector hmmRes; + string tmp; for(uint i= 0; i < infos.size(); i++) { if(1 == infos[i].word.size()) @@ -60,12 +61,14 @@ namespace CppJieba } for(uint j = 0; j < hmmRes.size(); j++) { - res.push_back(TransCode::encode(hmmRes[j])); + TransCode::encode(hmmRes[j], tmp); + res.push_back(tmp); } } unico.clear(); - res.push_back(TransCode::encode(infos[i].word)); + TransCode::encode(infos[i].word, tmp); + res.push_back(tmp); } } @@ -78,7 +81,8 @@ namespace CppJieba } for(uint j = 0; j < hmmRes.size(); j++) { - res.push_back(TransCode::encode(hmmRes[j])); + TransCode::encode(hmmRes[j], tmp); + res.push_back(tmp); } } diff --git a/src/TransCode.cpp b/src/TransCode.cpp index 80ab080..8ced090 100644 --- a/src/TransCode.cpp +++ b/src/TransCode.cpp @@ -34,14 +34,6 @@ namespace CppJieba _pf_encode = vecToUtf8; } - bool TransCode::decode(const string& str, vector& vec) - { - if(NULL == _pf_decode) - { - return false; - } - return _pf_decode(str, vec); - } bool TransCode::utf8ToVec(const string& str, vector& vec) { @@ -112,27 +104,14 @@ namespace CppJieba return true; } - string TransCode::encode(const Unicode& sentence) - { - return encode(sentence.begin(), sentence.end()); - } - - string TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end) - { - if(!_pf_encode) - { - return ""; - } - return _pf_encode(begin, end); - } - string TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end) + bool TransCode::vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) { if(begin >= end) { - return ""; + return false; } - string res; + res.clear(); uint16_t ui; while(begin != end) { @@ -154,17 +133,17 @@ namespace CppJieba } begin ++; } - return res; + return true; } - string TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end) + bool TransCode::vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) { if(begin >= end) { - return ""; + return false; } + res.clear(); pair pa; - string res; while(begin != end) { pa = uint16ToChar2(*begin); @@ -179,21 +158,8 @@ namespace CppJieba } begin++; } - return res; + return true; } - - //size_t TransCode::getWordLength(const string& str) - //{ - // vector vec; - // if(!decode(str, vec)) - // { - // return 0; - // } - // else - // { - // return vec.size(); - // } - //} } @@ -202,27 +168,6 @@ using namespace CPPCOMMON; using namespace CppJieba; int main() { - //ifstream ifile("/home/wuyanyi/code/SevKeyword/log.2.txt"); - //string line; - //Unicode vec; - //while(getline(ifile, line)) - //{ - // - // cout<&); - //pf tmp = TransCode::a; - //vector vec; - //tmp("1",vec); - string a("abd你好世界!a"); vector vec; //TransCode::setUtf8Enc(); diff --git a/src/TransCode.h b/src/TransCode.h index 5627941..817aa79 100644 --- a/src/TransCode.h +++ b/src/TransCode.h @@ -17,38 +17,56 @@ namespace CppJieba { public: typedef bool (*pf_decode_t)(const string&, vector&); - typedef string (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end); - typedef size_t (*pf_getWordLength_t)(const string& str); + typedef bool (*pf_encode_t)(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); private: static vector _encVec; static bool _isInitted; static pf_decode_t _pf_decode; static pf_encode_t _pf_encode; - static pf_getWordLength_t _pf_getWordLength; public: static void setGbkEnc(); static void setUtf8Enc(); - public: + private: TransCode(); ~TransCode(); public: static bool init(); public: - static bool decode(const string& str, vector& vec); - static string encode(Unicode::const_iterator begin, Unicode::const_iterator end); - static string encode(const Unicode& sentence); - //static size_t getWordLength(const string& str); + static inline bool decode(const string& str, vector& vec); + static inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); + static inline bool encode(const Unicode& sentence, string& res); + public: static bool gbkToVec(const string& str, vector& vec); - static string vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end); - //static size_t getGbkLength(const string& str); + static bool vecToGbk(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); public: static bool utf8ToVec(const string& str, vector& vec); - static string vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end); - //static size_t getUtf8Length(const string& str); + static bool vecToUtf8(Unicode::const_iterator begin, Unicode::const_iterator end, string& res); }; + + inline bool TransCode::decode(const string& str, vector& vec) + { + if(NULL == _pf_decode) + { + return false; + } + return _pf_decode(str, vec); + } + inline bool TransCode::encode(const Unicode& sentence, string& res) + { + return encode(sentence.begin(), sentence.end(), res); + } + + inline bool TransCode::encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) + { + if(!_pf_encode) + { + return false; + } + return _pf_encode(begin, end, res); + } } #endif diff --git a/src/structs.h b/src/structs.h index 13e4d9c..6cb4ed0 100644 --- a/src/structs.h +++ b/src/structs.h @@ -82,9 +82,11 @@ namespace CppJieba KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo) { } - string toString() const + inline string toString() const { - return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::encode(word.begin(), word.end()).c_str(), weight, idf); + string tmp; + TransCode::encode(word, tmp); + return string_format("{word:%s,weight:%lf, idf:%lf}", tmp.c_str(), weight, idf); } KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo) {