diff --git a/src/Segment.cpp b/src/Segment.cpp index 0baf1d9..718220f 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -41,10 +41,12 @@ namespace CppJieba bool Segment::cutDAG(const string& str, vector& res) { - bool retFlag; res.clear(); - string uniStr = gEncoding.decode(str); - if(uniStr.empty()) + + bool retFlag; + Unicode unicode; + retFlag = gEncoding.decode(str, unicode); + if(!retFlag) { LogError("gEncoding.decode failed."); return false; @@ -52,7 +54,7 @@ namespace CppJieba //calc DAG vector > dag; - retFlag = _calcDAG(uniStr, dag); + retFlag = _calcDAG(unicode, dag); if(!retFlag) { LogError("_calcDAG failed."); @@ -60,14 +62,14 @@ namespace CppJieba } vector > dp; - retFlag = _calcDP(uniStr, dag, dp); + retFlag = _calcDP(unicode, dag, dp); if(!retFlag) { LogError("_calcDP failed."); return false; } - retFlag = _cutDAG(uniStr, dp, res); + retFlag = _cutDAG(unicode, dp, res); if(!retFlag) { LogError("_cutDAG failed."); @@ -77,23 +79,24 @@ namespace CppJieba return true; } - double Segment::getWordWeight(const string& word) + bool Segment::_calcDAG(const Unicode& unicode, vector >& dag) { - return _trie.getWeight(word); - } - - bool Segment::_calcDAG(const string& uniStr, vector >& dag) - { - for(uint i = 0; i < uniStr.size(); i+=2) + if(unicode.empty()) + { + return false; + } + typedef UnicodeConstIterator UCI; + UCI beginIter = unicode.begin(); + for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++) { vector vec; - vec.push_back(i/2); - for(uint j = i + 4; j <= uniStr.size(); j+=2) + vec.push_back(iterI - beginIter); + for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++) { - //cout< >& dag, vector >& res) + bool Segment::_calcDP(const Unicode& unicode, const vector >& dag, vector >& res) { - if(uniStr.size() / 2 != dag.size()) + if(unicode.empty()) + { + LogError("unicode illegal"); + return false; + } + + if(unicode.size() != dag.size()) { LogError("dag is illegal!"); return false; } - if(uniStr.size() < 2) - { - LogError("uniStr illegal"); - return false; - } res.clear(); - res.assign(uniStr.size()/2 + 1, pair(-1, 0.0)); - res[uniStr.size()/2].first = -1; - res[uniStr.size()/2].second = 0.0; - for(int i = uniStr.size() - 2; i >= 0; i-=2) + res.assign(unicode.size() + 1, pair(-1, 0.0)); + res[unicode.size()].first = -1; + res[unicode.size()].second = 0.0; + + UnicodeConstIterator iterBegin = unicode.begin(); + + for(int i = unicode.size() - 1; i >= 0; i--) { // calc max - res[i/2].first = -1; - res[i/2].second = -(numeric_limits::max()); - for(int j = 0; j < dag[i/2].size(); j++) + res[i].first = -1; + res[i].second = -(numeric_limits::max()); + for(int j = 0; j < dag[i].size(); j++) { //cout<<(i/2)<<","< res[i/2].second) + if(val > res[i].second) { - res[i/2].first = pos; - res[i/2].second = val; + res[i].first = pos; + res[i].second = val; } } } res.pop_back(); return true; } - bool Segment::_cutDAG(const string& uniStr, const vector >& dp, vector& res) + bool Segment::_cutDAG(const Unicode& unicode, const vector >& dp, vector& res) { - if(dp.size() != uniStr.size()/2) + if(dp.size() != unicode.size()) { - LogError("dp or uniStr illegal!"); + LogError("dp or unicode illegal!"); return false; } res.clear(); uint begin = 0; + UnicodeConstIterator iterBegin = unicode.begin(); for(uint i = 0; i < dp.size(); i++) { //cout<& res); - double getWordWeight(const string& word); private: - bool _calcDAG(const string& uniStr, vector >& dag); - bool _calcDP(const string& uniStr, const vector >& dag, vector >& res); - bool _cutDAG(const string& uniStr, const vector >& dp, vector& res); + bool _calcDAG(const Unicode& unicode, vector >& dag); + bool _calcDP(const Unicode& unicode, const vector >& dag, vector >& res); + bool _cutDAG(const Unicode& unicode, const vector >& dp, vector& res); }; } diff --git a/src/Trie.cpp b/src/Trie.cpp index c913d7f..2fc1a7b 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -156,7 +156,7 @@ namespace CppJieba LogFatal("trie not initted!"); return NULL; } - vector unicode; + Unicode unicode; bool retFlag = gEncoding.decode(str, unicode); if(retFlag) @@ -199,7 +199,7 @@ namespace CppJieba const TrieNodeInfo* Trie::find(const string& str) { - vector unicode; + Unicode unicode; bool retFlag = gEncoding.decode(str, unicode); if(!retFlag) { @@ -208,7 +208,16 @@ namespace CppJieba return find(unicode); } - const TrieNodeInfo* Trie::find(const vector& unicode) + const TrieNodeInfo* Trie::find(const Unicode& unicode) + { + if(unicode.empty()) + { + return NULL; + } + return find(unicode.begin(), unicode.end()); + } + + const TrieNodeInfo* Trie::find(UnicodeConstIterator begin, UnicodeConstIterator end) { if(!_getInitFlag()) @@ -216,15 +225,14 @@ namespace CppJieba LogFatal("trie not initted!"); return NULL; } - if(unicode.empty()) + if(begin >= end) { - LogError("unicode empty"); return NULL; } TrieNode* p = _root; - for(uint i = 0; i < unicode.size(); i++) + for(UnicodeConstIterator it = begin; it != end; it++) { - uint16_t chUni = unicode[i]; + uint16_t chUni = *it; if(p->hmap.find(chUni) == p-> hmap.end()) { return NULL; @@ -253,8 +261,17 @@ namespace CppJieba double Trie::getWeight(const string& str) { - vector unicode; + Unicode unicode; gEncoding.decode(str, unicode); + return getWeight(unicode); + } + + double Trie::getWeight(const Unicode& unicode) + { + if(unicode.empty()) + { + return getMinWeight(); + } const TrieNodeInfo * p = find(unicode); if(NULL != p) { @@ -264,6 +281,20 @@ namespace CppJieba { return getMinWeight(); } + + } + + double Trie::getWeight(UnicodeConstIterator begin, UnicodeConstIterator end) + { + const TrieNodeInfo * p = find(begin, end); + if(NULL != p) + { + return p->weight; + } + else + { + return getMinWeight(); + } } double Trie::getMinWeight() @@ -299,7 +330,7 @@ namespace CppJieba const string& word = nodeInfo.word; - vector unicode; + Unicode unicode; bool retFlag = gEncoding.decode(word, unicode); if(!retFlag) { diff --git a/src/Trie.h b/src/Trie.h index 8163f33..cd0b0d5 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -88,11 +88,14 @@ namespace CppJieba public: const TrieNodeInfo* find(const string& str); - const TrieNodeInfo* find(const vector& unicode); + const TrieNodeInfo* find(const Unicode& unicode); + const TrieNodeInfo* find(UnicodeConstIterator begin, UnicodeConstIterator end); const TrieNodeInfo* findPrefix(const string& str); public: double getWeight(const string& str); + double getWeight(const Unicode& unicode); + double getWeight(UnicodeConstIterator begin, UnicodeConstIterator end); double getMinWeight(); int64_t getTotalCount(); diff --git a/src/cppcommon/encoding.cpp b/src/cppcommon/encoding.cpp index 26c1600..ab1e7a1 100644 --- a/src/cppcommon/encoding.cpp +++ b/src/cppcommon/encoding.cpp @@ -38,7 +38,17 @@ namespace CPPCOMMON return true; } - string UnicodeEncoding::encode(const vector& unicode) + string UnicodeEncoding::encode(UnicodeConstIterator begin, UnicodeConstIterator end) + { + if(begin >= end) + { + return ""; + } + Unicode unicode(begin, end); + return encode(unicode); + } + + string UnicodeEncoding::encode(const Unicode& unicode) { if(unicode.empty()) { @@ -55,7 +65,7 @@ namespace CPPCOMMON return ""; } - bool UnicodeEncoding::decode(const string& str, vector& unicode) + bool UnicodeEncoding::decode(const string& str, Unicode& unicode) { if(str.empty()) { @@ -80,7 +90,7 @@ int main() { UnicodeEncoding enc(GBKENC); ifstream ifile("testdata/dict.gbk"); - vector unicode; + Unicode unicode; string line; while(getline(ifile, line)) { diff --git a/src/cppcommon/encoding.h b/src/cppcommon/encoding.h index a1ef9eb..e77ca19 100644 --- a/src/cppcommon/encoding.h +++ b/src/cppcommon/encoding.h @@ -26,8 +26,9 @@ namespace CPPCOMMON ~UnicodeEncoding(); public: bool setEncoding(const string& enc); - string encode(const vector& unicode); - bool decode(const string& str, vector& unicode); + string encode(const Unicode& unicode); + string encode(UnicodeConstIterator begin, UnicodeConstIterator end); + bool decode(const string& str, Unicode& unicode); }; } diff --git a/src/cppcommon/str_functs.cpp b/src/cppcommon/str_functs.cpp index cc428f3..3349500 100644 --- a/src/cppcommon/str_functs.cpp +++ b/src/cppcommon/str_functs.cpp @@ -213,7 +213,7 @@ namespace CPPCOMMON return res; } - string unicodeToUtf8(const vector& unicode) + string unicodeToUtf8(const Unicode& unicode) { if(unicode.empty()) { @@ -283,7 +283,7 @@ namespace CPPCOMMON return length; } - bool utf8ToUnicode(const string& utfStr, vector& unicode) + bool utf8ToUnicode(const string& utfStr, Unicode& unicode) { unicode.clear(); if(utfStr.empty()) @@ -376,7 +376,7 @@ namespace CPPCOMMON } //unicode str to vec - bool uniStrToVec(const string& str, vector& vec) + bool uniStrToVec(const string& str, Unicode& vec) { vec.clear(); if(str.empty() || str.size() % 2) @@ -392,7 +392,7 @@ namespace CPPCOMMON } //unicode vec to str - string uniVecToStr(const vector& vec) + string uniVecToStr(const Unicode& vec) { string res(""); for(uint i = 0; i < vec.size(); i++) @@ -451,7 +451,7 @@ int main() //cout< unicode; + Unicode unicode; while(getline(ifile, line)) { cout<& unicode); + string unicodeToUtf8(const Unicode& unicode); int utf8ToUnicode(const char* inutf8, int len, uint16_t* unicode); - bool utf8ToUnicode(const string& utfStr, vector& unicode); + bool utf8ToUnicode(const string& utfStr, Unicode& unicode); int code_convert(const char *from_charset,const char *to_charset,char *inbuf,size_t inlen,char *outbuf,size_t outlen); string gbkToUtf8(const string& gbk); string utf8ToGbk(const string& utf); - bool uniStrToVec(const string& str, vector& vec); - string uniVecToStr(const vector& vec); + bool uniStrToVec(const string& str, Unicode& vec); + string uniVecToStr(const Unicode& vec); inline uint16_t twocharToUint16(char high, char low) { @@ -62,7 +62,7 @@ namespace CPPCOMMON return res; } - inline void printUnicode(const vector& unicode) + inline void printUnicode(const Unicode& unicode) { cout< Unicode; + typedef std::vector::const_iterator UnicodeConstIterator; }