diff --git a/Segment.cpp b/Segment.cpp index 3359ba2..3fee481 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -23,11 +23,8 @@ namespace CppJieba bool Segment::cutDAG(const string& chStr, vector& res) { res.clear(); - char utfBuf[bufSize]; - ChUnicode uniStr[bufSize]; - memset(uniStr, 0, sizeof(uniStr)); - size_t len = _utf8ToUni(chStr, uniStr, bufSize); - if(0 == len) + string uniStr = _utf8ToUni(chStr); + if(uniStr.empty()) { LogError("_utf8ToUni failed."); return false; @@ -35,15 +32,15 @@ namespace CppJieba //calc DAG vector > dag; - for(uint i = 0; i < len; i++) + for(uint i = 0; i < uniStr.size(); i+=2) { vector vec; - vec.push_back(i); - for(uint j = i + 2; j <= len; j++) + vec.push_back(i/2); + for(uint j = i + 4; j <= uniStr.size(); j+=2) { - if(NULL != _trie.find(uniStr + i, j - i)) + if(NULL != _trie.find(uniStr.substr(i, j - i))) { - vec.push_back(j - 1); + vec.push_back((j - 2)/2); } } dag.push_back(vec); @@ -54,7 +51,7 @@ namespace CppJieba getchar(); vector > dp; - _calcDP(uniStr, len, dag, dp); + _calcDP(uniStr, dag, dp); cout<<__FILE__<<__LINE__<& res) { res.clear(); - char utfBuf[bufSize]; - ChUnicode uniStr[bufSize]; - memset(uniStr, 0, sizeof(uniStr)); - size_t len = _utf8ToUni(chStr, uniStr, bufSize); - if(0 == len) + string uniStr = _utf8ToUni(chStr); + if(uniStr.empty()) { LogError("_utf8ToUni failed."); return false; @@ -111,15 +108,15 @@ namespace CppJieba } return true; } + */ + /* bool Segment::cutRMM(const string& chStr, vector& res) { res.clear(); char utfBuf[bufSize]; - ChUnicode uniStr[bufSize]; - memset(uniStr, 0, sizeof(uniStr)); - size_t len = _utf8ToUni(chStr, uniStr, bufSize); + string uniStr = _utf8ToUni(chStr); if(0 == len) { LogError("_utf8ToUni failed."); @@ -155,55 +152,68 @@ namespace CppJieba } return true; } + */ - size_t Segment::_utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size) + string Segment::_utf8ToUni(const string& utfStr) { char logBuf[bufSize]; - size_t len = utf8ToUnicode(chStr.c_str(), chStr.size(), uniStr); + string uniStr = utf8ToUnicode(utfStr); - if(0 == len) + if(uniStr.empty()) { - sprintf(logBuf, "utf8ToUnicode [%s] failed!", chStr.c_str()); + sprintf(logBuf, "utf8ToUnicode [%s] failed!", utfStr.c_str()); LogError(logBuf); - return 0; + return ""; } - - if(size - len <= 5) - { - sprintf(logBuf, "%s too long!", chStr.c_str()); - LogError(logBuf); - return 0; - } - return len; - + return uniStr; } - bool Segment::_calcDP(const ChUnicode* uniStr, size_t len, const vector >& dag, vector >& res) + bool Segment::_calcDP(const string& uniStr, const vector >& dag, vector >& res) { - if(len != dag.size()) + /* + for(int i =0;i(-1, 0.0)); - res[len].first = -1; - res[len].second = 0.0; - for(int i = len - 1; i >= 0; i--) + res.assign(uniStr.size()/2 + 1, pair(-1, 0.0)); + res[uniStr.size()/2].first = -1; + res[uniStr.size()/2].second = 0.0; + for(int i = uniStr.size() - 2; i >= 0; i-=2) { // calc max - res[i].first = -1; - res[i].second = -(numeric_limits::max()); - for(int j = 0; j < dag[i].size(); j++) + res[i/2].first = -1; + res[i/2].second = -(numeric_limits::max()); + for(int j = 0; j < dag[i/2].size(); j++) { - int pos = dag[i][j]; - double val = _trie.getWeight(uniStr + i, pos - i + 1) + res[pos+1].second; - //cout<<__LINE__<<"," - // < res[i].second) + //cout<<(i/2)<<","< res[i/2].second) { - res[i].first = pos; - res[i].second = val; + res[i/2].first = pos; + res[i/2].second = val; } } } @@ -223,13 +233,15 @@ int main() vector res; string title = "我来到北京清华大学"; - bool flag = segment.cutDAG(title, res); - if(flag) + /*segment.cutMM(title, res); + for(int i = 0; i < res.size(); i++) { - for(int i = 0; i < res.size(); i++) - { - cout<& res); - bool cutMM(const string& chStr, vector& res); - bool cutRMM(const string& chStr, vector& res); + //bool cutMM(const string& chStr, vector& res); + //bool cutRMM(const string& chStr, vector& res); private: - size_t _utf8ToUni(const string& chStr, ChUnicode* uniStr, size_t size); - bool _calcDP(const ChUnicode* uniStr, size_t len, const vector >& dag, vector >& res); + string _utf8ToUni(const string& chStr); + bool _calcDP(const string& uniStr, const vector >& dag, vector >& res); private: enum {bufSize = 1024}; diff --git a/Trie.cpp b/Trie.cpp index 42ac41d..e7f2484 100644 --- a/Trie.cpp +++ b/Trie.cpp @@ -233,6 +233,19 @@ namespace CppJieba } } + double Trie::getWeight(const string& uniStr) + { + const TrieNodeInfo * p = find(uniStr); + if(NULL != p) + { + return p->weight; + } + else + { + return _minWeight; + } + } + /* bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector >& res) {