#include "Segment.h" namespace CppJieba { Segment::Segment():_trie() { } Segment::~Segment() { } bool Segment::init(const char* const dictFilePath) { bool retFlag; LogInfo(string_format("_trie.init(%s) start...", dictFilePath)); retFlag = _trie.init(dictFilePath); LogInfo("_trie.init end."); return retFlag; } bool Segment::destroy() { return _trie.destroy(); } bool Segment::cutDAG(const string& chStr, vector& res) { bool retFlag; res.clear(); string uniStr = _utf8ToUni(chStr); if(uniStr.empty()) { LogError("_utf8ToUni failed."); return false; } //calc DAG vector > dag; retFlag = _calcDAG(uniStr, dag); if(!retFlag) { LogError("_calcDAG failed."); return false; } //LogDebug("_calcDAG finished."); vector > dp; retFlag = _calcDP(uniStr, dag, dp); if(!retFlag) { LogError("_calcDP failed."); return false; } //LogDebug("_calcDP finished."); retFlag = _cutDAG(uniStr, dp, res); if(!retFlag) { LogError("_cutDAG failed."); return false; } //LogDebug("_cutDAG finished."); return true; } double Segment::getUtf8WordWeight(const string& word) { return _trie.getWeight(utf8ToUnicode(word)); } double Segment::getUniWordWeight(const string& word) { return _trie.getWeight(word); } string Segment::_utf8ToUni(const string& utfStr) { string uniStr = utf8ToUnicode(utfStr); if(uniStr.empty()) { LogError(string_format("utf8ToUnicode [%s] failed!", utfStr.c_str())); return ""; } return uniStr; } bool Segment::_calcDAG(const string& uniStr, vector >& dag) { for(uint i = 0; i < uniStr.size(); i+=2) { vector vec; vec.push_back(i/2); for(uint j = i + 4; j <= uniStr.size(); j+=2) { if(NULL != _trie.find(uniStr.substr(i, j - i))) { vec.push_back((j - 2)/2); } } dag.push_back(vec); } return true; } bool Segment::_calcDP(const string& uniStr, const vector >& dag, vector >& res) { if(uniStr.size() / 2 != dag.size()) { LogError("dag is illegal!"); return false; } if(uniStr.size() < 2) { LogError("uniStr illegal"); return false; } res.clear(); res.assign(uniStr.size()/2 + 1, pair(-1, 0.0)); res[uniStr.size()/2].first = -1; res[uniStr.size()/2].second = 0.0; for(int i = uniStr.size() - 2; i >= 0; i-=2) { // calc max res[i/2].first = -1; res[i/2].second = -(numeric_limits::max()); for(int j = 0; j < dag[i/2].size(); j++) { //cout<<(i/2)<<","< res[i/2].second) { res[i/2].first = pos; res[i/2].second = val; } } } res.pop_back(); return true; } bool Segment::_cutDAG(const string& uniStr, const vector >& dp, vector& res) { if(dp.size() != uniStr.size()/2) { LogError("dp or uniStr illegal!"); return false; } res.clear(); uint begin = 0; for(uint i = 0; i < dp.size(); i++) { //cout< res; string title; title = "我来到北京清华大学"; res.clear(); segment.cutDAG(title, res); PRINT_VECTOR(res); title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; res.clear(); segment.cutDAG(title, res); PRINT_VECTOR(res); title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女"; res.clear(); segment.cutDAG(title, res); PRINT_VECTOR(res); title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女"; res.clear(); segment.cutDAG(title, res); PRINT_VECTOR(res); segment.destroy(); return 0; } #endif