#include "Segment.h" namespace CppJieba { Segment::Segment():_trie() { } Segment::~Segment() { } bool Segment::init(const char* const dictFilePath) { LogInfo(string_format("_trie.init(%s) start...", dictFilePath)); _trie.init(dictFilePath); LogInfo("_trie.init end."); } bool Segment::destroy() { return _trie.destroy(); } bool Segment::cutDAG(const string& chStr, vector& res) { bool retFlag; res.clear(); string uniStr = _utf8ToUni(chStr); if(uniStr.empty()) { LogError("_utf8ToUni failed."); return false; } //calc DAG vector > dag; retFlag = _calcDAG(uniStr, dag); if(!retFlag) { LogError("_calcDAG failed."); return false; } LogDebug("_calcDAG finished."); vector > dp; retFlag = _calcDP(uniStr, dag, dp); if(!retFlag) { LogError("_calcDP failed."); return false; } LogDebug("_calcDP finished."); retFlag = _cutDAG(uniStr, dp, res); if(!retFlag) { LogError("_cutDAG failed."); return false; } LogDebug("_cutDAG finished."); return true; } bool Segment::extract(const string& utf8Str, vector& keywords) { LogInfo(utf8Str); bool retFlag; vector tmp; retFlag = cutDAG(utf8Str, tmp); if(!retFlag) { LogError(string_format("cutDAG(%s) failed.", utf8Str.c_str())); return false; } // like str.join([]) in python LogDebug(string_format("cutDAG result:[%s]", joinStr(tmp, ",").c_str())); retFlag = _filter(tmp); if(!retFlag) { LogError("_filter failed."); return false; } LogDebug(string_format("_filter res:[%s]", joinStr(tmp, ",").c_str())); retFlag = _extractTopN(tmp, keywords, 5); if(!retFlag) { LogError("_extractTopN failed."); return false; } LogDebug("_extractTopN finished."); LogInfo(string_format("ext res:[%s]", joinStr(keywords, ",").c_str())); return true; } double Segment::getUtf8WordWeight(const string& word) { return _trie.getWeight(utf8ToUnicode(word)); } double Segment::getUniWordWeight(const string& word) { return _trie.getWeight(word); } bool Segment::_pair_compare(const pair& a, const pair& b) { return a.second < b.second; } bool Segment::_extractTopN(const vector& words, vector& keywords, uint topN) { keywords.clear(); vector > tmp; for(uint i = 0; i < words.size(); i++) { double w = getUtf8WordWeight(words[i]); tmp.push_back(pair(words[i], w)); } sort(tmp.begin(), tmp.end(), _pair_compare); //logging result vector logBuf;//for LogDebug for(uint i = 0; i < tmp.size(); i++) { logBuf.push_back(string_format("(%s,%lf)", tmp[i].first.c_str(), tmp[i].second)); } LogDebug(string_format("calc weight:%s",joinStr(logBuf, ",").c_str())); //extract TopN for(uint i = 0; i < topN && i < tmp.size(); i++) { keywords.push_back(tmp[i].first); } return true; } string Segment::_utf8ToUni(const string& utfStr) { string uniStr = utf8ToUnicode(utfStr); if(uniStr.empty()) { LogError(string_format("utf8ToUnicode [%s] failed!", utfStr.c_str())); return ""; } return uniStr; } bool Segment::_calcDAG(const string& uniStr, vector >& dag) { for(uint i = 0; i < uniStr.size(); i+=2) { vector vec; vec.push_back(i/2); for(uint j = i + 4; j <= uniStr.size(); j+=2) { if(NULL != _trie.find(uniStr.substr(i, j - i))) { vec.push_back((j - 2)/2); } } dag.push_back(vec); } return true; } bool Segment::_calcDP(const string& uniStr, const vector >& dag, vector >& res) { if(uniStr.size() / 2 != dag.size()) { LogError("dag is illegal!"); return false; } if(uniStr.size() < 2) { LogError("uniStr illegal"); return false; } res.clear(); res.assign(uniStr.size()/2 + 1, pair(-1, 0.0)); res[uniStr.size()/2].first = -1; res[uniStr.size()/2].second = 0.0; for(int i = uniStr.size() - 2; i >= 0; i-=2) { // calc max res[i/2].first = -1; res[i/2].second = -(numeric_limits::max()); for(int j = 0; j < dag[i/2].size(); j++) { //cout<<(i/2)<<","< res[i/2].second) { res[i/2].first = pos; res[i/2].second = val; } } } res.pop_back(); return true; } bool Segment::_cutDAG(const string& uniStr, const vector >& dp, vector& res) { if(dp.size() != uniStr.size()/2) { LogError("dp or uniStr illegal!"); return false; } res.clear(); uint begin = 0; for(uint i = 0; i < dp.size(); i++) { //cout<& utf8Strs) { bool retFlag; retFlag = _filterSingleWord(utf8Strs); if(!retFlag) { LogError("_filterSingleWord failed."); return false; } LogDebug(string_format("_filterSingleWord res:[%s]", joinStr(utf8Strs, ",").c_str())); retFlag = _filterSubstr(utf8Strs); if(!retFlag) { LogError("_filterSubstr failed."); return false; } LogDebug(string_format("_filterSubstr res:[%s]", joinStr(utf8Strs, ",").c_str())); return true; } bool Segment::_filterSingleWord(vector& utf8Strs) { for(vector::iterator it = utf8Strs.begin(); it != utf8Strs.end();) { string uniStr = utf8ToUnicode(*it); if(uniStr.empty() || uniStr.size()%2) { LogError("utf8ToUnicode error"); return false; } // filter single word if(uniStr.size() == 2) { it = utf8Strs.erase(it); } else { it++; } } return true; } bool Segment::_filterSubstr(vector& utf8Strs) { vector tmp = utf8Strs; set subs; for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); it ++) { for(uint j = 0; j < tmp.size(); j++) { if(*it != tmp[j] && string::npos != tmp[j].find(*it, 0)) { subs.insert(*it); } } } //erase subs from utf8Strs for(VSI it = utf8Strs.begin(); it != utf8Strs.end(); ) { if(subs.end() != subs.find(*it)) { LogDebug(string_format("_filterSubstr filter [%s].", it->c_str())); it = utf8Strs.erase(it); } else { it ++; } } return true; } } #ifdef SEGMENT_UT using namespace CppJieba; int main() { Segment segment; segment.init("dicts/segdict.utf8.v2.1"); //segment.init("dicts/jieba.dict.utf8"); vector res; string title; title = "我来到北京清华大学"; res.clear(); segment.extract(title, res); title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; res.clear(); segment.extract(title, res); title = "包邮拉菲草18cm大檐进口草帽子超强遮阳防晒欧美日韩新款夏天 女"; res.clear(); segment.extract(title, res); title = "2013新款19CM超大檐帽 遮阳草帽子 沙滩帽防晒大檐欧美新款夏天女"; res.clear(); segment.extract(title, res); segment.destroy(); return 0; } #endif