From a0866f99f845b3fb5c39cdb137b928f79793df21 Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Thu, 11 Jul 2013 01:38:51 +0800 Subject: [PATCH] finished extract in Segment.cpp/g --- Segment.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++-- Segment.h | 9 ++++-- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/Segment.cpp b/Segment.cpp index 08a6e7a..ff42e61 100644 --- a/Segment.cpp +++ b/Segment.cpp @@ -58,9 +58,70 @@ namespace CppJieba LogError("_cutDAG failed."); return false; } + + retFlag = _filter(res); + if(!retFlag) + { + LogError("_cutDAG failed."); + return false; + } return true; } + bool Segment::extract(const string& utf8Str, vector& keywords) + { + bool retFlag; + vector tmp; + retFlag = cutDAG(utf8Str, tmp); + if(!retFlag) + { + LogError(string_format("cutDAG(%s) failed.", utf8Str.c_str())); + return false; + } + + retFlag = _extract(tmp, keywords, 5); + if(!retFlag) + { + LogError("_extract failed."); + return false; + } + + return true; + } + + double Segment::getUtf8WordWeight(const string& word) + { + return _trie.getWeight(utf8ToUnicode(word)); + } + + double Segment::getUniWordWeight(const string& word) + { + return _trie.getWeight(word); + } + + bool Segment::_pair_compare(const pair& a, const pair& b) + { + return a.second < b.second; + } + + bool Segment::_extract(const vector& words, vector& keywords, uint topN) + { + keywords.clear(); + vector > tmp; + for(uint i = 0; i < words.size(); i++) + { + double w = getUtf8WordWeight(words[i]); + tmp.push_back(pair(words[i], w)); + LogDebug(string_format("(%s,%lf)", words[i].c_str(), w)); + } + + sort(tmp.begin(), tmp.end(), _pair_compare); + for(uint i = 0; i < topN && i < tmp.size(); i++) + { + keywords.push_back(tmp[i].first); + } + return true; + } string Segment::_utf8ToUni(const string& utfStr) { @@ -119,7 +180,7 @@ namespace CppJieba //cout<<(i/2)<<","< res[i/2].second) { @@ -163,6 +224,30 @@ namespace CppJieba return true; } + bool Segment::_filter(vector& utf8Strs) + { + for(vector::iterator it = utf8Strs.begin(); it != utf8Strs.end();) + { + string uniStr = utf8ToUnicode(*it); + if(uniStr.empty() || uniStr.size()%2) + { + LogError("utf8ToUnicode error"); + return false; + } + + // filter single word + if(uniStr.size() == 2) + { + it = utf8Strs.erase(it); + } + else + { + it++; + } + } + return true; + } + } @@ -179,7 +264,8 @@ int main() //string title = "我来到北京清华大学"; string title = "特价!camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; cout< #include "Trie.h" namespace CppJieba @@ -17,14 +18,18 @@ namespace CppJieba bool destroy(); public: bool cutDAG(const string& chStr, vector& res); - //bool cutMM(const string& chStr, vector& res); - //bool cutRMM(const string& chStr, vector& res); + bool extract(const string& utf8Str, vector& keywords); + double getUtf8WordWeight(const string& word); + double getUniWordWeight(const string& word); private: string _utf8ToUni(const string& chStr); bool _calcDAG(const string& uniStr, vector >& dag); bool _calcDP(const string& uniStr, const vector >& dag, vector >& res); bool _cutDAG(const string& uniStr, const vector >& dp, vector& res); + bool _filter(vector& utf8Strs); + static bool _pair_compare(const pair& a, const pair& b); + bool _extract(const vector& words, vector& keywords, uint topN); private: enum {bufSize = 1024};