finished extract in Segment.cpp/g

This commit is contained in:
gwdwyy 2013-07-11 01:38:51 +08:00
parent 30488d0473
commit a0866f99f8
2 changed files with 95 additions and 4 deletions

View File

@ -59,6 +59,67 @@ namespace CppJieba
return false; return false;
} }
retFlag = _filter(res);
if(!retFlag)
{
LogError("_cutDAG failed.");
return false;
}
return true;
}
bool Segment::extract(const string& utf8Str, vector<string>& keywords)
{
bool retFlag;
vector<string> tmp;
retFlag = cutDAG(utf8Str, tmp);
if(!retFlag)
{
LogError(string_format("cutDAG(%s) failed.", utf8Str.c_str()));
return false;
}
retFlag = _extract(tmp, keywords, 5);
if(!retFlag)
{
LogError("_extract failed.");
return false;
}
return true;
}
double Segment::getUtf8WordWeight(const string& word)
{
return _trie.getWeight(utf8ToUnicode(word));
}
double Segment::getUniWordWeight(const string& word)
{
return _trie.getWeight(word);
}
bool Segment::_pair_compare(const pair<string, double>& a, const pair<string, double>& b)
{
return a.second < b.second;
}
bool Segment::_extract(const vector<string>& words, vector<string>& keywords, uint topN)
{
keywords.clear();
vector<pair<string, double> > tmp;
for(uint i = 0; i < words.size(); i++)
{
double w = getUtf8WordWeight(words[i]);
tmp.push_back(pair<string, double>(words[i], w));
LogDebug(string_format("(%s,%lf)", words[i].c_str(), w));
}
sort(tmp.begin(), tmp.end(), _pair_compare);
for(uint i = 0; i < topN && i < tmp.size(); i++)
{
keywords.push_back(tmp[i].first);
}
return true; return true;
} }
@ -119,7 +180,7 @@ namespace CppJieba
//cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl; //cout<<(i/2)<<","<<dag[i/2].size()<<","<<j<<endl;
//getchar(); //getchar();
int pos = dag[i/2][j]; int pos = dag[i/2][j];
double val = _trie.getWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second; double val = getUniWordWeight(uniStr.substr(i, pos * 2 - i + 2)) + res[pos + 1].second;
//cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl; //cout<<pos<<","<<pos * 2 - i + 2<<","<<val<<endl;
if(val > res[i/2].second) if(val > res[i/2].second)
{ {
@ -163,6 +224,30 @@ namespace CppJieba
return true; return true;
} }
bool Segment::_filter(vector<string>& utf8Strs)
{
for(vector<string>::iterator it = utf8Strs.begin(); it != utf8Strs.end();)
{
string uniStr = utf8ToUnicode(*it);
if(uniStr.empty() || uniStr.size()%2)
{
LogError("utf8ToUnicode error");
return false;
}
// filter single word
if(uniStr.size() == 2)
{
it = utf8Strs.erase(it);
}
else
{
it++;
}
}
return true;
}
} }
@ -179,7 +264,8 @@ int main()
//string title = "我来到北京清华大学"; //string title = "我来到北京清华大学";
string title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏"; string title = "特价camel骆驼 柔软舒适头层牛皮平底凉鞋女 休闲平跟妈妈鞋夏";
cout<<title<<endl; cout<<title<<endl;
segment.cutDAG(title, res); //segment.cutDAG(title, res);
segment.extract(title, res);
for(int i = 0; i < res.size(); i++) for(int i = 0; i < res.size(); i++)
{ {
cout<<res[i]<<endl; cout<<res[i]<<endl;

View File

@ -1,6 +1,7 @@
#ifndef SEGMENT_H #ifndef SEGMENT_H
#define SEGMENT_H #define SEGMENT_H
#include <algorithm>
#include "Trie.h" #include "Trie.h"
namespace CppJieba namespace CppJieba
@ -17,14 +18,18 @@ namespace CppJieba
bool destroy(); bool destroy();
public: public:
bool cutDAG(const string& chStr, vector<string>& res); bool cutDAG(const string& chStr, vector<string>& res);
//bool cutMM(const string& chStr, vector<string>& res); bool extract(const string& utf8Str, vector<string>& keywords);
//bool cutRMM(const string& chStr, vector<string>& res); double getUtf8WordWeight(const string& word);
double getUniWordWeight(const string& word);
private: private:
string _utf8ToUni(const string& chStr); string _utf8ToUni(const string& chStr);
bool _calcDAG(const string& uniStr, vector<vector<uint> >& dag); bool _calcDAG(const string& uniStr, vector<vector<uint> >& dag);
bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res); bool _calcDP(const string& uniStr, const vector<vector<uint> >& dag, vector<pair<int, double> >& res);
bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res); bool _cutDAG(const string& uniStr, const vector<pair<int, double> >& dp, vector<string>& res);
bool _filter(vector<string>& utf8Strs);
static bool _pair_compare(const pair<string, double>& a, const pair<string, double>& b);
bool _extract(const vector<string>& words, vector<string>& keywords, uint topN);
private: private:
enum {bufSize = 1024}; enum {bufSize = 1024};