code layout change: tab -> space

This commit is contained in:
mayunyun 2016-05-03 09:03:16 +08:00
parent 1aa0a32d90
commit f2de41c15e

View File

@ -5,162 +5,162 @@
#include "Jieba.hpp" #include "Jieba.hpp"
namespace cppjieba { namespace cppjieba {
using namespace limonp; using namespace limonp;
using namespace std; using namespace std;
class TextRankExtractor { class TextRankExtractor {
public: public:
typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
private: private:
typedef std::unordered_map<string,Word> WordMap; typedef std::unordered_map<string,Word> WordMap;
class WordGraph{ class WordGraph{
private: private:
typedef double Score; typedef double Score;
typedef string Node; typedef string Node;
typedef std::unordered_set<Node> NodeSet; typedef std::unordered_set<Node> NodeSet;
typedef std::unordered_map<Node,double> Edges; typedef std::unordered_map<Node,double> Edges;
typedef std::unordered_map<Node,Edges> Graph; typedef std::unordered_map<Node,Edges> Graph;
double d; double d;
Graph graph; Graph graph;
NodeSet nodeSet; NodeSet nodeSet;
public: public:
WordGraph(): d(0.85) {}; WordGraph(): d(0.85) {};
WordGraph(double in_d): d(in_d) {}; WordGraph(double in_d): d(in_d) {};
void addEdge(Node start,Node end,double weight){ void addEdge(Node start,Node end,double weight){
Edges temp; Edges temp;
Edges::iterator gotEdges; Edges::iterator gotEdges;
nodeSet.insert(start); nodeSet.insert(start);
nodeSet.insert(end); nodeSet.insert(end);
graph[start][end]+=weight; graph[start][end]+=weight;
graph[end][start]+=weight; graph[end][start]+=weight;
} }
void rank(WordMap &ws,size_t rankTime=10){ void rank(WordMap &ws,size_t rankTime=10){
WordMap outSum; WordMap outSum;
Score wsdef, min_rank, max_rank; Score wsdef, min_rank, max_rank;
if( graph.size() == 0) if( graph.size() == 0)
return; return;
wsdef = 1.0 / graph.size(); wsdef = 1.0 / graph.size();
for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
// edges->first start节点edge->first end节点edge->second 权重 // edges->first start节点edge->first end节点edge->second 权重
ws[edges->first].word=edges->first; ws[edges->first].word=edges->first;
ws[edges->first].weight=wsdef; ws[edges->first].weight=wsdef;
outSum[edges->first].weight=0; outSum[edges->first].weight=0;
for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
outSum[edges->first].weight+=edge->second; outSum[edges->first].weight+=edge->second;
} }
} }
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
for( size_t i=0; i<rankTime; i++ ){ for( size_t i=0; i<rankTime; i++ ){
for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){ for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
double s = 0; double s = 0;
for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ ) for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
// edge->first end节点edge->second 权重 // edge->first end节点edge->second 权重
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
ws[*node].weight = (1 - d) + d * s; ws[*node].weight = (1 - d) + d * s;
} }
} }
min_rank=max_rank=ws.begin()->second.weight; min_rank=max_rank=ws.begin()->second.weight;
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
if( i->second.weight < min_rank ){ if( i->second.weight < min_rank ){
min_rank = i->second.weight; min_rank = i->second.weight;
} }
if( i->second.weight > max_rank ){ if( i->second.weight > max_rank ){
max_rank = i->second.weight; max_rank = i->second.weight;
} }
} }
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
} }
} }
}; };
public: public:
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
LoadStopWordDict(stopWordPath); LoadStopWordDict(stopWordPath);
} }
~TextRankExtractor() { ~TextRankExtractor() {
} }
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
vector<string> words; vector<string> words;
segment_.Cut(sentence, words); segment_.Cut(sentence, words);
TextRankExtractor::WordGraph graph; TextRankExtractor::WordGraph graph;
WordMap wordmap; WordMap wordmap;
size_t offset = 0; size_t offset = 0;
for(size_t i=0; i < words.size(); i++){ for(size_t i=0; i < words.size(); i++){
size_t t = offset; size_t t = offset;
offset += words[i].size(); offset += words[i].size();
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
continue; continue;
} }
for(size_t j=i+1;j<i+span && j<words.size();j++){ for(size_t j=i+1;j<i+span && j<words.size();j++){
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) { if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
continue; continue;
} }
graph.addEdge(words[i],words[j],1); graph.addEdge(words[i],words[j],1);
} }
wordmap[words[i]].offsets.push_back(t); wordmap[words[i]].offsets.push_back(t);
} }
if (offset != sentence.size()) { if (offset != sentence.size()) {
XLOG(ERROR) << "words illegal"; XLOG(ERROR) << "words illegal";
return; return;
} }
graph.rank(wordmap,rankTime); graph.rank(wordmap,rankTime);
keywords.clear(); keywords.clear();
keywords.reserve(wordmap.size()); keywords.reserve(wordmap.size());
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
keywords.push_back(itr->second); keywords.push_back(itr->second);
} }
topN = min(topN, keywords.size()); topN = min(topN, keywords.size());
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
keywords.resize(topN); keywords.resize(topN);
} }
private: private:
void LoadStopWordDict(const string& filePath) { void LoadStopWordDict(const string& filePath) {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
XCHECK(ifs.is_open()) << "open " << filePath << " failed"; XCHECK(ifs.is_open()) << "open " << filePath << " failed";
string line ; string line ;
while (getline(ifs, line)) { while (getline(ifs, line)) {
stopWords_.insert(line); stopWords_.insert(line);
} }
assert(stopWords_.size()); assert(stopWords_.size());
} }
bool IsSingleWord(const string& str) const { bool IsSingleWord(const string& str) const {
Unicode unicode; Unicode unicode;
TransCode::Decode(str, unicode); TransCode::Decode(str, unicode);
if (unicode.size() == 1) if (unicode.size() == 1)
return true; return true;
return false; return false;
} }
static void sortMapValue(WordMap &map,vector<Word>& result,size_t topN){ static void sortMapValue(WordMap &map,vector<Word>& result,size_t topN){
for(auto i=map.begin();i!=map.end();i++){ for(auto i=map.begin();i!=map.end();i++){
result.push_back(i->second); result.push_back(i->second);
} }
partial_sort(result.begin(),result.begin()+topN,result.end(),Compare); partial_sort(result.begin(),result.begin()+topN,result.end(),Compare);
} }
static bool Compare(const Word &x,const Word &y){ static bool Compare(const Word &x,const Word &y){
return x.weight > y.weight; return x.weight > y.weight;
} }
MixSegment segment_; MixSegment segment_;
unordered_set<string> stopWords_; unordered_set<string> stopWords_;
}; };
} // namespace cppjieba } // namespace cppjieba
#endif #endif