Merge pull request #65 from questionfish/master

增加了TextRank关键词提取
2025-07-18 00:00:12 +08:00 · 2016-05-04 20:02:07 +08:00 · 2016-05-04 20:02:07 +08:00 · 02df433f73
commit 02df433f73
parent a778d47046 00b2eb13c6
4 changed files with 278 additions and 1 deletions
--- a/include/cppjieba/KeywordExtractor.hpp
+++ b/include/cppjieba/KeywordExtractor.hpp
@ -145,7 +145,7 @@ class KeywordExtractor {
  double idfAverage_;

  unordered_set<string> stopWords_;
-}; // class Jieba
+}; // class KeywordExtractor

 inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
  return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
--- a/include/cppjieba/TextRankExtractor.hpp
+++ b/include/cppjieba/TextRankExtractor.hpp
@ -0,0 +1,190 @@
+#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
+#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
+
+#include <cmath>
+#include "Jieba.hpp"
+
+namespace cppjieba {
+  using namespace limonp;
+  using namespace std;
+
+  class TextRankExtractor {
+  public:
+    typedef struct _Word {string word;vector<size_t> offsets;double weight;}    Word; // struct Word
+  private:
+    typedef std::map<string,Word> WordMap;
+  
+    class WordGraph{
+    private:
+      typedef double Score;
+      typedef string Node;
+      typedef std::set<Node> NodeSet;
+
+      typedef std::map<Node,double> Edges;
+      typedef std::map<Node,Edges> Graph;
+      //typedef std::unordered_map<Node,double> Edges;
+      //typedef std::unordered_map<Node,Edges> Graph;
+
+      double d;
+      Graph graph;
+      NodeSet nodeSet;
+    public:
+      WordGraph(): d(0.85) {};
+      WordGraph(double in_d): d(in_d) {};
+
+      void addEdge(Node start,Node end,double weight){
+        Edges temp;
+        Edges::iterator gotEdges;
+        nodeSet.insert(start);
+        nodeSet.insert(end);
+        graph[start][end]+=weight;
+        graph[end][start]+=weight;
+      }
+
+      void rank(WordMap &ws,size_t rankTime=10){
+        WordMap outSum;
+        Score wsdef, min_rank, max_rank;
+
+        if( graph.size() == 0)
+          return;
+
+        wsdef = 1.0 / graph.size();
+
+        for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
+          // edges->first start节点；edge->first end节点；edge->second 权重
+          ws[edges->first].word=edges->first;
+          ws[edges->first].weight=wsdef;
+          outSum[edges->first].weight=0;
+          for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
+            outSum[edges->first].weight+=edge->second;
+          }
+        }
+        //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
+        for( size_t i=0; i<rankTime; i++ ){
+          for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
+            double s = 0;
+            for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
+              // edge->first end节点；edge->second 权重
+              s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
+            ws[*node].weight = (1 - d) + d * s;
+          }
+        }
+
+        min_rank=max_rank=ws.begin()->second.weight;
+        for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
+          if( i->second.weight < min_rank ){
+            min_rank = i->second.weight;
+          }
+          if( i->second.weight > max_rank ){
+            max_rank = i->second.weight;
+          }
+        }
+        for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
+          ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
+        }
+      }
+    };
+
+  public: 
+  TextRankExtractor(const string& dictPath, 
+        const string& hmmFilePath, 
+        const string& stopWordPath, 
+        const string& userDict = "") 
+    : segment_(dictPath, hmmFilePath, userDict) {
+    LoadStopWordDict(stopWordPath);
+  }
+  TextRankExtractor(const DictTrie* dictTrie, 
+        const HMMModel* model,
+        const string& stopWordPath) 
+    : segment_(dictTrie, model) {
+    LoadStopWordDict(stopWordPath);
+  }
+    TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
+        LoadStopWordDict(stopWordPath);
+    }
+    ~TextRankExtractor() {
+    }
+
+    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+      vector<Word> topWords;
+      Extract(sentence, topWords, topN);
+      for (size_t i = 0; i < topWords.size(); i++) {
+        keywords.push_back(topWords[i].word);
+      }
+    }
+
+    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+      vector<Word> topWords;
+      Extract(sentence, topWords, topN);
+      for (size_t i = 0; i < topWords.size(); i++) {
+        keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+      }
+    }
+
+    void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
+      vector<string> words;
+      segment_.Cut(sentence, words);
+
+      TextRankExtractor::WordGraph graph;
+      WordMap wordmap;
+      size_t offset = 0;
+
+      for(size_t i=0; i < words.size(); i++){
+        size_t t = offset;
+        offset += words[i].size();
+        if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+          continue;
+        }
+        for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
+          if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
+            skip++;
+            continue;
+          }
+          graph.addEdge(words[i],words[j],1);
+        }
+        wordmap[words[i]].offsets.push_back(t);
+      }
+      if (offset != sentence.size()) {
+        XLOG(ERROR) << "words illegal";
+        return;
+      }
+
+      graph.rank(wordmap,rankTime);
+      
+      keywords.clear();
+      keywords.reserve(wordmap.size());
+      for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+        keywords.push_back(itr->second);
+      }
+      
+      topN = min(topN, keywords.size());
+      partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+      keywords.resize(topN);
+    }
+  private:
+    void LoadStopWordDict(const string& filePath) {
+      ifstream ifs(filePath.c_str());
+      XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+      string line ;
+      while (getline(ifs, line)) {
+        stopWords_.insert(line);
+      }
+      assert(stopWords_.size());
+    }
+
+    static bool Compare(const Word &x,const Word &y){
+      return x.weight > y.weight;
+    }
+
+    MixSegment segment_;
+    unordered_set<string> stopWords_;
+  }; // class TextRankExtractor
+  
+  inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
+    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
+  }
+} // namespace cppjieba
+
+#endif
+
+
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@ -14,6 +14,7 @@ ADD_EXECUTABLE(test.run
    jieba_test.cpp
    pre_filter_test.cpp
    unicode_test.cpp
+    textrank_test.cpp
 )

 if(MSVC)
--- a/test/unittest/textrank_test.cpp
+++ b/test/unittest/textrank_test.cpp
@ -0,0 +1,86 @@
+#include "cppjieba/TextRankExtractor.hpp"
+#include "gtest/gtest.h"
+
+using namespace cppjieba;
+
+TEST(TextRankExtractorTest, Test1) {
+  TextRankExtractor Extractor(
+    "../test/testdata/extra_dict/jieba.dict.small.utf8",
+    "../dict/hmm_model.utf8", 
+    "../dict/stop_words.utf8");
+  {
+    string s("你好世界世界而且而且");
+    string res;
+    size_t topN = 5;
+
+    {
+      vector<string> words;
+      Extractor.Extract(s, words, topN);
+      res << words;
+      ASSERT_EQ(res, "[\"世界\", \"你好\"]");
+    }
+
+    {
+      vector<pair<string, double> > words;
+      Extractor.Extract(s, words, topN);
+      res << words;
+      ASSERT_EQ(res, "[世界:1, 你好:0.519787]");
+    }
+
+    {
+      vector<TextRankExtractor::Word> words;
+      Extractor.Extract(s, words, topN);
+      res << words;
+      ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]");
+    }
+  }
+
+  { 
+    string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]");
+    //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]");
+  }
+
+  {
+    string s("一部iPhone6");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
+  }
+}
+
+TEST(TextRankExtractorTest, Test2) {
+  TextRankExtractor Extractor(
+    "../test/testdata/extra_dict/jieba.dict.small.utf8",
+    "../dict/hmm_model.utf8",
+    "../dict/stop_words.utf8",
+    "../test/testdata/userdict.utf8");
+
+  {
+    string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]");
+  }
+
+  {
+    string s("一部iPhone6");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
+  }
+}