create keyword_extract in Jieba

2025-07-18 00:00:12 +08:00 · 2016-09-11 21:42:36 +08:00 · 2016-09-11 21:42:36 +08:00 · 74c70c70cd
commit 74c70c70cd
parent 4a755dff6a
5 changed files with 44 additions and 27 deletions
--- a/ChangeLog.md
+++ b/ChangeLog.md
@ -1,5 +1,9 @@
 # CppJieba ChangeLog

+## next version
+
+ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba
+
 ## v4.8.1

 + add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65)
--- a/include/cppjieba/Jieba.hpp
+++ b/include/cppjieba/Jieba.hpp
@ -2,22 +2,25 @@
 #define CPPJIEAB_JIEBA_H

 #include "QuerySegment.hpp"
-//#include "LevelSegment.hpp"
+#include "KeywordExtractor.hpp"

 namespace cppjieba {

 class Jieba {
 public:
-  Jieba(const string& dict_path, const string& model_path, const string& user_dict_path) 
+  Jieba(const string& dict_path, 
+        const string& model_path,
+        const string& user_dict_path, 
+        const string& idfPath, 
+        const string& stopWordPath) 
    : dict_trie_(dict_path, user_dict_path),
      model_(model_path),
      mp_seg_(&dict_trie_),
      hmm_seg_(&model_),
      mix_seg_(&dict_trie_, &model_),
      full_seg_(&dict_trie_),
-      query_seg_(&dict_trie_, &model_)
-      //level_seg_(&dict_trie_),
-      {
+      query_seg_(&dict_trie_, &model_),
+      extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
  }
  ~Jieba() {
  }
@ -95,8 +98,9 @@ class Jieba {
  MixSegment mix_seg_;
  FullSegment full_seg_;
  QuerySegment query_seg_;
-  //LevelSegment level_seg_;

+ public:
+  KeywordExtractor extractor;
 }; // class Jieba

 } // namespace cppjieba
--- a/include/cppjieba/KeywordExtractor.hpp
+++ b/include/cppjieba/KeywordExtractor.hpp
@ -3,10 +3,12 @@

 #include <cmath>
 #include <set>
-#include "Jieba.hpp"
+#include "MixSegment.hpp"

 namespace cppjieba {
+
 using namespace limonp;
+using namespace std;

 /*utf8*/
 class KeywordExtractor {
@ -34,13 +36,6 @@ class KeywordExtractor {
    LoadIdfDict(idfPath);
    LoadStopWordDict(stopWordPath);
  }
-  KeywordExtractor(const Jieba& jieba, 
-        const string& idfPath, 
-        const string& stopWordPath) 
-    : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
-    LoadIdfDict(idfPath);
-    LoadStopWordDict(stopWordPath);
-  }
  ~KeywordExtractor() {
  }

--- a/test/demo.cpp
+++ b/test/demo.cpp
@ -1,5 +1,4 @@
 #include "cppjieba/Jieba.hpp"
-#include "cppjieba/KeywordExtractor.hpp"

 using namespace std;

@ -12,7 +11,9 @@ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
 int main(int argc, char** argv) {
  cppjieba::Jieba jieba(DICT_PATH,
        HMM_PATH,
-        USER_DICT_PATH);
+        USER_DICT_PATH,
+        IDF_PATH,
+        STOP_WORD_PATH);
  vector<string> words;
  vector<cppjieba::Word> jiebawords;
  string s;
@ -69,13 +70,10 @@ int main(int argc, char** argv) {
  cout << s << endl;
  cout << tagres << endl;;

-  cppjieba::KeywordExtractor extractor(jieba,
-        IDF_PATH,
-        STOP_WORD_PATH);
  cout << "[demo] Keyword Extraction" << endl;
  const size_t topk = 5;
  vector<cppjieba::KeywordExtractor::Word> keywordres;
-  extractor.Extract(s, keywordres, topk);
+  jieba.extractor.Extract(s, keywordres, topk);
  cout << s << endl;
  cout << keywordres << endl;
  return EXIT_SUCCESS;
--- a/test/unittest/jieba_test.cpp
+++ b/test/unittest/jieba_test.cpp
@ -6,7 +6,9 @@ using namespace cppjieba;
 TEST(JiebaTest, Test1) {
  cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
                        "../dict/hmm_model.utf8",
-                            "../dict/user.dict.utf8");
+                        "../dict/user.dict.utf8",
+                        "../dict/idf.utf8",
+                        "../dict/stop_words.utf8");
  vector<string> words;
  string result;

@ -41,7 +43,9 @@ TEST(JiebaTest, Test1) {
 TEST(JiebaTest, WordTest) {
  cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
                        "../dict/hmm_model.utf8",
-                            "../dict/user.dict.utf8");
+                        "../dict/user.dict.utf8",
+                        "../dict/idf.utf8",
+                        "../dict/stop_words.utf8");
  vector<Word> words;
  string result;

@ -81,7 +85,9 @@ TEST(JiebaTest, WordTest) {
 TEST(JiebaTest, InsertUserWord) {
  cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
                        "../dict/hmm_model.utf8",
-                            "../dict/user.dict.utf8");
+                        "../dict/user.dict.utf8",
+                        "../dict/idf.utf8",
+                        "../dict/stop_words.utf8");
  vector<string> words;
  string result;

@ -114,4 +120,14 @@ TEST(JiebaTest, InsertUserWord) {
  jieba.Cut("同一个世界，同一个梦想", words);
  result = Join(words.begin(), words.end(), "/");
  ASSERT_EQ(result, "同一个世界，同一个梦想");
+
+  {
+    string s("一部iPhone6");
+    string res;
+    vector<KeywordExtractor::Word> wordweights;
+    size_t topN = 5;
+    jieba.extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
+  }
 }