create keyword_extract in Jieba

This commit is contained in:
yanyiwu 2016-09-11 21:42:36 +08:00
parent 4a755dff6a
commit 74c70c70cd
5 changed files with 44 additions and 27 deletions

View File

@ -1,5 +1,9 @@
# CppJieba ChangeLog # CppJieba ChangeLog
## next version
+ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba
## v4.8.1 ## v4.8.1
+ add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65) + add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65)

View File

@ -2,22 +2,25 @@
#define CPPJIEAB_JIEBA_H #define CPPJIEAB_JIEBA_H
#include "QuerySegment.hpp" #include "QuerySegment.hpp"
//#include "LevelSegment.hpp" #include "KeywordExtractor.hpp"
namespace cppjieba { namespace cppjieba {
class Jieba { class Jieba {
public: public:
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path) Jieba(const string& dict_path,
const string& model_path,
const string& user_dict_path,
const string& idfPath,
const string& stopWordPath)
: dict_trie_(dict_path, user_dict_path), : dict_trie_(dict_path, user_dict_path),
model_(model_path), model_(model_path),
mp_seg_(&dict_trie_), mp_seg_(&dict_trie_),
hmm_seg_(&model_), hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_), mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_), full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_) query_seg_(&dict_trie_, &model_),
//level_seg_(&dict_trie_), extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
{
} }
~Jieba() { ~Jieba() {
} }
@ -84,7 +87,7 @@ class Jieba {
const HMMModel* GetHMMModel() const { const HMMModel* GetHMMModel() const {
return &model_; return &model_;
} }
private: private:
DictTrie dict_trie_; DictTrie dict_trie_;
HMMModel model_; HMMModel model_;
@ -95,8 +98,9 @@ class Jieba {
MixSegment mix_seg_; MixSegment mix_seg_;
FullSegment full_seg_; FullSegment full_seg_;
QuerySegment query_seg_; QuerySegment query_seg_;
//LevelSegment level_seg_;
public:
KeywordExtractor extractor;
}; // class Jieba }; // class Jieba
} // namespace cppjieba } // namespace cppjieba

View File

@ -3,10 +3,12 @@
#include <cmath> #include <cmath>
#include <set> #include <set>
#include "Jieba.hpp" #include "MixSegment.hpp"
namespace cppjieba { namespace cppjieba {
using namespace limonp; using namespace limonp;
using namespace std;
/*utf8*/ /*utf8*/
class KeywordExtractor { class KeywordExtractor {
@ -34,13 +36,6 @@ class KeywordExtractor {
LoadIdfDict(idfPath); LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath); LoadStopWordDict(stopWordPath);
} }
KeywordExtractor(const Jieba& jieba,
const string& idfPath,
const string& stopWordPath)
: segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
}
~KeywordExtractor() { ~KeywordExtractor() {
} }

View File

@ -1,5 +1,4 @@
#include "cppjieba/Jieba.hpp" #include "cppjieba/Jieba.hpp"
#include "cppjieba/KeywordExtractor.hpp"
using namespace std; using namespace std;
@ -12,7 +11,9 @@ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
int main(int argc, char** argv) { int main(int argc, char** argv) {
cppjieba::Jieba jieba(DICT_PATH, cppjieba::Jieba jieba(DICT_PATH,
HMM_PATH, HMM_PATH,
USER_DICT_PATH); USER_DICT_PATH,
IDF_PATH,
STOP_WORD_PATH);
vector<string> words; vector<string> words;
vector<cppjieba::Word> jiebawords; vector<cppjieba::Word> jiebawords;
string s; string s;
@ -69,13 +70,10 @@ int main(int argc, char** argv) {
cout << s << endl; cout << s << endl;
cout << tagres << endl;; cout << tagres << endl;;
cppjieba::KeywordExtractor extractor(jieba,
IDF_PATH,
STOP_WORD_PATH);
cout << "[demo] Keyword Extraction" << endl; cout << "[demo] Keyword Extraction" << endl;
const size_t topk = 5; const size_t topk = 5;
vector<cppjieba::KeywordExtractor::Word> keywordres; vector<cppjieba::KeywordExtractor::Word> keywordres;
extractor.Extract(s, keywordres, topk); jieba.extractor.Extract(s, keywordres, topk);
cout << s << endl; cout << s << endl;
cout << keywordres << endl; cout << keywordres << endl;
return EXIT_SUCCESS; return EXIT_SUCCESS;

View File

@ -5,8 +5,10 @@ using namespace cppjieba;
TEST(JiebaTest, Test1) { TEST(JiebaTest, Test1) {
cppjieba::Jieba jieba("../dict/jieba.dict.utf8", cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8", "../dict/hmm_model.utf8",
"../dict/user.dict.utf8"); "../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words; vector<string> words;
string result; string result;
@ -40,8 +42,10 @@ TEST(JiebaTest, Test1) {
} }
TEST(JiebaTest, WordTest) { TEST(JiebaTest, WordTest) {
cppjieba::Jieba jieba("../dict/jieba.dict.utf8", cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8", "../dict/hmm_model.utf8",
"../dict/user.dict.utf8"); "../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<Word> words; vector<Word> words;
string result; string result;
@ -80,8 +84,10 @@ TEST(JiebaTest, WordTest) {
TEST(JiebaTest, InsertUserWord) { TEST(JiebaTest, InsertUserWord) {
cppjieba::Jieba jieba("../dict/jieba.dict.utf8", cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8", "../dict/hmm_model.utf8",
"../dict/user.dict.utf8"); "../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words; vector<string> words;
string result; string result;
@ -114,4 +120,14 @@ TEST(JiebaTest, InsertUserWord) {
jieba.Cut("同一个世界,同一个梦想", words); jieba.Cut("同一个世界,同一个梦想", words);
result = Join(words.begin(), words.end(), "/"); result = Join(words.begin(), words.end(), "/");
ASSERT_EQ(result, "同一个世界,同一个梦想"); ASSERT_EQ(result, "同一个世界,同一个梦想");
{
string s("一部iPhone6");
string res;
vector<KeywordExtractor::Word> wordweights;
size_t topN = 5;
jieba.extractor.Extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
}
} }