mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
create keyword_extract in Jieba
This commit is contained in:
parent
4a755dff6a
commit
74c70c70cd
@ -1,5 +1,9 @@
|
|||||||
# CppJieba ChangeLog
|
# CppJieba ChangeLog
|
||||||
|
|
||||||
|
## next version
|
||||||
|
|
||||||
|
+ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba
|
||||||
|
|
||||||
## v4.8.1
|
## v4.8.1
|
||||||
|
|
||||||
+ add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65)
|
+ add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65)
|
||||||
|
@ -2,22 +2,25 @@
|
|||||||
#define CPPJIEAB_JIEBA_H
|
#define CPPJIEAB_JIEBA_H
|
||||||
|
|
||||||
#include "QuerySegment.hpp"
|
#include "QuerySegment.hpp"
|
||||||
//#include "LevelSegment.hpp"
|
#include "KeywordExtractor.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
|
|
||||||
class Jieba {
|
class Jieba {
|
||||||
public:
|
public:
|
||||||
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path)
|
Jieba(const string& dict_path,
|
||||||
|
const string& model_path,
|
||||||
|
const string& user_dict_path,
|
||||||
|
const string& idfPath,
|
||||||
|
const string& stopWordPath)
|
||||||
: dict_trie_(dict_path, user_dict_path),
|
: dict_trie_(dict_path, user_dict_path),
|
||||||
model_(model_path),
|
model_(model_path),
|
||||||
mp_seg_(&dict_trie_),
|
mp_seg_(&dict_trie_),
|
||||||
hmm_seg_(&model_),
|
hmm_seg_(&model_),
|
||||||
mix_seg_(&dict_trie_, &model_),
|
mix_seg_(&dict_trie_, &model_),
|
||||||
full_seg_(&dict_trie_),
|
full_seg_(&dict_trie_),
|
||||||
query_seg_(&dict_trie_, &model_)
|
query_seg_(&dict_trie_, &model_),
|
||||||
//level_seg_(&dict_trie_),
|
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
|
||||||
{
|
|
||||||
}
|
}
|
||||||
~Jieba() {
|
~Jieba() {
|
||||||
}
|
}
|
||||||
@ -84,7 +87,7 @@ class Jieba {
|
|||||||
const HMMModel* GetHMMModel() const {
|
const HMMModel* GetHMMModel() const {
|
||||||
return &model_;
|
return &model_;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DictTrie dict_trie_;
|
DictTrie dict_trie_;
|
||||||
HMMModel model_;
|
HMMModel model_;
|
||||||
@ -95,8 +98,9 @@ class Jieba {
|
|||||||
MixSegment mix_seg_;
|
MixSegment mix_seg_;
|
||||||
FullSegment full_seg_;
|
FullSegment full_seg_;
|
||||||
QuerySegment query_seg_;
|
QuerySegment query_seg_;
|
||||||
//LevelSegment level_seg_;
|
|
||||||
|
|
||||||
|
public:
|
||||||
|
KeywordExtractor extractor;
|
||||||
}; // class Jieba
|
}; // class Jieba
|
||||||
|
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
@ -3,10 +3,12 @@
|
|||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include "Jieba.hpp"
|
#include "MixSegment.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
|
|
||||||
using namespace limonp;
|
using namespace limonp;
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
/*utf8*/
|
/*utf8*/
|
||||||
class KeywordExtractor {
|
class KeywordExtractor {
|
||||||
@ -34,13 +36,6 @@ class KeywordExtractor {
|
|||||||
LoadIdfDict(idfPath);
|
LoadIdfDict(idfPath);
|
||||||
LoadStopWordDict(stopWordPath);
|
LoadStopWordDict(stopWordPath);
|
||||||
}
|
}
|
||||||
KeywordExtractor(const Jieba& jieba,
|
|
||||||
const string& idfPath,
|
|
||||||
const string& stopWordPath)
|
|
||||||
: segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
|
||||||
LoadIdfDict(idfPath);
|
|
||||||
LoadStopWordDict(stopWordPath);
|
|
||||||
}
|
|
||||||
~KeywordExtractor() {
|
~KeywordExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
#include "cppjieba/Jieba.hpp"
|
#include "cppjieba/Jieba.hpp"
|
||||||
#include "cppjieba/KeywordExtractor.hpp"
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -12,7 +11,9 @@ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
|
|||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
cppjieba::Jieba jieba(DICT_PATH,
|
cppjieba::Jieba jieba(DICT_PATH,
|
||||||
HMM_PATH,
|
HMM_PATH,
|
||||||
USER_DICT_PATH);
|
USER_DICT_PATH,
|
||||||
|
IDF_PATH,
|
||||||
|
STOP_WORD_PATH);
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
vector<cppjieba::Word> jiebawords;
|
vector<cppjieba::Word> jiebawords;
|
||||||
string s;
|
string s;
|
||||||
@ -69,13 +70,10 @@ int main(int argc, char** argv) {
|
|||||||
cout << s << endl;
|
cout << s << endl;
|
||||||
cout << tagres << endl;;
|
cout << tagres << endl;;
|
||||||
|
|
||||||
cppjieba::KeywordExtractor extractor(jieba,
|
|
||||||
IDF_PATH,
|
|
||||||
STOP_WORD_PATH);
|
|
||||||
cout << "[demo] Keyword Extraction" << endl;
|
cout << "[demo] Keyword Extraction" << endl;
|
||||||
const size_t topk = 5;
|
const size_t topk = 5;
|
||||||
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||||
extractor.Extract(s, keywordres, topk);
|
jieba.extractor.Extract(s, keywordres, topk);
|
||||||
cout << s << endl;
|
cout << s << endl;
|
||||||
cout << keywordres << endl;
|
cout << keywordres << endl;
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
|
@ -5,8 +5,10 @@ using namespace cppjieba;
|
|||||||
|
|
||||||
TEST(JiebaTest, Test1) {
|
TEST(JiebaTest, Test1) {
|
||||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||||
"../dict/hmm_model.utf8",
|
"../dict/hmm_model.utf8",
|
||||||
"../dict/user.dict.utf8");
|
"../dict/user.dict.utf8",
|
||||||
|
"../dict/idf.utf8",
|
||||||
|
"../dict/stop_words.utf8");
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
string result;
|
string result;
|
||||||
|
|
||||||
@ -40,8 +42,10 @@ TEST(JiebaTest, Test1) {
|
|||||||
}
|
}
|
||||||
TEST(JiebaTest, WordTest) {
|
TEST(JiebaTest, WordTest) {
|
||||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||||
"../dict/hmm_model.utf8",
|
"../dict/hmm_model.utf8",
|
||||||
"../dict/user.dict.utf8");
|
"../dict/user.dict.utf8",
|
||||||
|
"../dict/idf.utf8",
|
||||||
|
"../dict/stop_words.utf8");
|
||||||
vector<Word> words;
|
vector<Word> words;
|
||||||
string result;
|
string result;
|
||||||
|
|
||||||
@ -80,8 +84,10 @@ TEST(JiebaTest, WordTest) {
|
|||||||
|
|
||||||
TEST(JiebaTest, InsertUserWord) {
|
TEST(JiebaTest, InsertUserWord) {
|
||||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||||
"../dict/hmm_model.utf8",
|
"../dict/hmm_model.utf8",
|
||||||
"../dict/user.dict.utf8");
|
"../dict/user.dict.utf8",
|
||||||
|
"../dict/idf.utf8",
|
||||||
|
"../dict/stop_words.utf8");
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
string result;
|
string result;
|
||||||
|
|
||||||
@ -114,4 +120,14 @@ TEST(JiebaTest, InsertUserWord) {
|
|||||||
jieba.Cut("同一个世界,同一个梦想", words);
|
jieba.Cut("同一个世界,同一个梦想", words);
|
||||||
result = Join(words.begin(), words.end(), "/");
|
result = Join(words.begin(), words.end(), "/");
|
||||||
ASSERT_EQ(result, "同一个世界,同一个梦想");
|
ASSERT_EQ(result, "同一个世界,同一个梦想");
|
||||||
|
|
||||||
|
{
|
||||||
|
string s("一部iPhone6");
|
||||||
|
string res;
|
||||||
|
vector<KeywordExtractor::Word> wordweights;
|
||||||
|
size_t topN = 5;
|
||||||
|
jieba.extractor.Extract(s, wordweights, topN);
|
||||||
|
res << wordweights;
|
||||||
|
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user