mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
create keyword_extract in Jieba
This commit is contained in:
parent
4a755dff6a
commit
74c70c70cd
@ -1,5 +1,9 @@
|
||||
# CppJieba ChangeLog
|
||||
|
||||
## next version
|
||||
|
||||
+ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba
|
||||
|
||||
## v4.8.1
|
||||
|
||||
+ add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65)
|
||||
|
@ -2,22 +2,25 @@
|
||||
#define CPPJIEAB_JIEBA_H
|
||||
|
||||
#include "QuerySegment.hpp"
|
||||
//#include "LevelSegment.hpp"
|
||||
#include "KeywordExtractor.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class Jieba {
|
||||
public:
|
||||
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path)
|
||||
Jieba(const string& dict_path,
|
||||
const string& model_path,
|
||||
const string& user_dict_path,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: dict_trie_(dict_path, user_dict_path),
|
||||
model_(model_path),
|
||||
mp_seg_(&dict_trie_),
|
||||
hmm_seg_(&model_),
|
||||
mix_seg_(&dict_trie_, &model_),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_)
|
||||
//level_seg_(&dict_trie_),
|
||||
{
|
||||
query_seg_(&dict_trie_, &model_),
|
||||
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
|
||||
}
|
||||
~Jieba() {
|
||||
}
|
||||
@ -95,8 +98,9 @@ class Jieba {
|
||||
MixSegment mix_seg_;
|
||||
FullSegment full_seg_;
|
||||
QuerySegment query_seg_;
|
||||
//LevelSegment level_seg_;
|
||||
|
||||
public:
|
||||
KeywordExtractor extractor;
|
||||
}; // class Jieba
|
||||
|
||||
} // namespace cppjieba
|
||||
|
@ -3,10 +3,12 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
#include "Jieba.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
using namespace std;
|
||||
|
||||
/*utf8*/
|
||||
class KeywordExtractor {
|
||||
@ -34,13 +36,6 @@ class KeywordExtractor {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
KeywordExtractor(const Jieba& jieba,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
||||
LoadIdfDict(idfPath);
|
||||
LoadStopWordDict(stopWordPath);
|
||||
}
|
||||
~KeywordExtractor() {
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -12,7 +11,9 @@ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
|
||||
int main(int argc, char** argv) {
|
||||
cppjieba::Jieba jieba(DICT_PATH,
|
||||
HMM_PATH,
|
||||
USER_DICT_PATH);
|
||||
USER_DICT_PATH,
|
||||
IDF_PATH,
|
||||
STOP_WORD_PATH);
|
||||
vector<string> words;
|
||||
vector<cppjieba::Word> jiebawords;
|
||||
string s;
|
||||
@ -69,13 +70,10 @@ int main(int argc, char** argv) {
|
||||
cout << s << endl;
|
||||
cout << tagres << endl;;
|
||||
|
||||
cppjieba::KeywordExtractor extractor(jieba,
|
||||
IDF_PATH,
|
||||
STOP_WORD_PATH);
|
||||
cout << "[demo] Keyword Extraction" << endl;
|
||||
const size_t topk = 5;
|
||||
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
||||
extractor.Extract(s, keywordres, topk);
|
||||
jieba.extractor.Extract(s, keywordres, topk);
|
||||
cout << s << endl;
|
||||
cout << keywordres << endl;
|
||||
return EXIT_SUCCESS;
|
||||
|
@ -6,7 +6,9 @@ using namespace cppjieba;
|
||||
TEST(JiebaTest, Test1) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8");
|
||||
"../dict/user.dict.utf8",
|
||||
"../dict/idf.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
vector<string> words;
|
||||
string result;
|
||||
|
||||
@ -41,7 +43,9 @@ TEST(JiebaTest, Test1) {
|
||||
TEST(JiebaTest, WordTest) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8");
|
||||
"../dict/user.dict.utf8",
|
||||
"../dict/idf.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
vector<Word> words;
|
||||
string result;
|
||||
|
||||
@ -81,7 +85,9 @@ TEST(JiebaTest, WordTest) {
|
||||
TEST(JiebaTest, InsertUserWord) {
|
||||
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
||||
"../dict/hmm_model.utf8",
|
||||
"../dict/user.dict.utf8");
|
||||
"../dict/user.dict.utf8",
|
||||
"../dict/idf.utf8",
|
||||
"../dict/stop_words.utf8");
|
||||
vector<string> words;
|
||||
string result;
|
||||
|
||||
@ -114,4 +120,14 @@ TEST(JiebaTest, InsertUserWord) {
|
||||
jieba.Cut("同一个世界,同一个梦想", words);
|
||||
result = Join(words.begin(), words.end(), "/");
|
||||
ASSERT_EQ(result, "同一个世界,同一个梦想");
|
||||
|
||||
{
|
||||
string s("一部iPhone6");
|
||||
string res;
|
||||
vector<KeywordExtractor::Word> wordweights;
|
||||
size_t topN = 5;
|
||||
jieba.extractor.Extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user