mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add Jieba.hpp
This commit is contained in:
parent
63ca914176
commit
710ddacd38
@ -5,6 +5,7 @@
|
|||||||
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
|
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
|
||||||
2. 新增层次分词器: LevelSegment 。
|
2. 新增层次分词器: LevelSegment 。
|
||||||
3. 增加MPSegment的细粒度分词功能。
|
3. 增加MPSegment的细粒度分词功能。
|
||||||
|
4. 增加 class Jieba ,提供可读性更好的接口。
|
||||||
|
|
||||||
## v3.1.0
|
## v3.1.0
|
||||||
|
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
#ifndef CPPJIEBA_APPLICATION_H
|
#ifndef CPPJIEBA_APPLICATION_H
|
||||||
#define CPPJIEBA_APPLICATION_H
|
#define CPPJIEBA_APPLICATION_H
|
||||||
|
|
||||||
#include "QuerySegment.hpp"
|
#include "Jieba.hpp"
|
||||||
#include "PosTagger.hpp"
|
#include "PosTagger.hpp"
|
||||||
#include "LevelSegment.hpp"
|
|
||||||
#include "KeywordExtractor.hpp"
|
#include "KeywordExtractor.hpp"
|
||||||
|
|
||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
@ -19,45 +18,38 @@ enum CutMethod {
|
|||||||
|
|
||||||
class Application {
|
class Application {
|
||||||
public:
|
public:
|
||||||
Application(const string& dictPath,
|
Application(const string& dict_path,
|
||||||
const string& modelPath,
|
const string& model_path,
|
||||||
const string& userDictPath,
|
const string& user_dict_path,
|
||||||
const string& idfPath,
|
const string& idf_path,
|
||||||
const string& stopWordsPath)
|
const string& stopWords_path)
|
||||||
: dictTrie_(dictPath, userDictPath),
|
: jieba_(dict_path, model_path, user_dict_path),
|
||||||
model_(modelPath),
|
tagger_(jieba_.GetDictTrie(), jieba_.GetHMMModel()),
|
||||||
mpSeg_(&dictTrie_),
|
extractor_(jieba_.GetDictTrie(),
|
||||||
hmmSeg_(&model_),
|
jieba_.GetHMMModel(),
|
||||||
mixSeg_(&dictTrie_, &model_),
|
idf_path,
|
||||||
fullSeg_(&dictTrie_),
|
stopWords_path) {
|
||||||
querySeg_(&dictTrie_, &model_),
|
|
||||||
levelSeg_(&dictTrie_),
|
|
||||||
tagger_(&dictTrie_, &model_),
|
|
||||||
extractor_(&dictTrie_,
|
|
||||||
&model_,
|
|
||||||
idfPath,
|
|
||||||
stopWordsPath) {
|
|
||||||
}
|
}
|
||||||
void cut(const string& sentence, vector<string>& words,
|
void cut(const string& sentence, vector<string>& words,
|
||||||
CutMethod method = METHOD_MIX) const {
|
CutMethod method = METHOD_MIX) const {
|
||||||
switch(method) {
|
switch(method) {
|
||||||
case METHOD_MP:
|
case METHOD_MP:
|
||||||
mpSeg_.cut(sentence, words);
|
jieba_.Cut(sentence, false, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_HMM:
|
case METHOD_HMM:
|
||||||
hmmSeg_.cut(sentence, words);
|
jieba_.CutHMM(sentence, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_MIX:
|
case METHOD_MIX:
|
||||||
mixSeg_.cut(sentence, words);
|
jieba_.Cut(sentence, true, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_FULL:
|
case METHOD_FULL:
|
||||||
fullSeg_.cut(sentence, words);
|
jieba_.CutAll(sentence, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_QUERY:
|
case METHOD_QUERY:
|
||||||
querySeg_.cut(sentence, words);
|
jieba_.CutForSearch(sentence, words);
|
||||||
break;
|
break;
|
||||||
case METHOD_LEVEL:
|
case METHOD_LEVEL:
|
||||||
levelSeg_.cut(sentence, words);
|
jieba_.CutLevel(sentence, words);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LogError("argument method is illegal.");
|
LogError("argument method is illegal.");
|
||||||
@ -65,14 +57,14 @@ class Application {
|
|||||||
}
|
}
|
||||||
void cut(const string& sentence,
|
void cut(const string& sentence,
|
||||||
vector<pair<string, size_t> >& words) const {
|
vector<pair<string, size_t> >& words) const {
|
||||||
levelSeg_.cut(sentence, words);
|
jieba_.CutLevel(sentence, words);
|
||||||
}
|
}
|
||||||
void cut(const string& sentence,
|
void cut(const string& sentence,
|
||||||
vector<string>& words, size_t max_word_len) const {
|
vector<string>& words, size_t max_word_len) const {
|
||||||
mpSeg_.cut(sentence, words, max_word_len);
|
jieba_.CutSmall(sentence, words, max_word_len);
|
||||||
}
|
}
|
||||||
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||||
return dictTrie_.insertUserWord(word, tag);
|
return jieba_.InsertUserWord(word, tag);
|
||||||
}
|
}
|
||||||
void tag(const string& str, vector<pair<string, string> >& res) const {
|
void tag(const string& str, vector<pair<string, string> >& res) const {
|
||||||
tagger_.tag(str, res);
|
tagger_.tag(str, res);
|
||||||
@ -85,17 +77,7 @@ class Application {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DictTrie dictTrie_;
|
Jieba jieba_;
|
||||||
HMMModel model_;
|
|
||||||
|
|
||||||
// They share the same dict trie and model
|
|
||||||
MPSegment mpSeg_;
|
|
||||||
HMMSegment hmmSeg_;
|
|
||||||
MixSegment mixSeg_;
|
|
||||||
FullSegment fullSeg_;
|
|
||||||
QuerySegment querySeg_;
|
|
||||||
LevelSegment levelSeg_;
|
|
||||||
|
|
||||||
PosTagger tagger_;
|
PosTagger tagger_;
|
||||||
KeywordExtractor extractor_;
|
KeywordExtractor extractor_;
|
||||||
}; // class Application
|
}; // class Application
|
||||||
|
74
src/Jieba.hpp
Normal file
74
src/Jieba.hpp
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#define CPPJIEAB_JIEBA_H
|
||||||
|
|
||||||
|
#include "QuerySegment.hpp"
|
||||||
|
#include "PosTagger.hpp"
|
||||||
|
#include "LevelSegment.hpp"
|
||||||
|
|
||||||
|
namespace CppJieba {
|
||||||
|
|
||||||
|
class Jieba {
|
||||||
|
public:
|
||||||
|
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path)
|
||||||
|
: dict_trie_(dict_path, user_dict_path),
|
||||||
|
model_(model_path),
|
||||||
|
mp_seg_(&dict_trie_),
|
||||||
|
hmm_seg_(&model_),
|
||||||
|
mix_seg_(&dict_trie_, &model_),
|
||||||
|
full_seg_(&dict_trie_),
|
||||||
|
query_seg_(&dict_trie_, &model_),
|
||||||
|
level_seg_(&dict_trie_) {
|
||||||
|
}
|
||||||
|
~Jieba() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void Cut(const string& sentence, bool hmm, vector<string>& words) const {
|
||||||
|
if (hmm) {
|
||||||
|
mix_seg_.cut(sentence, words);
|
||||||
|
} else {
|
||||||
|
mp_seg_.cut(sentence, words);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CutAll(const string& sentence, vector<string>& words) const {
|
||||||
|
full_seg_.cut(sentence, words);
|
||||||
|
}
|
||||||
|
void CutForSearch(const string& sentence, vector<string>& words) const {
|
||||||
|
query_seg_.cut(sentence, words);
|
||||||
|
}
|
||||||
|
void CutHMM(const string& sentence, vector<string>& words) const {
|
||||||
|
hmm_seg_.cut(sentence, words);
|
||||||
|
}
|
||||||
|
void CutLevel(const string& sentence, vector<string>& words) const {
|
||||||
|
level_seg_.cut(sentence, words);
|
||||||
|
}
|
||||||
|
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
|
||||||
|
level_seg_.cut(sentence, words);
|
||||||
|
}
|
||||||
|
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
||||||
|
mp_seg_.cut(sentence, words, max_word_len);
|
||||||
|
}
|
||||||
|
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||||
|
return dict_trie_.insertUserWord(word, tag);
|
||||||
|
}
|
||||||
|
|
||||||
|
const DictTrie* GetDictTrie() const {
|
||||||
|
return &dict_trie_;
|
||||||
|
}
|
||||||
|
const HMMModel* GetHMMModel() const {
|
||||||
|
return &model_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DictTrie dict_trie_;
|
||||||
|
HMMModel model_;
|
||||||
|
|
||||||
|
// They share the same dict trie and model
|
||||||
|
MPSegment mp_seg_;
|
||||||
|
HMMSegment hmm_seg_;
|
||||||
|
MixSegment mix_seg_;
|
||||||
|
FullSegment full_seg_;
|
||||||
|
QuerySegment query_seg_;
|
||||||
|
LevelSegment level_seg_;
|
||||||
|
|
||||||
|
}; // class
|
||||||
|
|
||||||
|
} // namespace
|
Loading…
x
Reference in New Issue
Block a user