diff --git a/ChangeLog.md b/ChangeLog.md index 1ef5413..ad73af5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -5,6 +5,7 @@ 1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。 2. 新增层次分词器: LevelSegment 。 3. 增加MPSegment的细粒度分词功能。 +4. 增加 class Jieba ,提供可读性更好的接口。 ## v3.1.0 diff --git a/src/Application.hpp b/src/Application.hpp index e48e931..0247d9c 100644 --- a/src/Application.hpp +++ b/src/Application.hpp @@ -1,9 +1,8 @@ #ifndef CPPJIEBA_APPLICATION_H #define CPPJIEBA_APPLICATION_H -#include "QuerySegment.hpp" +#include "Jieba.hpp" #include "PosTagger.hpp" -#include "LevelSegment.hpp" #include "KeywordExtractor.hpp" namespace CppJieba { @@ -19,45 +18,38 @@ enum CutMethod { class Application { public: - Application(const string& dictPath, - const string& modelPath, - const string& userDictPath, - const string& idfPath, - const string& stopWordsPath) - : dictTrie_(dictPath, userDictPath), - model_(modelPath), - mpSeg_(&dictTrie_), - hmmSeg_(&model_), - mixSeg_(&dictTrie_, &model_), - fullSeg_(&dictTrie_), - querySeg_(&dictTrie_, &model_), - levelSeg_(&dictTrie_), - tagger_(&dictTrie_, &model_), - extractor_(&dictTrie_, - &model_, - idfPath, - stopWordsPath) { + Application(const string& dict_path, + const string& model_path, + const string& user_dict_path, + const string& idf_path, + const string& stopWords_path) + : jieba_(dict_path, model_path, user_dict_path), + tagger_(jieba_.GetDictTrie(), jieba_.GetHMMModel()), + extractor_(jieba_.GetDictTrie(), + jieba_.GetHMMModel(), + idf_path, + stopWords_path) { } void cut(const string& sentence, vector& words, CutMethod method = METHOD_MIX) const { switch(method) { case METHOD_MP: - mpSeg_.cut(sentence, words); + jieba_.Cut(sentence, false, words); break; case METHOD_HMM: - hmmSeg_.cut(sentence, words); + jieba_.CutHMM(sentence, words); break; case METHOD_MIX: - mixSeg_.cut(sentence, words); + jieba_.Cut(sentence, true, words); break; case METHOD_FULL: - fullSeg_.cut(sentence, words); + jieba_.CutAll(sentence, words); break; case METHOD_QUERY: - querySeg_.cut(sentence, words); + jieba_.CutForSearch(sentence, words); break; case METHOD_LEVEL: - levelSeg_.cut(sentence, words); + jieba_.CutLevel(sentence, words); break; default: LogError("argument method is illegal."); @@ -65,14 +57,14 @@ class Application { } void cut(const string& sentence, vector >& words) const { - levelSeg_.cut(sentence, words); + jieba_.CutLevel(sentence, words); } void cut(const string& sentence, vector& words, size_t max_word_len) const { - mpSeg_.cut(sentence, words, max_word_len); + jieba_.CutSmall(sentence, words, max_word_len); } bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { - return dictTrie_.insertUserWord(word, tag); + return jieba_.InsertUserWord(word, tag); } void tag(const string& str, vector >& res) const { tagger_.tag(str, res); @@ -85,17 +77,7 @@ class Application { } private: - DictTrie dictTrie_; - HMMModel model_; - - // They share the same dict trie and model - MPSegment mpSeg_; - HMMSegment hmmSeg_; - MixSegment mixSeg_; - FullSegment fullSeg_; - QuerySegment querySeg_; - LevelSegment levelSeg_; - + Jieba jieba_; PosTagger tagger_; KeywordExtractor extractor_; }; // class Application diff --git a/src/Jieba.hpp b/src/Jieba.hpp new file mode 100644 index 0000000..e1e8f7e --- /dev/null +++ b/src/Jieba.hpp @@ -0,0 +1,74 @@ +#define CPPJIEAB_JIEBA_H + +#include "QuerySegment.hpp" +#include "PosTagger.hpp" +#include "LevelSegment.hpp" + +namespace CppJieba { + +class Jieba { + public: + Jieba(const string& dict_path, const string& model_path, const string& user_dict_path) + : dict_trie_(dict_path, user_dict_path), + model_(model_path), + mp_seg_(&dict_trie_), + hmm_seg_(&model_), + mix_seg_(&dict_trie_, &model_), + full_seg_(&dict_trie_), + query_seg_(&dict_trie_, &model_), + level_seg_(&dict_trie_) { + } + ~Jieba() { + } + + void Cut(const string& sentence, bool hmm, vector& words) const { + if (hmm) { + mix_seg_.cut(sentence, words); + } else { + mp_seg_.cut(sentence, words); + } + } + void CutAll(const string& sentence, vector& words) const { + full_seg_.cut(sentence, words); + } + void CutForSearch(const string& sentence, vector& words) const { + query_seg_.cut(sentence, words); + } + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.cut(sentence, words); + } + void CutLevel(const string& sentence, vector& words) const { + level_seg_.cut(sentence, words); + } + void CutLevel(const string& sentence, vector >& words) const { + level_seg_.cut(sentence, words); + } + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.cut(sentence, words, max_word_len); + } + bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + return dict_trie_.insertUserWord(word, tag); + } + + const DictTrie* GetDictTrie() const { + return &dict_trie_; + } + const HMMModel* GetHMMModel() const { + return &model_; + } + + private: + DictTrie dict_trie_; + HMMModel model_; + + // They share the same dict trie and model + MPSegment mp_seg_; + HMMSegment hmm_seg_; + MixSegment mix_seg_; + FullSegment full_seg_; + QuerySegment query_seg_; + LevelSegment level_seg_; + +}; // class + +} // namespace