add Jieba.hpp

This commit is contained in:
yanyiwu 2015-09-13 00:28:40 +08:00
parent 63ca914176
commit 710ddacd38
3 changed files with 97 additions and 40 deletions

View File

@ -5,6 +5,7 @@
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。 1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
2. 新增层次分词器: LevelSegment 。 2. 新增层次分词器: LevelSegment 。
3. 增加MPSegment的细粒度分词功能。 3. 增加MPSegment的细粒度分词功能。
4. 增加 class Jieba ,提供可读性更好的接口。
## v3.1.0 ## v3.1.0

View File

@ -1,9 +1,8 @@
#ifndef CPPJIEBA_APPLICATION_H #ifndef CPPJIEBA_APPLICATION_H
#define CPPJIEBA_APPLICATION_H #define CPPJIEBA_APPLICATION_H
#include "QuerySegment.hpp" #include "Jieba.hpp"
#include "PosTagger.hpp" #include "PosTagger.hpp"
#include "LevelSegment.hpp"
#include "KeywordExtractor.hpp" #include "KeywordExtractor.hpp"
namespace CppJieba { namespace CppJieba {
@ -19,45 +18,38 @@ enum CutMethod {
class Application { class Application {
public: public:
Application(const string& dictPath, Application(const string& dict_path,
const string& modelPath, const string& model_path,
const string& userDictPath, const string& user_dict_path,
const string& idfPath, const string& idf_path,
const string& stopWordsPath) const string& stopWords_path)
: dictTrie_(dictPath, userDictPath), : jieba_(dict_path, model_path, user_dict_path),
model_(modelPath), tagger_(jieba_.GetDictTrie(), jieba_.GetHMMModel()),
mpSeg_(&dictTrie_), extractor_(jieba_.GetDictTrie(),
hmmSeg_(&model_), jieba_.GetHMMModel(),
mixSeg_(&dictTrie_, &model_), idf_path,
fullSeg_(&dictTrie_), stopWords_path) {
querySeg_(&dictTrie_, &model_),
levelSeg_(&dictTrie_),
tagger_(&dictTrie_, &model_),
extractor_(&dictTrie_,
&model_,
idfPath,
stopWordsPath) {
} }
void cut(const string& sentence, vector<string>& words, void cut(const string& sentence, vector<string>& words,
CutMethod method = METHOD_MIX) const { CutMethod method = METHOD_MIX) const {
switch(method) { switch(method) {
case METHOD_MP: case METHOD_MP:
mpSeg_.cut(sentence, words); jieba_.Cut(sentence, false, words);
break; break;
case METHOD_HMM: case METHOD_HMM:
hmmSeg_.cut(sentence, words); jieba_.CutHMM(sentence, words);
break; break;
case METHOD_MIX: case METHOD_MIX:
mixSeg_.cut(sentence, words); jieba_.Cut(sentence, true, words);
break; break;
case METHOD_FULL: case METHOD_FULL:
fullSeg_.cut(sentence, words); jieba_.CutAll(sentence, words);
break; break;
case METHOD_QUERY: case METHOD_QUERY:
querySeg_.cut(sentence, words); jieba_.CutForSearch(sentence, words);
break; break;
case METHOD_LEVEL: case METHOD_LEVEL:
levelSeg_.cut(sentence, words); jieba_.CutLevel(sentence, words);
break; break;
default: default:
LogError("argument method is illegal."); LogError("argument method is illegal.");
@ -65,14 +57,14 @@ class Application {
} }
void cut(const string& sentence, void cut(const string& sentence,
vector<pair<string, size_t> >& words) const { vector<pair<string, size_t> >& words) const {
levelSeg_.cut(sentence, words); jieba_.CutLevel(sentence, words);
} }
void cut(const string& sentence, void cut(const string& sentence,
vector<string>& words, size_t max_word_len) const { vector<string>& words, size_t max_word_len) const {
mpSeg_.cut(sentence, words, max_word_len); jieba_.CutSmall(sentence, words, max_word_len);
} }
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dictTrie_.insertUserWord(word, tag); return jieba_.InsertUserWord(word, tag);
} }
void tag(const string& str, vector<pair<string, string> >& res) const { void tag(const string& str, vector<pair<string, string> >& res) const {
tagger_.tag(str, res); tagger_.tag(str, res);
@ -85,17 +77,7 @@ class Application {
} }
private: private:
DictTrie dictTrie_; Jieba jieba_;
HMMModel model_;
// They share the same dict trie and model
MPSegment mpSeg_;
HMMSegment hmmSeg_;
MixSegment mixSeg_;
FullSegment fullSeg_;
QuerySegment querySeg_;
LevelSegment levelSeg_;
PosTagger tagger_; PosTagger tagger_;
KeywordExtractor extractor_; KeywordExtractor extractor_;
}; // class Application }; // class Application

74
src/Jieba.hpp Normal file
View File

@ -0,0 +1,74 @@
#define CPPJIEAB_JIEBA_H
#include "QuerySegment.hpp"
#include "PosTagger.hpp"
#include "LevelSegment.hpp"
namespace CppJieba {
class Jieba {
public:
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path)
: dict_trie_(dict_path, user_dict_path),
model_(model_path),
mp_seg_(&dict_trie_),
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
level_seg_(&dict_trie_) {
}
~Jieba() {
}
void Cut(const string& sentence, bool hmm, vector<string>& words) const {
if (hmm) {
mix_seg_.cut(sentence, words);
} else {
mp_seg_.cut(sentence, words);
}
}
void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.cut(sentence, words);
}
void CutForSearch(const string& sentence, vector<string>& words) const {
query_seg_.cut(sentence, words);
}
void CutHMM(const string& sentence, vector<string>& words) const {
hmm_seg_.cut(sentence, words);
}
void CutLevel(const string& sentence, vector<string>& words) const {
level_seg_.cut(sentence, words);
}
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
level_seg_.cut(sentence, words);
}
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
mp_seg_.cut(sentence, words, max_word_len);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.insertUserWord(word, tag);
}
const DictTrie* GetDictTrie() const {
return &dict_trie_;
}
const HMMModel* GetHMMModel() const {
return &model_;
}
private:
DictTrie dict_trie_;
HMMModel model_;
// They share the same dict trie and model
MPSegment mp_seg_;
HMMSegment hmm_seg_;
MixSegment mix_seg_;
FullSegment full_seg_;
QuerySegment query_seg_;
LevelSegment level_seg_;
}; // class
} // namespace