add Jieba.hpp

This commit is contained in:
yanyiwu 2015-09-13 00:28:40 +08:00
parent 63ca914176
commit 710ddacd38
3 changed files with 97 additions and 40 deletions

View File

@ -5,6 +5,7 @@
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
2. 新增层次分词器: LevelSegment 。
3. 增加MPSegment的细粒度分词功能。
4. 增加 class Jieba ,提供可读性更好的接口。
## v3.1.0

View File

@ -1,9 +1,8 @@
#ifndef CPPJIEBA_APPLICATION_H
#define CPPJIEBA_APPLICATION_H
#include "QuerySegment.hpp"
#include "Jieba.hpp"
#include "PosTagger.hpp"
#include "LevelSegment.hpp"
#include "KeywordExtractor.hpp"
namespace CppJieba {
@ -19,45 +18,38 @@ enum CutMethod {
class Application {
public:
Application(const string& dictPath,
const string& modelPath,
const string& userDictPath,
const string& idfPath,
const string& stopWordsPath)
: dictTrie_(dictPath, userDictPath),
model_(modelPath),
mpSeg_(&dictTrie_),
hmmSeg_(&model_),
mixSeg_(&dictTrie_, &model_),
fullSeg_(&dictTrie_),
querySeg_(&dictTrie_, &model_),
levelSeg_(&dictTrie_),
tagger_(&dictTrie_, &model_),
extractor_(&dictTrie_,
&model_,
idfPath,
stopWordsPath) {
Application(const string& dict_path,
const string& model_path,
const string& user_dict_path,
const string& idf_path,
const string& stopWords_path)
: jieba_(dict_path, model_path, user_dict_path),
tagger_(jieba_.GetDictTrie(), jieba_.GetHMMModel()),
extractor_(jieba_.GetDictTrie(),
jieba_.GetHMMModel(),
idf_path,
stopWords_path) {
}
void cut(const string& sentence, vector<string>& words,
CutMethod method = METHOD_MIX) const {
switch(method) {
case METHOD_MP:
mpSeg_.cut(sentence, words);
jieba_.Cut(sentence, false, words);
break;
case METHOD_HMM:
hmmSeg_.cut(sentence, words);
jieba_.CutHMM(sentence, words);
break;
case METHOD_MIX:
mixSeg_.cut(sentence, words);
jieba_.Cut(sentence, true, words);
break;
case METHOD_FULL:
fullSeg_.cut(sentence, words);
jieba_.CutAll(sentence, words);
break;
case METHOD_QUERY:
querySeg_.cut(sentence, words);
jieba_.CutForSearch(sentence, words);
break;
case METHOD_LEVEL:
levelSeg_.cut(sentence, words);
jieba_.CutLevel(sentence, words);
break;
default:
LogError("argument method is illegal.");
@ -65,14 +57,14 @@ class Application {
}
void cut(const string& sentence,
vector<pair<string, size_t> >& words) const {
levelSeg_.cut(sentence, words);
jieba_.CutLevel(sentence, words);
}
void cut(const string& sentence,
vector<string>& words, size_t max_word_len) const {
mpSeg_.cut(sentence, words, max_word_len);
jieba_.CutSmall(sentence, words, max_word_len);
}
bool insertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dictTrie_.insertUserWord(word, tag);
return jieba_.InsertUserWord(word, tag);
}
void tag(const string& str, vector<pair<string, string> >& res) const {
tagger_.tag(str, res);
@ -85,17 +77,7 @@ class Application {
}
private:
DictTrie dictTrie_;
HMMModel model_;
// They share the same dict trie and model
MPSegment mpSeg_;
HMMSegment hmmSeg_;
MixSegment mixSeg_;
FullSegment fullSeg_;
QuerySegment querySeg_;
LevelSegment levelSeg_;
Jieba jieba_;
PosTagger tagger_;
KeywordExtractor extractor_;
}; // class Application

74
src/Jieba.hpp Normal file
View File

@ -0,0 +1,74 @@
#define CPPJIEAB_JIEBA_H
#include "QuerySegment.hpp"
#include "PosTagger.hpp"
#include "LevelSegment.hpp"
namespace CppJieba {
class Jieba {
public:
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path)
: dict_trie_(dict_path, user_dict_path),
model_(model_path),
mp_seg_(&dict_trie_),
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
level_seg_(&dict_trie_) {
}
~Jieba() {
}
void Cut(const string& sentence, bool hmm, vector<string>& words) const {
if (hmm) {
mix_seg_.cut(sentence, words);
} else {
mp_seg_.cut(sentence, words);
}
}
void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.cut(sentence, words);
}
void CutForSearch(const string& sentence, vector<string>& words) const {
query_seg_.cut(sentence, words);
}
void CutHMM(const string& sentence, vector<string>& words) const {
hmm_seg_.cut(sentence, words);
}
void CutLevel(const string& sentence, vector<string>& words) const {
level_seg_.cut(sentence, words);
}
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
level_seg_.cut(sentence, words);
}
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
mp_seg_.cut(sentence, words, max_word_len);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.insertUserWord(word, tag);
}
const DictTrie* GetDictTrie() const {
return &dict_trie_;
}
const HMMModel* GetHMMModel() const {
return &model_;
}
private:
DictTrie dict_trie_;
HMMModel model_;
// They share the same dict trie and model
MPSegment mp_seg_;
HMMSegment hmm_seg_;
MixSegment mix_seg_;
FullSegment full_seg_;
QuerySegment query_seg_;
LevelSegment level_seg_;
}; // class
} // namespace