diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 6cd1d95..76f12b6 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -2,7 +2,6 @@ #define CPPJIEAB_JIEBA_H #include "QuerySegment.hpp" -#include "PosTagger.hpp" //#include "LevelSegment.hpp" namespace cppjieba { @@ -16,9 +15,9 @@ class Jieba { hmm_seg_(&model_), mix_seg_(&dict_trie_, &model_), full_seg_(&dict_trie_), - query_seg_(&dict_trie_, &model_), + query_seg_(&dict_trie_, &model_) //level_seg_(&dict_trie_), - pos_tagger_(&dict_trie_, &model_) { + { } ~Jieba() { } @@ -61,7 +60,7 @@ class Jieba { } void Tag(const string& sentence, vector >& words) const { - pos_tagger_.Tag(sentence, words); + mix_seg_.Tag(sentence, words); } bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { return dict_trie_.InsertUserWord(word, tag); @@ -94,9 +93,7 @@ class Jieba { FullSegment full_seg_; QuerySegment query_seg_; //LevelSegment level_seg_; - - PosTagger pos_tagger_; - + }; // class Jieba } // namespace cppjieba diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index 07e1223..cfd07e3 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -6,11 +6,12 @@ #include #include "limonp/Logging.hpp" #include "DictTrie.hpp" -#include "SegmentBase.hpp" +#include "SegmentTagged.hpp" +#include "PosTagger.hpp" namespace cppjieba { -class MPSegment: public SegmentBase { +class MPSegment: public SegmentTagged { public: MPSegment(const string& dictPath, const string& userDictPath = "") : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { @@ -25,9 +26,13 @@ class MPSegment: public SegmentBase { } } - void Cut(const string& sentence, - vector& words, - size_t max_word_len = MAX_WORD_LENGTH) const { + void Cut(const string& sentence, vector& words) const { + Cut(sentence, words, MAX_WORD_LENGTH); + } + + void Cut(const string& sentence, + vector& words, + size_t max_word_len) const { vector tmp; Cut(sentence, tmp, max_word_len); GetStringsFromWords(tmp, words); @@ -64,6 +69,10 @@ class MPSegment: public SegmentBase { return dictTrie_; } + bool Tag(const string& src, vector >& res) const { + return tagger_.Tag(src, res, *this); + } + bool IsUserDictSingleChineseWord(const Rune& value) const { return dictTrie_->IsUserDictSingleChineseWord(value); } @@ -119,6 +128,8 @@ class MPSegment: public SegmentBase { const DictTrie* dictTrie_; bool isNeedDestroy_; + PosTagger tagger_; + }; // class MPSegment } // namespace cppjieba diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index ced8849..3e18b73 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -5,9 +5,10 @@ #include "MPSegment.hpp" #include "HMMSegment.hpp" #include "limonp/StringUtil.hpp" +#include "PosTagger.hpp" namespace cppjieba { -class MixSegment: public SegmentBase { +class MixSegment: public SegmentTagged { public: MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") @@ -20,7 +21,10 @@ class MixSegment: public SegmentBase { ~MixSegment() { } - void Cut(const string& sentence, vector& words, bool hmm = true) const { + void Cut(const string& sentence, vector& words) const { + Cut(sentence, words, true); + } + void Cut(const string& sentence, vector& words, bool hmm) const { vector tmp; Cut(sentence, tmp, hmm); GetStringsFromWords(tmp, words); @@ -84,9 +88,15 @@ class MixSegment: public SegmentBase { const DictTrie* GetDictTrie() const { return mpSeg_.GetDictTrie(); } + + bool Tag(const string& src, vector >& res) const { + return tagger_.Tag(src, res, *this); + } + private: MPSegment mpSeg_; HMMSegment hmmSeg_; + PosTagger tagger_; }; // class MixSegment diff --git a/include/cppjieba/PosTagger.hpp b/include/cppjieba/PosTagger.hpp index 863c07b..7113297 100644 --- a/include/cppjieba/PosTagger.hpp +++ b/include/cppjieba/PosTagger.hpp @@ -1,8 +1,8 @@ #ifndef CPPJIEBA_POS_TAGGING_H #define CPPJIEBA_POS_TAGGING_H -#include "MixSegment.hpp" #include "limonp/StringUtil.hpp" +#include "SegmentTagged.hpp" #include "DictTrie.hpp" namespace cppjieba { @@ -14,24 +14,18 @@ static const char* const POS_X = "x"; class PosTagger { public: - PosTagger(const string& dictPath, - const string& hmmFilePath, - const string& userDictPath = "") - : segment_(dictPath, hmmFilePath, userDictPath) { - } - PosTagger(const DictTrie* dictTrie, const HMMModel* model) - : segment_(dictTrie, model) { + PosTagger() { } ~PosTagger() { } - bool Tag(const string& src, vector >& res) const { + bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { vector CutRes; - segment_.Cut(src, CutRes); + segment.Cut(src, CutRes); const DictUnit *tmp = NULL; RuneStrArray runes; - const DictTrie * dict = segment_.GetDictTrie(); + const DictTrie * dict = segment.GetDictTrie(); assert(dict != NULL); for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { if (!DecodeRunesInString(*itr, runes)) { @@ -71,7 +65,6 @@ class PosTagger { return POS_ENG; } - MixSegment segment_; }; // class PosTagger } // namespace cppjieba diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 40f8b6d..7153c7a 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -24,7 +24,11 @@ class QuerySegment: public SegmentBase { } ~QuerySegment() { } - void Cut(const string& sentence, vector& words, bool hmm = true) const { + + void Cut(const string& sentence, vector& words) const { + Cut(sentence, words, true); + } + void Cut(const string& sentence, vector& words, bool hmm) const { vector tmp; Cut(sentence, tmp, hmm); GetStringsFromWords(tmp, words); diff --git a/include/cppjieba/SegmentBase.hpp b/include/cppjieba/SegmentBase.hpp index 3f81404..82c9ccd 100644 --- a/include/cppjieba/SegmentBase.hpp +++ b/include/cppjieba/SegmentBase.hpp @@ -17,9 +17,11 @@ class SegmentBase { SegmentBase() { XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); } - ~SegmentBase() { + virtual ~SegmentBase() { } + virtual void Cut(const string& sentence, vector& words) const = 0; + bool ResetSeparators(const string& s) { symbols_.clear(); RuneStrArray runes; diff --git a/include/cppjieba/SegmentTagged.hpp b/include/cppjieba/SegmentTagged.hpp new file mode 100644 index 0000000..685d174 --- /dev/null +++ b/include/cppjieba/SegmentTagged.hpp @@ -0,0 +1,25 @@ +#ifndef CPPJIEBA_SEGMENTTAGGED_H +#define CPPJIEBA_SEGMENTTAGGED_H + +#include "SegmentBase.hpp" + +namespace cppjieba { + +struct DictTrie; + +class SegmentTagged : public SegmentBase{ + public: + SegmentTagged() { + } + virtual ~SegmentTagged() { + } + + virtual bool Tag(const string& src, vector >& res) const = 0; + + virtual const struct DictTrie* GetDictTrie() const = 0; + +}; // class SegmentTagged + +} // cppjieba + +#endif diff --git a/test/unittest/pos_tagger_test.cpp b/test/unittest/pos_tagger_test.cpp index ec1ccc8..745c1dd 100644 --- a/test/unittest/pos_tagger_test.cpp +++ b/test/unittest/pos_tagger_test.cpp @@ -1,4 +1,4 @@ -#include "cppjieba/PosTagger.hpp" +#include "cppjieba/MixSegment.hpp" #include "gtest/gtest.h" using namespace cppjieba; @@ -13,7 +13,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a, //static const char * const ANS_TEST3 = ""; TEST(PosTaggerTest, Test) { - PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); + MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); { vector > res; tagger.Tag(QUERY_TEST1, res); @@ -23,7 +23,7 @@ TEST(PosTaggerTest, Test) { } } TEST(PosTagger, TestUserDict) { - PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); + MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); { vector > res; tagger.Tag(QUERY_TEST2, res);