Merge pull request #71 from jaiminpan/master

add tag capbility for each segments
This commit is contained in:
Yanyi Wu 2016-07-03 20:10:49 +08:00 committed by GitHub
commit 667acdeb7b
8 changed files with 73 additions and 31 deletions

View File

@ -2,7 +2,6 @@
#define CPPJIEAB_JIEBA_H #define CPPJIEAB_JIEBA_H
#include "QuerySegment.hpp" #include "QuerySegment.hpp"
#include "PosTagger.hpp"
//#include "LevelSegment.hpp" //#include "LevelSegment.hpp"
namespace cppjieba { namespace cppjieba {
@ -16,9 +15,9 @@ class Jieba {
hmm_seg_(&model_), hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_), mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_), full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_), query_seg_(&dict_trie_, &model_)
//level_seg_(&dict_trie_), //level_seg_(&dict_trie_),
pos_tagger_(&dict_trie_, &model_) { {
} }
~Jieba() { ~Jieba() {
} }
@ -61,7 +60,7 @@ class Jieba {
} }
void Tag(const string& sentence, vector<pair<string, string> >& words) const { void Tag(const string& sentence, vector<pair<string, string> >& words) const {
pos_tagger_.Tag(sentence, words); mix_seg_.Tag(sentence, words);
} }
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.InsertUserWord(word, tag); return dict_trie_.InsertUserWord(word, tag);
@ -95,8 +94,6 @@ class Jieba {
QuerySegment query_seg_; QuerySegment query_seg_;
//LevelSegment level_seg_; //LevelSegment level_seg_;
PosTagger pos_tagger_;
}; // class Jieba }; // class Jieba
} // namespace cppjieba } // namespace cppjieba

View File

@ -6,11 +6,12 @@
#include <cassert> #include <cassert>
#include "limonp/Logging.hpp" #include "limonp/Logging.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
#include "SegmentBase.hpp" #include "SegmentTagged.hpp"
#include "PosTagger.hpp"
namespace cppjieba { namespace cppjieba {
class MPSegment: public SegmentBase { class MPSegment: public SegmentTagged {
public: public:
MPSegment(const string& dictPath, const string& userDictPath = "") MPSegment(const string& dictPath, const string& userDictPath = "")
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
@ -25,9 +26,13 @@ class MPSegment: public SegmentBase {
} }
} }
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, MAX_WORD_LENGTH);
}
void Cut(const string& sentence, void Cut(const string& sentence,
vector<string>& words, vector<string>& words,
size_t max_word_len = MAX_WORD_LENGTH) const { size_t max_word_len) const {
vector<Word> tmp; vector<Word> tmp;
Cut(sentence, tmp, max_word_len); Cut(sentence, tmp, max_word_len);
GetStringsFromWords(tmp, words); GetStringsFromWords(tmp, words);
@ -64,6 +69,10 @@ class MPSegment: public SegmentBase {
return dictTrie_; return dictTrie_;
} }
bool Tag(const string& src, vector<pair<string, string> >& res) const {
return tagger_.Tag(src, res, *this);
}
bool IsUserDictSingleChineseWord(const Rune& value) const { bool IsUserDictSingleChineseWord(const Rune& value) const {
return dictTrie_->IsUserDictSingleChineseWord(value); return dictTrie_->IsUserDictSingleChineseWord(value);
} }
@ -119,6 +128,8 @@ class MPSegment: public SegmentBase {
const DictTrie* dictTrie_; const DictTrie* dictTrie_;
bool isNeedDestroy_; bool isNeedDestroy_;
PosTagger tagger_;
}; // class MPSegment }; // class MPSegment
} // namespace cppjieba } // namespace cppjieba

View File

@ -5,9 +5,10 @@
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "limonp/StringUtil.hpp" #include "limonp/StringUtil.hpp"
#include "PosTagger.hpp"
namespace cppjieba { namespace cppjieba {
class MixSegment: public SegmentBase { class MixSegment: public SegmentTagged {
public: public:
MixSegment(const string& mpSegDict, const string& hmmSegDict, MixSegment(const string& mpSegDict, const string& hmmSegDict,
const string& userDict = "") const string& userDict = "")
@ -20,7 +21,10 @@ class MixSegment: public SegmentBase {
~MixSegment() { ~MixSegment() {
} }
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const { void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp; vector<Word> tmp;
Cut(sentence, tmp, hmm); Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words); GetStringsFromWords(tmp, words);
@ -84,9 +88,15 @@ class MixSegment: public SegmentBase {
const DictTrie* GetDictTrie() const { const DictTrie* GetDictTrie() const {
return mpSeg_.GetDictTrie(); return mpSeg_.GetDictTrie();
} }
bool Tag(const string& src, vector<pair<string, string> >& res) const {
return tagger_.Tag(src, res, *this);
}
private: private:
MPSegment mpSeg_; MPSegment mpSeg_;
HMMSegment hmmSeg_; HMMSegment hmmSeg_;
PosTagger tagger_;
}; // class MixSegment }; // class MixSegment

View File

@ -1,8 +1,8 @@
#ifndef CPPJIEBA_POS_TAGGING_H #ifndef CPPJIEBA_POS_TAGGING_H
#define CPPJIEBA_POS_TAGGING_H #define CPPJIEBA_POS_TAGGING_H
#include "MixSegment.hpp"
#include "limonp/StringUtil.hpp" #include "limonp/StringUtil.hpp"
#include "SegmentTagged.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace cppjieba { namespace cppjieba {
@ -14,24 +14,18 @@ static const char* const POS_X = "x";
class PosTagger { class PosTagger {
public: public:
PosTagger(const string& dictPath, PosTagger() {
const string& hmmFilePath,
const string& userDictPath = "")
: segment_(dictPath, hmmFilePath, userDictPath) {
}
PosTagger(const DictTrie* dictTrie, const HMMModel* model)
: segment_(dictTrie, model) {
} }
~PosTagger() { ~PosTagger() {
} }
bool Tag(const string& src, vector<pair<string, string> >& res) const { bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
vector<string> CutRes; vector<string> CutRes;
segment_.Cut(src, CutRes); segment.Cut(src, CutRes);
const DictUnit *tmp = NULL; const DictUnit *tmp = NULL;
RuneStrArray runes; RuneStrArray runes;
const DictTrie * dict = segment_.GetDictTrie(); const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL); assert(dict != NULL);
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
if (!DecodeRunesInString(*itr, runes)) { if (!DecodeRunesInString(*itr, runes)) {
@ -71,7 +65,6 @@ class PosTagger {
return POS_ENG; return POS_ENG;
} }
MixSegment segment_;
}; // class PosTagger }; // class PosTagger
} // namespace cppjieba } // namespace cppjieba

View File

@ -24,7 +24,11 @@ class QuerySegment: public SegmentBase {
} }
~QuerySegment() { ~QuerySegment() {
} }
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp; vector<Word> tmp;
Cut(sentence, tmp, hmm); Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words); GetStringsFromWords(tmp, words);

View File

@ -17,9 +17,11 @@ class SegmentBase {
SegmentBase() { SegmentBase() {
XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
} }
~SegmentBase() { virtual ~SegmentBase() {
} }
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
bool ResetSeparators(const string& s) { bool ResetSeparators(const string& s) {
symbols_.clear(); symbols_.clear();
RuneStrArray runes; RuneStrArray runes;

View File

@ -0,0 +1,25 @@
#ifndef CPPJIEBA_SEGMENTTAGGED_H
#define CPPJIEBA_SEGMENTTAGGED_H
#include "SegmentBase.hpp"
namespace cppjieba {
struct DictTrie;
class SegmentTagged : public SegmentBase{
public:
SegmentTagged() {
}
virtual ~SegmentTagged() {
}
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
virtual const struct DictTrie* GetDictTrie() const = 0;
}; // class SegmentTagged
} // cppjieba
#endif

View File

@ -1,4 +1,4 @@
#include "cppjieba/PosTagger.hpp" #include "cppjieba/MixSegment.hpp"
#include "gtest/gtest.h" #include "gtest/gtest.h"
using namespace cppjieba; using namespace cppjieba;
@ -13,7 +13,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
//static const char * const ANS_TEST3 = ""; //static const char * const ANS_TEST3 = "";
TEST(PosTaggerTest, Test) { TEST(PosTaggerTest, Test) {
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
{ {
vector<pair<string, string> > res; vector<pair<string, string> > res;
tagger.Tag(QUERY_TEST1, res); tagger.Tag(QUERY_TEST1, res);
@ -23,7 +23,7 @@ TEST(PosTaggerTest, Test) {
} }
} }
TEST(PosTagger, TestUserDict) { TEST(PosTagger, TestUserDict) {
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
{ {
vector<pair<string, string> > res; vector<pair<string, string> > res;
tagger.Tag(QUERY_TEST2, res); tagger.Tag(QUERY_TEST2, res);