add tag capbility for each segments

This commit is contained in:
Jaimin Pan 2016-06-27 18:10:42 +08:00
parent ec848581b2
commit ce8cafe54a
8 changed files with 73 additions and 31 deletions

View File

@ -2,7 +2,6 @@
#define CPPJIEAB_JIEBA_H
#include "QuerySegment.hpp"
#include "PosTagger.hpp"
//#include "LevelSegment.hpp"
namespace cppjieba {
@ -16,9 +15,9 @@ class Jieba {
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
query_seg_(&dict_trie_, &model_)
//level_seg_(&dict_trie_),
pos_tagger_(&dict_trie_, &model_) {
{
}
~Jieba() {
}
@ -61,7 +60,7 @@ class Jieba {
}
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
pos_tagger_.Tag(sentence, words);
mix_seg_.Tag(sentence, words);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.InsertUserWord(word, tag);
@ -95,8 +94,6 @@ class Jieba {
QuerySegment query_seg_;
//LevelSegment level_seg_;
PosTagger pos_tagger_;
}; // class Jieba
} // namespace cppjieba

View File

@ -6,11 +6,12 @@
#include <cassert>
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "SegmentTagged.hpp"
#include "PosTagger.hpp"
namespace cppjieba {
class MPSegment: public SegmentBase {
class MPSegment: public SegmentTagged {
public:
MPSegment(const string& dictPath, const string& userDictPath = "")
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
@ -25,9 +26,13 @@ class MPSegment: public SegmentBase {
}
}
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, MAX_WORD_LENGTH);
}
void Cut(const string& sentence,
vector<string>& words,
size_t max_word_len = MAX_WORD_LENGTH) const {
size_t max_word_len) const {
vector<Word> tmp;
Cut(sentence, tmp, max_word_len);
GetStringsFromWords(tmp, words);
@ -64,6 +69,10 @@ class MPSegment: public SegmentBase {
return dictTrie_;
}
bool Tag(const string& src, vector<pair<string, string> >& res) const {
return tagger_.Tag(src, res, *this);
}
bool IsUserDictSingleChineseWord(const Rune& value) const {
return dictTrie_->IsUserDictSingleChineseWord(value);
}
@ -119,6 +128,8 @@ class MPSegment: public SegmentBase {
const DictTrie* dictTrie_;
bool isNeedDestroy_;
PosTagger tagger_;
}; // class MPSegment
} // namespace cppjieba

View File

@ -5,9 +5,10 @@
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "limonp/StringUtil.hpp"
#include "PosTagger.hpp"
namespace cppjieba {
class MixSegment: public SegmentBase {
class MixSegment: public SegmentTagged {
public:
MixSegment(const string& mpSegDict, const string& hmmSegDict,
const string& userDict = "")
@ -20,7 +21,10 @@ class MixSegment: public SegmentBase {
~MixSegment() {
}
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words);
@ -84,9 +88,15 @@ class MixSegment: public SegmentBase {
const DictTrie* GetDictTrie() const {
return mpSeg_.GetDictTrie();
}
bool Tag(const string& src, vector<pair<string, string> >& res) const {
return tagger_.Tag(src, res, *this);
}
private:
MPSegment mpSeg_;
HMMSegment hmmSeg_;
PosTagger tagger_;
}; // class MixSegment

View File

@ -1,8 +1,8 @@
#ifndef CPPJIEBA_POS_TAGGING_H
#define CPPJIEBA_POS_TAGGING_H
#include "MixSegment.hpp"
#include "limonp/StringUtil.hpp"
#include "SegmentTagged.hpp"
#include "DictTrie.hpp"
namespace cppjieba {
@ -14,24 +14,18 @@ static const char* const POS_X = "x";
class PosTagger {
public:
PosTagger(const string& dictPath,
const string& hmmFilePath,
const string& userDictPath = "")
: segment_(dictPath, hmmFilePath, userDictPath) {
}
PosTagger(const DictTrie* dictTrie, const HMMModel* model)
: segment_(dictTrie, model) {
PosTagger() {
}
~PosTagger() {
}
bool Tag(const string& src, vector<pair<string, string> >& res) const {
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
vector<string> CutRes;
segment_.Cut(src, CutRes);
segment.Cut(src, CutRes);
const DictUnit *tmp = NULL;
RuneStrArray runes;
const DictTrie * dict = segment_.GetDictTrie();
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
if (!DecodeRunesInString(*itr, runes)) {
@ -71,7 +65,6 @@ class PosTagger {
return POS_ENG;
}
MixSegment segment_;
}; // class PosTagger
} // namespace cppjieba

View File

@ -24,7 +24,11 @@ class QuerySegment: public SegmentBase {
}
~QuerySegment() {
}
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words);

View File

@ -17,9 +17,11 @@ class SegmentBase {
SegmentBase() {
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
}
~SegmentBase() {
virtual ~SegmentBase() {
}
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
bool ResetSeparators(const string& s) {
symbols_.clear();
RuneStrArray runes;

View File

@ -0,0 +1,25 @@
#ifndef CPPJIEBA_SEGMENTTAGGED_H
#define CPPJIEBA_SEGMENTTAGGED_H
#include "SegmentBase.hpp"
namespace cppjieba {
struct DictTrie;
class SegmentTagged : public SegmentBase{
public:
SegmentTagged() {
}
virtual ~SegmentTagged() {
}
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
virtual const struct DictTrie* GetDictTrie() const = 0;
}; // class SegmentTagged
} // cppjieba
#endif

View File

@ -1,4 +1,4 @@
#include "cppjieba/PosTagger.hpp"
#include "cppjieba/MixSegment.hpp"
#include "gtest/gtest.h"
using namespace cppjieba;
@ -13,7 +13,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
//static const char * const ANS_TEST3 = "";
TEST(PosTaggerTest, Test) {
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
{
vector<pair<string, string> > res;
tagger.Tag(QUERY_TEST1, res);
@ -23,7 +23,7 @@ TEST(PosTaggerTest, Test) {
}
}
TEST(PosTagger, TestUserDict) {
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
{
vector<pair<string, string> > res;
tagger.Tag(QUERY_TEST2, res);