mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add tag capbility for each segments
This commit is contained in:
parent
ec848581b2
commit
ce8cafe54a
@ -2,7 +2,6 @@
|
||||
#define CPPJIEAB_JIEBA_H
|
||||
|
||||
#include "QuerySegment.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
//#include "LevelSegment.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
@ -16,9 +15,9 @@ class Jieba {
|
||||
hmm_seg_(&model_),
|
||||
mix_seg_(&dict_trie_, &model_),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_),
|
||||
query_seg_(&dict_trie_, &model_)
|
||||
//level_seg_(&dict_trie_),
|
||||
pos_tagger_(&dict_trie_, &model_) {
|
||||
{
|
||||
}
|
||||
~Jieba() {
|
||||
}
|
||||
@ -61,7 +60,7 @@ class Jieba {
|
||||
}
|
||||
|
||||
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
||||
pos_tagger_.Tag(sentence, words);
|
||||
mix_seg_.Tag(sentence, words);
|
||||
}
|
||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||
return dict_trie_.InsertUserWord(word, tag);
|
||||
@ -94,9 +93,7 @@ class Jieba {
|
||||
FullSegment full_seg_;
|
||||
QuerySegment query_seg_;
|
||||
//LevelSegment level_seg_;
|
||||
|
||||
PosTagger pos_tagger_;
|
||||
|
||||
|
||||
}; // class Jieba
|
||||
|
||||
} // namespace cppjieba
|
||||
|
@ -6,11 +6,12 @@
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
class MPSegment: public SegmentBase {
|
||||
class MPSegment: public SegmentTagged {
|
||||
public:
|
||||
MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
||||
@ -25,9 +26,13 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, MAX_WORD_LENGTH);
|
||||
}
|
||||
|
||||
void Cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, max_word_len);
|
||||
GetStringsFromWords(tmp, words);
|
||||
@ -64,6 +69,10 @@ class MPSegment: public SegmentBase {
|
||||
return dictTrie_;
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
return tagger_.Tag(src, res, *this);
|
||||
}
|
||||
|
||||
bool IsUserDictSingleChineseWord(const Rune& value) const {
|
||||
return dictTrie_->IsUserDictSingleChineseWord(value);
|
||||
}
|
||||
@ -119,6 +128,8 @@ class MPSegment: public SegmentBase {
|
||||
|
||||
const DictTrie* dictTrie_;
|
||||
bool isNeedDestroy_;
|
||||
PosTagger tagger_;
|
||||
|
||||
}; // class MPSegment
|
||||
|
||||
} // namespace cppjieba
|
||||
|
@ -5,9 +5,10 @@
|
||||
#include "MPSegment.hpp"
|
||||
#include "HMMSegment.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class MixSegment: public SegmentBase {
|
||||
class MixSegment: public SegmentTagged {
|
||||
public:
|
||||
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
||||
const string& userDict = "")
|
||||
@ -20,7 +21,10 @@ class MixSegment: public SegmentBase {
|
||||
~MixSegment() {
|
||||
}
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, true);
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
@ -84,9 +88,15 @@ class MixSegment: public SegmentBase {
|
||||
const DictTrie* GetDictTrie() const {
|
||||
return mpSeg_.GetDictTrie();
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
return tagger_.Tag(src, res, *this);
|
||||
}
|
||||
|
||||
private:
|
||||
MPSegment mpSeg_;
|
||||
HMMSegment hmmSeg_;
|
||||
PosTagger tagger_;
|
||||
|
||||
}; // class MixSegment
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
#ifndef CPPJIEBA_POS_TAGGING_H
|
||||
#define CPPJIEBA_POS_TAGGING_H
|
||||
|
||||
#include "MixSegment.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
@ -14,24 +14,18 @@ static const char* const POS_X = "x";
|
||||
|
||||
class PosTagger {
|
||||
public:
|
||||
PosTagger(const string& dictPath,
|
||||
const string& hmmFilePath,
|
||||
const string& userDictPath = "")
|
||||
: segment_(dictPath, hmmFilePath, userDictPath) {
|
||||
}
|
||||
PosTagger(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: segment_(dictTrie, model) {
|
||||
PosTagger() {
|
||||
}
|
||||
~PosTagger() {
|
||||
}
|
||||
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
|
||||
vector<string> CutRes;
|
||||
segment_.Cut(src, CutRes);
|
||||
segment.Cut(src, CutRes);
|
||||
|
||||
const DictUnit *tmp = NULL;
|
||||
RuneStrArray runes;
|
||||
const DictTrie * dict = segment_.GetDictTrie();
|
||||
const DictTrie * dict = segment.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||
if (!DecodeRunesInString(*itr, runes)) {
|
||||
@ -71,7 +65,6 @@ class PosTagger {
|
||||
return POS_ENG;
|
||||
}
|
||||
|
||||
MixSegment segment_;
|
||||
}; // class PosTagger
|
||||
|
||||
} // namespace cppjieba
|
||||
|
@ -24,7 +24,11 @@ class QuerySegment: public SegmentBase {
|
||||
}
|
||||
~QuerySegment() {
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words) const {
|
||||
Cut(sentence, words, true);
|
||||
}
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||
vector<Word> tmp;
|
||||
Cut(sentence, tmp, hmm);
|
||||
GetStringsFromWords(tmp, words);
|
||||
|
@ -17,9 +17,11 @@ class SegmentBase {
|
||||
SegmentBase() {
|
||||
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
||||
}
|
||||
~SegmentBase() {
|
||||
virtual ~SegmentBase() {
|
||||
}
|
||||
|
||||
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
|
||||
|
||||
bool ResetSeparators(const string& s) {
|
||||
symbols_.clear();
|
||||
RuneStrArray runes;
|
||||
|
25
include/cppjieba/SegmentTagged.hpp
Normal file
25
include/cppjieba/SegmentTagged.hpp
Normal file
@ -0,0 +1,25 @@
|
||||
#ifndef CPPJIEBA_SEGMENTTAGGED_H
|
||||
#define CPPJIEBA_SEGMENTTAGGED_H
|
||||
|
||||
#include "SegmentBase.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
struct DictTrie;
|
||||
|
||||
class SegmentTagged : public SegmentBase{
|
||||
public:
|
||||
SegmentTagged() {
|
||||
}
|
||||
virtual ~SegmentTagged() {
|
||||
}
|
||||
|
||||
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
|
||||
|
||||
virtual const struct DictTrie* GetDictTrie() const = 0;
|
||||
|
||||
}; // class SegmentTagged
|
||||
|
||||
} // cppjieba
|
||||
|
||||
#endif
|
@ -1,4 +1,4 @@
|
||||
#include "cppjieba/PosTagger.hpp"
|
||||
#include "cppjieba/MixSegment.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
@ -13,7 +13,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
|
||||
//static const char * const ANS_TEST3 = "";
|
||||
|
||||
TEST(PosTaggerTest, Test) {
|
||||
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||
{
|
||||
vector<pair<string, string> > res;
|
||||
tagger.Tag(QUERY_TEST1, res);
|
||||
@ -23,7 +23,7 @@ TEST(PosTaggerTest, Test) {
|
||||
}
|
||||
}
|
||||
TEST(PosTagger, TestUserDict) {
|
||||
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
{
|
||||
vector<pair<string, string> > res;
|
||||
tagger.Tag(QUERY_TEST2, res);
|
||||
|
Loading…
x
Reference in New Issue
Block a user