mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Merge pull request #71 from jaiminpan/master
add tag capbility for each segments
This commit is contained in:
commit
667acdeb7b
@ -2,7 +2,6 @@
|
|||||||
#define CPPJIEAB_JIEBA_H
|
#define CPPJIEAB_JIEBA_H
|
||||||
|
|
||||||
#include "QuerySegment.hpp"
|
#include "QuerySegment.hpp"
|
||||||
#include "PosTagger.hpp"
|
|
||||||
//#include "LevelSegment.hpp"
|
//#include "LevelSegment.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
@ -16,9 +15,9 @@ class Jieba {
|
|||||||
hmm_seg_(&model_),
|
hmm_seg_(&model_),
|
||||||
mix_seg_(&dict_trie_, &model_),
|
mix_seg_(&dict_trie_, &model_),
|
||||||
full_seg_(&dict_trie_),
|
full_seg_(&dict_trie_),
|
||||||
query_seg_(&dict_trie_, &model_),
|
query_seg_(&dict_trie_, &model_)
|
||||||
//level_seg_(&dict_trie_),
|
//level_seg_(&dict_trie_),
|
||||||
pos_tagger_(&dict_trie_, &model_) {
|
{
|
||||||
}
|
}
|
||||||
~Jieba() {
|
~Jieba() {
|
||||||
}
|
}
|
||||||
@ -61,7 +60,7 @@ class Jieba {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
||||||
pos_tagger_.Tag(sentence, words);
|
mix_seg_.Tag(sentence, words);
|
||||||
}
|
}
|
||||||
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
||||||
return dict_trie_.InsertUserWord(word, tag);
|
return dict_trie_.InsertUserWord(word, tag);
|
||||||
@ -95,8 +94,6 @@ class Jieba {
|
|||||||
QuerySegment query_seg_;
|
QuerySegment query_seg_;
|
||||||
//LevelSegment level_seg_;
|
//LevelSegment level_seg_;
|
||||||
|
|
||||||
PosTagger pos_tagger_;
|
|
||||||
|
|
||||||
}; // class Jieba
|
}; // class Jieba
|
||||||
|
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
@ -6,11 +6,12 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include "limonp/Logging.hpp"
|
#include "limonp/Logging.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
#include "SegmentBase.hpp"
|
#include "SegmentTagged.hpp"
|
||||||
|
#include "PosTagger.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
|
|
||||||
class MPSegment: public SegmentBase {
|
class MPSegment: public SegmentTagged {
|
||||||
public:
|
public:
|
||||||
MPSegment(const string& dictPath, const string& userDictPath = "")
|
MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||||
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
||||||
@ -25,9 +26,13 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Cut(const string& sentence, vector<string>& words) const {
|
||||||
|
Cut(sentence, words, MAX_WORD_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
void Cut(const string& sentence,
|
void Cut(const string& sentence,
|
||||||
vector<string>& words,
|
vector<string>& words,
|
||||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
size_t max_word_len) const {
|
||||||
vector<Word> tmp;
|
vector<Word> tmp;
|
||||||
Cut(sentence, tmp, max_word_len);
|
Cut(sentence, tmp, max_word_len);
|
||||||
GetStringsFromWords(tmp, words);
|
GetStringsFromWords(tmp, words);
|
||||||
@ -64,6 +69,10 @@ class MPSegment: public SegmentBase {
|
|||||||
return dictTrie_;
|
return dictTrie_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||||
|
return tagger_.Tag(src, res, *this);
|
||||||
|
}
|
||||||
|
|
||||||
bool IsUserDictSingleChineseWord(const Rune& value) const {
|
bool IsUserDictSingleChineseWord(const Rune& value) const {
|
||||||
return dictTrie_->IsUserDictSingleChineseWord(value);
|
return dictTrie_->IsUserDictSingleChineseWord(value);
|
||||||
}
|
}
|
||||||
@ -119,6 +128,8 @@ class MPSegment: public SegmentBase {
|
|||||||
|
|
||||||
const DictTrie* dictTrie_;
|
const DictTrie* dictTrie_;
|
||||||
bool isNeedDestroy_;
|
bool isNeedDestroy_;
|
||||||
|
PosTagger tagger_;
|
||||||
|
|
||||||
}; // class MPSegment
|
}; // class MPSegment
|
||||||
|
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
@ -5,9 +5,10 @@
|
|||||||
#include "MPSegment.hpp"
|
#include "MPSegment.hpp"
|
||||||
#include "HMMSegment.hpp"
|
#include "HMMSegment.hpp"
|
||||||
#include "limonp/StringUtil.hpp"
|
#include "limonp/StringUtil.hpp"
|
||||||
|
#include "PosTagger.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
class MixSegment: public SegmentBase {
|
class MixSegment: public SegmentTagged {
|
||||||
public:
|
public:
|
||||||
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
||||||
const string& userDict = "")
|
const string& userDict = "")
|
||||||
@ -20,7 +21,10 @@ class MixSegment: public SegmentBase {
|
|||||||
~MixSegment() {
|
~MixSegment() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
void Cut(const string& sentence, vector<string>& words) const {
|
||||||
|
Cut(sentence, words, true);
|
||||||
|
}
|
||||||
|
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||||
vector<Word> tmp;
|
vector<Word> tmp;
|
||||||
Cut(sentence, tmp, hmm);
|
Cut(sentence, tmp, hmm);
|
||||||
GetStringsFromWords(tmp, words);
|
GetStringsFromWords(tmp, words);
|
||||||
@ -84,9 +88,15 @@ class MixSegment: public SegmentBase {
|
|||||||
const DictTrie* GetDictTrie() const {
|
const DictTrie* GetDictTrie() const {
|
||||||
return mpSeg_.GetDictTrie();
|
return mpSeg_.GetDictTrie();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
||||||
|
return tagger_.Tag(src, res, *this);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MPSegment mpSeg_;
|
MPSegment mpSeg_;
|
||||||
HMMSegment hmmSeg_;
|
HMMSegment hmmSeg_;
|
||||||
|
PosTagger tagger_;
|
||||||
|
|
||||||
}; // class MixSegment
|
}; // class MixSegment
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
#ifndef CPPJIEBA_POS_TAGGING_H
|
#ifndef CPPJIEBA_POS_TAGGING_H
|
||||||
#define CPPJIEBA_POS_TAGGING_H
|
#define CPPJIEBA_POS_TAGGING_H
|
||||||
|
|
||||||
#include "MixSegment.hpp"
|
|
||||||
#include "limonp/StringUtil.hpp"
|
#include "limonp/StringUtil.hpp"
|
||||||
|
#include "SegmentTagged.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
@ -14,24 +14,18 @@ static const char* const POS_X = "x";
|
|||||||
|
|
||||||
class PosTagger {
|
class PosTagger {
|
||||||
public:
|
public:
|
||||||
PosTagger(const string& dictPath,
|
PosTagger() {
|
||||||
const string& hmmFilePath,
|
|
||||||
const string& userDictPath = "")
|
|
||||||
: segment_(dictPath, hmmFilePath, userDictPath) {
|
|
||||||
}
|
|
||||||
PosTagger(const DictTrie* dictTrie, const HMMModel* model)
|
|
||||||
: segment_(dictTrie, model) {
|
|
||||||
}
|
}
|
||||||
~PosTagger() {
|
~PosTagger() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
|
||||||
vector<string> CutRes;
|
vector<string> CutRes;
|
||||||
segment_.Cut(src, CutRes);
|
segment.Cut(src, CutRes);
|
||||||
|
|
||||||
const DictUnit *tmp = NULL;
|
const DictUnit *tmp = NULL;
|
||||||
RuneStrArray runes;
|
RuneStrArray runes;
|
||||||
const DictTrie * dict = segment_.GetDictTrie();
|
const DictTrie * dict = segment.GetDictTrie();
|
||||||
assert(dict != NULL);
|
assert(dict != NULL);
|
||||||
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
||||||
if (!DecodeRunesInString(*itr, runes)) {
|
if (!DecodeRunesInString(*itr, runes)) {
|
||||||
@ -71,7 +65,6 @@ class PosTagger {
|
|||||||
return POS_ENG;
|
return POS_ENG;
|
||||||
}
|
}
|
||||||
|
|
||||||
MixSegment segment_;
|
|
||||||
}; // class PosTagger
|
}; // class PosTagger
|
||||||
|
|
||||||
} // namespace cppjieba
|
} // namespace cppjieba
|
||||||
|
@ -24,7 +24,11 @@ class QuerySegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
~QuerySegment() {
|
~QuerySegment() {
|
||||||
}
|
}
|
||||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
|
||||||
|
void Cut(const string& sentence, vector<string>& words) const {
|
||||||
|
Cut(sentence, words, true);
|
||||||
|
}
|
||||||
|
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
||||||
vector<Word> tmp;
|
vector<Word> tmp;
|
||||||
Cut(sentence, tmp, hmm);
|
Cut(sentence, tmp, hmm);
|
||||||
GetStringsFromWords(tmp, words);
|
GetStringsFromWords(tmp, words);
|
||||||
|
@ -17,9 +17,11 @@ class SegmentBase {
|
|||||||
SegmentBase() {
|
SegmentBase() {
|
||||||
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
||||||
}
|
}
|
||||||
~SegmentBase() {
|
virtual ~SegmentBase() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
|
||||||
|
|
||||||
bool ResetSeparators(const string& s) {
|
bool ResetSeparators(const string& s) {
|
||||||
symbols_.clear();
|
symbols_.clear();
|
||||||
RuneStrArray runes;
|
RuneStrArray runes;
|
||||||
|
25
include/cppjieba/SegmentTagged.hpp
Normal file
25
include/cppjieba/SegmentTagged.hpp
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#ifndef CPPJIEBA_SEGMENTTAGGED_H
|
||||||
|
#define CPPJIEBA_SEGMENTTAGGED_H
|
||||||
|
|
||||||
|
#include "SegmentBase.hpp"
|
||||||
|
|
||||||
|
namespace cppjieba {
|
||||||
|
|
||||||
|
struct DictTrie;
|
||||||
|
|
||||||
|
class SegmentTagged : public SegmentBase{
|
||||||
|
public:
|
||||||
|
SegmentTagged() {
|
||||||
|
}
|
||||||
|
virtual ~SegmentTagged() {
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
|
||||||
|
|
||||||
|
virtual const struct DictTrie* GetDictTrie() const = 0;
|
||||||
|
|
||||||
|
}; // class SegmentTagged
|
||||||
|
|
||||||
|
} // cppjieba
|
||||||
|
|
||||||
|
#endif
|
@ -1,4 +1,4 @@
|
|||||||
#include "cppjieba/PosTagger.hpp"
|
#include "cppjieba/MixSegment.hpp"
|
||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
using namespace cppjieba;
|
using namespace cppjieba;
|
||||||
@ -13,7 +13,7 @@ static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a,
|
|||||||
//static const char * const ANS_TEST3 = "";
|
//static const char * const ANS_TEST3 = "";
|
||||||
|
|
||||||
TEST(PosTaggerTest, Test) {
|
TEST(PosTaggerTest, Test) {
|
||||||
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||||
{
|
{
|
||||||
vector<pair<string, string> > res;
|
vector<pair<string, string> > res;
|
||||||
tagger.Tag(QUERY_TEST1, res);
|
tagger.Tag(QUERY_TEST1, res);
|
||||||
@ -23,7 +23,7 @@ TEST(PosTaggerTest, Test) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
TEST(PosTagger, TestUserDict) {
|
TEST(PosTagger, TestUserDict) {
|
||||||
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||||
{
|
{
|
||||||
vector<pair<string, string> > res;
|
vector<pair<string, string> > res;
|
||||||
tagger.Tag(QUERY_TEST2, res);
|
tagger.Tag(QUERY_TEST2, res);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user