mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
重构:增加让各个分词类的构造函数,为后面的憋大招做准备。
This commit is contained in:
parent
b99d0698f0
commit
d56bf2cc68
@ -13,40 +13,23 @@
|
||||
namespace CppJieba {
|
||||
class FullSegment: public SegmentBase {
|
||||
public:
|
||||
FullSegment() {
|
||||
dictTrie_ = NULL;
|
||||
isBorrowed_ = false;
|
||||
FullSegment(const string& dictPath) {
|
||||
dictTrie_ = new DictTrie(dictPath);
|
||||
isNeedDestroy_ = true;
|
||||
LogInfo("FullSegment init %s ok", dictPath.c_str());
|
||||
}
|
||||
explicit FullSegment(const string& dictPath) {
|
||||
dictTrie_ = NULL;
|
||||
init(dictPath);
|
||||
}
|
||||
explicit FullSegment(const DictTrie* dictTrie) {
|
||||
dictTrie_ = NULL;
|
||||
init(dictTrie);
|
||||
FullSegment(const DictTrie* dictTrie)
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
virtual ~FullSegment() {
|
||||
if(dictTrie_ && ! isBorrowed_) {
|
||||
if(isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
|
||||
};
|
||||
bool init(const string& dictPath) {
|
||||
assert(dictTrie_ == NULL);
|
||||
dictTrie_ = new DictTrie(dictPath);
|
||||
isBorrowed_ = false;
|
||||
return true;
|
||||
}
|
||||
bool init(const DictTrie* dictTrie) {
|
||||
assert(dictTrie_ == NULL);
|
||||
assert(dictTrie);
|
||||
dictTrie_ = dictTrie;
|
||||
isBorrowed_ = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end,
|
||||
vector<Unicode>& res) const {
|
||||
//resut of searching in trie tree
|
||||
DagType tRes;
|
||||
|
||||
@ -87,7 +70,8 @@ class FullSegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end,
|
||||
vector<string>& res) const {
|
||||
vector<Unicode> uRes;
|
||||
if (!cut(begin, end, uRes)) {
|
||||
LogError("get unicode cut result error.");
|
||||
@ -95,7 +79,8 @@ class FullSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
string tmp;
|
||||
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
|
||||
for (vector<Unicode>::const_iterator uItr = uRes.begin();
|
||||
uItr != uRes.end(); uItr++) {
|
||||
TransCode::encode(*uItr, tmp);
|
||||
res.push_back(tmp);
|
||||
}
|
||||
@ -104,7 +89,7 @@ class FullSegment: public SegmentBase {
|
||||
}
|
||||
private:
|
||||
const DictTrie* dictTrie_;
|
||||
bool isBorrowed_;
|
||||
bool isNeedDestroy_;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -12,9 +12,17 @@ namespace CppJieba {
|
||||
|
||||
class HMMSegment: public SegmentBase {
|
||||
public:
|
||||
explicit HMMSegment(const string& filePath): model_(filePath) {
|
||||
HMMSegment(const string& filePath) {
|
||||
model_ = new HMMModel(filePath);
|
||||
}
|
||||
HMMSegment(const HMMModel* model)
|
||||
: model_(model), isNeedDestroy_(false) {
|
||||
}
|
||||
virtual ~HMMSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete model_;
|
||||
}
|
||||
}
|
||||
virtual ~HMMSegment() {}
|
||||
|
||||
using SegmentBase::cut;
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
|
||||
@ -138,7 +146,7 @@ class HMMSegment: public SegmentBase {
|
||||
|
||||
//start
|
||||
for(size_t y = 0; y < Y; y++) {
|
||||
weight[0 + y * X] = model_.startProb[y] + model_.getEmitProb(model_.emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
weight[0 + y * X] = model_->startProb[y] + model_->getEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
|
||||
path[0 + y * X] = -1;
|
||||
}
|
||||
|
||||
@ -149,10 +157,10 @@ class HMMSegment: public SegmentBase {
|
||||
now = x + y*X;
|
||||
weight[now] = MIN_DOUBLE;
|
||||
path[now] = HMMModel::E; // warning
|
||||
emitProb = model_.getEmitProb(model_.emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
emitProb = model_->getEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||
for(size_t preY = 0; preY < Y; preY++) {
|
||||
old = x - 1 + preY * X;
|
||||
tmp = weight[old] + model_.transProb[preY][y] + emitProb;
|
||||
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
||||
if(tmp > weight[now]) {
|
||||
weight[now] = tmp;
|
||||
path[now] = preY;
|
||||
@ -179,7 +187,9 @@ class HMMSegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
|
||||
HMMModel model_;
|
||||
private:
|
||||
const HMMModel* model_;
|
||||
bool isNeedDestroy_;
|
||||
}; // class HMMSegment
|
||||
|
||||
} // namespace CppJieba
|
||||
|
@ -20,6 +20,14 @@ class KeywordExtractor {
|
||||
loadIdfDict_(idfPath);
|
||||
loadStopWordDict_(stopWordPath);
|
||||
}
|
||||
KeywordExtractor(const DictTrie* dictTrie,
|
||||
const HMMModel* model,
|
||||
const string& idfPath,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model){
|
||||
loadIdfDict_(idfPath);
|
||||
loadStopWordDict_(stopWordPath);
|
||||
}
|
||||
~KeywordExtractor() {
|
||||
}
|
||||
|
||||
|
@ -12,28 +12,28 @@
|
||||
namespace CppJieba {
|
||||
|
||||
class MPSegment: public SegmentBase {
|
||||
|
||||
public:
|
||||
MPSegment() {};
|
||||
MPSegment(const string& dictPath, const string& userDictPath = "") {
|
||||
init(dictPath, userDictPath);
|
||||
};
|
||||
virtual ~MPSegment() {};
|
||||
|
||||
void init(const string& dictPath, const string& userDictPath = "") {
|
||||
dictTrie_.init(dictPath, userDictPath);
|
||||
dictTrie_ = new DictTrie(dictPath, userDictPath);
|
||||
isNeedDestroy_ = true;
|
||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||
}
|
||||
MPSegment(const DictTrie* dictTrie)
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
virtual ~MPSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
|
||||
return dictTrie_.isUserDictSingleChineseWord(value);
|
||||
return dictTrie_->isUserDictSingleChineseWord(value);
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
||||
if(begin == end) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!cut(begin, end, words)) {
|
||||
@ -48,12 +48,9 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
if(end == begin) {
|
||||
return false;
|
||||
}
|
||||
vector<SegmentChar> segmentChars;
|
||||
|
||||
dictTrie_.find(begin, end, segmentChars);
|
||||
dictTrie_->find(begin, end, segmentChars);
|
||||
|
||||
calcDP_(segmentChars);
|
||||
|
||||
@ -62,7 +59,7 @@ class MPSegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
const DictTrie* getDictTrie() const {
|
||||
return &dictTrie_;
|
||||
return dictTrie_;
|
||||
}
|
||||
|
||||
private:
|
||||
@ -86,7 +83,7 @@ class MPSegment: public SegmentBase {
|
||||
if(p) {
|
||||
val += p->weight;
|
||||
} else {
|
||||
val += dictTrie_.getMinWeight();
|
||||
val += dictTrie_->getMinWeight();
|
||||
}
|
||||
if(val > rit->weight) {
|
||||
rit->pInfo = p;
|
||||
@ -95,7 +92,8 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void cut_(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
|
||||
void cut_(const vector<SegmentChar>& segmentChars,
|
||||
vector<Unicode>& res) const {
|
||||
size_t i = 0;
|
||||
while(i < segmentChars.size()) {
|
||||
const DictUnit* p = segmentChars[i].pInfo;
|
||||
@ -110,9 +108,10 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
private:
|
||||
DictTrie dictTrie_;
|
||||
const DictTrie* dictTrie_;
|
||||
bool isNeedDestroy_;
|
||||
}; // class MPSegment
|
||||
|
||||
};
|
||||
}
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif
|
||||
|
@ -15,6 +15,9 @@ class MixSegment: public SegmentBase {
|
||||
hmmSeg_(hmmSegDict) {
|
||||
LogInfo("MixSegment init %s, %s", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||
}
|
||||
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mpSeg_(dictTrie), hmmSeg_(model) {
|
||||
}
|
||||
virtual ~MixSegment() {
|
||||
}
|
||||
using SegmentBase::cut;
|
||||
@ -90,7 +93,9 @@ class MixSegment: public SegmentBase {
|
||||
private:
|
||||
MPSegment mpSeg_;
|
||||
HMMSegment hmmSeg_;
|
||||
};
|
||||
}
|
||||
|
||||
}; // class MixSegment
|
||||
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif
|
||||
|
@ -18,8 +18,9 @@ class PosTagger {
|
||||
const string& hmmFilePath,
|
||||
const string& userDictPath = "")
|
||||
: segment_(dictPath, hmmFilePath, userDictPath) {
|
||||
dictTrie_ = segment_.getDictTrie();
|
||||
LIMONP_CHECK(dictTrie_);
|
||||
}
|
||||
PosTagger(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: segment_(dictTrie, model) {
|
||||
}
|
||||
~PosTagger() {
|
||||
}
|
||||
@ -33,12 +34,14 @@ class PosTagger {
|
||||
|
||||
const DictUnit *tmp = NULL;
|
||||
Unicode unico;
|
||||
const DictTrie * dict = segment_.getDictTrie();
|
||||
assert(dict != NULL);
|
||||
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) {
|
||||
if (!TransCode::decode(*itr, unico)) {
|
||||
LogError("decode failed.");
|
||||
return false;
|
||||
}
|
||||
tmp = dictTrie_->find(unico.begin(), unico.end());
|
||||
tmp = dict->find(unico.begin(), unico.end());
|
||||
if(tmp == NULL || tmp->tag.empty()) {
|
||||
res.push_back(make_pair(*itr, specialRule_(unico)));
|
||||
} else {
|
||||
@ -72,8 +75,8 @@ class PosTagger {
|
||||
}
|
||||
private:
|
||||
MixSegment segment_;
|
||||
const DictTrie * dictTrie_;
|
||||
};
|
||||
}
|
||||
}; // class PosTagger
|
||||
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif
|
||||
|
@ -19,11 +19,15 @@ class QuerySegment: public SegmentBase {
|
||||
QuerySegment(const string& dict, const string& model, size_t maxWordLen = 4,
|
||||
const string& userDict = "")
|
||||
: mixSeg_(dict, model, userDict),
|
||||
fullSeg_(mixSeg_.getDictTrie()) {
|
||||
assert(maxWordLen);
|
||||
maxWordLen_ = maxWordLen;
|
||||
};
|
||||
virtual ~QuerySegment() {};
|
||||
fullSeg_(mixSeg_.getDictTrie()),
|
||||
maxWordLen_(maxWordLen) {
|
||||
assert(maxWordLen_);
|
||||
}
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mixSeg_(dictTrie, model), fullSeg_(dictTrie) {
|
||||
}
|
||||
virtual ~QuerySegment() {
|
||||
}
|
||||
using SegmentBase::cut;
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
//use mix cut first
|
||||
|
Loading…
x
Reference in New Issue
Block a user