统一在SegmentBase搞定所有Unicode和string的转码事情

This commit is contained in:
yanyiwu 2015-07-24 13:42:24 +08:00
parent 4d86abb001
commit 0f79fa6c24
7 changed files with 31 additions and 117 deletions

View File

@ -58,49 +58,6 @@ class FullSegment: public SegmentBase {
} }
uIdx++; uIdx++;
} }
/*
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
//find word start from uItr
if (dictTrie_->find(uItr, end, tRes, 0)) {
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) {
wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
res.push_back(itr->second->word);
}
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
}
tRes.clear();
} else { // not found word start from uItr
if (maxIdx <= uIdx) { // never exist in prev results
//put itr itself in res
res.push_back(Unicode(1, *uItr));
//mark it exits
++maxIdx;
}
}
++uIdx;
}
*/
return true;
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end,
vector<string>& res) const {
vector<Unicode> uRes;
if (!cut(begin, end, uRes)) {
LogError("get unicode cut result error.");
return false;
}
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin();
uItr != uRes.end(); uItr++) {
TransCode::encode(*uItr, tmp);
res.push_back(tmp);
}
return true; return true;
} }
private: private:

View File

@ -55,22 +55,6 @@ class HMMSegment: public SegmentBase {
} }
return true; return true;
} }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
if(begin == end) {
return false;
}
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) {
TransCode::encode(words[i], res[offset + i]);
}
return true;
}
private: private:
// sequential letters rule // sequential letters rule
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const { Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {

View File

@ -1,14 +1,15 @@
#ifndef CPPJIEBA_SEGMENTINTERFACE_H #ifndef CPPJIEBA_ISEGMENT_H
#define CPPJIEBA_SEGMENTINTERFACE_H #define CPPJIEBA_ISEGMENT_H
namespace CppJieba { namespace CppJieba {
class ISegment { class ISegment {
public: public:
virtual ~ISegment() {}; virtual ~ISegment() {
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0; };
virtual bool cut(const string& str, vector<string>& res) const = 0; virtual bool cut(const string& str, vector<string>& res) const = 0;
}; };
}
#endif } // namespace CppJieba
#endif // CPPJIEBA_ISEGMENT_H

View File

@ -33,20 +33,6 @@ class MPSegment: public SegmentBase {
} }
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) {
TransCode::encode(words[i], res[i + offset]);
}
return true;
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const { bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Dag> dags; vector<Dag> dags;

View File

@ -68,25 +68,6 @@ class MixSegment: public SegmentBase {
return true; return true;
} }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
if(begin == end) {
return false;
}
vector<Unicode> uRes;
uRes.reserve(end - begin);
if (!cut(begin, end, uRes)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
TransCode::encode(uRes[i], res[offset]);
}
return true;
}
const DictTrie* getDictTrie() const { const DictTrie* getDictTrie() const {
return mpSeg_.getDictTrie(); return mpSeg_.getDictTrie();
} }

View File

@ -55,22 +55,6 @@ class QuerySegment: public SegmentBase {
return true; return true;
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
vector<Unicode> uRes;
if (!cut(begin, end, uRes)) {
LogError("get unicode cut result error.");
return false;
}
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
TransCode::encode(*uItr, tmp);
res.push_back(tmp);
}
return true;
}
private: private:
MixSegment mixSeg_; MixSegment mixSeg_;
FullSegment fullSeg_; FullSegment fullSeg_;

View File

@ -24,9 +24,10 @@ class SegmentBase: public ISegment, public NonCopyable {
SegmentBase() { SegmentBase() {
loadSpecialSymbols_(); loadSpecialSymbols_();
}; };
virtual ~SegmentBase() {}; virtual ~SegmentBase() {
};
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const { virtual bool cut(const string& str, vector<string>& res) const {
res.clear(); res.clear();
@ -54,6 +55,26 @@ class SegmentBase: public ISegment, public NonCopyable {
return true; return true;
} }
virtual bool cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<string>& res) const {
if(begin == end) {
return false;
}
vector<Unicode> uRes;
uRes.reserve(end - begin);
if (!cut(begin, end, uRes)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
TransCode::encode(uRes[i], res[offset]);
}
return true;
}
private: private:
void loadSpecialSymbols_() { void loadSpecialSymbols_() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);