统一在SegmentBase搞定所有Unicode和string的转码事情

This commit is contained in:
yanyiwu 2015-07-24 13:42:24 +08:00
parent 4d86abb001
commit 0f79fa6c24
7 changed files with 31 additions and 117 deletions

View File

@ -58,49 +58,6 @@ class FullSegment: public SegmentBase {
}
uIdx++;
}
/*
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
//find word start from uItr
if (dictTrie_->find(uItr, end, tRes, 0)) {
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) {
wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
res.push_back(itr->second->word);
}
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
}
tRes.clear();
} else { // not found word start from uItr
if (maxIdx <= uIdx) { // never exist in prev results
//put itr itself in res
res.push_back(Unicode(1, *uItr));
//mark it exits
++maxIdx;
}
}
++uIdx;
}
*/
return true;
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end,
vector<string>& res) const {
vector<Unicode> uRes;
if (!cut(begin, end, uRes)) {
LogError("get unicode cut result error.");
return false;
}
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin();
uItr != uRes.end(); uItr++) {
TransCode::encode(*uItr, tmp);
res.push_back(tmp);
}
return true;
}
private:

View File

@ -55,22 +55,6 @@ class HMMSegment: public SegmentBase {
}
return true;
}
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
if(begin == end) {
return false;
}
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) {
TransCode::encode(words[i], res[offset + i]);
}
return true;
}
private:
// sequential letters rule
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {

View File

@ -1,14 +1,15 @@
#ifndef CPPJIEBA_SEGMENTINTERFACE_H
#define CPPJIEBA_SEGMENTINTERFACE_H
#ifndef CPPJIEBA_ISEGMENT_H
#define CPPJIEBA_ISEGMENT_H
namespace CppJieba {
class ISegment {
public:
virtual ~ISegment() {};
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual ~ISegment() {
};
virtual bool cut(const string& str, vector<string>& res) const = 0;
};
}
#endif
} // namespace CppJieba
#endif // CPPJIEBA_ISEGMENT_H

View File

@ -33,20 +33,6 @@ class MPSegment: public SegmentBase {
}
using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
vector<Unicode> words;
words.reserve(end - begin);
if(!cut(begin, end, words)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) {
TransCode::encode(words[i], res[i + offset]);
}
return true;
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Dag> dags;

View File

@ -68,25 +68,6 @@ class MixSegment: public SegmentBase {
return true;
}
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
if(begin == end) {
return false;
}
vector<Unicode> uRes;
uRes.reserve(end - begin);
if (!cut(begin, end, uRes)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
TransCode::encode(uRes[i], res[offset]);
}
return true;
}
const DictTrie* getDictTrie() const {
return mpSeg_.getDictTrie();
}

View File

@ -55,22 +55,6 @@ class QuerySegment: public SegmentBase {
return true;
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
vector<Unicode> uRes;
if (!cut(begin, end, uRes)) {
LogError("get unicode cut result error.");
return false;
}
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
TransCode::encode(*uItr, tmp);
res.push_back(tmp);
}
return true;
}
private:
MixSegment mixSeg_;
FullSegment fullSeg_;

View File

@ -24,9 +24,10 @@ class SegmentBase: public ISegment, public NonCopyable {
SegmentBase() {
loadSpecialSymbols_();
};
virtual ~SegmentBase() {};
virtual ~SegmentBase() {
};
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const {
res.clear();
@ -54,6 +55,26 @@ class SegmentBase: public ISegment, public NonCopyable {
return true;
}
virtual bool cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<string>& res) const {
if(begin == end) {
return false;
}
vector<Unicode> uRes;
uRes.reserve(end - begin);
if (!cut(begin, end, uRes)) {
return false;
}
size_t offset = res.size();
res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
TransCode::encode(uRes[i], res[offset]);
}
return true;
}
private:
void loadSpecialSymbols_() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);