mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
统一在SegmentBase搞定所有Unicode和string的转码事情
This commit is contained in:
parent
4d86abb001
commit
0f79fa6c24
@ -58,49 +58,6 @@ class FullSegment: public SegmentBase {
|
||||
}
|
||||
uIdx++;
|
||||
}
|
||||
/*
|
||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||
//find word start from uItr
|
||||
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
||||
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) {
|
||||
wordLen = itr->second->word.size();
|
||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
|
||||
res.push_back(itr->second->word);
|
||||
}
|
||||
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
||||
}
|
||||
tRes.clear();
|
||||
} else { // not found word start from uItr
|
||||
if (maxIdx <= uIdx) { // never exist in prev results
|
||||
//put itr itself in res
|
||||
res.push_back(Unicode(1, *uItr));
|
||||
|
||||
//mark it exits
|
||||
++maxIdx;
|
||||
}
|
||||
}
|
||||
++uIdx;
|
||||
}
|
||||
*/
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end,
|
||||
vector<string>& res) const {
|
||||
vector<Unicode> uRes;
|
||||
if (!cut(begin, end, uRes)) {
|
||||
LogError("get unicode cut result error.");
|
||||
return false;
|
||||
}
|
||||
|
||||
string tmp;
|
||||
for (vector<Unicode>::const_iterator uItr = uRes.begin();
|
||||
uItr != uRes.end(); uItr++) {
|
||||
TransCode::encode(*uItr, tmp);
|
||||
res.push_back(tmp);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
|
@ -55,22 +55,6 @@ class HMMSegment: public SegmentBase {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
||||
if(begin == end) {
|
||||
return false;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!cut(begin, end, words)) {
|
||||
return false;
|
||||
}
|
||||
size_t offset = res.size();
|
||||
res.resize(res.size() + words.size());
|
||||
for(size_t i = 0; i < words.size(); i++) {
|
||||
TransCode::encode(words[i], res[offset + i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||
|
@ -1,14 +1,15 @@
|
||||
#ifndef CPPJIEBA_SEGMENTINTERFACE_H
|
||||
#define CPPJIEBA_SEGMENTINTERFACE_H
|
||||
|
||||
#ifndef CPPJIEBA_ISEGMENT_H
|
||||
#define CPPJIEBA_ISEGMENT_H
|
||||
|
||||
namespace CppJieba {
|
||||
|
||||
class ISegment {
|
||||
public:
|
||||
virtual ~ISegment() {};
|
||||
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||
virtual ~ISegment() {
|
||||
};
|
||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif // CPPJIEBA_ISEGMENT_H
|
||||
|
@ -33,20 +33,6 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!cut(begin, end, words)) {
|
||||
return false;
|
||||
}
|
||||
size_t offset = res.size();
|
||||
res.resize(res.size() + words.size());
|
||||
for(size_t i = 0; i < words.size(); i++) {
|
||||
TransCode::encode(words[i], res[i + offset]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
vector<Dag> dags;
|
||||
|
||||
|
@ -68,25 +68,6 @@ class MixSegment: public SegmentBase {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
||||
if(begin == end) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Unicode> uRes;
|
||||
uRes.reserve(end - begin);
|
||||
if (!cut(begin, end, uRes)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t offset = res.size();
|
||||
res.resize(res.size() + uRes.size());
|
||||
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
|
||||
TransCode::encode(uRes[i], res[offset]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const DictTrie* getDictTrie() const {
|
||||
return mpSeg_.getDictTrie();
|
||||
}
|
||||
|
@ -55,22 +55,6 @@ class QuerySegment: public SegmentBase {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
||||
vector<Unicode> uRes;
|
||||
if (!cut(begin, end, uRes)) {
|
||||
LogError("get unicode cut result error.");
|
||||
return false;
|
||||
}
|
||||
|
||||
string tmp;
|
||||
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
|
||||
TransCode::encode(*uItr, tmp);
|
||||
res.push_back(tmp);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
MixSegment mixSeg_;
|
||||
FullSegment fullSeg_;
|
||||
|
@ -24,9 +24,10 @@ class SegmentBase: public ISegment, public NonCopyable {
|
||||
SegmentBase() {
|
||||
loadSpecialSymbols_();
|
||||
};
|
||||
virtual ~SegmentBase() {};
|
||||
virtual ~SegmentBase() {
|
||||
};
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||
virtual bool cut(const string& str, vector<string>& res) const {
|
||||
res.clear();
|
||||
|
||||
@ -54,6 +55,26 @@ class SegmentBase: public ISegment, public NonCopyable {
|
||||
|
||||
return true;
|
||||
}
|
||||
virtual bool cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<string>& res) const {
|
||||
if(begin == end) {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Unicode> uRes;
|
||||
uRes.reserve(end - begin);
|
||||
if (!cut(begin, end, uRes)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t offset = res.size();
|
||||
res.resize(res.size() + uRes.size());
|
||||
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
|
||||
TransCode::encode(uRes[i], res[offset]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
void loadSpecialSymbols_() {
|
||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||
|
Loading…
x
Reference in New Issue
Block a user