mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
统一在SegmentBase搞定所有Unicode和string的转码事情
This commit is contained in:
parent
4d86abb001
commit
0f79fa6c24
@ -58,49 +58,6 @@ class FullSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
uIdx++;
|
uIdx++;
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
|
||||||
//find word start from uItr
|
|
||||||
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
|
||||||
for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) {
|
|
||||||
wordLen = itr->second->word.size();
|
|
||||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
|
|
||||||
res.push_back(itr->second->word);
|
|
||||||
}
|
|
||||||
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
|
||||||
}
|
|
||||||
tRes.clear();
|
|
||||||
} else { // not found word start from uItr
|
|
||||||
if (maxIdx <= uIdx) { // never exist in prev results
|
|
||||||
//put itr itself in res
|
|
||||||
res.push_back(Unicode(1, *uItr));
|
|
||||||
|
|
||||||
//mark it exits
|
|
||||||
++maxIdx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
++uIdx;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end,
|
|
||||||
vector<string>& res) const {
|
|
||||||
vector<Unicode> uRes;
|
|
||||||
if (!cut(begin, end, uRes)) {
|
|
||||||
LogError("get unicode cut result error.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
string tmp;
|
|
||||||
for (vector<Unicode>::const_iterator uItr = uRes.begin();
|
|
||||||
uItr != uRes.end(); uItr++) {
|
|
||||||
TransCode::encode(*uItr, tmp);
|
|
||||||
res.push_back(tmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
|
@ -55,22 +55,6 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
|
||||||
if(begin == end) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
vector<Unicode> words;
|
|
||||||
words.reserve(end - begin);
|
|
||||||
if(!cut(begin, end, words)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
size_t offset = res.size();
|
|
||||||
res.resize(res.size() + words.size());
|
|
||||||
for(size_t i = 0; i < words.size(); i++) {
|
|
||||||
TransCode::encode(words[i], res[offset + i]);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
private:
|
private:
|
||||||
// sequential letters rule
|
// sequential letters rule
|
||||||
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
|
@ -1,14 +1,15 @@
|
|||||||
#ifndef CPPJIEBA_SEGMENTINTERFACE_H
|
#ifndef CPPJIEBA_ISEGMENT_H
|
||||||
#define CPPJIEBA_SEGMENTINTERFACE_H
|
#define CPPJIEBA_ISEGMENT_H
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
|
|
||||||
class ISegment {
|
class ISegment {
|
||||||
public:
|
public:
|
||||||
virtual ~ISegment() {};
|
virtual ~ISegment() {
|
||||||
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
};
|
||||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
||||||
};
|
};
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
} // namespace CppJieba
|
||||||
|
|
||||||
|
#endif // CPPJIEBA_ISEGMENT_H
|
||||||
|
@ -33,20 +33,6 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
|
||||||
vector<Unicode> words;
|
|
||||||
words.reserve(end - begin);
|
|
||||||
if(!cut(begin, end, words)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
size_t offset = res.size();
|
|
||||||
res.resize(res.size() + words.size());
|
|
||||||
for(size_t i = 0; i < words.size(); i++) {
|
|
||||||
TransCode::encode(words[i], res[i + offset]);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
vector<Dag> dags;
|
vector<Dag> dags;
|
||||||
|
|
||||||
|
@ -68,25 +68,6 @@ class MixSegment: public SegmentBase {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
|
||||||
if(begin == end) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
vector<Unicode> uRes;
|
|
||||||
uRes.reserve(end - begin);
|
|
||||||
if (!cut(begin, end, uRes)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t offset = res.size();
|
|
||||||
res.resize(res.size() + uRes.size());
|
|
||||||
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
|
|
||||||
TransCode::encode(uRes[i], res[offset]);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* getDictTrie() const {
|
||||||
return mpSeg_.getDictTrie();
|
return mpSeg_.getDictTrie();
|
||||||
}
|
}
|
||||||
|
@ -55,22 +55,6 @@ class QuerySegment: public SegmentBase {
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
|
||||||
vector<Unicode> uRes;
|
|
||||||
if (!cut(begin, end, uRes)) {
|
|
||||||
LogError("get unicode cut result error.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
string tmp;
|
|
||||||
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
|
|
||||||
TransCode::encode(*uItr, tmp);
|
|
||||||
res.push_back(tmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
private:
|
private:
|
||||||
MixSegment mixSeg_;
|
MixSegment mixSeg_;
|
||||||
FullSegment fullSeg_;
|
FullSegment fullSeg_;
|
||||||
|
@ -24,9 +24,10 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
SegmentBase() {
|
SegmentBase() {
|
||||||
loadSpecialSymbols_();
|
loadSpecialSymbols_();
|
||||||
};
|
};
|
||||||
virtual ~SegmentBase() {};
|
virtual ~SegmentBase() {
|
||||||
|
};
|
||||||
public:
|
public:
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||||
virtual bool cut(const string& str, vector<string>& res) const {
|
virtual bool cut(const string& str, vector<string>& res) const {
|
||||||
res.clear();
|
res.clear();
|
||||||
|
|
||||||
@ -54,6 +55,26 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
virtual bool cut(Unicode::const_iterator begin,
|
||||||
|
Unicode::const_iterator end,
|
||||||
|
vector<string>& res) const {
|
||||||
|
if(begin == end) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<Unicode> uRes;
|
||||||
|
uRes.reserve(end - begin);
|
||||||
|
if (!cut(begin, end, uRes)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t offset = res.size();
|
||||||
|
res.resize(res.size() + uRes.size());
|
||||||
|
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
|
||||||
|
TransCode::encode(uRes[i], res[offset]);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
void loadSpecialSymbols_() {
|
void loadSpecialSymbols_() {
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user