remove chinesefilter

This commit is contained in:
wyy 2013-12-06 06:19:54 -08:00
parent 5692220756
commit cde97bf9b8
2 changed files with 131 additions and 99 deletions

View File

@ -6,98 +6,98 @@
namespace CppJieba namespace CppJieba
{ {
enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1}; //enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1};
typedef Unicode::const_iterator UniConIter; //typedef Unicode::const_iterator UniConIter;
class ChineseFilter; //class ChineseFilter;
class ChFilterIterator //class ChFilterIterator
{ //{
public: // public:
const Unicode * ptUnico; // const Unicode * ptUnico;
UniConIter begin; // UniConIter begin;
UniConIter end; // UniConIter end;
CHAR_TYPE charType; // CHAR_TYPE charType;
ChFilterIterator& operator++() // ChFilterIterator& operator++()
{ // {
return *this = _get(end); // return *this = _get(end);
} // }
ChFilterIterator operator++(int) // ChFilterIterator operator++(int)
{ // {
ChFilterIterator res = *this; // ChFilterIterator res = *this;
*this = _get(end); // *this = _get(end);
return res; // return res;
} // }
bool operator==(const ChFilterIterator& iter) // bool operator==(const ChFilterIterator& iter)
{ // {
return begin == iter.begin && end == iter.end; // return begin == iter.begin && end == iter.end;
} // }
bool operator!=(const ChFilterIterator& iter) // bool operator!=(const ChFilterIterator& iter)
{ // {
return !(*this == iter); // return !(*this == iter);
} // }
ChFilterIterator& operator=(const ChFilterIterator& iter) // ChFilterIterator& operator=(const ChFilterIterator& iter)
{ // {
ptUnico = iter.ptUnico; // ptUnico = iter.ptUnico;
begin = iter.begin; // begin = iter.begin;
end = iter.end; // end = iter.end;
charType = iter.charType; // charType = iter.charType;
return *this; // return *this;
} // }
//
public: // public:
ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){}; // ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){};
ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());}; // ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());};
private: // private:
ChFilterIterator(){} // ChFilterIterator(){}
private: // private:
CHAR_TYPE _charType(uint16_t x)const // CHAR_TYPE _charType(uint16_t x)const
{ // {
if(x < 0x0080) // if(x < 0x0080)
{ // {
return DIGIT_OR_LETTER; // return DIGIT_OR_LETTER;
} // }
return CHWORD; // return CHWORD;
} // }
ChFilterIterator _get(UniConIter iter) // ChFilterIterator _get(UniConIter iter)
{ // {
UniConIter _begin = iter; // UniConIter _begin = iter;
const UniConIter& _end = ptUnico->end(); // const UniConIter& _end = ptUnico->end();
if(iter == _end) // if(iter == _end)
{ // {
return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER); // return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER);
} // }
CHAR_TYPE charType = _charType(*iter); // CHAR_TYPE charType = _charType(*iter);
iter ++; // iter ++;
while(iter != _end &&charType == _charType(*iter)) // while(iter != _end &&charType == _charType(*iter))
{ // {
iter++; // iter++;
} // }
return ChFilterIterator(ptUnico, _begin, iter, charType); // return ChFilterIterator(ptUnico, _begin, iter, charType);
} // }
}; //};
class ChineseFilter //class ChineseFilter
{ //{
private: // private:
Unicode _unico; // Unicode _unico;
public: // public:
typedef ChFilterIterator iterator; // typedef ChFilterIterator iterator;
public: // public:
ChineseFilter(){}; // ChineseFilter(){};
~ChineseFilter(){}; // ~ChineseFilter(){};
public: // public:
bool feed(const string& str) // bool feed(const string& str)
{ // {
return TransCode::decode(str, _unico); // return TransCode::decode(str, _unico);
} // }
iterator begin() // iterator begin()
{ // {
return iterator(&_unico); // return iterator(&_unico);
} // }
iterator end() // iterator end()
{ // {
return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER); // return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER);
} // }
}; //};
/* /*
* if char is ascii, count the ascii string's length and return 0; * if char is ascii, count the ascii string's length and return 0;

View File

@ -31,24 +31,56 @@ namespace CppJieba
LogError("not inited."); LogError("not inited.");
return false; return false;
} }
ChineseFilter filter; const char * cstr = str.c_str();
filter.feed(str); uint size = str.size();
for(ChineseFilter::iterator it = filter.begin(); it != filter.end(); it++) uint offset = 0;
string subs;
int ret;
uint len;
Unicode unico;
while(offset < size)
{ {
if(it.charType == CHWORD) if(-1 == (ret = filterAscii(cstr + offset, size, len)))
{ {
cut(it.begin, it.end, res); LogFatal("str[%s] illegal.", cstr);
return false;
}
subs.assign(cstr + offset, len);
if(!ret)
{
res.push_back(subs);
} }
else else
{ {
string tmp; unico.clear();
if(TransCode::encode(it.begin, it.end, tmp)) if(!TransCode::decode(subs, unico))
{ {
res.push_back(tmp); LogFatal("str[%s] decode failed.", subs.c_str());
return false;
} }
cut(unico.begin(), unico.end(), res);
} }
offset += len;
} }
return true; return true;
//ChineseFilter filter;
//filter.feed(str);
//for(ChineseFilter::iterator it = filter.begin(); it != filter.end(); it++)
//{
// if(it.charType == CHWORD)
// {
// cut(it.begin, it.end, res);
// }
// else
// {
// string tmp;
// if(TransCode::encode(it.begin, it.end, tmp))
// {
// res.push_back(tmp);
// }
// }
//}
//return true;
} }
}; };