remove chinesefilter

This commit is contained in:
wyy 2013-12-06 06:19:54 -08:00
parent 5692220756
commit cde97bf9b8
2 changed files with 131 additions and 99 deletions

View File

@ -6,98 +6,98 @@
namespace CppJieba
{
enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1};
typedef Unicode::const_iterator UniConIter;
class ChineseFilter;
class ChFilterIterator
{
public:
const Unicode * ptUnico;
UniConIter begin;
UniConIter end;
CHAR_TYPE charType;
ChFilterIterator& operator++()
{
return *this = _get(end);
}
ChFilterIterator operator++(int)
{
ChFilterIterator res = *this;
*this = _get(end);
return res;
}
bool operator==(const ChFilterIterator& iter)
{
return begin == iter.begin && end == iter.end;
}
bool operator!=(const ChFilterIterator& iter)
{
return !(*this == iter);
}
ChFilterIterator& operator=(const ChFilterIterator& iter)
{
ptUnico = iter.ptUnico;
begin = iter.begin;
end = iter.end;
charType = iter.charType;
return *this;
}
public:
ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){};
ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());};
private:
ChFilterIterator(){}
private:
CHAR_TYPE _charType(uint16_t x)const
{
if(x < 0x0080)
{
return DIGIT_OR_LETTER;
}
return CHWORD;
}
ChFilterIterator _get(UniConIter iter)
{
UniConIter _begin = iter;
const UniConIter& _end = ptUnico->end();
if(iter == _end)
{
return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER);
}
CHAR_TYPE charType = _charType(*iter);
iter ++;
while(iter != _end &&charType == _charType(*iter))
{
iter++;
}
return ChFilterIterator(ptUnico, _begin, iter, charType);
}
//enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1};
//typedef Unicode::const_iterator UniConIter;
//class ChineseFilter;
//class ChFilterIterator
//{
// public:
// const Unicode * ptUnico;
// UniConIter begin;
// UniConIter end;
// CHAR_TYPE charType;
// ChFilterIterator& operator++()
// {
// return *this = _get(end);
// }
// ChFilterIterator operator++(int)
// {
// ChFilterIterator res = *this;
// *this = _get(end);
// return res;
// }
// bool operator==(const ChFilterIterator& iter)
// {
// return begin == iter.begin && end == iter.end;
// }
// bool operator!=(const ChFilterIterator& iter)
// {
// return !(*this == iter);
// }
// ChFilterIterator& operator=(const ChFilterIterator& iter)
// {
// ptUnico = iter.ptUnico;
// begin = iter.begin;
// end = iter.end;
// charType = iter.charType;
// return *this;
// }
//
// public:
// ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){};
// ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());};
// private:
// ChFilterIterator(){}
// private:
// CHAR_TYPE _charType(uint16_t x)const
// {
// if(x < 0x0080)
// {
// return DIGIT_OR_LETTER;
// }
// return CHWORD;
// }
// ChFilterIterator _get(UniConIter iter)
// {
// UniConIter _begin = iter;
// const UniConIter& _end = ptUnico->end();
// if(iter == _end)
// {
// return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER);
// }
// CHAR_TYPE charType = _charType(*iter);
// iter ++;
// while(iter != _end &&charType == _charType(*iter))
// {
// iter++;
// }
// return ChFilterIterator(ptUnico, _begin, iter, charType);
// }
};
class ChineseFilter
{
private:
Unicode _unico;
public:
typedef ChFilterIterator iterator;
public:
ChineseFilter(){};
~ChineseFilter(){};
public:
bool feed(const string& str)
{
return TransCode::decode(str, _unico);
}
iterator begin()
{
return iterator(&_unico);
}
iterator end()
{
return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER);
}
};
//};
//class ChineseFilter
//{
// private:
// Unicode _unico;
// public:
// typedef ChFilterIterator iterator;
// public:
// ChineseFilter(){};
// ~ChineseFilter(){};
// public:
// bool feed(const string& str)
// {
// return TransCode::decode(str, _unico);
// }
// iterator begin()
// {
// return iterator(&_unico);
// }
// iterator end()
// {
// return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER);
// }
//};
/*
* if char is ascii, count the ascii string's length and return 0;

View File

@ -31,24 +31,56 @@ namespace CppJieba
LogError("not inited.");
return false;
}
ChineseFilter filter;
filter.feed(str);
for(ChineseFilter::iterator it = filter.begin(); it != filter.end(); it++)
const char * cstr = str.c_str();
uint size = str.size();
uint offset = 0;
string subs;
int ret;
uint len;
Unicode unico;
while(offset < size)
{
if(it.charType == CHWORD)
if(-1 == (ret = filterAscii(cstr + offset, size, len)))
{
cut(it.begin, it.end, res);
LogFatal("str[%s] illegal.", cstr);
return false;
}
subs.assign(cstr + offset, len);
if(!ret)
{
res.push_back(subs);
}
else
{
string tmp;
if(TransCode::encode(it.begin, it.end, tmp))
unico.clear();
if(!TransCode::decode(subs, unico))
{
res.push_back(tmp);
LogFatal("str[%s] decode failed.", subs.c_str());
return false;
}
cut(unico.begin(), unico.end(), res);
}
offset += len;
}
return true;
//ChineseFilter filter;
//filter.feed(str);
//for(ChineseFilter::iterator it = filter.begin(); it != filter.end(); it++)
//{
// if(it.charType == CHWORD)
// {
// cut(it.begin, it.end, res);
// }
// else
// {
// string tmp;
// if(TransCode::encode(it.begin, it.end, tmp))
// {
// res.push_back(tmp);
// }
// }
//}
//return true;
}
};