diff --git a/src/ChineseFilter.hpp b/src/ChineseFilter.hpp index 621c0ea..c969607 100644 --- a/src/ChineseFilter.hpp +++ b/src/ChineseFilter.hpp @@ -6,98 +6,98 @@ namespace CppJieba { - enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1}; - typedef Unicode::const_iterator UniConIter; - class ChineseFilter; - class ChFilterIterator - { - public: - const Unicode * ptUnico; - UniConIter begin; - UniConIter end; - CHAR_TYPE charType; - ChFilterIterator& operator++() - { - return *this = _get(end); - } - ChFilterIterator operator++(int) - { - ChFilterIterator res = *this; - *this = _get(end); - return res; - } - bool operator==(const ChFilterIterator& iter) - { - return begin == iter.begin && end == iter.end; - } - bool operator!=(const ChFilterIterator& iter) - { - return !(*this == iter); - } - ChFilterIterator& operator=(const ChFilterIterator& iter) - { - ptUnico = iter.ptUnico; - begin = iter.begin; - end = iter.end; - charType = iter.charType; - return *this; - } - - public: - ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){}; - ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());}; - private: - ChFilterIterator(){} - private: - CHAR_TYPE _charType(uint16_t x)const - { - if(x < 0x0080) - { - return DIGIT_OR_LETTER; - } - return CHWORD; - } - ChFilterIterator _get(UniConIter iter) - { - UniConIter _begin = iter; - const UniConIter& _end = ptUnico->end(); - if(iter == _end) - { - return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER); - } - CHAR_TYPE charType = _charType(*iter); - iter ++; - while(iter != _end &&charType == _charType(*iter)) - { - iter++; - } - return ChFilterIterator(ptUnico, _begin, iter, charType); - } + //enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1}; + //typedef Unicode::const_iterator UniConIter; + //class ChineseFilter; + //class ChFilterIterator + //{ + // public: + // const Unicode * ptUnico; + // UniConIter begin; + // UniConIter end; + // CHAR_TYPE charType; + // ChFilterIterator& operator++() + // { + // return *this = _get(end); + // } + // ChFilterIterator operator++(int) + // { + // ChFilterIterator res = *this; + // *this = _get(end); + // return res; + // } + // bool operator==(const ChFilterIterator& iter) + // { + // return begin == iter.begin && end == iter.end; + // } + // bool operator!=(const ChFilterIterator& iter) + // { + // return !(*this == iter); + // } + // ChFilterIterator& operator=(const ChFilterIterator& iter) + // { + // ptUnico = iter.ptUnico; + // begin = iter.begin; + // end = iter.end; + // charType = iter.charType; + // return *this; + // } + // + // public: + // ChFilterIterator(const Unicode * ptu, UniConIter be, UniConIter en, CHAR_TYPE is):ptUnico(ptu), begin(be), end(en), charType(is){}; + // ChFilterIterator(const Unicode * ptu):ptUnico(ptu){*this = _get(ptUnico->begin());}; + // private: + // ChFilterIterator(){} + // private: + // CHAR_TYPE _charType(uint16_t x)const + // { + // if(x < 0x0080) + // { + // return DIGIT_OR_LETTER; + // } + // return CHWORD; + // } + // ChFilterIterator _get(UniConIter iter) + // { + // UniConIter _begin = iter; + // const UniConIter& _end = ptUnico->end(); + // if(iter == _end) + // { + // return ChFilterIterator(ptUnico, end, end, DIGIT_OR_LETTER); + // } + // CHAR_TYPE charType = _charType(*iter); + // iter ++; + // while(iter != _end &&charType == _charType(*iter)) + // { + // iter++; + // } + // return ChFilterIterator(ptUnico, _begin, iter, charType); + // } - }; - class ChineseFilter - { - private: - Unicode _unico; - public: - typedef ChFilterIterator iterator; - public: - ChineseFilter(){}; - ~ChineseFilter(){}; - public: - bool feed(const string& str) - { - return TransCode::decode(str, _unico); - } - iterator begin() - { - return iterator(&_unico); - } - iterator end() - { - return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER); - } - }; + //}; + //class ChineseFilter + //{ + // private: + // Unicode _unico; + // public: + // typedef ChFilterIterator iterator; + // public: + // ChineseFilter(){}; + // ~ChineseFilter(){}; + // public: + // bool feed(const string& str) + // { + // return TransCode::decode(str, _unico); + // } + // iterator begin() + // { + // return iterator(&_unico); + // } + // iterator end() + // { + // return iterator(&_unico, _unico.end(), _unico.end(), DIGIT_OR_LETTER); + // } + //}; /* * if char is ascii, count the ascii string's length and return 0; diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 44e4df3..5536047 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -31,24 +31,56 @@ namespace CppJieba LogError("not inited."); return false; } - ChineseFilter filter; - filter.feed(str); - for(ChineseFilter::iterator it = filter.begin(); it != filter.end(); it++) + const char * cstr = str.c_str(); + uint size = str.size(); + uint offset = 0; + string subs; + int ret; + uint len; + Unicode unico; + while(offset < size) { - if(it.charType == CHWORD) + if(-1 == (ret = filterAscii(cstr + offset, size, len))) { - cut(it.begin, it.end, res); + LogFatal("str[%s] illegal.", cstr); + return false; + } + subs.assign(cstr + offset, len); + if(!ret) + { + res.push_back(subs); } else { - string tmp; - if(TransCode::encode(it.begin, it.end, tmp)) + unico.clear(); + if(!TransCode::decode(subs, unico)) { - res.push_back(tmp); + LogFatal("str[%s] decode failed.", subs.c_str()); + return false; } + cut(unico.begin(), unico.end(), res); } + offset += len; } return true; + //ChineseFilter filter; + //filter.feed(str); + //for(ChineseFilter::iterator it = filter.begin(); it != filter.end(); it++) + //{ + // if(it.charType == CHWORD) + // { + // cut(it.begin, it.end, res); + // } + // else + // { + // string tmp; + // if(TransCode::encode(it.begin, it.end, tmp)) + // { + // res.push_back(tmp); + // } + // } + //} + //return true; } };