diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 576cec5..98a1276 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -12,55 +12,109 @@ namespace CppJieba { using namespace Limonp; - const char* const SPECIAL_CHARS = " \t\n"; + //const char* const SPECIAL_CHARS = " \t\n"; +#ifndef CPPJIEBA_GBK + const uint32_t SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; +#else + const uint32_t SPECIAL_SYMBOL[] = {32u, 9u, 10u}; +#endif class SegmentBase: public ISegment, public InitOnOff { public: - SegmentBase(){}; + SegmentBase(){_loadSpecialSymbols();}; virtual ~SegmentBase(){}; + private: + unordered_set _specialSymbols; + private: + void _loadSpecialSymbols() + { + size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); + for(size_t i = 0; i < size; i ++) + { + _specialSymbols.insert(SPECIAL_SYMBOL[i]); + } + assert(_specialSymbols.size()); + } public: - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; virtual bool cut(const string& str, vector& res)const { assert(_getInitFlag()); - Unicode unico; + + Unicode unicode; + TransCode::decode(str, unicode); res.clear(); - const char * const cstr = str.c_str(); - size_t size = str.size(); - size_t offset = 0; - string subs; - int ret; - size_t len; - while(offset < size) + + Unicode::const_iterator left = unicode.begin(); + Unicode::const_iterator right = unicode.begin(); + + string oneword; + while(right != unicode.end()) { - const char * const nstr = cstr + offset; - size_t nsize = size - offset; - if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize) + if(isIn(_specialSymbols, *right)) { - LogFatal("str[%s] illegal.", cstr); - return false; - } - subs.assign(nstr, len); - if(!ret) - { - res.push_back(subs); + if(left != right) + { + cut(left, right, res); + } + TransCode::encode(right, right + 1, oneword); + res.push_back(oneword); + right ++; + left = right; } else { - unico.clear(); - if(!TransCode::decode(subs, unico)) - { - LogFatal("str[%s] decode failed.", subs.c_str()); - return false; - } - cut(unico.begin(), unico.end(), res); + right ++; } - offset += len; } + if(left != right) + { + cut(left, right, res); + } + return true; } + //virtual bool cut(const string& str, vector& res)const + //{ + // assert(_getInitFlag()); + // Unicode unico; + // res.clear(); + // const char * const cstr = str.c_str(); + // size_t size = str.size(); + // size_t offset = 0; + // string subs; + // int ret; + // size_t len; + // while(offset < size) + // { + // const char * const nstr = cstr + offset; + // size_t nsize = size - offset; + // if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize) + // { + // LogFatal("str[%s] illegal.", cstr); + // return false; + // } + // subs.assign(nstr, len); + // if(!ret) + // { + // res.push_back(subs); + // } + // else + // { + // unico.clear(); + // if(!TransCode::decode(subs, unico)) + // { + // LogFatal("str[%s] decode failed.", subs.c_str()); + // return false; + // } + // cut(unico.begin(), unico.end(), res); + // } + // offset += len; + // } + // return true; + //} public: /* @@ -68,22 +122,22 @@ namespace CppJieba * else count the NO SPECIAL_CHARS string's length and return 1; * if errors, return -1; * */ - static int filterSpecialChars(const char* str, size_t len, size_t& resLen) - { - if(!str || !len) - { - return -1; - } + //static int filterSpecialChars(const char* str, size_t len, size_t& resLen) + //{ + // if(!str || !len) + // { + // return -1; + // } - resLen = 1; - int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1); - for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++) - { - resLen ++; - } - return flag; + // resLen = 1; + // int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1); + // for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++) + // { + // resLen ++; + // } + // return flag; - } + //} /* * if char is ascii, count the ascii string's length and return 0;