rewrite cut for chinese special symbol

This commit is contained in:
wyy 2014-04-19 11:25:13 +08:00
parent 3d6bade24f
commit a585471e76

View File

@ -12,55 +12,109 @@ namespace CppJieba
{ {
using namespace Limonp; using namespace Limonp;
const char* const SPECIAL_CHARS = " \t\n"; //const char* const SPECIAL_CHARS = " \t\n";
#ifndef CPPJIEBA_GBK
const uint32_t SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
#else
const uint32_t SPECIAL_SYMBOL[] = {32u, 9u, 10u};
#endif
class SegmentBase: public ISegment, public InitOnOff class SegmentBase: public ISegment, public InitOnOff
{ {
public: public:
SegmentBase(){}; SegmentBase(){_loadSpecialSymbols();};
virtual ~SegmentBase(){}; virtual ~SegmentBase(){};
private:
unordered_set<uint32_t> _specialSymbols;
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res)const virtual bool cut(const string& str, vector<string>& res)const
{ {
assert(_getInitFlag()); assert(_getInitFlag());
Unicode unico;
Unicode unicode;
TransCode::decode(str, unicode);
res.clear(); res.clear();
const char * const cstr = str.c_str();
size_t size = str.size(); Unicode::const_iterator left = unicode.begin();
size_t offset = 0; Unicode::const_iterator right = unicode.begin();
string subs;
int ret; string oneword;
size_t len; while(right != unicode.end())
while(offset < size)
{ {
const char * const nstr = cstr + offset; if(isIn(_specialSymbols, *right))
size_t nsize = size - offset;
if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize)
{ {
LogFatal("str[%s] illegal.", cstr); if(left != right)
return false; {
} cut(left, right, res);
subs.assign(nstr, len); }
if(!ret) TransCode::encode(right, right + 1, oneword);
{ res.push_back(oneword);
res.push_back(subs); right ++;
left = right;
} }
else else
{ {
unico.clear(); right ++;
if(!TransCode::decode(subs, unico))
{
LogFatal("str[%s] decode failed.", subs.c_str());
return false;
}
cut(unico.begin(), unico.end(), res);
} }
offset += len;
} }
if(left != right)
{
cut(left, right, res);
}
return true; return true;
} }
//virtual bool cut(const string& str, vector<string>& res)const
//{
// assert(_getInitFlag());
// Unicode unico;
// res.clear();
// const char * const cstr = str.c_str();
// size_t size = str.size();
// size_t offset = 0;
// string subs;
// int ret;
// size_t len;
// while(offset < size)
// {
// const char * const nstr = cstr + offset;
// size_t nsize = size - offset;
// if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize)
// {
// LogFatal("str[%s] illegal.", cstr);
// return false;
// }
// subs.assign(nstr, len);
// if(!ret)
// {
// res.push_back(subs);
// }
// else
// {
// unico.clear();
// if(!TransCode::decode(subs, unico))
// {
// LogFatal("str[%s] decode failed.", subs.c_str());
// return false;
// }
// cut(unico.begin(), unico.end(), res);
// }
// offset += len;
// }
// return true;
//}
public: public:
/* /*
@ -68,22 +122,22 @@ namespace CppJieba
* else count the NO SPECIAL_CHARS string's length and return 1; * else count the NO SPECIAL_CHARS string's length and return 1;
* if errors, return -1; * if errors, return -1;
* */ * */
static int filterSpecialChars(const char* str, size_t len, size_t& resLen) //static int filterSpecialChars(const char* str, size_t len, size_t& resLen)
{ //{
if(!str || !len) // if(!str || !len)
{ // {
return -1; // return -1;
} // }
resLen = 1; // resLen = 1;
int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1); // int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1);
for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++) // for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++)
{ // {
resLen ++; // resLen ++;
} // }
return flag; // return flag;
} //}
/* /*
* if char is ascii, count the ascii string's length and return 0; * if char is ascii, count the ascii string's length and return 0;