rewrite cut for chinese special symbol

This commit is contained in:
wyy 2014-04-19 11:25:13 +08:00
parent 3d6bade24f
commit a585471e76

View File

@ -12,55 +12,109 @@ namespace CppJieba
{
using namespace Limonp;
const char* const SPECIAL_CHARS = " \t\n";
//const char* const SPECIAL_CHARS = " \t\n";
#ifndef CPPJIEBA_GBK
const uint32_t SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
#else
const uint32_t SPECIAL_SYMBOL[] = {32u, 9u, 10u};
#endif
class SegmentBase: public ISegment, public InitOnOff
{
public:
SegmentBase(){};
SegmentBase(){_loadSpecialSymbols();};
virtual ~SegmentBase(){};
private:
unordered_set<uint32_t> _specialSymbols;
private:
void _loadSpecialSymbols()
{
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++)
{
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
}
assert(_specialSymbols.size());
}
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res)const
{
assert(_getInitFlag());
Unicode unico;
Unicode unicode;
TransCode::decode(str, unicode);
res.clear();
const char * const cstr = str.c_str();
size_t size = str.size();
size_t offset = 0;
string subs;
int ret;
size_t len;
while(offset < size)
Unicode::const_iterator left = unicode.begin();
Unicode::const_iterator right = unicode.begin();
string oneword;
while(right != unicode.end())
{
const char * const nstr = cstr + offset;
size_t nsize = size - offset;
if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize)
if(isIn(_specialSymbols, *right))
{
LogFatal("str[%s] illegal.", cstr);
return false;
if(left != right)
{
cut(left, right, res);
}
subs.assign(nstr, len);
if(!ret)
{
res.push_back(subs);
TransCode::encode(right, right + 1, oneword);
res.push_back(oneword);
right ++;
left = right;
}
else
{
unico.clear();
if(!TransCode::decode(subs, unico))
right ++;
}
}
if(left != right)
{
LogFatal("str[%s] decode failed.", subs.c_str());
return false;
}
cut(unico.begin(), unico.end(), res);
}
offset += len;
cut(left, right, res);
}
return true;
}
//virtual bool cut(const string& str, vector<string>& res)const
//{
// assert(_getInitFlag());
// Unicode unico;
// res.clear();
// const char * const cstr = str.c_str();
// size_t size = str.size();
// size_t offset = 0;
// string subs;
// int ret;
// size_t len;
// while(offset < size)
// {
// const char * const nstr = cstr + offset;
// size_t nsize = size - offset;
// if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize)
// {
// LogFatal("str[%s] illegal.", cstr);
// return false;
// }
// subs.assign(nstr, len);
// if(!ret)
// {
// res.push_back(subs);
// }
// else
// {
// unico.clear();
// if(!TransCode::decode(subs, unico))
// {
// LogFatal("str[%s] decode failed.", subs.c_str());
// return false;
// }
// cut(unico.begin(), unico.end(), res);
// }
// offset += len;
// }
// return true;
//}
public:
/*
@ -68,22 +122,22 @@ namespace CppJieba
* else count the NO SPECIAL_CHARS string's length and return 1;
* if errors, return -1;
* */
static int filterSpecialChars(const char* str, size_t len, size_t& resLen)
{
if(!str || !len)
{
return -1;
}
//static int filterSpecialChars(const char* str, size_t len, size_t& resLen)
//{
// if(!str || !len)
// {
// return -1;
// }
resLen = 1;
int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1);
for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++)
{
resLen ++;
}
return flag;
// resLen = 1;
// int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1);
// for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++)
// {
// resLen ++;
// }
// return flag;
}
//}
/*
* if char is ascii, count the ascii string's length and return 0;