mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
use filterSpecialChars in segmentbase.hpp
This commit is contained in:
parent
59dae88689
commit
76d640b26e
@ -11,6 +11,9 @@
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
const char* const SPECIAL_CHARS = " \t\n";
|
||||
|
||||
class SegmentBase: public ISegment, public InitOnOff
|
||||
{
|
||||
public:
|
||||
@ -24,14 +27,6 @@ namespace CppJieba
|
||||
assert(_getInitFlag());
|
||||
Unicode unico;
|
||||
res.clear();
|
||||
#ifdef NO_FILTER
|
||||
if(!TransCode::decode(str, unico))
|
||||
{
|
||||
LogFatal("str[%s] decode failed.", str.c_str());
|
||||
return false;
|
||||
}
|
||||
return cut(unico.begin(), unico.end(), res);
|
||||
#else
|
||||
const char * const cstr = str.c_str();
|
||||
size_t size = str.size();
|
||||
size_t offset = 0;
|
||||
@ -42,7 +37,7 @@ namespace CppJieba
|
||||
{
|
||||
const char * const nstr = cstr + offset;
|
||||
size_t nsize = size - offset;
|
||||
if(-1 == (ret = filterAscii(nstr, nsize, len)) || 0 == len || len > nsize)
|
||||
if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize)
|
||||
{
|
||||
LogFatal("str[%s] illegal.", cstr);
|
||||
return false;
|
||||
@ -65,10 +60,31 @@ namespace CppJieba
|
||||
offset += len;
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
public:
|
||||
|
||||
/*
|
||||
* if char is SPECIAL_CHARS, count the SPECITAL_CHARS string's length and return 0;
|
||||
* else count the NO SPECIAL_CHARS string's length and return 1;
|
||||
* if errors, return -1;
|
||||
* */
|
||||
static int filterSpecialChars(const char* str, size_t len, size_t& resLen)
|
||||
{
|
||||
if(!str || !len)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
resLen = 1;
|
||||
int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1);
|
||||
for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++)
|
||||
{
|
||||
resLen ++;
|
||||
}
|
||||
return flag;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* if char is ascii, count the ascii string's length and return 0;
|
||||
* else count the nonascii string's length and return 1;
|
||||
|
1
test/testdata/testlines.utf8
vendored
1
test/testdata/testlines.utf8
vendored
@ -7,3 +7,4 @@
|
||||
我来到南京市长江大桥
|
||||
请在一米线外等候
|
||||
人事处女干事
|
||||
去医院做B超,叫号123
|
||||
|
Loading…
x
Reference in New Issue
Block a user