mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
use filterSpecialChars in segmentbase.hpp
This commit is contained in:
parent
59dae88689
commit
76d640b26e
@ -11,6 +11,9 @@
|
|||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
|
|
||||||
|
const char* const SPECIAL_CHARS = " \t\n";
|
||||||
|
|
||||||
class SegmentBase: public ISegment, public InitOnOff
|
class SegmentBase: public ISegment, public InitOnOff
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
@ -24,14 +27,6 @@ namespace CppJieba
|
|||||||
assert(_getInitFlag());
|
assert(_getInitFlag());
|
||||||
Unicode unico;
|
Unicode unico;
|
||||||
res.clear();
|
res.clear();
|
||||||
#ifdef NO_FILTER
|
|
||||||
if(!TransCode::decode(str, unico))
|
|
||||||
{
|
|
||||||
LogFatal("str[%s] decode failed.", str.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return cut(unico.begin(), unico.end(), res);
|
|
||||||
#else
|
|
||||||
const char * const cstr = str.c_str();
|
const char * const cstr = str.c_str();
|
||||||
size_t size = str.size();
|
size_t size = str.size();
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
@ -42,7 +37,7 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
const char * const nstr = cstr + offset;
|
const char * const nstr = cstr + offset;
|
||||||
size_t nsize = size - offset;
|
size_t nsize = size - offset;
|
||||||
if(-1 == (ret = filterAscii(nstr, nsize, len)) || 0 == len || len > nsize)
|
if(-1 == (ret = filterSpecialChars(nstr, nsize, len)) || 0 == len || len > nsize)
|
||||||
{
|
{
|
||||||
LogFatal("str[%s] illegal.", cstr);
|
LogFatal("str[%s] illegal.", cstr);
|
||||||
return false;
|
return false;
|
||||||
@ -65,10 +60,31 @@ namespace CppJieba
|
|||||||
offset += len;
|
offset += len;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* if char is SPECIAL_CHARS, count the SPECITAL_CHARS string's length and return 0;
|
||||||
|
* else count the NO SPECIAL_CHARS string's length and return 1;
|
||||||
|
* if errors, return -1;
|
||||||
|
* */
|
||||||
|
static int filterSpecialChars(const char* str, size_t len, size_t& resLen)
|
||||||
|
{
|
||||||
|
if(!str || !len)
|
||||||
|
{
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
resLen = 1;
|
||||||
|
int flag = (strchr(SPECIAL_CHARS, *str) ? 0: 1);
|
||||||
|
for(size_t i = 1; i < len && bool(flag) != bool(strchr(SPECIAL_CHARS, str[i])); i++)
|
||||||
|
{
|
||||||
|
resLen ++;
|
||||||
|
}
|
||||||
|
return flag;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if char is ascii, count the ascii string's length and return 0;
|
* if char is ascii, count the ascii string's length and return 0;
|
||||||
* else count the nonascii string's length and return 1;
|
* else count the nonascii string's length and return 1;
|
||||||
|
1
test/testdata/testlines.utf8
vendored
1
test/testdata/testlines.utf8
vendored
@ -7,3 +7,4 @@
|
|||||||
我来到南京市长江大桥
|
我来到南京市长江大桥
|
||||||
请在一米线外等候
|
请在一米线外等候
|
||||||
人事处女干事
|
人事处女干事
|
||||||
|
去医院做B超,叫号123
|
||||||
|
Loading…
x
Reference in New Issue
Block a user