From 5692220756d4047b904397e17966bbb3505b12de Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 6 Dec 2013 06:01:45 -0800 Subject: [PATCH] add filterAscii --- src/ChineseFilter.hpp | 30 ++++++++++++++++++++++++++++++ test/ChineseFilterUt.cpp | 36 ++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/ChineseFilter.hpp b/src/ChineseFilter.hpp index 3b1e163..621c0ea 100644 --- a/src/ChineseFilter.hpp +++ b/src/ChineseFilter.hpp @@ -99,6 +99,36 @@ namespace CppJieba } }; + /* + * if char is ascii, count the ascii string's length and return 0; + * else count the nonascii string's length and return 1; + * if errors, return -1; + * */ + inline int filterAscii(const char* str, uint len, uint& resLen) + { + if(!str || !len) + { + return -1; + } + char x = 0x80; + int resFlag = (str[0] & x ? 1 : 0); + resLen = 1; + if(!resFlag) + { + while(resLen < len && !(str[resLen] & x)) + { + resLen ++; + } + } + else + { + while(resLen < len && (str[resLen] & x)) + { + resLen ++; + } + } + return resFlag; + } } #endif diff --git a/test/ChineseFilterUt.cpp b/test/ChineseFilterUt.cpp index eb06c8f..0051439 100644 --- a/test/ChineseFilterUt.cpp +++ b/test/ChineseFilterUt.cpp @@ -1,26 +1,38 @@ -#include +#include "../src/ChineseFilter.hpp" -#ifdef UT using namespace CppJieba; - int main(int argc, char** argv) { - ChineseFilter chFilter; - ifstream ifs("../demo/testlines.utf8"); + //ChineseFilter chFilter; + ifstream ifs(argv[1]); string line; + string s; while(getline(ifs, line)) { - chFilter.feed(line); - for(ChineseFilter::iterator it = chFilter.begin(); it != chFilter.end(); it++) + const char * str = line.c_str(); + uint size = line.size(); + uint offset = 0; + while(offset < size) { - //cout<<__FILE__<<__LINE__<