add filterAscii

This commit is contained in:
wyy 2013-12-06 06:01:45 -08:00
parent 1bdce8904f
commit 5692220756
2 changed files with 54 additions and 12 deletions

View File

@ -99,6 +99,36 @@ namespace CppJieba
}
};
/*
* if char is ascii, count the ascii string's length and return 0;
* else count the nonascii string's length and return 1;
* if errors, return -1;
* */
inline int filterAscii(const char* str, uint len, uint& resLen)
{
if(!str || !len)
{
return -1;
}
char x = 0x80;
int resFlag = (str[0] & x ? 1 : 0);
resLen = 1;
if(!resFlag)
{
while(resLen < len && !(str[resLen] & x))
{
resLen ++;
}
}
else
{
while(resLen < len && (str[resLen] & x))
{
resLen ++;
}
}
return resFlag;
}
}
#endif

View File

@ -1,26 +1,38 @@
#include <ChineseFilter.h>
#include "../src/ChineseFilter.hpp"
#ifdef UT
using namespace CppJieba;
int main(int argc, char** argv)
{
ChineseFilter chFilter;
ifstream ifs("../demo/testlines.utf8");
//ChineseFilter chFilter;
ifstream ifs(argv[1]);
string line;
string s;
while(getline(ifs, line))
{
chFilter.feed(line);
for(ChineseFilter::iterator it = chFilter.begin(); it != chFilter.end(); it++)
const char * str = line.c_str();
uint size = line.size();
uint offset = 0;
while(offset < size)
{
//cout<<__FILE__<<__LINE__<<endl;
string tmp;
TransCode::encode(it.begin, it.end, tmp);
cout<<tmp<<endl;
uint len;
const char* t = str+offset;
int ret = filterAscii(t, size, len);
s.assign(t, len);
cout<<s<<","<<ret<<","<<len<<endl;
offset += len;
}
//chFilter.feed(line);
//for(ChineseFilter::iterator it = chFilter.begin(); it != chFilter.end(); it++)
//{
// //cout<<__FILE__<<__LINE__<<endl;
// string tmp;
// TransCode::encode(it.begin, it.end, tmp);
// cout<<tmp<<endl;
//}
}
return 0;
}
#endif