add unicode decoding unittest

This commit is contained in:
yanyiwu 2016-04-18 14:37:17 +08:00
parent 6fa843b527
commit 63e9c94fb7
4 changed files with 52 additions and 1 deletions

View File

@ -2,6 +2,7 @@
#define CPPJIEBA_PRE_FILTER_H
#include "Trie.hpp"
#include "limonp/Logging.hpp"
namespace cppjieba {
@ -16,7 +17,9 @@ class PreFilter {
PreFilter(const unordered_set<Rune>& symbols,
const string& sentence)
: symbols_(symbols) {
DecodeRunesInString(sentence, sentence_);
if (!DecodeRunesInString(sentence, sentence_)) {
XLOG(ERROR) << "decode string: " << sentence << " failed";
}
cursor_ = sentence_.begin();
}
~PreFilter() {

View File

@ -38,6 +38,9 @@ struct RuneStr {
}
}; // struct RuneStr
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
}
typedef limonp::LocalVector<Rune> Unicode;
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
@ -132,6 +135,7 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
for (size_t i = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
if (rp.len == 0) {
runes.clear();
return false;
}
RuneStr x(rp.rune, i, rp.len);

View File

@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run
pos_tagger_test.cpp
jieba_test.cpp
pre_filter_test.cpp
unicode_test.cpp
)
TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,43 @@
#include "cppjieba/Unicode.hpp"
#include "limonp/StdExtension.hpp"
#include "gtest/gtest.h"
using namespace cppjieba;
using namespace std;
TEST(UnicodeTest, Test1) {
string s = "你好世界";
RuneStrArray runes;
ASSERT_TRUE(DecodeRunesInString(s, runes));
string actual;
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
actual << runes;
ASSERT_EQ(expected, actual);
}
TEST(UnicodeTest, Illegal) {
string s = "123\x80";
RuneStrArray runes;
ASSERT_FALSE(DecodeRunesInString(s, runes));
string actual;
string expected = "[]";
actual << runes;
ASSERT_EQ(expected, actual);
}
TEST(UnicodeTest, Rand) {
const size_t ITERATION = 1024;
const size_t MAX_LEN = 256;
string s;
srand(time(NULL));
for (size_t i = 0; i < ITERATION; i++) {
size_t len = rand() % MAX_LEN;
s.resize(len);
for (size_t j = 0; j < len; j++) {
s[rand() % len] = rand();
}
RuneStrArray runes;
DecodeRunesInString(s, runes);
}
}