From 63e9c94fb784202ece11ca9f5dca9d7f42e91304 Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Mon, 18 Apr 2016 14:37:17 +0800 Subject: [PATCH] add unicode decoding unittest --- include/cppjieba/PreFilter.hpp | 5 +++- include/cppjieba/Unicode.hpp | 4 ++++ test/unittest/CMakeLists.txt | 1 + test/unittest/unicode_test.cpp | 43 ++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 test/unittest/unicode_test.cpp diff --git a/include/cppjieba/PreFilter.hpp b/include/cppjieba/PreFilter.hpp index 0d5b877..7d6bdee 100644 --- a/include/cppjieba/PreFilter.hpp +++ b/include/cppjieba/PreFilter.hpp @@ -2,6 +2,7 @@ #define CPPJIEBA_PRE_FILTER_H #include "Trie.hpp" +#include "limonp/Logging.hpp" namespace cppjieba { @@ -16,7 +17,9 @@ class PreFilter { PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - DecodeRunesInString(sentence, sentence_); + if (!DecodeRunesInString(sentence, sentence_)) { + XLOG(ERROR) << "decode string: " << sentence << " failed"; + } cursor_ = sentence_.begin(); } ~PreFilter() { diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 1f2aec2..22a9d83 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -38,6 +38,9 @@ struct RuneStr { } }; // struct RuneStr +inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { + return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}"; +} typedef limonp::LocalVector Unicode; typedef limonp::LocalVector RuneStrArray; @@ -132,6 +135,7 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) for (size_t i = 0; i < len;) { RuneStrLite rp = DecodeRuneInString(s + i, len - i); if (rp.len == 0) { + runes.clear(); return false; } RuneStr x(rp.rune, i, rp.len); diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 2655215..de3cf04 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run pos_tagger_test.cpp jieba_test.cpp pre_filter_test.cpp + unicode_test.cpp ) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/unicode_test.cpp b/test/unittest/unicode_test.cpp new file mode 100644 index 0000000..a22096e --- /dev/null +++ b/test/unittest/unicode_test.cpp @@ -0,0 +1,43 @@ +#include "cppjieba/Unicode.hpp" +#include "limonp/StdExtension.hpp" +#include "gtest/gtest.h" + +using namespace cppjieba; +using namespace std; + +TEST(UnicodeTest, Test1) { + string s = "你好世界"; + RuneStrArray runes; + ASSERT_TRUE(DecodeRunesInString(s, runes)); + string actual; + string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]"; + actual << runes; + ASSERT_EQ(expected, actual); +} + +TEST(UnicodeTest, Illegal) { + string s = "123\x80"; + RuneStrArray runes; + ASSERT_FALSE(DecodeRunesInString(s, runes)); + string actual; + string expected = "[]"; + actual << runes; + ASSERT_EQ(expected, actual); +} + +TEST(UnicodeTest, Rand) { + const size_t ITERATION = 1024; + const size_t MAX_LEN = 256; + string s; + srand(time(NULL)); + + for (size_t i = 0; i < ITERATION; i++) { + size_t len = rand() % MAX_LEN; + s.resize(len); + for (size_t j = 0; j < len; j++) { + s[rand() % len] = rand(); + } + RuneStrArray runes; + DecodeRunesInString(s, runes); + } +}