mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add unicode decoding unittest
This commit is contained in:
parent
6fa843b527
commit
63e9c94fb7
@ -2,6 +2,7 @@
|
||||
#define CPPJIEBA_PRE_FILTER_H
|
||||
|
||||
#include "Trie.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
@ -16,7 +17,9 @@ class PreFilter {
|
||||
PreFilter(const unordered_set<Rune>& symbols,
|
||||
const string& sentence)
|
||||
: symbols_(symbols) {
|
||||
DecodeRunesInString(sentence, sentence_);
|
||||
if (!DecodeRunesInString(sentence, sentence_)) {
|
||||
XLOG(ERROR) << "decode string: " << sentence << " failed";
|
||||
}
|
||||
cursor_ = sentence_.begin();
|
||||
}
|
||||
~PreFilter() {
|
||||
|
@ -38,6 +38,9 @@ struct RuneStr {
|
||||
}
|
||||
}; // struct RuneStr
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
||||
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
||||
}
|
||||
|
||||
typedef limonp::LocalVector<Rune> Unicode;
|
||||
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
||||
@ -132,6 +135,7 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
|
||||
for (size_t i = 0; i < len;) {
|
||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||
if (rp.len == 0) {
|
||||
runes.clear();
|
||||
return false;
|
||||
}
|
||||
RuneStr x(rp.rune, i, rp.len);
|
||||
|
@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run
|
||||
pos_tagger_test.cpp
|
||||
jieba_test.cpp
|
||||
pre_filter_test.cpp
|
||||
unicode_test.cpp
|
||||
)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
|
||||
|
43
test/unittest/unicode_test.cpp
Normal file
43
test/unittest/unicode_test.cpp
Normal file
@ -0,0 +1,43 @@
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
#include "limonp/StdExtension.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace cppjieba;
|
||||
using namespace std;
|
||||
|
||||
TEST(UnicodeTest, Test1) {
|
||||
string s = "你好世界";
|
||||
RuneStrArray runes;
|
||||
ASSERT_TRUE(DecodeRunesInString(s, runes));
|
||||
string actual;
|
||||
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
|
||||
actual << runes;
|
||||
ASSERT_EQ(expected, actual);
|
||||
}
|
||||
|
||||
TEST(UnicodeTest, Illegal) {
|
||||
string s = "123\x80";
|
||||
RuneStrArray runes;
|
||||
ASSERT_FALSE(DecodeRunesInString(s, runes));
|
||||
string actual;
|
||||
string expected = "[]";
|
||||
actual << runes;
|
||||
ASSERT_EQ(expected, actual);
|
||||
}
|
||||
|
||||
TEST(UnicodeTest, Rand) {
|
||||
const size_t ITERATION = 1024;
|
||||
const size_t MAX_LEN = 256;
|
||||
string s;
|
||||
srand(time(NULL));
|
||||
|
||||
for (size_t i = 0; i < ITERATION; i++) {
|
||||
size_t len = rand() % MAX_LEN;
|
||||
s.resize(len);
|
||||
for (size_t j = 0; j < len; j++) {
|
||||
s[rand() % len] = rand();
|
||||
}
|
||||
RuneStrArray runes;
|
||||
DecodeRunesInString(s, runes);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user