mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add unicode decoding unittest
This commit is contained in:
parent
6fa843b527
commit
63e9c94fb7
@ -2,6 +2,7 @@
|
|||||||
#define CPPJIEBA_PRE_FILTER_H
|
#define CPPJIEBA_PRE_FILTER_H
|
||||||
|
|
||||||
#include "Trie.hpp"
|
#include "Trie.hpp"
|
||||||
|
#include "limonp/Logging.hpp"
|
||||||
|
|
||||||
namespace cppjieba {
|
namespace cppjieba {
|
||||||
|
|
||||||
@ -16,7 +17,9 @@ class PreFilter {
|
|||||||
PreFilter(const unordered_set<Rune>& symbols,
|
PreFilter(const unordered_set<Rune>& symbols,
|
||||||
const string& sentence)
|
const string& sentence)
|
||||||
: symbols_(symbols) {
|
: symbols_(symbols) {
|
||||||
DecodeRunesInString(sentence, sentence_);
|
if (!DecodeRunesInString(sentence, sentence_)) {
|
||||||
|
XLOG(ERROR) << "decode string: " << sentence << " failed";
|
||||||
|
}
|
||||||
cursor_ = sentence_.begin();
|
cursor_ = sentence_.begin();
|
||||||
}
|
}
|
||||||
~PreFilter() {
|
~PreFilter() {
|
||||||
|
@ -38,6 +38,9 @@ struct RuneStr {
|
|||||||
}
|
}
|
||||||
}; // struct RuneStr
|
}; // struct RuneStr
|
||||||
|
|
||||||
|
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
||||||
|
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
||||||
|
}
|
||||||
|
|
||||||
typedef limonp::LocalVector<Rune> Unicode;
|
typedef limonp::LocalVector<Rune> Unicode;
|
||||||
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
||||||
@ -132,6 +135,7 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
|
|||||||
for (size_t i = 0; i < len;) {
|
for (size_t i = 0; i < len;) {
|
||||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||||
if (rp.len == 0) {
|
if (rp.len == 0) {
|
||||||
|
runes.clear();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
RuneStr x(rp.rune, i, rp.len);
|
RuneStr x(rp.rune, i, rp.len);
|
||||||
|
@ -13,6 +13,7 @@ ADD_EXECUTABLE(test.run
|
|||||||
pos_tagger_test.cpp
|
pos_tagger_test.cpp
|
||||||
jieba_test.cpp
|
jieba_test.cpp
|
||||||
pre_filter_test.cpp
|
pre_filter_test.cpp
|
||||||
|
unicode_test.cpp
|
||||||
)
|
)
|
||||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||||
|
|
||||||
|
43
test/unittest/unicode_test.cpp
Normal file
43
test/unittest/unicode_test.cpp
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#include "cppjieba/Unicode.hpp"
|
||||||
|
#include "limonp/StdExtension.hpp"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
using namespace cppjieba;
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
TEST(UnicodeTest, Test1) {
|
||||||
|
string s = "你好世界";
|
||||||
|
RuneStrArray runes;
|
||||||
|
ASSERT_TRUE(DecodeRunesInString(s, runes));
|
||||||
|
string actual;
|
||||||
|
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
|
||||||
|
actual << runes;
|
||||||
|
ASSERT_EQ(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(UnicodeTest, Illegal) {
|
||||||
|
string s = "123\x80";
|
||||||
|
RuneStrArray runes;
|
||||||
|
ASSERT_FALSE(DecodeRunesInString(s, runes));
|
||||||
|
string actual;
|
||||||
|
string expected = "[]";
|
||||||
|
actual << runes;
|
||||||
|
ASSERT_EQ(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(UnicodeTest, Rand) {
|
||||||
|
const size_t ITERATION = 1024;
|
||||||
|
const size_t MAX_LEN = 256;
|
||||||
|
string s;
|
||||||
|
srand(time(NULL));
|
||||||
|
|
||||||
|
for (size_t i = 0; i < ITERATION; i++) {
|
||||||
|
size_t len = rand() % MAX_LEN;
|
||||||
|
s.resize(len);
|
||||||
|
for (size_t j = 0; j < len; j++) {
|
||||||
|
s[rand() % len] = rand();
|
||||||
|
}
|
||||||
|
RuneStrArray runes;
|
||||||
|
DecodeRunesInString(s, runes);
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user