Refactor decoding functions to use UTF-8 compliant methods

Updated multiple files to replace instances of DecodeRunesInString with DecodeUTF8RunesInString, ensuring proper handling of UTF-8 encoded strings. This change enhances the robustness of string decoding across the cppjieba library, including updates in DictTrie, HMMModel, PosTagger, PreFilter, SegmentBase, and Unicode files. Additionally, corresponding unit tests have been modified to reflect these changes.
This commit is contained in:
yanyiwu 2024-12-08 16:46:24 +08:00
parent 5ee74d788e
commit 42a93a4b98
8 changed files with 34 additions and 34 deletions

View File

@ -85,7 +85,7 @@ class DictTrie {
{ {
const DictUnit *tmp = NULL; const DictUnit *tmp = NULL;
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(word, runes)) if (!DecodeUTF8RunesInString(word, runes))
{ {
XLOG(ERROR) << "Decode failed."; XLOG(ERROR) << "Decode failed.";
} }
@ -197,7 +197,7 @@ class DictTrie {
const string& word, const string& word,
double weight, double weight,
const string& tag) { const string& tag) {
if (!DecodeRunesInString(word, node_info.word)) { if (!DecodeUTF8RunesInString(word, node_info.word)) {
XLOG(ERROR) << "Decode " << word << " failed."; XLOG(ERROR) << "Decode " << word << " failed.";
return false; return false;
} }

View File

@ -105,7 +105,7 @@ struct HMMModel {
XLOG(ERROR) << "emitProb illegal."; XLOG(ERROR) << "emitProb illegal.";
return false; return false;
} }
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
XLOG(ERROR) << "TransCode failed."; XLOG(ERROR) << "TransCode failed.";
return false; return false;
} }

View File

@ -34,7 +34,7 @@ class PosTagger {
RuneStrArray runes; RuneStrArray runes;
const DictTrie * dict = segment.GetDictTrie(); const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL); assert(dict != NULL);
if (!DecodeRunesInString(str, runes)) { if (!DecodeUTF8RunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed."; XLOG(ERROR) << "Decode failed.";
return POS_X; return POS_X;
} }

View File

@ -17,7 +17,7 @@ class PreFilter {
PreFilter(const unordered_set<Rune>& symbols, PreFilter(const unordered_set<Rune>& symbols,
const string& sentence) const string& sentence)
: symbols_(symbols) { : symbols_(symbols) {
if (!DecodeRunesInString(sentence, sentence_)) { if (!DecodeUTF8RunesInString(sentence, sentence_)) {
XLOG(ERROR) << "decode failed. "; XLOG(ERROR) << "decode failed. ";
} }
cursor_ = sentence_.begin(); cursor_ = sentence_.begin();

View File

@ -25,7 +25,7 @@ class SegmentBase {
bool ResetSeparators(const string& s) { bool ResetSeparators(const string& s) {
symbols_.clear(); symbols_.clear();
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(s, runes)) { if (!DecodeUTF8RunesInString(s, runes)) {
XLOG(ERROR) << "decode " << s << " failed"; XLOG(ERROR) << "decode " << s << " failed";
return false; return false;
} }

View File

@ -84,7 +84,7 @@ struct RuneStrLite {
} }
}; // struct RuneStrLite }; // struct RuneStrLite
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
RuneStrLite rp(0, 0); RuneStrLite rp(0, 0);
if (str == NULL || len == 0) { if (str == NULL || len == 0) {
return rp; return rp;
@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
return rp; return rp;
} }
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
runes.clear(); runes.clear();
runes.reserve(len / 2); runes.reserve(len / 2);
for (uint32_t i = 0, j = 0; i < len;) { for (uint32_t i = 0, j = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i); RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
if (rp.len == 0) { if (rp.len == 0) {
runes.clear(); runes.clear();
return false; return false;
@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
return true; return true;
} }
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) {
return DecodeRunesInString(s.c_str(), s.size(), runes); return DecodeUTF8RunesInString(s.c_str(), s.size(), runes);
} }
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) {
unicode.clear(); unicode.clear();
RuneStrArray runes; RuneStrArray runes;
if (!DecodeRunesInString(s, len, runes)) { if (!DecodeUTF8RunesInString(s, len, runes)) {
return false; return false;
} }
unicode.reserve(runes.size()); unicode.reserve(runes.size());
@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
} }
inline bool IsSingleWord(const string& str) { inline bool IsSingleWord(const string& str) {
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size());
return rp.len == str.size(); return rp.len == str.size();
} }
inline bool DecodeRunesInString(const string& s, Unicode& unicode) { inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) {
return DecodeRunesInString(s.c_str(), s.size(), unicode); return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode);
} }
inline Unicode DecodeRunesInString(const string& s) { inline Unicode DecodeUTF8RunesInString(const string& s) {
Unicode result; Unicode result;
DecodeRunesInString(s, result); DecodeUTF8RunesInString(s, result);
return result; return result;
} }

View File

@ -15,7 +15,7 @@ TEST(TrieTest, Empty) {
TEST(TrieTest, Construct) { TEST(TrieTest, Construct) {
vector<Unicode> keys; vector<Unicode> keys;
vector<const DictUnit*> values; vector<const DictUnit*> values;
keys.push_back(DecodeRunesInString("")); keys.push_back(DecodeUTF8RunesInString(""));
values.push_back((const DictUnit*)(NULL)); values.push_back((const DictUnit*)(NULL));
Trie trie(keys, values); Trie trie(keys, values);
} }
@ -32,7 +32,7 @@ TEST(DictTrieTest, Test1) {
ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
string word("来到"); string word("来到");
cppjieba::RuneStrArray uni; cppjieba::RuneStrArray uni;
ASSERT_TRUE(DecodeRunesInString(word, uni)); ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
//DictUnit nodeInfo; //DictUnit nodeInfo;
//nodeInfo.word = uni; //nodeInfo.word = uni;
//nodeInfo.tag = "v"; //nodeInfo.tag = "v";
@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) {
LocalVector<pair<size_t, const DictUnit*> > res; LocalVector<pair<size_t, const DictUnit*> > res;
const char * words[] = {"", "清华", "清华大学"}; const char * words[] = {"", "清华", "清华大学"};
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(DecodeRunesInString(words[i], uni)); ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
} }
vector<pair<size_t, const DictUnit*> > vec; vector<pair<size_t, const DictUnit*> > vec;
vector<struct Dag> dags; vector<struct Dag> dags;
ASSERT_TRUE(DecodeRunesInString(word, uni)); ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
trie.Find(uni.begin(), uni.end(), dags); trie.Find(uni.begin(), uni.end(), dags);
ASSERT_EQ(dags.size(), uni.size()); ASSERT_EQ(dags.size(), uni.size());
ASSERT_NE(dags.size(), 0u); ASSERT_NE(dags.size(), 0u);
@ -72,20 +72,20 @@ TEST(DictTrieTest, UserDict) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
string word = "云计算"; string word = "云计算";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL); ASSERT_TRUE(unit != NULL);
ASSERT_NEAR(unit->weight, -14.100, 0.001); ASSERT_NEAR(unit->weight, -14.100, 0.001);
word = "蓝翔"; word = "蓝翔";
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end()); unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL); ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz"); ASSERT_EQ(unit->tag, "nz");
ASSERT_NEAR(unit->weight, -14.100, 0.001); ASSERT_NEAR(unit->weight, -14.100, 0.001);
word = "区块链"; word = "区块链";
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end()); unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL); ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz"); ASSERT_EQ(unit->tag, "nz");
@ -96,7 +96,7 @@ TEST(DictTrieTest, UserDictWithMaxWeight) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
string word = "云计算"; string word = "云计算";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit); ASSERT_TRUE(unit);
ASSERT_NEAR(unit->weight, -2.975, 0.001); ASSERT_NEAR(unit->weight, -2.975, 0.001);
@ -108,7 +108,7 @@ TEST(DictTrieTest, Dag) {
{ {
string word = "清华大学"; string word = "清华大学";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res; vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res); trie.Find(unicode.begin(), unicode.end(), res);
@ -122,7 +122,7 @@ TEST(DictTrieTest, Dag) {
{ {
string word = "北京邮电大学"; string word = "北京邮电大学";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res; vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res); trie.Find(unicode.begin(), unicode.end(), res);
@ -136,7 +136,7 @@ TEST(DictTrieTest, Dag) {
{ {
string word = "长江大桥"; string word = "长江大桥";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res; vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res); trie.Find(unicode.begin(), unicode.end(), res);
@ -150,7 +150,7 @@ TEST(DictTrieTest, Dag) {
{ {
string word = "长江大桥"; string word = "长江大桥";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res; vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res, 3); trie.Find(unicode.begin(), unicode.end(), res, 3);
@ -164,7 +164,7 @@ TEST(DictTrieTest, Dag) {
{ {
string word = "长江大桥"; string word = "长江大桥";
cppjieba::RuneStrArray unicode; cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode)); ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res; vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res, 4); trie.Find(unicode.begin(), unicode.end(), res, 4);

View File

@ -8,7 +8,7 @@ using namespace std;
TEST(UnicodeTest, Test1) { TEST(UnicodeTest, Test1) {
string s = "你好世界"; string s = "你好世界";
RuneStrArray runes; RuneStrArray runes;
ASSERT_TRUE(DecodeRunesInString(s, runes)); ASSERT_TRUE(DecodeUTF8RunesInString(s, runes));
string actual; string actual;
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]"; string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
actual << runes; actual << runes;
@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) {
TEST(UnicodeTest, Illegal) { TEST(UnicodeTest, Illegal) {
string s = "123\x80"; string s = "123\x80";
RuneStrArray runes; RuneStrArray runes;
ASSERT_FALSE(DecodeRunesInString(s, runes)); ASSERT_FALSE(DecodeUTF8RunesInString(s, runes));
string actual; string actual;
string expected = "[]"; string expected = "[]";
actual << runes; actual << runes;
@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) {
s[rand() % len] = rand(); s[rand() % len] = rand();
} }
RuneStrArray runes; RuneStrArray runes;
DecodeRunesInString(s, runes); DecodeUTF8RunesInString(s, runes);
} }
} }