Refactor decoding functions to use UTF-8 compliant methods

Updated multiple files to replace instances of DecodeRunesInString with DecodeUTF8RunesInString, ensuring proper handling of UTF-8 encoded strings. This change enhances the robustness of string decoding across the cppjieba library, including updates in DictTrie, HMMModel, PosTagger, PreFilter, SegmentBase, and Unicode files. Additionally, corresponding unit tests have been modified to reflect these changes.
This commit is contained in:
yanyiwu 2024-12-08 16:46:24 +08:00
parent 5ee74d788e
commit 42a93a4b98
8 changed files with 34 additions and 34 deletions

View File

@ -85,7 +85,7 @@ class DictTrie {
{
const DictUnit *tmp = NULL;
RuneStrArray runes;
if (!DecodeRunesInString(word, runes))
if (!DecodeUTF8RunesInString(word, runes))
{
XLOG(ERROR) << "Decode failed.";
}
@ -197,7 +197,7 @@ class DictTrie {
const string& word,
double weight,
const string& tag) {
if (!DecodeRunesInString(word, node_info.word)) {
if (!DecodeUTF8RunesInString(word, node_info.word)) {
XLOG(ERROR) << "Decode " << word << " failed.";
return false;
}

View File

@ -105,7 +105,7 @@ struct HMMModel {
XLOG(ERROR) << "emitProb illegal.";
return false;
}
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
XLOG(ERROR) << "TransCode failed.";
return false;
}

View File

@ -34,7 +34,7 @@ class PosTagger {
RuneStrArray runes;
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
if (!DecodeRunesInString(str, runes)) {
if (!DecodeUTF8RunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed.";
return POS_X;
}

View File

@ -17,7 +17,7 @@ class PreFilter {
PreFilter(const unordered_set<Rune>& symbols,
const string& sentence)
: symbols_(symbols) {
if (!DecodeRunesInString(sentence, sentence_)) {
if (!DecodeUTF8RunesInString(sentence, sentence_)) {
XLOG(ERROR) << "decode failed. ";
}
cursor_ = sentence_.begin();

View File

@ -25,7 +25,7 @@ class SegmentBase {
bool ResetSeparators(const string& s) {
symbols_.clear();
RuneStrArray runes;
if (!DecodeRunesInString(s, runes)) {
if (!DecodeUTF8RunesInString(s, runes)) {
XLOG(ERROR) << "decode " << s << " failed";
return false;
}

View File

@ -84,7 +84,7 @@ struct RuneStrLite {
}
}; // struct RuneStrLite
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
RuneStrLite rp(0, 0);
if (str == NULL || len == 0) {
return rp;
@ -139,11 +139,11 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
return rp;
}
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
runes.clear();
runes.reserve(len / 2);
for (uint32_t i = 0, j = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
if (rp.len == 0) {
runes.clear();
return false;
@ -156,14 +156,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes)
return true;
}
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
return DecodeRunesInString(s.c_str(), s.size(), runes);
inline bool DecodeUTF8RunesInString(const string& s, RuneStrArray& runes) {
return DecodeUTF8RunesInString(s.c_str(), s.size(), runes);
}
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
inline bool DecodeUTF8RunesInString(const char* s, size_t len, Unicode& unicode) {
unicode.clear();
RuneStrArray runes;
if (!DecodeRunesInString(s, len, runes)) {
if (!DecodeUTF8RunesInString(s, len, runes)) {
return false;
}
unicode.reserve(runes.size());
@ -174,17 +174,17 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
}
inline bool IsSingleWord(const string& str) {
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
RuneStrLite rp = DecodeUTF8ToRune(str.c_str(), str.size());
return rp.len == str.size();
}
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
return DecodeRunesInString(s.c_str(), s.size(), unicode);
inline bool DecodeUTF8RunesInString(const string& s, Unicode& unicode) {
return DecodeUTF8RunesInString(s.c_str(), s.size(), unicode);
}
inline Unicode DecodeRunesInString(const string& s) {
inline Unicode DecodeUTF8RunesInString(const string& s) {
Unicode result;
DecodeRunesInString(s, result);
DecodeUTF8RunesInString(s, result);
return result;
}

View File

@ -15,7 +15,7 @@ TEST(TrieTest, Empty) {
TEST(TrieTest, Construct) {
vector<Unicode> keys;
vector<const DictUnit*> values;
keys.push_back(DecodeRunesInString(""));
keys.push_back(DecodeUTF8RunesInString(""));
values.push_back((const DictUnit*)(NULL));
Trie trie(keys, values);
}
@ -32,7 +32,7 @@ TEST(DictTrieTest, Test1) {
ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
string word("来到");
cppjieba::RuneStrArray uni;
ASSERT_TRUE(DecodeRunesInString(word, uni));
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
//DictUnit nodeInfo;
//nodeInfo.word = uni;
//nodeInfo.tag = "v";
@ -52,13 +52,13 @@ TEST(DictTrieTest, Test1) {
LocalVector<pair<size_t, const DictUnit*> > res;
const char * words[] = {"", "清华", "清华大学"};
for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
ASSERT_TRUE(DecodeRunesInString(words[i], uni));
ASSERT_TRUE(DecodeUTF8RunesInString(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
//resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
}
vector<pair<size_t, const DictUnit*> > vec;
vector<struct Dag> dags;
ASSERT_TRUE(DecodeRunesInString(word, uni));
ASSERT_TRUE(DecodeUTF8RunesInString(word, uni));
trie.Find(uni.begin(), uni.end(), dags);
ASSERT_EQ(dags.size(), uni.size());
ASSERT_NE(dags.size(), 0u);
@ -72,20 +72,20 @@ TEST(DictTrieTest, UserDict) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
string word = "云计算";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_NEAR(unit->weight, -14.100, 0.001);
word = "蓝翔";
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz");
ASSERT_NEAR(unit->weight, -14.100, 0.001);
word = "区块链";
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz");
@ -96,7 +96,7 @@ TEST(DictTrieTest, UserDictWithMaxWeight) {
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
string word = "云计算";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit);
ASSERT_NEAR(unit->weight, -2.975, 0.001);
@ -108,7 +108,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "清华大学";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res);
@ -122,7 +122,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "北京邮电大学";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res);
@ -136,7 +136,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "长江大桥";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res);
@ -150,7 +150,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "长江大桥";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res, 3);
@ -164,7 +164,7 @@ TEST(DictTrieTest, Dag) {
{
string word = "长江大桥";
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
ASSERT_TRUE(DecodeUTF8RunesInString(word, unicode));
vector<struct Dag> res;
trie.Find(unicode.begin(), unicode.end(), res, 4);

View File

@ -8,7 +8,7 @@ using namespace std;
TEST(UnicodeTest, Test1) {
string s = "你好世界";
RuneStrArray runes;
ASSERT_TRUE(DecodeRunesInString(s, runes));
ASSERT_TRUE(DecodeUTF8RunesInString(s, runes));
string actual;
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
actual << runes;
@ -18,7 +18,7 @@ TEST(UnicodeTest, Test1) {
TEST(UnicodeTest, Illegal) {
string s = "123\x80";
RuneStrArray runes;
ASSERT_FALSE(DecodeRunesInString(s, runes));
ASSERT_FALSE(DecodeUTF8RunesInString(s, runes));
string actual;
string expected = "[]";
actual << runes;
@ -38,6 +38,6 @@ TEST(UnicodeTest, Rand) {
s[rand() % len] = rand();
}
RuneStrArray runes;
DecodeRunesInString(s, runes);
DecodeUTF8RunesInString(s, runes);
}
}