diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 22a9d83..87de505 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -18,9 +18,14 @@ typedef uint32_t Rune; struct Word { string word; uint32_t offset; + uint32_t unicode_offset; + uint32_t unicode_length; Word(const string& w, uint32_t o) : word(w), offset(o) { } + Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) + : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } }; // struct Word inline std::ostream& operator << (std::ostream& os, const Word& w) { @@ -31,11 +36,16 @@ struct RuneStr { Rune rune; uint32_t offset; uint32_t len; + uint32_t unicode_offset; + uint32_t unicode_length; RuneStr(): rune(0), offset(0), len(0) { } RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) { } + RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length) + : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } }; // struct RuneStr inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { @@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { runes.clear(); runes.reserve(len / 2); - for (size_t i = 0; i < len;) { + for (size_t i = 0, j = 0; i < len;) { RuneStrLite rp = DecodeRuneInString(s + i, len - i); if (rp.len == 0) { runes.clear(); return false; } - RuneStr x(rp.rune, i, rp.len); + RuneStr x(rp.rune, i, rp.len, j, 1); runes.push_back(x); i += rp.len; + ++j; } return true; } @@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) { inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { assert(right->offset >= left->offset); uint32_t len = right->offset - left->offset + right->len; - return Word(s.substr(left->offset, len), left->offset); + uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length; + return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length); } inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {