Merge pull request #79 from royguo/master

Add Unicode offset/length support for `Word`
This commit is contained in:
Yanyi Wu 2016-10-18 23:02:01 +08:00 committed by GitHub
commit e5d9eb8816

View File

@ -18,9 +18,14 @@ typedef uint32_t Rune;
struct Word { struct Word {
string word; string word;
uint32_t offset; uint32_t offset;
uint32_t unicode_offset;
uint32_t unicode_length;
Word(const string& w, uint32_t o) Word(const string& w, uint32_t o)
: word(w), offset(o) { : word(w), offset(o) {
} }
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
}
}; // struct Word }; // struct Word
inline std::ostream& operator << (std::ostream& os, const Word& w) { inline std::ostream& operator << (std::ostream& os, const Word& w) {
@ -31,11 +36,16 @@ struct RuneStr {
Rune rune; Rune rune;
uint32_t offset; uint32_t offset;
uint32_t len; uint32_t len;
uint32_t unicode_offset;
uint32_t unicode_length;
RuneStr(): rune(0), offset(0), len(0) { RuneStr(): rune(0), offset(0), len(0) {
} }
RuneStr(Rune r, uint32_t o, uint32_t l) RuneStr(Rune r, uint32_t o, uint32_t l)
: rune(r), offset(o), len(l) { : rune(r), offset(o), len(l) {
} }
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
}
}; // struct RuneStr }; // struct RuneStr
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
runes.clear(); runes.clear();
runes.reserve(len / 2); runes.reserve(len / 2);
for (size_t i = 0; i < len;) { for (size_t i = 0, j = 0; i < len;) {
RuneStrLite rp = DecodeRuneInString(s + i, len - i); RuneStrLite rp = DecodeRuneInString(s + i, len - i);
if (rp.len == 0) { if (rp.len == 0) {
runes.clear(); runes.clear();
return false; return false;
} }
RuneStr x(rp.rune, i, rp.len); RuneStr x(rp.rune, i, rp.len, j, 1);
runes.push_back(x); runes.push_back(x);
i += rp.len; i += rp.len;
++j;
} }
return true; return true;
} }
@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) {
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
assert(right->offset >= left->offset); assert(right->offset >= left->offset);
uint32_t len = right->offset - left->offset + right->len; uint32_t len = right->offset - left->offset + right->len;
return Word(s.substr(left->offset, len), left->offset); uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
} }
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {