mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Merge pull request #79 from royguo/master
Add Unicode offset/length support for `Word`
This commit is contained in:
commit
e5d9eb8816
@ -18,9 +18,14 @@ typedef uint32_t Rune;
|
|||||||
struct Word {
|
struct Word {
|
||||||
string word;
|
string word;
|
||||||
uint32_t offset;
|
uint32_t offset;
|
||||||
|
uint32_t unicode_offset;
|
||||||
|
uint32_t unicode_length;
|
||||||
Word(const string& w, uint32_t o)
|
Word(const string& w, uint32_t o)
|
||||||
: word(w), offset(o) {
|
: word(w), offset(o) {
|
||||||
}
|
}
|
||||||
|
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||||
|
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||||
|
}
|
||||||
}; // struct Word
|
}; // struct Word
|
||||||
|
|
||||||
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
||||||
@ -31,11 +36,16 @@ struct RuneStr {
|
|||||||
Rune rune;
|
Rune rune;
|
||||||
uint32_t offset;
|
uint32_t offset;
|
||||||
uint32_t len;
|
uint32_t len;
|
||||||
|
uint32_t unicode_offset;
|
||||||
|
uint32_t unicode_length;
|
||||||
RuneStr(): rune(0), offset(0), len(0) {
|
RuneStr(): rune(0), offset(0), len(0) {
|
||||||
}
|
}
|
||||||
RuneStr(Rune r, uint32_t o, uint32_t l)
|
RuneStr(Rune r, uint32_t o, uint32_t l)
|
||||||
: rune(r), offset(o), len(l) {
|
: rune(r), offset(o), len(l) {
|
||||||
}
|
}
|
||||||
|
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
||||||
|
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||||
|
}
|
||||||
}; // struct RuneStr
|
}; // struct RuneStr
|
||||||
|
|
||||||
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
||||||
@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
|||||||
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
||||||
runes.clear();
|
runes.clear();
|
||||||
runes.reserve(len / 2);
|
runes.reserve(len / 2);
|
||||||
for (size_t i = 0; i < len;) {
|
for (size_t i = 0, j = 0; i < len;) {
|
||||||
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
||||||
if (rp.len == 0) {
|
if (rp.len == 0) {
|
||||||
runes.clear();
|
runes.clear();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
RuneStr x(rp.rune, i, rp.len);
|
RuneStr x(rp.rune, i, rp.len, j, 1);
|
||||||
runes.push_back(x);
|
runes.push_back(x);
|
||||||
i += rp.len;
|
i += rp.len;
|
||||||
|
++j;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) {
|
|||||||
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||||
assert(right->offset >= left->offset);
|
assert(right->offset >= left->offset);
|
||||||
uint32_t len = right->offset - left->offset + right->len;
|
uint32_t len = right->offset - left->offset + right->len;
|
||||||
return Word(s.substr(left->offset, len), left->offset);
|
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
|
||||||
|
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user