diff --git a/include/cppjieba/FullSegment.hpp b/include/cppjieba/FullSegment.hpp index 2f101ee..d859e81 100644 --- a/include/cppjieba/FullSegment.hpp +++ b/include/cppjieba/FullSegment.hpp @@ -37,7 +37,7 @@ class FullSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/HMMSegment.hpp b/include/cppjieba/HMMSegment.hpp index 84e69aa..0038a49 100644 --- a/include/cppjieba/HMMSegment.hpp +++ b/include/cppjieba/HMMSegment.hpp @@ -35,7 +35,7 @@ class HMMSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { RuneStrArray::const_iterator left = begin; diff --git a/include/cppjieba/MPSegment.hpp b/include/cppjieba/MPSegment.hpp index b386ae9..2bffeb8 100644 --- a/include/cppjieba/MPSegment.hpp +++ b/include/cppjieba/MPSegment.hpp @@ -38,7 +38,7 @@ class MPSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, diff --git a/include/cppjieba/MixSegment.hpp b/include/cppjieba/MixSegment.hpp index e096815..82f078b 100644 --- a/include/cppjieba/MixSegment.hpp +++ b/include/cppjieba/MixSegment.hpp @@ -31,7 +31,7 @@ class MixSegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { diff --git a/include/cppjieba/QuerySegment.hpp b/include/cppjieba/QuerySegment.hpp index 15a684e..0b04ce6 100644 --- a/include/cppjieba/QuerySegment.hpp +++ b/include/cppjieba/QuerySegment.hpp @@ -37,7 +37,7 @@ class QuerySegment: public SegmentBase { } words.clear(); words.reserve(wrs.size()); - GetStringsFromWordRanges(wrs, words); + GetStringsFromWordRanges(sentence, wrs, words); } void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first diff --git a/include/cppjieba/Unicode.hpp b/include/cppjieba/Unicode.hpp index 37813d2..923fcdb 100644 --- a/include/cppjieba/Unicode.hpp +++ b/include/cppjieba/Unicode.hpp @@ -9,16 +9,19 @@ namespace cppjieba { +using std::string; +using std::vector; + typedef uint32_t Rune; struct RuneStr { Rune rune; - const char* str; + uint32_t offset; uint32_t len; - RuneStr(): rune(0), str(NULL), len(0) { + RuneStr(): rune(0), offset(0), len(0) { } - RuneStr(Rune r, const char* s, uint32_t l) - : rune(r), str(s), len(l) { + RuneStr(Rune r, uint32_t o, uint32_t l) + : rune(r), offset(o), len(l) { } }; // struct RuneStr @@ -118,14 +121,14 @@ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) if (rp.len == 0) { return false; } - RuneStr x(rp.rune, s + i, rp.len); + RuneStr x(rp.rune, i, rp.len); runes.push_back(x); i += rp.len; } return true; } -inline bool DecodeRunesInString(const std::string& s, RuneStrArray& runes) { +inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { return DecodeRunesInString(s.c_str(), s.size(), runes); } @@ -142,37 +145,38 @@ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { return true; } -inline bool IsSingleWord(const std::string& str) { +inline bool IsSingleWord(const string& str) { RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); return rp.len == str.size(); } -inline bool DecodeRunesInString(const std::string& s, Unicode& unicode) { +inline bool DecodeRunesInString(const string& s, Unicode& unicode) { return DecodeRunesInString(s.c_str(), s.size(), unicode); } -inline Unicode DecodeRunesInString(const std::string& s) { +inline Unicode DecodeRunesInString(const string& s) { Unicode result; DecodeRunesInString(s, result); return result; } -//[left, right] -inline std::string GetStringFromRunes(RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { - assert(right->str >= left->str); - return std::string(left->str, right->str - left->str + right->len); +// [left, right] +inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return s.substr(left->offset, len); } -inline void GetStringsFromWordRanges(const std::vector& wrs, std::vector& words) { +inline void GetStringsFromWordRanges(const string& s, const vector& wrs, vector& words) { for (size_t i = 0; i < wrs.size(); i++) { - words.push_back(GetStringFromRunes(wrs[i].left, wrs[i].right)); + words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right)); } } -inline std::vector GetStringsFromWordRanges(const std::vector& wrs) { - std::vector result; - GetStringsFromWordRanges(wrs, result); +inline vector GetStringsFromWordRanges(const string& s, const vector& wrs) { + vector result; + GetStringsFromWordRanges(s, wrs, result); return result; } diff --git a/test/unittest/pre_filter_test.cpp b/test/unittest/pre_filter_test.cpp index 7a532b9..7ff080e 100644 --- a/test/unittest/pre_filter_test.cpp +++ b/test/unittest/pre_filter_test.cpp @@ -20,7 +20,7 @@ TEST(PreFilterTest, Test1) { while (filter.HasNext()) { PreFilter::Range range; range = filter.Next(); - words.push_back(GetStringFromRunes(range.begin, range.end - 1)); + words.push_back(GetStringFromRunes(s, range.begin, range.end - 1)); } res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ(res, expected); @@ -35,7 +35,7 @@ TEST(PreFilterTest, Test1) { while (filter.HasNext()) { PreFilter::Range range; range = filter.Next(); - words.push_back(GetStringFromRunes(range.begin, range.end - 1)); + words.push_back(GetStringFromRunes(s, range.begin, range.end - 1)); } res = limonp::Join(words.begin(), words.end(), "/"); ASSERT_EQ(res, expected);