diff --git a/include/cppjieba/SegmentBase.hpp b/include/cppjieba/SegmentBase.hpp index 8630cfd..9ce21c4 100644 --- a/include/cppjieba/SegmentBase.hpp +++ b/include/cppjieba/SegmentBase.hpp @@ -8,27 +8,50 @@ namespace cppjieba { -//const char* const SPECIAL_CHARS = " \t\n,。"; -const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u}; +const char* const SPECIAL_RUNES = " \t\n,。"; using namespace limonp; class SegmentBase { public: SegmentBase() { - LoadSpecialSymbols(); + XCHECK(Insert(SPECIAL_RUNES)); } ~SegmentBase() { } protected: - void LoadSpecialSymbols() { - size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); - for (size_t i = 0; i < size; i ++) { - symbols_.insert(SPECIAL_SYMBOL[i]); + bool Insert(const string& s) { + RuneStrArray runes; + if (!DecodeRunesInString(s, runes)) { + XLOG(ERROR) << "decode " << s << " failed"; + return false; } - assert(symbols_.size()); + for (size_t i = 0; i < runes.size(); i++) { + if (!symbols_.insert(runes[i].rune).second) { + XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; + return false; + } + } + return true; } + //bool Remove(const string& s) { + // RuneStrArray runes; + // if (!DecodeRunesInString(s, runes)) { + // XLOG(ERROR) << "decode " << s << " failed"; + // return false; + // } + // for (size_t i = 0; i < runes.size(); i++) { + // unordered_set::iterator iter = symbols_.find(runes[i].rune); + // if (iter == symbols_.end()) { + // XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " not found"; + // return false; + // } + // symbols_.erase(iter); + // } + // return true; + //} + unordered_set symbols_; }; // class SegmentBase