diff --git a/src/PreFilter.hpp b/src/PreFilter.hpp index fa71624..bea43f0 100644 --- a/src/PreFilter.hpp +++ b/src/PreFilter.hpp @@ -5,7 +5,16 @@ namespace CppJieba { -const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; +//class PreFilterIterator { +// public: +// PreFilterIterator() { +// } +// ~PreFilterIterator() { +// } +// +// private: +// const unordered_set& specialSymbols_; +//}; // PreFilterIterator class PreFilter { public: @@ -14,17 +23,14 @@ class PreFilter { Unicode::const_iterator end; }; // struct Range - PreFilter() { - LoadSpecialSymbols(); - } - ~PreFilter() { - } - - void Reset(const string& sentence) { + PreFilter(const unordered_set& symbols, + const string& sentence) + : symbols_(symbols) { TransCode::decode(sentence, sentence_); cursor_ = sentence_.begin(); } - + ~PreFilter() { + } bool HasNext() const { return cursor_ != sentence_.end(); } @@ -32,7 +38,7 @@ class PreFilter { Range range; range.begin = cursor_; while (cursor_ != sentence_.end()) { - if (isIn(specialSymbols_, *cursor_)) { + if (isIn(symbols_, *cursor_)) { if (range.begin == cursor_) { cursor_ ++; } @@ -45,18 +51,9 @@ class PreFilter { return range; } private: - Unicode sentence_; Unicode::const_iterator cursor_; - - void LoadSpecialSymbols() { - size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); - for(size_t i = 0; i < size; i ++) { - specialSymbols_.insert(SPECIAL_SYMBOL[i]); - } - assert(specialSymbols_.size()); - } - - unordered_set specialSymbols_; + Unicode sentence_; + const unordered_set& symbols_; }; // class PreFilter } // namespace CppJieba diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index 6484965..8612f9e 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -1,57 +1,56 @@ #ifndef CPPJIEBA_SEGMENTBASE_H #define CPPJIEBA_SEGMENTBASE_H -#include "TransCode.hpp" #include "limonp/Logger.hpp" -#include "limonp/NonCopyable.hpp" -#include "limonp/HandyMacro.hpp" +#include "PreFilter.hpp" #include "ISegment.hpp" #include namespace CppJieba { + +//const char* const SPECIAL_CHARS = " \t\n,。"; +const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u}; + using namespace limonp; -//const char* const SPECIAL_CHARS = " \t\n"; -#ifndef CPPJIEBA_GBK -const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; -#else -const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u}; -#endif - -class SegmentBase: public ISegment, public NonCopyable { +class SegmentBase: public ISegment { public: SegmentBase() { - LoadSpecialSymbols(); } virtual ~SegmentBase() { } public: virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; virtual bool cut(const string& str, vector& res) const { + PreFilter pre_filter(symbols_, str); + PreFilter::Range range; res.clear(); - - Unicode unicode; - unicode.reserve(str.size()); - - TransCode::decode(str, unicode); - - Unicode::const_iterator left = unicode.begin(); - Unicode::const_iterator right; - - for(right = unicode.begin(); right != unicode.end(); right++) { - if(isIn(specialSymbols_, *right)) { - if(left != right) { - cut(left, right, res); - } - res.resize(res.size() + 1); - TransCode::encode(right, right + 1, res.back()); - left = right + 1; - } - } - if(left != right) { - cut(left, right, res); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + cut(range.begin, range.end, res); } + //Unicode unicode; + //unicode.reserve(str.size()); + + //TransCode::decode(str, unicode); + + //Unicode::const_iterator left = unicode.begin(); + //Unicode::const_iterator right; + + //for(right = unicode.begin(); right != unicode.end(); right++) { + // if(isIn(specialSymbols_, *right)) { + // if(left != right) { + // cut(left, right, res); + // } + // res.resize(res.size() + 1); + // TransCode::encode(right, right + 1, res.back()); + // left = right + 1; + // } + //} + //if(left != right) { + // cut(left, right, res); + //} return true; } @@ -66,19 +65,20 @@ class SegmentBase: public ISegment, public NonCopyable { res.resize(res.size() + uRes.size()); for(size_t i = 0; i < uRes.size(); i ++, offset++) { TransCode::encode(uRes[i], res[offset]); + cout << __FILE__ << __LINE__ << endl; + cout << res[offset] << endl; } } private: void LoadSpecialSymbols() { size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); for(size_t i = 0; i < size; i ++) { - specialSymbols_.insert(SPECIAL_SYMBOL[i]); + symbols_.insert(SPECIAL_SYMBOL[i]); } - assert(specialSymbols_.size()); + assert(symbols_.size()); } - unordered_set specialSymbols_; - + unordered_set symbols_; }; // class SegmentBase } // CppJieba diff --git a/test/unittest/TPreFilter.cpp b/test/unittest/TPreFilter.cpp index 5974825..12409e2 100644 --- a/test/unittest/TPreFilter.cpp +++ b/test/unittest/TPreFilter.cpp @@ -4,15 +4,39 @@ using namespace CppJieba; TEST(PreFilterTest, Test1) { - PreFilter filter; - filter.Reset("你好,美丽的,世界"); - const char* expected[] = {"你好", ",", "美丽的", ",", "世界"}; - ASSERT_TRUE(filter.HasNext()); - vector words; - while (filter.HasNext()) { - PreFilter::Range range; - range = filter.Next(); - words.push_back(TransCode::encode(range.begin, range.end)); + unordered_set symbol; + symbol.insert(65292u); // "," + symbol.insert(12290u); // "。" + string expected; + string res; + + { + PreFilter filter(symbol, "你好,美丽的,世界"); + expected = "你好/,/美丽的/,/世界"; + ASSERT_TRUE(filter.HasNext()); + vector words; + while (filter.HasNext()) { + PreFilter::Range range; + range = filter.Next(); + words.push_back(TransCode::encode(range.begin, range.end)); + } + res = join(words.begin(), words.end(), "/"); + ASSERT_EQ(res, expected); + } + + { + PreFilter filter(symbol, "我来自北京邮电大学。。。学号123456,用AK47"); + expected = "我来自北京邮电大学/。/。/。/学号123456/,/用AK47"; + ASSERT_TRUE(filter.HasNext()); + vector words; + while (filter.HasNext()) { + PreFilter::Range range; + range = filter.Next(); + words.push_back(TransCode::encode(range.begin, range.end)); + } + res = join(words.begin(), words.end(), "/"); + for (size_t i = 0; i < words.size(); i++) { + } + ASSERT_EQ(res, expected); } - ASSERT_EQ(vector(expected, expected + sizeof(expected)/sizeof(*expected)), words); } diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 1d4f6bd..0fc1ed4 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -18,6 +18,7 @@ TEST(MixSegmentTest, Test1) { const char* res2[] = {"B超"," ", "T恤"}; vector words; ASSERT_TRUE(segment.cut(str, words)); + cout << words << endl; ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_TRUE(segment.cut(str2, words)); ASSERT_EQ(words, vector(res2, res2 + sizeof(res2)/sizeof(res2[0])));