From 0542dd1cfdcf8449ba9a0670859bd2f7bdfda00d Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 13 Sep 2015 15:10:10 +0800 Subject: [PATCH] add PreFilter --- src/PreFilter.hpp | 64 ++++++++++++++++++++++++++++++++++++ src/SegmentBase.hpp | 4 +-- src/TransCode.hpp | 7 ++++ test/unittest/CMakeLists.txt | 1 + test/unittest/TPreFilter.cpp | 18 ++++++++++ 5 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 src/PreFilter.hpp create mode 100644 test/unittest/TPreFilter.cpp diff --git a/src/PreFilter.hpp b/src/PreFilter.hpp new file mode 100644 index 0000000..fa71624 --- /dev/null +++ b/src/PreFilter.hpp @@ -0,0 +1,64 @@ +#ifndef CPPJIEBA_PRE_FILTER_H +#define CPPJIEBA_PRE_FILTER_H + +#include "TransCode.hpp" + +namespace CppJieba { + +const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u}; + +class PreFilter { + public: + struct Range { + Unicode::const_iterator begin; + Unicode::const_iterator end; + }; // struct Range + + PreFilter() { + LoadSpecialSymbols(); + } + ~PreFilter() { + } + + void Reset(const string& sentence) { + TransCode::decode(sentence, sentence_); + cursor_ = sentence_.begin(); + } + + bool HasNext() const { + return cursor_ != sentence_.end(); + } + Range Next() { + Range range; + range.begin = cursor_; + while (cursor_ != sentence_.end()) { + if (isIn(specialSymbols_, *cursor_)) { + if (range.begin == cursor_) { + cursor_ ++; + } + range.end = cursor_; + return range; + } + cursor_ ++; + } + range.end = sentence_.end(); + return range; + } + private: + Unicode sentence_; + Unicode::const_iterator cursor_; + + void LoadSpecialSymbols() { + size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); + for(size_t i = 0; i < size; i ++) { + specialSymbols_.insert(SPECIAL_SYMBOL[i]); + } + assert(specialSymbols_.size()); + } + + unordered_set specialSymbols_; +}; // class PreFilter + +} // namespace CppJieba + +#endif // CPPJIEBA_PRE_FILTER_H diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index c28f0a3..6484965 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -23,9 +23,9 @@ class SegmentBase: public ISegment, public NonCopyable { public: SegmentBase() { LoadSpecialSymbols(); - }; + } virtual ~SegmentBase() { - }; + } public: virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; virtual bool cut(const string& str, vector& res) const { diff --git a/src/TransCode.hpp b/src/TransCode.hpp index 52e5ac7..5583f46 100644 --- a/src/TransCode.hpp +++ b/src/TransCode.hpp @@ -57,6 +57,13 @@ inline Unicode decode(const string& str) { return unicode; } +inline void encode(const vector& input, vector& output) { + output.resize(input.size()); + for (size_t i = 0; i < output.size(); i++) { + encode(input[i], output[i]); + } +} + } // namespace TransCode } // namespace CppJieba diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 35a2fa0..46449d8 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -23,6 +23,7 @@ ADD_EXECUTABLE(test.run TSegments.cpp TPosTagger.cpp TApplication.cpp + TPreFilter.cpp ) TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread) diff --git a/test/unittest/TPreFilter.cpp b/test/unittest/TPreFilter.cpp new file mode 100644 index 0000000..5974825 --- /dev/null +++ b/test/unittest/TPreFilter.cpp @@ -0,0 +1,18 @@ +#include "gtest/gtest.h" +#include "src/PreFilter.hpp" + +using namespace CppJieba; + +TEST(PreFilterTest, Test1) { + PreFilter filter; + filter.Reset("你好,美丽的,世界"); + const char* expected[] = {"你好", ",", "美丽的", ",", "世界"}; + ASSERT_TRUE(filter.HasNext()); + vector words; + while (filter.HasNext()) { + PreFilter::Range range; + range = filter.Next(); + words.push_back(TransCode::encode(range.begin, range.end)); + } + ASSERT_EQ(vector(expected, expected + sizeof(expected)/sizeof(*expected)), words); +}