use PreFilter in SegmentBase

2025-07-18 00:00:12 +08:00 · 2015-09-13 16:05:17 +08:00 · 2015-09-13 16:05:17 +08:00 · 28bcb3bf57
commit 28bcb3bf57
parent 0542dd1cfd
4 changed files with 90 additions and 68 deletions
--- a/src/PreFilter.hpp
+++ b/src/PreFilter.hpp
@ -5,7 +5,16 @@

 namespace CppJieba {

-const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
+//class PreFilterIterator {
+// public:
+//  PreFilterIterator() {
+//  }
+//  ~PreFilterIterator() {
+//  }
+//  
+// private:
+//  const unordered_set<Rune>& specialSymbols_;
+//}; // PreFilterIterator

 class PreFilter {
 public:
@ -14,17 +23,14 @@ class PreFilter {
    Unicode::const_iterator end;
  }; // struct Range

-  PreFilter() {
-    LoadSpecialSymbols();
-  }
-  ~PreFilter() {
-  }
-
-  void Reset(const string& sentence) {
+  PreFilter(const unordered_set<Rune>& symbols, 
+        const string& sentence)
+    : symbols_(symbols) {
    TransCode::decode(sentence, sentence_);
    cursor_ = sentence_.begin();
  }
-  
+  ~PreFilter() {
+  }
  bool HasNext() const {
    return cursor_ != sentence_.end();
  }
@ -32,7 +38,7 @@ class PreFilter {
    Range range;
    range.begin = cursor_;
    while (cursor_ != sentence_.end()) {
-      if (isIn(specialSymbols_, *cursor_)) {
+      if (isIn(symbols_, *cursor_)) {
        if (range.begin == cursor_) {
          cursor_ ++;
        }
@ -45,18 +51,9 @@ class PreFilter {
    return range;
  }
 private:
-  Unicode sentence_;
  Unicode::const_iterator cursor_;
-
-  void LoadSpecialSymbols() {
-    size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
-    for(size_t i = 0; i < size; i ++) {
-      specialSymbols_.insert(SPECIAL_SYMBOL[i]);
-    }
-    assert(specialSymbols_.size());
-  }
-
-  unordered_set<Rune> specialSymbols_;
+  Unicode sentence_;
+  const unordered_set<Rune>& symbols_;
 }; // class PreFilter

 } // namespace CppJieba
--- a/src/SegmentBase.hpp
+++ b/src/SegmentBase.hpp
@ -1,57 +1,56 @@
 #ifndef CPPJIEBA_SEGMENTBASE_H
 #define CPPJIEBA_SEGMENTBASE_H

-#include "TransCode.hpp"
 #include "limonp/Logger.hpp"
-#include "limonp/NonCopyable.hpp"
-#include "limonp/HandyMacro.hpp"
+#include "PreFilter.hpp"
 #include "ISegment.hpp"
 #include <cassert>


 namespace CppJieba {
+
+//const char* const SPECIAL_CHARS = " \t\n，。";
+const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
+
 using namespace limonp;

-//const char* const SPECIAL_CHARS = " \t\n";
-#ifndef CPPJIEBA_GBK
-const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
-#else
-const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u};
-#endif
-
-class SegmentBase: public ISegment, public NonCopyable {
+class SegmentBase: public ISegment {
 public:
  SegmentBase() {
-    LoadSpecialSymbols();
  }
  virtual ~SegmentBase() {
  }
 public:
  virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
  virtual bool cut(const string& str, vector<string>& res) const {
+    PreFilter pre_filter(symbols_, str);
+    PreFilter::Range range;
    res.clear();
-
-    Unicode unicode;
-    unicode.reserve(str.size());
-
-    TransCode::decode(str, unicode);
-
-    Unicode::const_iterator left = unicode.begin();
-    Unicode::const_iterator right;
-
-    for(right = unicode.begin(); right != unicode.end(); right++) {
-      if(isIn(specialSymbols_, *right)) {
-        if(left != right) {
-          cut(left, right, res);
-        }
-        res.resize(res.size() + 1);
-        TransCode::encode(right, right + 1, res.back());
-        left = right + 1;
-      }
-    }
-    if(left != right) {
-      cut(left, right, res);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      cut(range.begin, range.end, res);
    }
+    //Unicode unicode;
+    //unicode.reserve(str.size());
+
+    //TransCode::decode(str, unicode);
+
+    //Unicode::const_iterator left = unicode.begin();
+    //Unicode::const_iterator right;
+
+    //for(right = unicode.begin(); right != unicode.end(); right++) {
+    //  if(isIn(specialSymbols_, *right)) {
+    //    if(left != right) {
+    //      cut(left, right, res);
+    //    }
+    //    res.resize(res.size() + 1);
+    //    TransCode::encode(right, right + 1, res.back());
+    //    left = right + 1;
+    //  }
+    //}
+    //if(left != right) {
+    //  cut(left, right, res);
+    //}

    return true;
  }
@ -66,19 +65,20 @@ class SegmentBase: public ISegment, public NonCopyable {
    res.resize(res.size() + uRes.size());
    for(size_t i = 0; i < uRes.size(); i ++, offset++) {
      TransCode::encode(uRes[i], res[offset]);
+      cout << __FILE__ << __LINE__ << endl;
+      cout << res[offset] << endl;
    }
  }
 private:
  void LoadSpecialSymbols() {
    size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
    for(size_t i = 0; i < size; i ++) {
-      specialSymbols_.insert(SPECIAL_SYMBOL[i]);
+      symbols_.insert(SPECIAL_SYMBOL[i]);
    }
-    assert(specialSymbols_.size());
+    assert(symbols_.size());
  }

-  unordered_set<Rune> specialSymbols_;
-
+  unordered_set<Rune> symbols_;
 }; // class SegmentBase

 } // CppJieba
--- a/test/unittest/TPreFilter.cpp
+++ b/test/unittest/TPreFilter.cpp
@ -4,9 +4,15 @@
 using namespace CppJieba;

 TEST(PreFilterTest, Test1) {
-  PreFilter filter;
-  filter.Reset("你好，美丽的，世界");
-  const char* expected[] = {"你好", "，", "美丽的", "，", "世界"};
+  unordered_set<Rune> symbol;
+  symbol.insert(65292u); // "，"
+  symbol.insert(12290u); // "。"
+  string expected;
+  string res;
+
+  {
+    PreFilter filter(symbol, "你好，美丽的，世界");
+    expected = "你好/，/美丽的/，/世界";
    ASSERT_TRUE(filter.HasNext());
    vector<string> words;
    while (filter.HasNext()) {
@ -14,5 +20,23 @@ TEST(PreFilterTest, Test1) {
      range = filter.Next();
      words.push_back(TransCode::encode(range.begin, range.end));
    }
-  ASSERT_EQ(vector<string>(expected, expected + sizeof(expected)/sizeof(*expected)), words);
+    res = join(words.begin(), words.end(), "/");
+    ASSERT_EQ(res, expected);
+  }
+
+  {
+    PreFilter filter(symbol, "我来自北京邮电大学。。。学号123456，用AK47");
+    expected = "我来自北京邮电大学/。/。/。/学号123456/，/用AK47";
+    ASSERT_TRUE(filter.HasNext());
+    vector<string> words;
+    while (filter.HasNext()) {
+      PreFilter::Range range;
+      range = filter.Next();
+      words.push_back(TransCode::encode(range.begin, range.end));
+    }
+    res = join(words.begin(), words.end(), "/");
+    for (size_t i = 0; i < words.size(); i++) {
+    }
+    ASSERT_EQ(res, expected);
+  }
 }
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -18,6 +18,7 @@ TEST(MixSegmentTest, Test1) {
  const char* res2[] = {"B超"," ", "T恤"};
  vector<string> words;
  ASSERT_TRUE(segment.cut(str, words));
+  cout << words << endl;
  ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
  ASSERT_TRUE(segment.cut(str2, words));
  ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));