abondom ISegment

2025-07-18 00:00:12 +08:00 · 2015-09-13 17:02:04 +08:00 · 2015-09-13 17:02:04 +08:00 · 14974d51b4
commit 14974d51b4
parent 6d69363145
11 changed files with 105 additions and 98 deletions
--- a/src/FullSegment.hpp
+++ b/src/FullSegment.hpp
@ -6,7 +6,6 @@
 #include <cassert>
 #include "limonp/Logger.hpp"
 #include "DictTrie.hpp"
-#include "ISegment.hpp"
 #include "SegmentBase.hpp"
 #include "TransCode.hpp"

@ -22,13 +21,24 @@ class FullSegment: public SegmentBase {
    : dictTrie_(dictTrie), isNeedDestroy_(false) {
    assert(dictTrie_);
  }
-  virtual ~FullSegment() {
+  ~FullSegment() {
    if(isNeedDestroy_) {
      delete dictTrie_;
    }
  }
-  using SegmentBase::cut;
-  virtual void cut(Unicode::const_iterator begin, 
+  void cut(const string& sentence, 
+        vector<string>& words) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<Unicode> uwords;
+    uwords.reserve(sentence.size());
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      cut(range.begin, range.end, uwords);
+    }
+    TransCode::encode(uwords, words);
+  }
+  void cut(Unicode::const_iterator begin, 
        Unicode::const_iterator end, 
        vector<Unicode>& res) const {
    //resut of searching in trie tree
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -17,14 +17,25 @@ class HMMSegment: public SegmentBase {
  HMMSegment(const HMMModel* model) 
  : model_(model), isNeedDestroy_(false) {
  }
-  virtual ~HMMSegment() {
+  ~HMMSegment() {
    if(isNeedDestroy_) {
      delete model_;
    }
  }

-  using SegmentBase::cut;
-  void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
+  void cut(const string& sentence, 
+        vector<string>& words) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<Unicode> uwords;
+    uwords.reserve(sentence.size());
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      cut(range.begin, range.end, uwords);
+    }
+    TransCode::encode(uwords, words);
+  }
+  void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
    Unicode::const_iterator left = begin;
    Unicode::const_iterator right = begin;
    while(right != end) {
--- a/src/ISegment.hpp
+++ b/src/ISegment.hpp
@ -1,15 +0,0 @@
-#ifndef CPPJIEBA_ISEGMENT_H
-#define CPPJIEBA_ISEGMENT_H
-
-namespace CppJieba {
-
-class ISegment {
- public:
-  virtual ~ISegment() {
-  }
-  virtual bool cut(const string& str, vector<string>& res) const = 0;
-};
-
-} // namespace CppJieba
-
-#endif // CPPJIEBA_ISEGMENT_H
--- a/src/KeywordExtractor.hpp
+++ b/src/KeywordExtractor.hpp
@ -44,10 +44,7 @@ class KeywordExtractor {

  bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
    vector<string> words;
-    if(!segment_.cut(str, words)) {
-      LogError("segment cut(%s) failed.", str.c_str());
-      return false;
-    }
+    segment_.cut(str, words);

    map<string, double> wordmap;
    for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
--- a/src/LevelSegment.hpp
+++ b/src/LevelSegment.hpp
@ -5,7 +5,7 @@

 namespace CppJieba {

-class LevelSegment: public ISegment {
+class LevelSegment: public SegmentBase{
 public:
  LevelSegment(const string& dictPath, 
               const string& userDictPath = "")
@ -15,7 +15,7 @@ class LevelSegment: public ISegment {
  LevelSegment(const DictTrie* dictTrie) 
    : mpSeg_(dictTrie) {
  }
-  virtual ~LevelSegment() {
+  ~LevelSegment() {
  }

  void cut(Unicode::const_iterator begin,
--- a/src/MPSegment.hpp
+++ b/src/MPSegment.hpp
@ -6,7 +6,6 @@
 #include <cassert>
 #include "limonp/Logger.hpp"
 #include "DictTrie.hpp"
-#include "ISegment.hpp"
 #include "SegmentBase.hpp"

 namespace CppJieba {
@ -22,50 +21,38 @@ class MPSegment: public SegmentBase {
    : dictTrie_(dictTrie), isNeedDestroy_(false) {
    assert(dictTrie_);
  }
-  virtual ~MPSegment() {
+  ~MPSegment() {
    if(isNeedDestroy_) {
      delete dictTrie_;
    }
  }

-  using SegmentBase::cut;
-  void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
-    vector<Dag> dags;
-
-    dictTrie_->find(begin, end, dags);
-
-    CalcDP(dags);
-
-    Cut(dags, words);
-  }
-  bool cut(const string& sentence, 
+  void cut(const string& sentence, 
        vector<string>& words, 
-        size_t max_word_len) const {
-    Unicode unicode;
-    if (!TransCode::decode(sentence, unicode)) {
-      return false;
+        size_t max_word_len = MAX_WORD_LENGTH) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<Unicode> uwords;
+    uwords.reserve(sentence.size());
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      cut(range.begin, range.end, uwords, max_word_len);
    }
-    vector<Unicode> unicodeWords;
-    cut(unicode.begin(), unicode.end(), 
-          unicodeWords, max_word_len);
-    words.resize(unicodeWords.size());
-    for (size_t i = 0; i < words.size(); i++) {
-      TransCode::encode(unicodeWords[i], words[i]);
-    }
-    return true;
+    TransCode::encode(uwords, words);
  }
  void cut(Unicode::const_iterator begin,
           Unicode::const_iterator end,
           vector<Unicode>& words,
-           size_t max_word_len) const {
+           size_t max_word_len = MAX_WORD_LENGTH) const {
    vector<Dag> dags;
    dictTrie_->find(begin, 
          end, 
          dags,
          max_word_len);
    CalcDP(dags);
-    Cut(dags, words);
+    CutByDag(dags, words);
  }
+
  const DictTrie* getDictTrie() const {
    return dictTrie_;
  }
@ -103,7 +90,7 @@ class MPSegment: public SegmentBase {
      }
    }
  }
-  void Cut(const vector<Dag>& dags, 
+  void CutByDag(const vector<Dag>& dags, 
        vector<Unicode>& words) const {
    size_t i = 0;
    while(i < dags.size()) {
--- a/src/MixSegment.hpp
+++ b/src/MixSegment.hpp
@ -18,10 +18,23 @@ class MixSegment: public SegmentBase {
  MixSegment(const DictTrie* dictTrie, const HMMModel* model) 
    : mpSeg_(dictTrie), hmmSeg_(model) {
  }
-  virtual ~MixSegment() {
+  ~MixSegment() {
  }
-  using SegmentBase::cut;
-  virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
+
+  void cut(const string& sentence, 
+        vector<string>& words) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<Unicode> uwords;
+    uwords.reserve(sentence.size());
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      cut(range.begin, range.end, uwords);
+    }
+    TransCode::encode(uwords, words);
+  }
+
+  void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
    vector<Unicode> words;
    words.reserve(end - begin);
    mpSeg_.cut(begin, end, words);
--- a/src/PosTagger.hpp
+++ b/src/PosTagger.hpp
@ -27,10 +27,7 @@ class PosTagger {

  bool tag(const string& src, vector<pair<string, string> >& res) const {
    vector<string> cutRes;
-    if (!segment_.cut(src, cutRes)) {
-      LogError("mixSegment_ cut failed");
-      return false;
-    }
+    segment_.cut(src, cutRes);

    const DictUnit *tmp = NULL;
    Unicode unico;
--- a/src/QuerySegment.hpp
+++ b/src/QuerySegment.hpp
@ -6,7 +6,6 @@
 #include <cassert>
 #include "limonp/Logger.hpp"
 #include "DictTrie.hpp"
-#include "ISegment.hpp"
 #include "SegmentBase.hpp"
 #include "FullSegment.hpp"
 #include "MixSegment.hpp"
@ -25,9 +24,20 @@ class QuerySegment: public SegmentBase {
  QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
    : mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
  }
-  virtual ~QuerySegment() {
+  ~QuerySegment() {
+  }
+  void cut(const string& sentence, 
+        vector<string>& words) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<Unicode> uwords;
+    uwords.reserve(sentence.size());
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      cut(range.begin, range.end, uwords);
+    }
+    TransCode::encode(uwords, words);
  }
-  using SegmentBase::cut;
  void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
    //use mix cut first
    vector<Unicode> mixRes;
--- a/src/SegmentBase.hpp
+++ b/src/SegmentBase.hpp
@ -3,7 +3,6 @@

 #include "limonp/Logger.hpp"
 #include "PreFilter.hpp"
-#include "ISegment.hpp"
 #include <cassert>


@ -14,16 +13,17 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};

 using namespace limonp;

-class SegmentBase: public ISegment {
+class SegmentBase {
 public:
  SegmentBase() {
    LoadSpecialSymbols();
  }
-  virtual ~SegmentBase() {
+  ~SegmentBase() {
  }
+  /*
 public:
-  virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
-  virtual bool cut(const string& sentence, vector<string>& words) const {
+  void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
+  bool cut(const string& sentence, vector<string>& words) const {
    PreFilter pre_filter(symbols_, sentence);
    PreFilter::Range range;
    vector<Unicode> uwords;
@ -32,14 +32,12 @@ class SegmentBase: public ISegment {
      range = pre_filter.Next();
      cut(range.begin, range.end, uwords);
    }
-    words.resize(uwords.size());
-    for (size_t i = 0; i < uwords.size(); i++) {
-      TransCode::encode(uwords[i], words[i]);
-    }
+    TransCode::encode(uwords, words);
    return true;
  }
+  */

- private:
+ protected:
  void LoadSpecialSymbols() {
    size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
    for(size_t i = 0; i < size; i ++) {
@ -47,7 +45,6 @@ class SegmentBase: public ISegment {
    }
    assert(symbols_.size());
  }
-
  unordered_set<Rune> symbols_;
 }; // class SegmentBase

--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -17,9 +17,9 @@ TEST(MixSegmentTest, Test1) {
  const char* str2 = "B超 T恤";
  const char* res2[] = {"B超"," ", "T恤"};
  vector<string> words;
-  ASSERT_TRUE(segment.cut(str, words));
+  segment.cut(str, words);
  ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
-  ASSERT_TRUE(segment.cut(str2, words));
+  segment.cut(str2, words);
  ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
 }

@ -27,7 +27,7 @@ TEST(MixSegmentTest, NoUserDict) {
  MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
  const char* str = "令狐冲是云计算方面的专家";
  vector<string> words;
-  ASSERT_TRUE(segment.cut(str, words));
+  segment.cut(str, words);
  string res;
  ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);

@ -37,14 +37,14 @@ TEST(MixSegmentTest, UserDict) {
  {
    const char* str = "令狐冲是云计算方面的专家";
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    string res;
    ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
  }
  {
    const char* str = "小明先就职于IBM,后在日本京都大学深造";
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    string res;
    res << words;
    ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
@ -52,7 +52,7 @@ TEST(MixSegmentTest, UserDict) {
  {
    const char* str = "IBM,3.14";
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    string res;
    res << words;
    ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
@ -63,14 +63,14 @@ TEST(MixSegmentTest, UserDict2) {
  {
    const char* str = "令狐冲是云计算方面的专家";
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    string res;
    ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
  }
  {
    const char* str = "小明先就职于IBM,后在日本京都大学深造";
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    string res;
    res << words;
    ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
@ -78,7 +78,7 @@ TEST(MixSegmentTest, UserDict2) {
  {
    const char* str = "IBM,3.14";
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    string res;
    res << words;
    ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
@ -89,20 +89,20 @@ TEST(MPSegmentTest, Test1) {
  MPSegment segment("../dict/jieba.dict.utf8");;
  string s;
  vector<string> words;
-  ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words));
+  segment.cut("我来自北京邮电大学。", words);
  ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words);

-  ASSERT_TRUE(segment.cut("B超 T恤", words));
+  segment.cut("B超 T恤", words);
  ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");

-  ASSERT_TRUE(segment.cut("南京市长江大桥", words));
+  segment.cut("南京市长江大桥", words);
  ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);

  // MaxWordLen
-  ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
+  segment.cut("南京市长江大桥", words, 3);
  ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);

-  ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0));
+  segment.cut("南京市长江大桥", words, 0);
  ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
 }

@ -142,7 +142,7 @@ TEST(HMMSegmentTest, Test1) {
    const char* str = "我来自北京邮电大学。。。学号123456";
    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
  }

@ -150,7 +150,7 @@ TEST(HMMSegmentTest, Test1) {
    const char* str = "IBM,1.2,123";
    const char* res[] = {"IBM", ",", "1.2", ",", "123"};
    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
  }
 }
@ -160,12 +160,12 @@ TEST(FullSegment, Test1) {
  vector<string> words;
  string s;

-  ASSERT_TRUE(segment.cut("我来自北京邮电大学", words));
+  segment.cut("我来自北京邮电大学", words);
  s << words;
  ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");

  
-  ASSERT_TRUE(segment.cut("上市公司CEO", words));
+  segment.cut("上市公司CEO", words);
  s << words;
  ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
 }
@ -175,7 +175,7 @@ TEST(QuerySegment, Test1) {
  const char* str = "小明硕士毕业于中国科学院计算所，后在日本京都大学深造";
  vector<string> words;

-  ASSERT_TRUE(segment.cut(str, words));
+  segment.cut(str, words);

  string s1, s2;
  s1 << words;
@ -191,7 +191,7 @@ TEST(QuerySegment, Test2) {
    const char* str = "小明硕士毕业于中国科学院计算所，后在日本京都大学深造";
    vector<string> words;

-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);

    string s1, s2;
    s1 << words;
@ -203,7 +203,7 @@ TEST(QuerySegment, Test2) {
    const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
    vector<string> words;

-    ASSERT_TRUE(segment.cut(str, words));
+    segment.cut(str, words);

    string s1, s2;
    s1 << words;