diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index c3a3ca4..b0b4dc9 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -6,7 +6,6 @@ #include #include "limonp/Logger.hpp" #include "DictTrie.hpp" -#include "ISegment.hpp" #include "SegmentBase.hpp" #include "TransCode.hpp" @@ -22,13 +21,24 @@ class FullSegment: public SegmentBase { : dictTrie_(dictTrie), isNeedDestroy_(false) { assert(dictTrie_); } - virtual ~FullSegment() { + ~FullSegment() { if(isNeedDestroy_) { delete dictTrie_; } } - using SegmentBase::cut; - virtual void cut(Unicode::const_iterator begin, + void cut(const string& sentence, + vector& words) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector uwords; + uwords.reserve(sentence.size()); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + cut(range.begin, range.end, uwords); + } + TransCode::encode(uwords, words); + } + void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { //resut of searching in trie tree diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index a354fae..565ce4a 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -17,14 +17,25 @@ class HMMSegment: public SegmentBase { HMMSegment(const HMMModel* model) : model_(model), isNeedDestroy_(false) { } - virtual ~HMMSegment() { + ~HMMSegment() { if(isNeedDestroy_) { delete model_; } } - using SegmentBase::cut; - void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { + void cut(const string& sentence, + vector& words) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector uwords; + uwords.reserve(sentence.size()); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + cut(range.begin, range.end, uwords); + } + TransCode::encode(uwords, words); + } + void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { Unicode::const_iterator left = begin; Unicode::const_iterator right = begin; while(right != end) { diff --git a/src/ISegment.hpp b/src/ISegment.hpp deleted file mode 100644 index 38ef0a2..0000000 --- a/src/ISegment.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef CPPJIEBA_ISEGMENT_H -#define CPPJIEBA_ISEGMENT_H - -namespace CppJieba { - -class ISegment { - public: - virtual ~ISegment() { - } - virtual bool cut(const string& str, vector& res) const = 0; -}; - -} // namespace CppJieba - -#endif // CPPJIEBA_ISEGMENT_H diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index 7626313..29ef466 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -44,10 +44,7 @@ class KeywordExtractor { bool extract(const string& str, vector >& keywords, size_t topN) const { vector words; - if(!segment_.cut(str, words)) { - LogError("segment cut(%s) failed.", str.c_str()); - return false; - } + segment_.cut(str, words); map wordmap; for(vector::iterator iter = words.begin(); iter != words.end(); iter++) { diff --git a/src/LevelSegment.hpp b/src/LevelSegment.hpp index a1033d1..dbc37d1 100644 --- a/src/LevelSegment.hpp +++ b/src/LevelSegment.hpp @@ -5,7 +5,7 @@ namespace CppJieba { -class LevelSegment: public ISegment { +class LevelSegment: public SegmentBase{ public: LevelSegment(const string& dictPath, const string& userDictPath = "") @@ -15,7 +15,7 @@ class LevelSegment: public ISegment { LevelSegment(const DictTrie* dictTrie) : mpSeg_(dictTrie) { } - virtual ~LevelSegment() { + ~LevelSegment() { } void cut(Unicode::const_iterator begin, diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 4d05ff5..bae6bbf 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -6,7 +6,6 @@ #include #include "limonp/Logger.hpp" #include "DictTrie.hpp" -#include "ISegment.hpp" #include "SegmentBase.hpp" namespace CppJieba { @@ -22,50 +21,38 @@ class MPSegment: public SegmentBase { : dictTrie_(dictTrie), isNeedDestroy_(false) { assert(dictTrie_); } - virtual ~MPSegment() { + ~MPSegment() { if(isNeedDestroy_) { delete dictTrie_; } } - using SegmentBase::cut; - void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& words) const { - vector dags; - - dictTrie_->find(begin, end, dags); - - CalcDP(dags); - - Cut(dags, words); - } - bool cut(const string& sentence, + void cut(const string& sentence, vector& words, - size_t max_word_len) const { - Unicode unicode; - if (!TransCode::decode(sentence, unicode)) { - return false; + size_t max_word_len = MAX_WORD_LENGTH) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector uwords; + uwords.reserve(sentence.size()); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + cut(range.begin, range.end, uwords, max_word_len); } - vector unicodeWords; - cut(unicode.begin(), unicode.end(), - unicodeWords, max_word_len); - words.resize(unicodeWords.size()); - for (size_t i = 0; i < words.size(); i++) { - TransCode::encode(unicodeWords[i], words[i]); - } - return true; + TransCode::encode(uwords, words); } void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& words, - size_t max_word_len) const { + size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->find(begin, end, dags, max_word_len); CalcDP(dags); - Cut(dags, words); + CutByDag(dags, words); } + const DictTrie* getDictTrie() const { return dictTrie_; } @@ -103,7 +90,7 @@ class MPSegment: public SegmentBase { } } } - void Cut(const vector& dags, + void CutByDag(const vector& dags, vector& words) const { size_t i = 0; while(i < dags.size()) { diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 3c91899..a0dfce6 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -18,10 +18,23 @@ class MixSegment: public SegmentBase { MixSegment(const DictTrie* dictTrie, const HMMModel* model) : mpSeg_(dictTrie), hmmSeg_(model) { } - virtual ~MixSegment() { + ~MixSegment() { } - using SegmentBase::cut; - virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + + void cut(const string& sentence, + vector& words) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector uwords; + uwords.reserve(sentence.size()); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + cut(range.begin, range.end, uwords); + } + TransCode::encode(uwords, words); + } + + void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { vector words; words.reserve(end - begin); mpSeg_.cut(begin, end, words); diff --git a/src/PosTagger.hpp b/src/PosTagger.hpp index 94b89b9..50d70b5 100644 --- a/src/PosTagger.hpp +++ b/src/PosTagger.hpp @@ -27,10 +27,7 @@ class PosTagger { bool tag(const string& src, vector >& res) const { vector cutRes; - if (!segment_.cut(src, cutRes)) { - LogError("mixSegment_ cut failed"); - return false; - } + segment_.cut(src, cutRes); const DictUnit *tmp = NULL; Unicode unico; diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index 9bb2801..c75fd43 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -6,7 +6,6 @@ #include #include "limonp/Logger.hpp" #include "DictTrie.hpp" -#include "ISegment.hpp" #include "SegmentBase.hpp" #include "FullSegment.hpp" #include "MixSegment.hpp" @@ -25,9 +24,20 @@ class QuerySegment: public SegmentBase { QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4) : mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) { } - virtual ~QuerySegment() { + ~QuerySegment() { + } + void cut(const string& sentence, + vector& words) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector uwords; + uwords.reserve(sentence.size()); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + cut(range.begin, range.end, uwords); + } + TransCode::encode(uwords, words); } - using SegmentBase::cut; void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { //use mix cut first vector mixRes; diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index a5c766d..ff00a98 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -3,7 +3,6 @@ #include "limonp/Logger.hpp" #include "PreFilter.hpp" -#include "ISegment.hpp" #include @@ -14,16 +13,17 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u}; using namespace limonp; -class SegmentBase: public ISegment { +class SegmentBase { public: SegmentBase() { LoadSpecialSymbols(); } - virtual ~SegmentBase() { + ~SegmentBase() { } + /* public: - virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; - virtual bool cut(const string& sentence, vector& words) const { + void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const = 0; + bool cut(const string& sentence, vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; vector uwords; @@ -32,14 +32,12 @@ class SegmentBase: public ISegment { range = pre_filter.Next(); cut(range.begin, range.end, uwords); } - words.resize(uwords.size()); - for (size_t i = 0; i < uwords.size(); i++) { - TransCode::encode(uwords[i], words[i]); - } + TransCode::encode(uwords, words); return true; } + */ - private: + protected: void LoadSpecialSymbols() { size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); for(size_t i = 0; i < size; i ++) { @@ -47,7 +45,6 @@ class SegmentBase: public ISegment { } assert(symbols_.size()); } - unordered_set symbols_; }; // class SegmentBase diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 1d4f6bd..97a36f2 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -17,9 +17,9 @@ TEST(MixSegmentTest, Test1) { const char* str2 = "B超 T恤"; const char* res2[] = {"B超"," ", "T恤"}; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); - ASSERT_TRUE(segment.cut(str2, words)); + segment.cut(str2, words); ASSERT_EQ(words, vector(res2, res2 + sizeof(res2)/sizeof(res2[0]))); } @@ -27,7 +27,7 @@ TEST(MixSegmentTest, NoUserDict) { MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8"); const char* str = "令狐冲是云计算方面的专家"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words); @@ -37,14 +37,14 @@ TEST(MixSegmentTest, UserDict) { { const char* str = "令狐冲是云计算方面的专家"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); } { const char* str = "小明先就职于IBM,后在日本京都大学深造"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; res << words; ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res); @@ -52,7 +52,7 @@ TEST(MixSegmentTest, UserDict) { { const char* str = "IBM,3.14"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; res << words; ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res); @@ -63,14 +63,14 @@ TEST(MixSegmentTest, UserDict2) { { const char* str = "令狐冲是云计算方面的专家"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); } { const char* str = "小明先就职于IBM,后在日本京都大学深造"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; res << words; ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res); @@ -78,7 +78,7 @@ TEST(MixSegmentTest, UserDict2) { { const char* str = "IBM,3.14"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string res; res << words; ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res); @@ -89,20 +89,20 @@ TEST(MPSegmentTest, Test1) { MPSegment segment("../dict/jieba.dict.utf8");; string s; vector words; - ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words)); + segment.cut("我来自北京邮电大学。", words); ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words); - ASSERT_TRUE(segment.cut("B超 T恤", words)); + segment.cut("B超 T恤", words); ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]"); - ASSERT_TRUE(segment.cut("南京市长江大桥", words)); + segment.cut("南京市长江大桥", words); ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words); // MaxWordLen - ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3)); + segment.cut("南京市长江大桥", words, 3); ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words); - ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0)); + segment.cut("南京市长江大桥", words, 0); ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words); } @@ -142,7 +142,7 @@ TEST(HMMSegmentTest, Test1) { const char* str = "我来自北京邮电大学。。。学号123456"; const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } @@ -150,7 +150,7 @@ TEST(HMMSegmentTest, Test1) { const char* str = "IBM,1.2,123"; const char* res[] = {"IBM", ",", "1.2", ",", "123"}; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } } @@ -160,12 +160,12 @@ TEST(FullSegment, Test1) { vector words; string s; - ASSERT_TRUE(segment.cut("我来自北京邮电大学", words)); + segment.cut("我来自北京邮电大学", words); s << words; ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]"); - ASSERT_TRUE(segment.cut("上市公司CEO", words)); + segment.cut("上市公司CEO", words); s << words; ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]"); } @@ -175,7 +175,7 @@ TEST(QuerySegment, Test1) { const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string s1, s2; s1 << words; @@ -191,7 +191,7 @@ TEST(QuerySegment, Test2) { const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string s1, s2; s1 << words; @@ -203,7 +203,7 @@ TEST(QuerySegment, Test2) { const char* str = "小明硕士毕业于中国科学院计算所iPhone6"; vector words; - ASSERT_TRUE(segment.cut(str, words)); + segment.cut(str, words); string s1, s2; s1 << words;