abondom ISegment

This commit is contained in:
yanyiwu 2015-09-13 17:02:04 +08:00
parent 6d69363145
commit 14974d51b4
11 changed files with 105 additions and 98 deletions

View File

@ -6,7 +6,6 @@
#include <cassert> #include <cassert>
#include "limonp/Logger.hpp" #include "limonp/Logger.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
@ -22,13 +21,24 @@ class FullSegment: public SegmentBase {
: dictTrie_(dictTrie), isNeedDestroy_(false) { : dictTrie_(dictTrie), isNeedDestroy_(false) {
assert(dictTrie_); assert(dictTrie_);
} }
virtual ~FullSegment() { ~FullSegment() {
if(isNeedDestroy_) { if(isNeedDestroy_) {
delete dictTrie_; delete dictTrie_;
} }
} }
using SegmentBase::cut; void cut(const string& sentence,
virtual void cut(Unicode::const_iterator begin, vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<Unicode>& res) const { vector<Unicode>& res) const {
//resut of searching in trie tree //resut of searching in trie tree

View File

@ -17,14 +17,25 @@ class HMMSegment: public SegmentBase {
HMMSegment(const HMMModel* model) HMMSegment(const HMMModel* model)
: model_(model), isNeedDestroy_(false) { : model_(model), isNeedDestroy_(false) {
} }
virtual ~HMMSegment() { ~HMMSegment() {
if(isNeedDestroy_) { if(isNeedDestroy_) {
delete model_; delete model_;
} }
} }
using SegmentBase::cut; void cut(const string& sentence,
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const { vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right = begin; Unicode::const_iterator right = begin;
while(right != end) { while(right != end) {

View File

@ -1,15 +0,0 @@
#ifndef CPPJIEBA_ISEGMENT_H
#define CPPJIEBA_ISEGMENT_H
namespace CppJieba {
class ISegment {
public:
virtual ~ISegment() {
}
virtual bool cut(const string& str, vector<string>& res) const = 0;
};
} // namespace CppJieba
#endif // CPPJIEBA_ISEGMENT_H

View File

@ -44,10 +44,7 @@ class KeywordExtractor {
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const { bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
vector<string> words; vector<string> words;
if(!segment_.cut(str, words)) { segment_.cut(str, words);
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
map<string, double> wordmap; map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) { for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {

View File

@ -5,7 +5,7 @@
namespace CppJieba { namespace CppJieba {
class LevelSegment: public ISegment { class LevelSegment: public SegmentBase{
public: public:
LevelSegment(const string& dictPath, LevelSegment(const string& dictPath,
const string& userDictPath = "") const string& userDictPath = "")
@ -15,7 +15,7 @@ class LevelSegment: public ISegment {
LevelSegment(const DictTrie* dictTrie) LevelSegment(const DictTrie* dictTrie)
: mpSeg_(dictTrie) { : mpSeg_(dictTrie) {
} }
virtual ~LevelSegment() { ~LevelSegment() {
} }
void cut(Unicode::const_iterator begin, void cut(Unicode::const_iterator begin,

View File

@ -6,7 +6,6 @@
#include <cassert> #include <cassert>
#include "limonp/Logger.hpp" #include "limonp/Logger.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
namespace CppJieba { namespace CppJieba {
@ -22,50 +21,38 @@ class MPSegment: public SegmentBase {
: dictTrie_(dictTrie), isNeedDestroy_(false) { : dictTrie_(dictTrie), isNeedDestroy_(false) {
assert(dictTrie_); assert(dictTrie_);
} }
virtual ~MPSegment() { ~MPSegment() {
if(isNeedDestroy_) { if(isNeedDestroy_) {
delete dictTrie_; delete dictTrie_;
} }
} }
using SegmentBase::cut; void cut(const string& sentence,
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
vector<Dag> dags;
dictTrie_->find(begin, end, dags);
CalcDP(dags);
Cut(dags, words);
}
bool cut(const string& sentence,
vector<string>& words, vector<string>& words,
size_t max_word_len) const { size_t max_word_len = MAX_WORD_LENGTH) const {
Unicode unicode; PreFilter pre_filter(symbols_, sentence);
if (!TransCode::decode(sentence, unicode)) { PreFilter::Range range;
return false; vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords, max_word_len);
} }
vector<Unicode> unicodeWords; TransCode::encode(uwords, words);
cut(unicode.begin(), unicode.end(),
unicodeWords, max_word_len);
words.resize(unicodeWords.size());
for (size_t i = 0; i < words.size(); i++) {
TransCode::encode(unicodeWords[i], words[i]);
}
return true;
} }
void cut(Unicode::const_iterator begin, void cut(Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<Unicode>& words, vector<Unicode>& words,
size_t max_word_len) const { size_t max_word_len = MAX_WORD_LENGTH) const {
vector<Dag> dags; vector<Dag> dags;
dictTrie_->find(begin, dictTrie_->find(begin,
end, end,
dags, dags,
max_word_len); max_word_len);
CalcDP(dags); CalcDP(dags);
Cut(dags, words); CutByDag(dags, words);
} }
const DictTrie* getDictTrie() const { const DictTrie* getDictTrie() const {
return dictTrie_; return dictTrie_;
} }
@ -103,7 +90,7 @@ class MPSegment: public SegmentBase {
} }
} }
} }
void Cut(const vector<Dag>& dags, void CutByDag(const vector<Dag>& dags,
vector<Unicode>& words) const { vector<Unicode>& words) const {
size_t i = 0; size_t i = 0;
while(i < dags.size()) { while(i < dags.size()) {

View File

@ -18,10 +18,23 @@ class MixSegment: public SegmentBase {
MixSegment(const DictTrie* dictTrie, const HMMModel* model) MixSegment(const DictTrie* dictTrie, const HMMModel* model)
: mpSeg_(dictTrie), hmmSeg_(model) { : mpSeg_(dictTrie), hmmSeg_(model) {
} }
virtual ~MixSegment() { ~MixSegment() {
} }
using SegmentBase::cut;
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { void cut(const string& sentence,
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
mpSeg_.cut(begin, end, words); mpSeg_.cut(begin, end, words);

View File

@ -27,10 +27,7 @@ class PosTagger {
bool tag(const string& src, vector<pair<string, string> >& res) const { bool tag(const string& src, vector<pair<string, string> >& res) const {
vector<string> cutRes; vector<string> cutRes;
if (!segment_.cut(src, cutRes)) { segment_.cut(src, cutRes);
LogError("mixSegment_ cut failed");
return false;
}
const DictUnit *tmp = NULL; const DictUnit *tmp = NULL;
Unicode unico; Unicode unico;

View File

@ -6,7 +6,6 @@
#include <cassert> #include <cassert>
#include "limonp/Logger.hpp" #include "limonp/Logger.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "FullSegment.hpp" #include "FullSegment.hpp"
#include "MixSegment.hpp" #include "MixSegment.hpp"
@ -25,9 +24,20 @@ class QuerySegment: public SegmentBase {
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4) QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) { : mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
} }
virtual ~QuerySegment() { ~QuerySegment() {
}
void cut(const string& sentence,
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
} }
using SegmentBase::cut;
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
//use mix cut first //use mix cut first
vector<Unicode> mixRes; vector<Unicode> mixRes;

View File

@ -3,7 +3,6 @@
#include "limonp/Logger.hpp" #include "limonp/Logger.hpp"
#include "PreFilter.hpp" #include "PreFilter.hpp"
#include "ISegment.hpp"
#include <cassert> #include <cassert>
@ -14,16 +13,17 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
using namespace limonp; using namespace limonp;
class SegmentBase: public ISegment { class SegmentBase {
public: public:
SegmentBase() { SegmentBase() {
LoadSpecialSymbols(); LoadSpecialSymbols();
} }
virtual ~SegmentBase() { ~SegmentBase() {
} }
/*
public: public:
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0; void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
virtual bool cut(const string& sentence, vector<string>& words) const { bool cut(const string& sentence, vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence); PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range; PreFilter::Range range;
vector<Unicode> uwords; vector<Unicode> uwords;
@ -32,14 +32,12 @@ class SegmentBase: public ISegment {
range = pre_filter.Next(); range = pre_filter.Next();
cut(range.begin, range.end, uwords); cut(range.begin, range.end, uwords);
} }
words.resize(uwords.size()); TransCode::encode(uwords, words);
for (size_t i = 0; i < uwords.size(); i++) {
TransCode::encode(uwords[i], words[i]);
}
return true; return true;
} }
*/
private: protected:
void LoadSpecialSymbols() { void LoadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL); size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) { for(size_t i = 0; i < size; i ++) {
@ -47,7 +45,6 @@ class SegmentBase: public ISegment {
} }
assert(symbols_.size()); assert(symbols_.size());
} }
unordered_set<Rune> symbols_; unordered_set<Rune> symbols_;
}; // class SegmentBase }; // class SegmentBase

View File

@ -17,9 +17,9 @@ TEST(MixSegmentTest, Test1) {
const char* str2 = "B超 T恤"; const char* str2 = "B超 T恤";
const char* res2[] = {"B超"," ", "T恤"}; const char* res2[] = {"B超"," ", "T恤"};
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
ASSERT_TRUE(segment.cut(str2, words)); segment.cut(str2, words);
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0]))); ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
} }
@ -27,7 +27,7 @@ TEST(MixSegmentTest, NoUserDict) {
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8"); MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words); ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
@ -37,14 +37,14 @@ TEST(MixSegmentTest, UserDict) {
{ {
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words); ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
} }
{ {
const char* str = "小明先就职于IBM,后在日本京都大学深造"; const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
res << words; res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res); ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
@ -52,7 +52,7 @@ TEST(MixSegmentTest, UserDict) {
{ {
const char* str = "IBM,3.14"; const char* str = "IBM,3.14";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
res << words; res << words;
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res); ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
@ -63,14 +63,14 @@ TEST(MixSegmentTest, UserDict2) {
{ {
const char* str = "令狐冲是云计算方面的专家"; const char* str = "令狐冲是云计算方面的专家";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words); ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
} }
{ {
const char* str = "小明先就职于IBM,后在日本京都大学深造"; const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
res << words; res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res); ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
@ -78,7 +78,7 @@ TEST(MixSegmentTest, UserDict2) {
{ {
const char* str = "IBM,3.14"; const char* str = "IBM,3.14";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string res; string res;
res << words; res << words;
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res); ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
@ -89,20 +89,20 @@ TEST(MPSegmentTest, Test1) {
MPSegment segment("../dict/jieba.dict.utf8");; MPSegment segment("../dict/jieba.dict.utf8");;
string s; string s;
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words)); segment.cut("我来自北京邮电大学。", words);
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", s << words); ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", s << words);
ASSERT_TRUE(segment.cut("B超 T恤", words)); segment.cut("B超 T恤", words);
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]"); ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
ASSERT_TRUE(segment.cut("南京市长江大桥", words)); segment.cut("南京市长江大桥", words);
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words); ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
// MaxWordLen // MaxWordLen
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3)); segment.cut("南京市长江大桥", words, 3);
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words); ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0)); segment.cut("南京市长江大桥", words, 0);
ASSERT_EQ("[\"\", \"\", \"\", \"\", \"\", \"\", \"\"]", s << words); ASSERT_EQ("[\"\", \"\", \"\", \"\", \"\", \"\", \"\"]", s << words);
} }
@ -142,7 +142,7 @@ TEST(HMMSegmentTest, Test1) {
const char* str = "我来自北京邮电大学。。。学号123456"; const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"}; const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }
@ -150,7 +150,7 @@ TEST(HMMSegmentTest, Test1) {
const char* str = "IBM,1.2,123"; const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"}; const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }
} }
@ -160,12 +160,12 @@ TEST(FullSegment, Test1) {
vector<string> words; vector<string> words;
string s; string s;
ASSERT_TRUE(segment.cut("我来自北京邮电大学", words)); segment.cut("我来自北京邮电大学", words);
s << words; s << words;
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]"); ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
ASSERT_TRUE(segment.cut("上市公司CEO", words)); segment.cut("上市公司CEO", words);
s << words; s << words;
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]"); ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
} }
@ -175,7 +175,7 @@ TEST(QuerySegment, Test1) {
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string s1, s2; string s1, s2;
s1 << words; s1 << words;
@ -191,7 +191,7 @@ TEST(QuerySegment, Test2) {
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string s1, s2; string s1, s2;
s1 << words; s1 << words;
@ -203,7 +203,7 @@ TEST(QuerySegment, Test2) {
const char* str = "小明硕士毕业于中国科学院计算所iPhone6"; const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
vector<string> words; vector<string> words;
ASSERT_TRUE(segment.cut(str, words)); segment.cut(str, words);
string s1, s2; string s1, s2;
s1 << words; s1 << words;