abondom ISegment

This commit is contained in:
yanyiwu 2015-09-13 17:02:04 +08:00
parent 6d69363145
commit 14974d51b4
11 changed files with 105 additions and 98 deletions

View File

@ -6,7 +6,6 @@
#include <cassert>
#include "limonp/Logger.hpp"
#include "DictTrie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
#include "TransCode.hpp"
@ -22,13 +21,24 @@ class FullSegment: public SegmentBase {
: dictTrie_(dictTrie), isNeedDestroy_(false) {
assert(dictTrie_);
}
virtual ~FullSegment() {
~FullSegment() {
if(isNeedDestroy_) {
delete dictTrie_;
}
}
using SegmentBase::cut;
virtual void cut(Unicode::const_iterator begin,
void cut(const string& sentence,
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<Unicode>& res) const {
//resut of searching in trie tree

View File

@ -17,14 +17,25 @@ class HMMSegment: public SegmentBase {
HMMSegment(const HMMModel* model)
: model_(model), isNeedDestroy_(false) {
}
virtual ~HMMSegment() {
~HMMSegment() {
if(isNeedDestroy_) {
delete model_;
}
}
using SegmentBase::cut;
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
void cut(const string& sentence,
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
Unicode::const_iterator left = begin;
Unicode::const_iterator right = begin;
while(right != end) {

View File

@ -1,15 +0,0 @@
#ifndef CPPJIEBA_ISEGMENT_H
#define CPPJIEBA_ISEGMENT_H
namespace CppJieba {
class ISegment {
public:
virtual ~ISegment() {
}
virtual bool cut(const string& str, vector<string>& res) const = 0;
};
} // namespace CppJieba
#endif // CPPJIEBA_ISEGMENT_H

View File

@ -44,10 +44,7 @@ class KeywordExtractor {
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
vector<string> words;
if(!segment_.cut(str, words)) {
LogError("segment cut(%s) failed.", str.c_str());
return false;
}
segment_.cut(str, words);
map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {

View File

@ -5,7 +5,7 @@
namespace CppJieba {
class LevelSegment: public ISegment {
class LevelSegment: public SegmentBase{
public:
LevelSegment(const string& dictPath,
const string& userDictPath = "")
@ -15,7 +15,7 @@ class LevelSegment: public ISegment {
LevelSegment(const DictTrie* dictTrie)
: mpSeg_(dictTrie) {
}
virtual ~LevelSegment() {
~LevelSegment() {
}
void cut(Unicode::const_iterator begin,

View File

@ -6,7 +6,6 @@
#include <cassert>
#include "limonp/Logger.hpp"
#include "DictTrie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
namespace CppJieba {
@ -22,50 +21,38 @@ class MPSegment: public SegmentBase {
: dictTrie_(dictTrie), isNeedDestroy_(false) {
assert(dictTrie_);
}
virtual ~MPSegment() {
~MPSegment() {
if(isNeedDestroy_) {
delete dictTrie_;
}
}
using SegmentBase::cut;
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
vector<Dag> dags;
dictTrie_->find(begin, end, dags);
CalcDP(dags);
Cut(dags, words);
}
bool cut(const string& sentence,
void cut(const string& sentence,
vector<string>& words,
size_t max_word_len) const {
Unicode unicode;
if (!TransCode::decode(sentence, unicode)) {
return false;
size_t max_word_len = MAX_WORD_LENGTH) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords, max_word_len);
}
vector<Unicode> unicodeWords;
cut(unicode.begin(), unicode.end(),
unicodeWords, max_word_len);
words.resize(unicodeWords.size());
for (size_t i = 0; i < words.size(); i++) {
TransCode::encode(unicodeWords[i], words[i]);
}
return true;
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<Unicode>& words,
size_t max_word_len) const {
size_t max_word_len = MAX_WORD_LENGTH) const {
vector<Dag> dags;
dictTrie_->find(begin,
end,
dags,
max_word_len);
CalcDP(dags);
Cut(dags, words);
CutByDag(dags, words);
}
const DictTrie* getDictTrie() const {
return dictTrie_;
}
@ -103,7 +90,7 @@ class MPSegment: public SegmentBase {
}
}
}
void Cut(const vector<Dag>& dags,
void CutByDag(const vector<Dag>& dags,
vector<Unicode>& words) const {
size_t i = 0;
while(i < dags.size()) {

View File

@ -18,10 +18,23 @@ class MixSegment: public SegmentBase {
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
: mpSeg_(dictTrie), hmmSeg_(model) {
}
virtual ~MixSegment() {
~MixSegment() {
}
using SegmentBase::cut;
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
void cut(const string& sentence,
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
vector<Unicode> words;
words.reserve(end - begin);
mpSeg_.cut(begin, end, words);

View File

@ -27,10 +27,7 @@ class PosTagger {
bool tag(const string& src, vector<pair<string, string> >& res) const {
vector<string> cutRes;
if (!segment_.cut(src, cutRes)) {
LogError("mixSegment_ cut failed");
return false;
}
segment_.cut(src, cutRes);
const DictUnit *tmp = NULL;
Unicode unico;

View File

@ -6,7 +6,6 @@
#include <cassert>
#include "limonp/Logger.hpp"
#include "DictTrie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
#include "FullSegment.hpp"
#include "MixSegment.hpp"
@ -25,9 +24,20 @@ class QuerySegment: public SegmentBase {
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
}
virtual ~QuerySegment() {
~QuerySegment() {
}
void cut(const string& sentence,
vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
TransCode::encode(uwords, words);
}
using SegmentBase::cut;
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
//use mix cut first
vector<Unicode> mixRes;

View File

@ -3,7 +3,6 @@
#include "limonp/Logger.hpp"
#include "PreFilter.hpp"
#include "ISegment.hpp"
#include <cassert>
@ -14,16 +13,17 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
using namespace limonp;
class SegmentBase: public ISegment {
class SegmentBase {
public:
SegmentBase() {
LoadSpecialSymbols();
}
virtual ~SegmentBase() {
~SegmentBase() {
}
/*
public:
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
virtual bool cut(const string& sentence, vector<string>& words) const {
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
bool cut(const string& sentence, vector<string>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
@ -32,14 +32,12 @@ class SegmentBase: public ISegment {
range = pre_filter.Next();
cut(range.begin, range.end, uwords);
}
words.resize(uwords.size());
for (size_t i = 0; i < uwords.size(); i++) {
TransCode::encode(uwords[i], words[i]);
}
TransCode::encode(uwords, words);
return true;
}
*/
private:
protected:
void LoadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) {
@ -47,7 +45,6 @@ class SegmentBase: public ISegment {
}
assert(symbols_.size());
}
unordered_set<Rune> symbols_;
}; // class SegmentBase

View File

@ -17,9 +17,9 @@ TEST(MixSegmentTest, Test1) {
const char* str2 = "B超 T恤";
const char* res2[] = {"B超"," ", "T恤"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
ASSERT_TRUE(segment.cut(str2, words));
segment.cut(str2, words);
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
}
@ -27,7 +27,7 @@ TEST(MixSegmentTest, NoUserDict) {
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"\", \"计算\", \"方面\", \"\", \"专家\"]", res << words);
@ -37,14 +37,14 @@ TEST(MixSegmentTest, UserDict) {
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"IBM\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
@ -52,7 +52,7 @@ TEST(MixSegmentTest, UserDict) {
{
const char* str = "IBM,3.14";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
res << words;
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
@ -63,14 +63,14 @@ TEST(MixSegmentTest, UserDict2) {
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
res << words;
ASSERT_EQ("[\"小明\", \"\", \"就职\", \"\", \"I\", \"B\", \"M\", \",\", \"\", \"\", \"日本\", \"京都大学\", \"深造\"]", res);
@ -78,7 +78,7 @@ TEST(MixSegmentTest, UserDict2) {
{
const char* str = "IBM,3.14";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string res;
res << words;
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
@ -89,20 +89,20 @@ TEST(MPSegmentTest, Test1) {
MPSegment segment("../dict/jieba.dict.utf8");;
string s;
vector<string> words;
ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words));
segment.cut("我来自北京邮电大学。", words);
ASSERT_EQ("[\"\", \"来自\", \"北京邮电大学\", \"\"]", s << words);
ASSERT_TRUE(segment.cut("B超 T恤", words));
segment.cut("B超 T恤", words);
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
ASSERT_TRUE(segment.cut("南京市长江大桥", words));
segment.cut("南京市长江大桥", words);
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
// MaxWordLen
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
segment.cut("南京市长江大桥", words, 3);
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0));
segment.cut("南京市长江大桥", words, 0);
ASSERT_EQ("[\"\", \"\", \"\", \"\", \"\", \"\", \"\"]", s << words);
}
@ -142,7 +142,7 @@ TEST(HMMSegmentTest, Test1) {
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
@ -150,7 +150,7 @@ TEST(HMMSegmentTest, Test1) {
const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
}
@ -160,12 +160,12 @@ TEST(FullSegment, Test1) {
vector<string> words;
string s;
ASSERT_TRUE(segment.cut("我来自北京邮电大学", words));
segment.cut("我来自北京邮电大学", words);
s << words;
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
ASSERT_TRUE(segment.cut("上市公司CEO", words));
segment.cut("上市公司CEO", words);
s << words;
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
}
@ -175,7 +175,7 @@ TEST(QuerySegment, Test1) {
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string s1, s2;
s1 << words;
@ -191,7 +191,7 @@ TEST(QuerySegment, Test2) {
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string s1, s2;
s1 << words;
@ -203,7 +203,7 @@ TEST(QuerySegment, Test2) {
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
segment.cut(str, words);
string s1, s2;
s1 << words;