mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
abondom ISegment
This commit is contained in:
parent
6d69363145
commit
14974d51b4
@ -6,7 +6,6 @@
|
||||
#include <cassert>
|
||||
#include "limonp/Logger.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "TransCode.hpp"
|
||||
|
||||
@ -22,13 +21,24 @@ class FullSegment: public SegmentBase {
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
virtual ~FullSegment() {
|
||||
~FullSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
using SegmentBase::cut;
|
||||
virtual void cut(Unicode::const_iterator begin,
|
||||
void cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& res) const {
|
||||
//resut of searching in trie tree
|
||||
|
@ -17,14 +17,25 @@ class HMMSegment: public SegmentBase {
|
||||
HMMSegment(const HMMModel* model)
|
||||
: model_(model), isNeedDestroy_(false) {
|
||||
}
|
||||
virtual ~HMMSegment() {
|
||||
~HMMSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete model_;
|
||||
}
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
|
||||
void cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
Unicode::const_iterator left = begin;
|
||||
Unicode::const_iterator right = begin;
|
||||
while(right != end) {
|
||||
|
@ -1,15 +0,0 @@
|
||||
#ifndef CPPJIEBA_ISEGMENT_H
|
||||
#define CPPJIEBA_ISEGMENT_H
|
||||
|
||||
namespace CppJieba {
|
||||
|
||||
class ISegment {
|
||||
public:
|
||||
virtual ~ISegment() {
|
||||
}
|
||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
||||
};
|
||||
|
||||
} // namespace CppJieba
|
||||
|
||||
#endif // CPPJIEBA_ISEGMENT_H
|
@ -44,10 +44,7 @@ class KeywordExtractor {
|
||||
|
||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||
vector<string> words;
|
||||
if(!segment_.cut(str, words)) {
|
||||
LogError("segment cut(%s) failed.", str.c_str());
|
||||
return false;
|
||||
}
|
||||
segment_.cut(str, words);
|
||||
|
||||
map<string, double> wordmap;
|
||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
namespace CppJieba {
|
||||
|
||||
class LevelSegment: public ISegment {
|
||||
class LevelSegment: public SegmentBase{
|
||||
public:
|
||||
LevelSegment(const string& dictPath,
|
||||
const string& userDictPath = "")
|
||||
@ -15,7 +15,7 @@ class LevelSegment: public ISegment {
|
||||
LevelSegment(const DictTrie* dictTrie)
|
||||
: mpSeg_(dictTrie) {
|
||||
}
|
||||
virtual ~LevelSegment() {
|
||||
~LevelSegment() {
|
||||
}
|
||||
|
||||
void cut(Unicode::const_iterator begin,
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <cassert>
|
||||
#include "limonp/Logger.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
|
||||
namespace CppJieba {
|
||||
@ -22,50 +21,38 @@ class MPSegment: public SegmentBase {
|
||||
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
||||
assert(dictTrie_);
|
||||
}
|
||||
virtual ~MPSegment() {
|
||||
~MPSegment() {
|
||||
if(isNeedDestroy_) {
|
||||
delete dictTrie_;
|
||||
}
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
|
||||
vector<Dag> dags;
|
||||
|
||||
dictTrie_->find(begin, end, dags);
|
||||
|
||||
CalcDP(dags);
|
||||
|
||||
Cut(dags, words);
|
||||
}
|
||||
bool cut(const string& sentence,
|
||||
void cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len) const {
|
||||
Unicode unicode;
|
||||
if (!TransCode::decode(sentence, unicode)) {
|
||||
return false;
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords, max_word_len);
|
||||
}
|
||||
vector<Unicode> unicodeWords;
|
||||
cut(unicode.begin(), unicode.end(),
|
||||
unicodeWords, max_word_len);
|
||||
words.resize(unicodeWords.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
TransCode::encode(unicodeWords[i], words[i]);
|
||||
}
|
||||
return true;
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
void cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& words,
|
||||
size_t max_word_len) const {
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
vector<Dag> dags;
|
||||
dictTrie_->find(begin,
|
||||
end,
|
||||
dags,
|
||||
max_word_len);
|
||||
CalcDP(dags);
|
||||
Cut(dags, words);
|
||||
CutByDag(dags, words);
|
||||
}
|
||||
|
||||
const DictTrie* getDictTrie() const {
|
||||
return dictTrie_;
|
||||
}
|
||||
@ -103,7 +90,7 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void Cut(const vector<Dag>& dags,
|
||||
void CutByDag(const vector<Dag>& dags,
|
||||
vector<Unicode>& words) const {
|
||||
size_t i = 0;
|
||||
while(i < dags.size()) {
|
||||
|
@ -18,10 +18,23 @@ class MixSegment: public SegmentBase {
|
||||
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
||||
: mpSeg_(dictTrie), hmmSeg_(model) {
|
||||
}
|
||||
virtual ~MixSegment() {
|
||||
~MixSegment() {
|
||||
}
|
||||
using SegmentBase::cut;
|
||||
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
|
||||
void cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
mpSeg_.cut(begin, end, words);
|
||||
|
@ -27,10 +27,7 @@ class PosTagger {
|
||||
|
||||
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
||||
vector<string> cutRes;
|
||||
if (!segment_.cut(src, cutRes)) {
|
||||
LogError("mixSegment_ cut failed");
|
||||
return false;
|
||||
}
|
||||
segment_.cut(src, cutRes);
|
||||
|
||||
const DictUnit *tmp = NULL;
|
||||
Unicode unico;
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <cassert>
|
||||
#include "limonp/Logger.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "FullSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
@ -25,9 +24,20 @@ class QuerySegment: public SegmentBase {
|
||||
QuerySegment(const DictTrie* dictTrie, const HMMModel* model, size_t maxWordLen = 4)
|
||||
: mixSeg_(dictTrie, model), fullSeg_(dictTrie), maxWordLen_(maxWordLen) {
|
||||
}
|
||||
virtual ~QuerySegment() {
|
||||
~QuerySegment() {
|
||||
}
|
||||
void cut(const string& sentence,
|
||||
vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
uwords.reserve(sentence.size());
|
||||
while (pre_filter.HasNext()) {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
}
|
||||
using SegmentBase::cut;
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
//use mix cut first
|
||||
vector<Unicode> mixRes;
|
||||
|
@ -3,7 +3,6 @@
|
||||
|
||||
#include "limonp/Logger.hpp"
|
||||
#include "PreFilter.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include <cassert>
|
||||
|
||||
|
||||
@ -14,16 +13,17 @@ const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
class SegmentBase: public ISegment {
|
||||
class SegmentBase {
|
||||
public:
|
||||
SegmentBase() {
|
||||
LoadSpecialSymbols();
|
||||
}
|
||||
virtual ~SegmentBase() {
|
||||
~SegmentBase() {
|
||||
}
|
||||
/*
|
||||
public:
|
||||
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||
virtual bool cut(const string& sentence, vector<string>& words) const {
|
||||
void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||
bool cut(const string& sentence, vector<string>& words) const {
|
||||
PreFilter pre_filter(symbols_, sentence);
|
||||
PreFilter::Range range;
|
||||
vector<Unicode> uwords;
|
||||
@ -32,14 +32,12 @@ class SegmentBase: public ISegment {
|
||||
range = pre_filter.Next();
|
||||
cut(range.begin, range.end, uwords);
|
||||
}
|
||||
words.resize(uwords.size());
|
||||
for (size_t i = 0; i < uwords.size(); i++) {
|
||||
TransCode::encode(uwords[i], words[i]);
|
||||
}
|
||||
TransCode::encode(uwords, words);
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
private:
|
||||
protected:
|
||||
void LoadSpecialSymbols() {
|
||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||
for(size_t i = 0; i < size; i ++) {
|
||||
@ -47,7 +45,6 @@ class SegmentBase: public ISegment {
|
||||
}
|
||||
assert(symbols_.size());
|
||||
}
|
||||
|
||||
unordered_set<Rune> symbols_;
|
||||
}; // class SegmentBase
|
||||
|
||||
|
@ -17,9 +17,9 @@ TEST(MixSegmentTest, Test1) {
|
||||
const char* str2 = "B超 T恤";
|
||||
const char* res2[] = {"B超"," ", "T恤"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
ASSERT_TRUE(segment.cut(str2, words));
|
||||
segment.cut(str2, words);
|
||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ TEST(MixSegmentTest, NoUserDict) {
|
||||
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
|
||||
@ -37,14 +37,14 @@ TEST(MixSegmentTest, UserDict) {
|
||||
{
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
}
|
||||
{
|
||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
||||
@ -52,7 +52,7 @@ TEST(MixSegmentTest, UserDict) {
|
||||
{
|
||||
const char* str = "IBM,3.14";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
|
||||
@ -63,14 +63,14 @@ TEST(MixSegmentTest, UserDict2) {
|
||||
{
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
}
|
||||
{
|
||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
||||
@ -78,7 +78,7 @@ TEST(MixSegmentTest, UserDict2) {
|
||||
{
|
||||
const char* str = "IBM,3.14";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
string res;
|
||||
res << words;
|
||||
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
|
||||
@ -89,20 +89,20 @@ TEST(MPSegmentTest, Test1) {
|
||||
MPSegment segment("../dict/jieba.dict.utf8");;
|
||||
string s;
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut("我来自北京邮电大学。", words));
|
||||
segment.cut("我来自北京邮电大学。", words);
|
||||
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words);
|
||||
|
||||
ASSERT_TRUE(segment.cut("B超 T恤", words));
|
||||
segment.cut("B超 T恤", words);
|
||||
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
|
||||
|
||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words));
|
||||
segment.cut("南京市长江大桥", words);
|
||||
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
||||
|
||||
// MaxWordLen
|
||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
|
||||
segment.cut("南京市长江大桥", words, 3);
|
||||
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
||||
|
||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 0));
|
||||
segment.cut("南京市长江大桥", words, 0);
|
||||
ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
|
||||
}
|
||||
|
||||
@ -142,7 +142,7 @@ TEST(HMMSegmentTest, Test1) {
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
||||
@ -150,7 +150,7 @@ TEST(HMMSegmentTest, Test1) {
|
||||
const char* str = "IBM,1.2,123";
|
||||
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
}
|
||||
@ -160,12 +160,12 @@ TEST(FullSegment, Test1) {
|
||||
vector<string> words;
|
||||
string s;
|
||||
|
||||
ASSERT_TRUE(segment.cut("我来自北京邮电大学", words));
|
||||
segment.cut("我来自北京邮电大学", words);
|
||||
s << words;
|
||||
ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
|
||||
|
||||
|
||||
ASSERT_TRUE(segment.cut("上市公司CEO", words));
|
||||
segment.cut("上市公司CEO", words);
|
||||
s << words;
|
||||
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
|
||||
}
|
||||
@ -175,7 +175,7 @@ TEST(QuerySegment, Test1) {
|
||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
|
||||
string s1, s2;
|
||||
s1 << words;
|
||||
@ -191,7 +191,7 @@ TEST(QuerySegment, Test2) {
|
||||
const char* str = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
|
||||
string s1, s2;
|
||||
s1 << words;
|
||||
@ -203,7 +203,7 @@ TEST(QuerySegment, Test2) {
|
||||
const char* str = "小明硕士毕业于中国科学院计算所iPhone6";
|
||||
vector<string> words;
|
||||
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
segment.cut(str, words);
|
||||
|
||||
string s1, s2;
|
||||
s1 << words;
|
||||
|
Loading…
x
Reference in New Issue
Block a user