mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
remove InitOnOff to make code lighter
This commit is contained in:
parent
5bfd3d0c49
commit
9571a4d0d5
@ -10,7 +10,6 @@
|
||||
#include <limits>
|
||||
#include "Limonp/StringUtil.hpp"
|
||||
#include "Limonp/Logger.hpp"
|
||||
#include "Limonp/InitOnOff.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
@ -41,7 +40,7 @@ namespace CppJieba
|
||||
|
||||
typedef map<size_t, const DictUnit*> DagType;
|
||||
|
||||
class DictTrie: public InitOnOff
|
||||
class DictTrie
|
||||
{
|
||||
public:
|
||||
typedef Trie<Unicode::value_type, DictUnit, Unicode, vector<Unicode>, vector<const DictUnit*> > TrieType;
|
||||
@ -65,12 +64,11 @@ namespace CppJieba
|
||||
{
|
||||
_trie = NULL;
|
||||
_minWeight = MAX_DOUBLE;
|
||||
_setInitFlag(false);
|
||||
}
|
||||
DictTrie(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
new (this) DictTrie();
|
||||
_setInitFlag(init(dictPath, userDictPath));
|
||||
init(dictPath, userDictPath);
|
||||
}
|
||||
~DictTrie()
|
||||
{
|
||||
@ -83,7 +81,7 @@ namespace CppJieba
|
||||
public:
|
||||
bool init(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
assert(!_getInitFlag());
|
||||
assert(!_trie);
|
||||
_loadDict(dictPath, _nodeInfos);
|
||||
_calculateWeight(_nodeInfos);
|
||||
_minWeight = _findMinWeight(_nodeInfos);
|
||||
@ -96,7 +94,7 @@ namespace CppJieba
|
||||
_shrink(_nodeInfos);
|
||||
_trie = _creatTrie(_nodeInfos);
|
||||
assert(_trie);
|
||||
return _setInitFlag(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -35,20 +35,15 @@ namespace CppJieba
|
||||
vector<EmitProbMap* > _emitProbVec;
|
||||
|
||||
public:
|
||||
HMMSegment(){_setInitFlag(false);}
|
||||
HMMSegment(){}
|
||||
explicit HMMSegment(const string& filePath)
|
||||
{
|
||||
_setInitFlag(init(filePath));
|
||||
LIMONP_CHECK(init(filePath));
|
||||
}
|
||||
virtual ~HMMSegment(){}
|
||||
public:
|
||||
bool init(const string& filePath)
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("inited already.");
|
||||
return false;
|
||||
}
|
||||
memset(_startProb, 0, sizeof(_startProb));
|
||||
memset(_transProb, 0, sizeof(_transProb));
|
||||
_statMap[0] = 'B';
|
||||
@ -59,11 +54,7 @@ namespace CppJieba
|
||||
_emitProbVec.push_back(&_emitProbE);
|
||||
_emitProbVec.push_back(&_emitProbM);
|
||||
_emitProbVec.push_back(&_emitProbS);
|
||||
if(!_setInitFlag(_loadModel(filePath.c_str())))
|
||||
{
|
||||
LogError("_loadModel(%s) failed.", filePath.c_str());
|
||||
return false;
|
||||
}
|
||||
LIMONP_CHECK(_loadModel(filePath.c_str()));
|
||||
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
||||
return true;
|
||||
}
|
||||
@ -104,7 +95,6 @@ namespace CppJieba
|
||||
private:
|
||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<size_t> status;
|
||||
if(!_viterbi(begin, end, status))
|
||||
{
|
||||
@ -128,7 +118,6 @@ namespace CppJieba
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if(begin == end)
|
||||
{
|
||||
return false;
|
||||
|
@ -10,7 +10,7 @@ namespace CppJieba
|
||||
using namespace Limonp;
|
||||
|
||||
/*utf8*/
|
||||
class KeywordExtractor: public InitOnOff
|
||||
class KeywordExtractor
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
@ -20,10 +20,10 @@ namespace CppJieba
|
||||
|
||||
unordered_set<string> _stopWords;
|
||||
public:
|
||||
KeywordExtractor(){_setInitFlag(false);};
|
||||
KeywordExtractor(){};
|
||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
||||
{
|
||||
_setInitFlag(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
||||
LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
||||
};
|
||||
~KeywordExtractor(){};
|
||||
|
||||
@ -32,13 +32,13 @@ namespace CppJieba
|
||||
{
|
||||
_loadIdfDict(idfPath);
|
||||
_loadStopWordDict(stopWordPath);
|
||||
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
|
||||
return true;
|
||||
};
|
||||
public:
|
||||
|
||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<pair<string, double> > topWords;
|
||||
if(!extract(str, topWords, topN))
|
||||
{
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#define LIMONP_CHECK(exp) \
|
||||
if(exp){fprintf(stderr, "File:%s, Line:%d Exp:[" #exp "] is true, abort.\n", __FILE__, __LINE__); abort();}
|
||||
if(!(exp)){fprintf(stderr, "File:%s, Line:%d Exp:[" #exp "] is true, abort.\n", __FILE__, __LINE__); abort();}
|
||||
|
||||
#define print(x) cout<< #x": " << x <<endl
|
||||
/*
|
||||
|
@ -34,24 +34,18 @@ namespace CppJieba
|
||||
DictTrie _dictTrie;
|
||||
|
||||
public:
|
||||
MPSegment(){_setInitFlag(false);};
|
||||
explicit MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||
MPSegment(){};
|
||||
MPSegment(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
_setInitFlag(init(dictPath, userDictPath));
|
||||
LIMONP_CHECK(init(dictPath, userDictPath));
|
||||
};
|
||||
virtual ~MPSegment(){};
|
||||
public:
|
||||
bool init(const string& dictPath, const string& userDictPath = "")
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("already inited before now.");
|
||||
return false;
|
||||
}
|
||||
_dictTrie.init(dictPath, userDictPath);
|
||||
assert(_dictTrie);
|
||||
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||
return _setInitFlag(true);
|
||||
return true;
|
||||
}
|
||||
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
|
||||
{
|
||||
@ -61,7 +55,6 @@ namespace CppJieba
|
||||
using SegmentBase::cut;
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if(begin == end)
|
||||
{
|
||||
return false;
|
||||
@ -92,7 +85,6 @@ namespace CppJieba
|
||||
{
|
||||
return false;
|
||||
}
|
||||
assert(_getInitFlag());
|
||||
vector<SegmentChar> segmentChars(end - begin);
|
||||
|
||||
//calc DAG
|
||||
|
@ -14,36 +14,25 @@ namespace CppJieba
|
||||
MPSegment _mpSeg;
|
||||
HMMSegment _hmmSeg;
|
||||
public:
|
||||
MixSegment(){_setInitFlag(false);};
|
||||
MixSegment(){};
|
||||
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||
{
|
||||
_setInitFlag(init(mpSegDict, hmmSegDict, userDict));
|
||||
assert(_getInitFlag());
|
||||
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
|
||||
}
|
||||
virtual ~MixSegment(){}
|
||||
public:
|
||||
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
||||
{
|
||||
assert(!_getInitFlag());
|
||||
if(!_mpSeg.init(mpSegDict, userDict))
|
||||
{
|
||||
LogError("_mpSeg init");
|
||||
return false;
|
||||
}
|
||||
if(!_hmmSeg.init(hmmSegDict))
|
||||
{
|
||||
LogError("_hmmSeg init");
|
||||
return false;
|
||||
}
|
||||
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
||||
LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
|
||||
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||
return _setInitFlag(true);
|
||||
return true;
|
||||
}
|
||||
public:
|
||||
using SegmentBase::cut;
|
||||
public:
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<Unicode> words;
|
||||
words.reserve(end - begin);
|
||||
if(!_mpSeg.cut(begin, end, words))
|
||||
@ -98,7 +87,6 @@ namespace CppJieba
|
||||
|
||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if(begin == end)
|
||||
{
|
||||
return false;
|
||||
|
@ -9,32 +9,29 @@ namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
|
||||
class PosTagger: public InitOnOff
|
||||
class PosTagger
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
DictTrie _dictTrie;
|
||||
|
||||
public:
|
||||
PosTagger(){_setInitFlag(false);};
|
||||
PosTagger(){};
|
||||
PosTagger(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
|
||||
{
|
||||
_setInitFlag(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb));
|
||||
LIMONP_CHECK(init(dictPath, hmmFilePath, charStatus, startProb, emitProb, endProb, transProb));
|
||||
};
|
||||
~PosTagger(){};
|
||||
public:
|
||||
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
|
||||
{
|
||||
|
||||
assert(!_getInitFlag());
|
||||
_dictTrie.init(dictPath);
|
||||
assert(_dictTrie);
|
||||
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||
LIMONP_CHECK(_dictTrie.init(dictPath));
|
||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
|
||||
return true;
|
||||
};
|
||||
|
||||
bool tag(const string& src, vector<pair<string, string> >& res)
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
vector<string> cutRes;
|
||||
if (!_segment.cut(src, cutRes))
|
||||
{
|
||||
|
@ -23,32 +23,20 @@ namespace CppJieba
|
||||
size_t _maxWordLen;
|
||||
|
||||
public:
|
||||
QuerySegment(){_setInitFlag(false);};
|
||||
QuerySegment(){};
|
||||
QuerySegment(const string& dict, const string& model, size_t maxWordLen)
|
||||
{
|
||||
_setInitFlag(init(dict, model, maxWordLen));
|
||||
init(dict, model, maxWordLen);
|
||||
};
|
||||
virtual ~QuerySegment(){};
|
||||
public:
|
||||
bool init(const string& dict, const string& model, size_t maxWordLen)
|
||||
{
|
||||
if (_getInitFlag())
|
||||
{
|
||||
LogError("inited already.");
|
||||
return false;
|
||||
}
|
||||
if (!_mixSeg.init(dict, model))
|
||||
{
|
||||
LogError("_mixSeg init");
|
||||
return false;
|
||||
}
|
||||
if (!_fullSeg.init(_mixSeg.getDictTrie()))
|
||||
{
|
||||
LogError("_fullSeg init");
|
||||
return false;
|
||||
}
|
||||
LIMONP_CHECK(_mixSeg.init(dict, model));
|
||||
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
|
||||
assert(maxWordLen);
|
||||
_maxWordLen = maxWordLen;
|
||||
return _setInitFlag(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
@ -57,7 +45,6 @@ namespace CppJieba
|
||||
public:
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if (begin >= end)
|
||||
{
|
||||
LogError("begin >= end");
|
||||
@ -102,7 +89,6 @@ namespace CppJieba
|
||||
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if (begin >= end)
|
||||
{
|
||||
LogError("begin >= end");
|
||||
|
@ -3,8 +3,8 @@
|
||||
|
||||
#include "TransCode.hpp"
|
||||
#include "Limonp/Logger.hpp"
|
||||
#include "Limonp/InitOnOff.hpp"
|
||||
#include "Limonp/NonCopyable.hpp"
|
||||
#include "Limonp/HandyMacro.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include <cassert>
|
||||
|
||||
@ -20,7 +20,7 @@ namespace CppJieba
|
||||
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
||||
#endif
|
||||
|
||||
class SegmentBase: public ISegment, public InitOnOff, public NonCopyable
|
||||
class SegmentBase: public ISegment, public NonCopyable
|
||||
{
|
||||
public:
|
||||
SegmentBase(){_loadSpecialSymbols();};
|
||||
|
@ -11,7 +11,6 @@ using namespace CppJieba;
|
||||
void cut(size_t times = 20)
|
||||
{
|
||||
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
||||
assert(seg);
|
||||
vector<string> res;
|
||||
string doc;
|
||||
ifstream ifs("../test/testdata/weicheng.utf8");
|
||||
@ -32,7 +31,6 @@ void cut(size_t times = 20)
|
||||
void extract(size_t times = 400)
|
||||
{
|
||||
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
||||
assert(extractor);
|
||||
vector<string> words;
|
||||
string doc;
|
||||
ifstream ifs("../test/testdata/review.100");
|
||||
|
@ -17,8 +17,6 @@ TEST(MixSegmentTest, Test1)
|
||||
const char* str2 = "B超 T恤";
|
||||
const char* res2[] = {"B超"," ", "T恤"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment);
|
||||
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
ASSERT_TRUE(segment.cut(str2, words));
|
||||
@ -29,7 +27,6 @@ TEST(MixSegmentTest, Test1)
|
||||
TEST(MixSegmentTest, NoUserDict)
|
||||
{
|
||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
||||
ASSERT_TRUE(segment);
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
@ -40,7 +37,6 @@ TEST(MixSegmentTest, NoUserDict)
|
||||
TEST(MixSegmentTest, UserDict)
|
||||
{
|
||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
ASSERT_TRUE(segment);
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
@ -55,7 +51,6 @@ TEST(MPSegmentTest, Test1)
|
||||
const char* str = "我来自北京邮电大学。";
|
||||
const char* res[] = {"我", "来自", "北京邮电大学", "。"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment);
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
|
||||
@ -105,7 +100,6 @@ TEST(HMMSegmentTest, Test1)
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment);
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
@ -55,7 +55,6 @@ TEST(DictTrieTest, Test1)
|
||||
TEST(DictTrieTest, UserDict)
|
||||
{
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||
ASSERT_TRUE(trie);
|
||||
string word = "云计算";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
|
Loading…
x
Reference in New Issue
Block a user