From f89cf005525434f8a2010b82e9fad8a811024ae8 Mon Sep 17 00:00:00 2001 From: wyy Date: Fri, 20 Dec 2013 08:57:10 -0800 Subject: [PATCH 1/2] init TfIdfKeyWord.hpp --- src/TfIdfKeyWord.hpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 src/TfIdfKeyWord.hpp diff --git a/src/TfIdfKeyWord.hpp b/src/TfIdfKeyWord.hpp new file mode 100644 index 0000000..c155af6 --- /dev/null +++ b/src/TfIdfKeyWord.hpp @@ -0,0 +1,32 @@ +#ifndef CPPJIEBA_TFIDF_H +#define CPPJIEBA_TFIDF_H + +#include "MPSegment.hpp" + +namespace CppJieba +{ + using namespace Limonp; + + class TfIdfKeyWord + { + private: + MPSegment _segment; + public: + TfIdfKeyWord(const char* dictFile): _segment(dictFile){}; + ~TfIdfKeyWord(){}; + public: + bool init(){return _segment.init();}; + bool dispose(){return _segment.dispose();}; + public: + bool extract(const string& str, vector& words, uint topN) + { + return _segment.cut(words); + return true; + } + + }; +} + +#endif + + From fa75f0f3193d7e136374f7feaf856753ade00cce Mon Sep 17 00:00:00 2001 From: wyy Date: Sat, 21 Dec 2013 09:37:12 -0800 Subject: [PATCH 2/2] modify construction and init for segments --- src/FullSegment.hpp | 205 ++++++++++++++++------------------ src/HMMSegment.hpp | 36 +++--- src/Husky/ServerFrame.hpp | 17 +-- src/ISegment.hpp | 3 - src/MPSegment.hpp | 25 ++--- src/MixSegment.hpp | 26 ++--- src/QuerySegment.hpp | 30 ++--- src/SegmentBase.hpp | 6 +- src/segment.cpp | 15 +-- src/server.cpp | 4 +- test/segment.cpp | 9 +- test/test_performance.cpp | 24 +--- test/unittest/THMMSegment.cpp | 4 +- test/unittest/TMPSegment.cpp | 4 +- test/unittest/TMixSegment.cpp | 36 +----- 15 files changed, 160 insertions(+), 284 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index d4bfa31..b722b07 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -15,127 +15,118 @@ namespace CppJieba { class FullSegment: public SegmentBase { - private: - Trie* _trie; - const string _dictPath; + private: + Trie* _trie; - public: - FullSegment(const char* dictPath): _dictPath(dictPath){}; - virtual ~FullSegment(){dispose();}; - public: - bool init() - { - if(_getInitFlag()) + public: + FullSegment(){_setInitFlag(false);}; + explicit FullSegment(const string& dictPath){_setInitFlag(init(dictPath));} + virtual ~FullSegment(){}; + public: + bool init(const string& dictPath) { - LogError("already inited before now."); - return false; + if(_getInitFlag()) + { + LogError("already inited before now."); + return false; + } + _trie = TrieManager::getInstance().getTrie(dictPath.c_str()); + if (NULL == _trie) + { + LogError("get NULL pointor from getTrie(\"%s\")", dictPath.c_str()); + return false; + } + return _setInitFlag(true); } - _trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); - if (NULL == _trie) - { - LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str()); - return false; - } - return _setInitFlag(true); - } - bool dispose() - { - if(!_getInitFlag()) + + public: + using SegmentBase::cut; + + public: + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_getInitFlag()); + if (begin >= end) + { + LogError("begin >= end"); + return false; + } + + //resut of searching in trie tree + vector > tRes; + + //max index of res's words + int maxIdx = 0; + + // always equals to (uItr - begin) + int uIdx = 0; + + //tmp variables + int wordLen = 0; + for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) + { + //find word start from uItr + if (_trie->find(uItr, end, tRes)) + { + for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + { + wordLen = itr->second->word.size(); + if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx) + { + res.push_back(itr->second->word); + } + maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; + } + tRes.clear(); + } + else // not found word start from uItr + { + if (maxIdx <= uIdx) // never exist in prev results + { + //put itr itself in res + res.push_back(Unicode(1, *uItr)); + + //mark it exits + ++maxIdx; + } + } + ++uIdx; + } + return true; } - _setInitFlag(false); - return true; - } - public: - using SegmentBase::cut; - - public: - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - assert(_getInitFlag()); - if (begin >= end) + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - LogError("begin >= end"); - return false; - } - - //resut of searching in trie tree - vector > tRes; - - //max index of res's words - int maxIdx = 0; - - // always equals to (uItr - begin) - int uIdx = 0; - - //tmp variables - int wordLen = 0; - for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) - { - //find word start from uItr - if (_trie->find(uItr, end, tRes)) + assert(_getInitFlag()); + if (begin >= end) { - for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - { - wordLen = itr->second->word.size(); - if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx) - { - res.push_back(itr->second->word); - } - maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; - } - tRes.clear(); + LogError("begin >= end"); + return false; } - else // not found word start from uItr - { - if (maxIdx <= uIdx) // never exist in prev results - { - //put itr itself in res - res.push_back(Unicode(1, *uItr)); - //mark it exits - ++maxIdx; + vector uRes; + if (!cut(begin, end, uRes)) + { + LogError("get unicode cut result error."); + return false; + } + + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) + { + if (TransCode::encode(*uItr, tmp)) + { + res.push_back(tmp); + } + else + { + LogError("encode failed."); } } - ++uIdx; + + return true; } - - return true; - } - - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - assert(_getInitFlag()); - if (begin >= end) - { - LogError("begin >= end"); - return false; - } - - vector uRes; - if (!cut(begin, end, uRes)) - { - LogError("get unicode cut result error."); - return false; - } - - string tmp; - for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) - { - if (TransCode::encode(*uItr, tmp)) - { - res.push_back(tmp); - } - else - { - LogError("encode failed."); - } - } - - return true; - } }; } diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 1a3f774..3ef0097 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -33,12 +33,21 @@ namespace CppJieba EmitProbMap _emitProbM; EmitProbMap _emitProbS; vector _emitProbVec; - private: - const string _hmmModelPath; public: - HMMSegment(const char * const filePath): _hmmModelPath(filePath) + HMMSegment(){_setInitFlag(false);} + explicit HMMSegment(const string& filePath) { + _setInitFlag(init(filePath)); + } + virtual ~HMMSegment(){} + public: + bool init(const string& filePath) + { + if(_getInitFlag()) + { + return false; + } memset(_startProb, 0, sizeof(_startProb)); memset(_transProb, 0, sizeof(_transProb)); _statMap[0] = 'B'; @@ -49,20 +58,7 @@ namespace CppJieba _emitProbVec.push_back(&_emitProbE); _emitProbVec.push_back(&_emitProbM); _emitProbVec.push_back(&_emitProbS); - } - virtual ~HMMSegment() - { - dispose(); - } - public: - virtual bool init() - { - return _setInitFlag(_loadModel(_hmmModelPath.c_str())); - } - virtual bool dispose() - { - _setInitFlag(false); - return true; + return _setInitFlag(_loadModel(filePath.c_str())); } public: using SegmentBase::cut; @@ -96,11 +92,6 @@ namespace CppJieba public: virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { - //if(!_getInitFlag()) - //{ - // LogError("not inited."); - // return false; - //} assert(_getInitFlag()); if(begin == end) { @@ -121,7 +112,6 @@ namespace CppJieba } return true; } - //virtual bool cut(const string& str, vector& res)const; private: bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const diff --git a/src/Husky/ServerFrame.hpp b/src/Husky/ServerFrame.hpp index 748ed54..114c14c 100644 --- a/src/Husky/ServerFrame.hpp +++ b/src/Husky/ServerFrame.hpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -40,9 +41,6 @@ namespace Husky public: virtual ~IRequestHandler(){}; public: - virtual bool init() = 0; - virtual bool dispose() = 0; - virtual bool do_GET(const HttpReqInfo& httpReq, string& res) = 0; }; @@ -63,10 +61,11 @@ namespace Husky public: ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler) { + m_bShutdown = false; m_nLsnPort = nPort; m_nThreadCount = nThreadCount; m_pHandler = pHandler; - m_bShutdown = false; + assert(pHandler); pthread_mutex_init(&m_pmAccept,NULL); }; virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);}; @@ -80,11 +79,6 @@ namespace Husky } LogInfo("init ok {port:%d, threadNum:%d}", m_nLsnPort, m_nThreadCount); - if(!m_pHandler->init()) - { - LogFatal("m_pHandler init failed."); - return false; - } return true; } virtual bool dispose() @@ -96,7 +90,6 @@ namespace Husky return false; } - int sockfd; struct sockaddr_in dest; @@ -120,10 +113,6 @@ namespace Husky LogError("error [%s]", strerror(errno)); } close(sockfd); - if(!m_pHandler->dispose()) - { - LogFatal("m_pHandler dispose failed."); - } return true; } virtual bool run() diff --git a/src/ISegment.hpp b/src/ISegment.hpp index 8821289..5099fa0 100644 --- a/src/ISegment.hpp +++ b/src/ISegment.hpp @@ -8,9 +8,6 @@ namespace CppJieba { public: virtual ~ISegment(){}; - public: - virtual bool init() = 0; - virtual bool dispose() = 0; public: virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; virtual bool cut(const string& str, vector& res) const = 0; diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 81cd45d..0b85137 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -34,37 +34,30 @@ namespace CppJieba { private: Trie* _trie; - private: - const string _dictPath; public: - MPSegment(const char * const dictPath): _dictPath(dictPath){}; - virtual ~MPSegment(){dispose();}; + MPSegment(){_setInitFlag(false);}; + explicit MPSegment(const string& dictPath) + { + _setInitFlag(init(dictPath)); + }; + virtual ~MPSegment(){}; public: - virtual bool init() + bool init(const string& dictPath) { if(_getInitFlag()) { LogError("already inited before now."); return false; } - _trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); + _trie = TrieManager::getInstance().getTrie(dictPath.c_str()); if (_trie == NULL) { - LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str()); + LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str()); return false; } return _setInitFlag(true); } - virtual bool dispose() - { - if(!_getInitFlag()) - { - return true; - } - _setInitFlag(false); - return true; - } public: using SegmentBase::cut; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 79c40d1..6e9abed 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -14,44 +14,32 @@ namespace CppJieba MPSegment _mpSeg; HMMSegment _hmmSeg; public: - MixSegment(const char * const mpSegDict, const char * const hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict) + MixSegment(){_setInitFlag(false);}; + explicit MixSegment(const string& mpSegDict, const string& hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict) { + _setInitFlag(_mpSeg && _hmmSeg); } - virtual ~MixSegment() - { - dispose(); - } + virtual ~MixSegment(){} public: - virtual bool init() + bool init(const string& mpSegDict, const string& hmmSegDict) { if(_getInitFlag()) { LogError("inited."); return false; } - if(!_mpSeg.init()) + if(!_mpSeg.init(mpSegDict)) { LogError("_mpSeg init"); return false; } - if(!_hmmSeg.init()) + if(!_hmmSeg.init(hmmSegDict)) { LogError("_hmmSeg init"); return false; } return _setInitFlag(true); } - virtual bool dispose() - { - if(!_getInitFlag()) - { - return true; - } - _mpSeg.dispose(); - _hmmSeg.dispose(); - _setInitFlag(false); - return true; - } public: using SegmentBase::cut; public: diff --git a/src/QuerySegment.hpp b/src/QuerySegment.hpp index a57f710..74f69f9 100644 --- a/src/QuerySegment.hpp +++ b/src/QuerySegment.hpp @@ -20,41 +20,35 @@ namespace CppJieba private: MixSegment _mixSeg; FullSegment _fullSeg; - int _maxWordLen; + size_t _maxWordLen; public: - QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){}; - virtual ~QuerySegment(){dispose();}; + QuerySegment(){_setInitFlag(false);}; + explicit QuerySegment(const string& dict, const string& model, size_t maxWordLen) + { + _setInitFlag(init(dict, model, maxWordLen)); + }; + virtual ~QuerySegment(){}; public: - bool init() + bool init(const string& dict, const string& model, size_t maxWordLen) { if (_getInitFlag()) { - LogError("inited."); + LogError("inited already."); + return false; } - if (!_mixSeg.init()) + if (!_mixSeg.init(dict, model)) { LogError("_mixSeg init"); return false; } - if (!_fullSeg.init()) + if (!_fullSeg.init(dict)) { LogError("_fullSeg init"); return false; } return _setInitFlag(true); } - bool dispose() - { - if(!_getInitFlag()) - { - return true; - } - _fullSeg.dispose(); - _mixSeg.dispose(); - _setInitFlag(false); - return true; - } public: using SegmentBase::cut; diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index ee990ac..2dea84f 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -18,11 +18,9 @@ namespace CppJieba bool _isInited; bool _getInitFlag()const{return _isInited;}; bool _setInitFlag(bool flag){return _isInited = flag;}; + public: - virtual bool init() = 0; - virtual bool dispose() = 0; - - public: + operator bool(){return _getInitFlag();}; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const = 0; virtual bool cut(const string& str, vector& res)const { diff --git a/src/segment.cpp b/src/segment.cpp index 37cad63..5dafe25 100644 --- a/src/segment.cpp +++ b/src/segment.cpp @@ -63,57 +63,52 @@ int main(int argc, char ** argv) if("cutHMM" == algorithm) { HMMSegment seg(modelPath.c_str()); - if(!seg.init()) + if(!seg) { cout<<"seg init failed."< buf(res, res + sizeof(res)/sizeof(res[0])); vector words; - ASSERT_EQ(segment.init(), true); - ASSERT_EQ(segment.cut(str, words), true); + ASSERT_TRUE(segment); + ASSERT_TRUE(segment.cut(str, words)); //print(words); EXPECT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } diff --git a/test/unittest/TMPSegment.cpp b/test/unittest/TMPSegment.cpp index e8ee8a2..e812243 100644 --- a/test/unittest/TMPSegment.cpp +++ b/test/unittest/TMPSegment.cpp @@ -9,8 +9,8 @@ TEST(MPSegmentTest, Test1) const char* str = "我来自北京邮电大学。。。 学号 123456"; const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学","号", " 123456"}; vector words; - ASSERT_EQ(segment.init(), true); - ASSERT_EQ(segment.cut(str, words), true); + ASSERT_TRUE(segment); + ASSERT_TRUE(segment.cut(str, words)); //print(words); EXPECT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); } diff --git a/test/unittest/TMixSegment.cpp b/test/unittest/TMixSegment.cpp index efeb333..5c24fc3 100644 --- a/test/unittest/TMixSegment.cpp +++ b/test/unittest/TMixSegment.cpp @@ -8,41 +8,9 @@ TEST(MixSegmentTest, Test1) MixSegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");; const char* str = "我来自北京邮电大学。。。 学号 123456"; const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。"," ","学号", " 123456"}; - //string s; - //vector buf(res, res + sizeof(res)/sizeof(res[0])); vector words; - ASSERT_EQ(segment.init(), true); - ASSERT_EQ(segment.cut(str, words), true); + ASSERT_TRUE(segment); + ASSERT_TRUE(segment.cut(str, words)); EXPECT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); - //print(words); - - //for(uint i = 0; i < sizeof(res)/sizeof(res[0]); i++) - //{ - // buf.push_back() - //} - //buf.push_back(""); - //buf.push_back("你好"); - //buf.push_back("...hh"); - //vector res; - //uint size = strlen(str); - //uint offset = 0; - //while(offset < size) - //{ - // uint len; - // const char* t = str + offset; - // int ret = filterAscii(t, size - offset, len); - // s.assign(t, len); - // res.push_back(s); - // //cout<