merge head

This commit is contained in:
aholic 2013-12-17 01:19:04 +08:00
commit 9af21d9658
16 changed files with 191 additions and 287 deletions

View File

@ -17,36 +17,27 @@ namespace CppJieba
{ {
private: private:
Trie* _trie; Trie* _trie;
const string _dictPath;
public: public:
FullSegment(const char* dictPath): _dictPath(dictPath){}; FullSegment(){_setInitFlag(false);};
virtual ~FullSegment(){dispose();}; explicit FullSegment(const string& dictPath){_setInitFlag(init(dictPath));}
virtual ~FullSegment(){};
public: public:
bool init() bool init(const string& dictPath)
{ {
if(_getInitFlag()) if(_getInitFlag())
{ {
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
_trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); _trie = TrieManager::getInstance().getTrie(dictPath.c_str());
if (NULL == _trie) if (NULL == _trie)
{ {
LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str()); LogError("get NULL pointor from getTrie(\"%s\")", dictPath.c_str());
return false; return false;
} }
return _setInitFlag(true); return _setInitFlag(true);
} }
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_setInitFlag(false);
return true;
}
public: public:
using SegmentBase::cut; using SegmentBase::cut;

View File

@ -33,12 +33,21 @@ namespace CppJieba
EmitProbMap _emitProbM; EmitProbMap _emitProbM;
EmitProbMap _emitProbS; EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec; vector<EmitProbMap* > _emitProbVec;
private:
const string _hmmModelPath;
public: public:
HMMSegment(const char * const filePath): _hmmModelPath(filePath) HMMSegment(){_setInitFlag(false);}
explicit HMMSegment(const string& filePath)
{ {
_setInitFlag(init(filePath));
}
virtual ~HMMSegment(){}
public:
bool init(const string& filePath)
{
if(_getInitFlag())
{
return false;
}
memset(_startProb, 0, sizeof(_startProb)); memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb)); memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B'; _statMap[0] = 'B';
@ -49,20 +58,7 @@ namespace CppJieba
_emitProbVec.push_back(&_emitProbE); _emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbM); _emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbS); _emitProbVec.push_back(&_emitProbS);
} return _setInitFlag(_loadModel(filePath.c_str()));
virtual ~HMMSegment()
{
dispose();
}
public:
virtual bool init()
{
return _setInitFlag(_loadModel(_hmmModelPath.c_str()));
}
virtual bool dispose()
{
_setInitFlag(false);
return true;
} }
public: public:
using SegmentBase::cut; using SegmentBase::cut;
@ -96,11 +92,6 @@ namespace CppJieba
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{ {
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag()); assert(_getInitFlag());
if(begin == end) if(begin == end)
{ {
@ -121,7 +112,6 @@ namespace CppJieba
} }
return true; return true;
} }
//virtual bool cut(const string& str, vector<string>& res)const;
private: private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const

View File

@ -4,6 +4,7 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <cassert>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/types.h> #include <sys/types.h>
#include <arpa/inet.h> #include <arpa/inet.h>
@ -40,9 +41,6 @@ namespace Husky
public: public:
virtual ~IRequestHandler(){}; virtual ~IRequestHandler(){};
public: public:
virtual bool init() = 0;
virtual bool dispose() = 0;
virtual bool do_GET(const HttpReqInfo& httpReq, string& res) = 0; virtual bool do_GET(const HttpReqInfo& httpReq, string& res) = 0;
}; };
@ -63,10 +61,11 @@ namespace Husky
public: public:
ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler) ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
{ {
m_bShutdown = false;
m_nLsnPort = nPort; m_nLsnPort = nPort;
m_nThreadCount = nThreadCount; m_nThreadCount = nThreadCount;
m_pHandler = pHandler; m_pHandler = pHandler;
m_bShutdown = false; assert(pHandler);
pthread_mutex_init(&m_pmAccept,NULL); pthread_mutex_init(&m_pmAccept,NULL);
}; };
virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);}; virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);};
@ -80,11 +79,6 @@ namespace Husky
} }
LogInfo("init ok {port:%d, threadNum:%d}", m_nLsnPort, m_nThreadCount); LogInfo("init ok {port:%d, threadNum:%d}", m_nLsnPort, m_nThreadCount);
if(!m_pHandler->init())
{
LogFatal("m_pHandler init failed.");
return false;
}
return true; return true;
} }
virtual bool dispose() virtual bool dispose()
@ -96,7 +90,6 @@ namespace Husky
return false; return false;
} }
int sockfd; int sockfd;
struct sockaddr_in dest; struct sockaddr_in dest;
@ -120,10 +113,6 @@ namespace Husky
LogError("error [%s]", strerror(errno)); LogError("error [%s]", strerror(errno));
} }
close(sockfd); close(sockfd);
if(!m_pHandler->dispose())
{
LogFatal("m_pHandler dispose failed.");
}
return true; return true;
} }
virtual bool run() virtual bool run()

View File

@ -8,9 +8,6 @@ namespace CppJieba
{ {
public: public:
virtual ~ISegment(){}; virtual ~ISegment(){};
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
public: public:
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const = 0; virtual bool cut(const string& str, vector<string>& res) const = 0;

View File

@ -34,37 +34,30 @@ namespace CppJieba
{ {
private: private:
Trie* _trie; Trie* _trie;
private:
const string _dictPath;
public: public:
MPSegment(const char * const dictPath): _dictPath(dictPath){}; MPSegment(){_setInitFlag(false);};
virtual ~MPSegment(){dispose();}; explicit MPSegment(const string& dictPath)
{
_setInitFlag(init(dictPath));
};
virtual ~MPSegment(){};
public: public:
virtual bool init() bool init(const string& dictPath)
{ {
if(_getInitFlag()) if(_getInitFlag())
{ {
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
_trie = TrieManager::getInstance().getTrie(_dictPath.c_str()); _trie = TrieManager::getInstance().getTrie(dictPath.c_str());
if (_trie == NULL) if (_trie == NULL)
{ {
LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str()); LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str());
return false; return false;
} }
return _setInitFlag(true); return _setInitFlag(true);
} }
virtual bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_setInitFlag(false);
return true;
}
public: public:
using SegmentBase::cut; using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const

View File

@ -14,44 +14,32 @@ namespace CppJieba
MPSegment _mpSeg; MPSegment _mpSeg;
HMMSegment _hmmSeg; HMMSegment _hmmSeg;
public: public:
MixSegment(const char * const mpSegDict, const char * const hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict) MixSegment(){_setInitFlag(false);};
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict)
{ {
_setInitFlag(_mpSeg && _hmmSeg);
} }
virtual ~MixSegment() virtual ~MixSegment(){}
{
dispose();
}
public: public:
virtual bool init() bool init(const string& mpSegDict, const string& hmmSegDict)
{ {
if(_getInitFlag()) if(_getInitFlag())
{ {
LogError("inited."); LogError("inited.");
return false; return false;
} }
if(!_mpSeg.init()) if(!_mpSeg.init(mpSegDict))
{ {
LogError("_mpSeg init"); LogError("_mpSeg init");
return false; return false;
} }
if(!_hmmSeg.init()) if(!_hmmSeg.init(hmmSegDict))
{ {
LogError("_hmmSeg init"); LogError("_hmmSeg init");
return false; return false;
} }
return _setInitFlag(true); return _setInitFlag(true);
} }
virtual bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_mpSeg.dispose();
_hmmSeg.dispose();
_setInitFlag(false);
return true;
}
public: public:
using SegmentBase::cut; using SegmentBase::cut;
public: public:

View File

@ -20,42 +20,35 @@ namespace CppJieba
private: private:
MixSegment _mixSeg; MixSegment _mixSeg;
FullSegment _fullSeg; FullSegment _fullSeg;
int _maxWordLen; size_t _maxWordLen;
public: public:
QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){}; QuerySegment(){_setInitFlag(false);};
virtual ~QuerySegment(){dispose();}; explicit QuerySegment(const string& dict, const string& model, size_t maxWordLen)
{
_setInitFlag(init(dict, model, maxWordLen));
};
virtual ~QuerySegment(){};
public: public:
bool init() bool init(const string& dict, const string& model, size_t maxWordLen)
{ {
if (_getInitFlag()) if (_getInitFlag())
{ {
LogError("inited."); LogError("inited already.");
return false; return false;
} }
if (!_mixSeg.init()) if (!_mixSeg.init(dict, model))
{ {
LogError("_mixSeg init"); LogError("_mixSeg init");
return false; return false;
} }
if (!_fullSeg.init()) if (!_fullSeg.init(dict))
{ {
LogError("_fullSeg init"); LogError("_fullSeg init");
return false; return false;
} }
return _setInitFlag(true); return _setInitFlag(true);
} }
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_fullSeg.dispose();
_mixSeg.dispose();
_setInitFlag(false);
return true;
}
public: public:
using SegmentBase::cut; using SegmentBase::cut;
@ -92,9 +85,6 @@ namespace CppJieba
res.push_back(*fullResItr); res.push_back(*fullResItr);
} }
} }
//clear fullRes
fullRes.clear();
} }
else // just use the mix result else // just use the mix result
{ {

View File

@ -18,11 +18,9 @@ namespace CppJieba
bool _isInited; bool _isInited;
bool _getInitFlag()const{return _isInited;}; bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;}; bool _setInitFlag(bool flag){return _isInited = flag;};
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
public: public:
operator bool(){return _getInitFlag();};
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0; virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const virtual bool cut(const string& str, vector<string>& res)const
{ {

32
src/TfIdfKeyWord.hpp Normal file
View File

@ -0,0 +1,32 @@
#ifndef CPPJIEBA_TFIDF_H
#define CPPJIEBA_TFIDF_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class TfIdfKeyWord
{
private:
MPSegment _segment;
public:
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
~TfIdfKeyWord(){};
public:
bool init(){return _segment.init();};
bool dispose(){return _segment.dispose();};
public:
bool extract(const string& str, vector<string>& words, uint topN)
{
return _segment.cut(words);
return true;
}
};
}
#endif

View File

@ -63,57 +63,52 @@ int main(int argc, char ** argv)
if("cutHMM" == algorithm) if("cutHMM" == algorithm)
{ {
HMMSegment seg(modelPath.c_str()); HMMSegment seg(modelPath.c_str());
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose();
} }
else if("cutDAG" == algorithm) else if("cutDAG" == algorithm)
{ {
MPSegment seg(dictPath.c_str()); MPSegment seg(dictPath.c_str());
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return false; return false;
} }
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose();
} }
else if ("cutFull" == algorithm) else if ("cutFull" == algorithm)
{ {
FullSegment seg(dictPath.c_str()); FullSegment seg(dictPath.c_str());
if (!seg.init()) if (!seg)
{ {
cout << "seg init failed" << endl; cout << "seg init failed" << endl;
return false; return false;
} }
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose();
} }
else if ("cutQuery" == algorithm) else if ("cutQuery" == algorithm)
{ {
QuerySegment seg(dictPath.c_str(), modelPath.c_str(), maxLen); QuerySegment seg(dictPath.c_str(), modelPath.c_str(), maxLen);
if (!seg.init()) if (!seg)
{ {
cout << "seg init failed" << endl; cout << "seg init failed" << endl;
return false; return false;
} }
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose();
} }
else else
{ {
MixSegment seg(dictPath.c_str(), modelPath.c_str()); MixSegment seg(dictPath.c_str(), modelPath.c_str());
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose();
} }
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -15,10 +15,8 @@ using namespace CppJieba;
class ReqHandler: public IRequestHandler class ReqHandler: public IRequestHandler
{ {
public: public:
ReqHandler(const string& dictPath, const string& modelPath): _segment(dictPath.c_str(), modelPath.c_str()){}; ReqHandler(const string& dictPath, const string& modelPath): _segment(dictPath, modelPath){};
virtual ~ReqHandler(){}; virtual ~ReqHandler(){};
virtual bool init(){return _segment.init();};
virtual bool dispose(){return _segment.dispose();};
public: public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd) virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd)
{ {

View File

@ -31,33 +31,30 @@ int main(int argc, char ** argv)
//demo //demo
{ {
HMMSegment seg(HMM_DICT_FILE); HMMSegment seg(HMM_DICT_FILE);
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
cut(&seg, TEST_FILE); cut(&seg, TEST_FILE);
seg.dispose();
} }
{ {
MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE); MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE);
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
cut(&seg, TEST_FILE); cut(&seg, TEST_FILE);
seg.dispose();
} }
{ {
MPSegment seg(JIEBA_DICT_FILE); MPSegment seg(JIEBA_DICT_FILE);
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return false; return false;
} }
cut(&seg, TEST_FILE); cut(&seg, TEST_FILE);
seg.dispose();
} }
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -25,36 +25,14 @@ void cut(const ISegment * seg, const char * const filePath)
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
//demo
//{
// HMMSegment seg;
// if(!seg.init("../dicts/hmm_model.utf8"))
// {
// cout<<"seg init failed."<<endl;
// return EXIT_FAILURE;
// }
// cut(&seg, "testlines.utf8");
// seg.dispose();
//}
//{
// MixSegment seg;
// if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
// {
// cout<<"seg init failed."<<endl;
// return EXIT_FAILURE;
// }
// cut(&seg, "testlines.utf8");
// seg.dispose();
//}
{ {
MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"); MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");
if(!seg.init()) if(!seg)
{ {
cout<<"seg init failed."<<endl; cout<<"seg init failed."<<endl;
return false; return false;
} }
cut(&seg, argv[1]); cut(&seg, argv[1]);
seg.dispose();
} }
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -11,8 +11,8 @@ TEST(HMMSegmentTest, Test1)
//string s; //string s;
//vector<string> buf(res, res + sizeof(res)/sizeof(res[0])); //vector<string> buf(res, res + sizeof(res)/sizeof(res[0]));
vector<string> words; vector<string> words;
ASSERT_EQ(segment.init(), true); ASSERT_TRUE(segment);
ASSERT_EQ(segment.cut(str, words), true); ASSERT_TRUE(segment.cut(str, words));
//print(words); //print(words);
EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }

View File

@ -9,8 +9,8 @@ TEST(MPSegmentTest, Test1)
const char* str = "我来自北京邮电大学。。。 学号 123456"; const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","","", " 123456"}; const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","","", " 123456"};
vector<string> words; vector<string> words;
ASSERT_EQ(segment.init(), true); ASSERT_TRUE(segment);
ASSERT_EQ(segment.cut(str, words), true); ASSERT_TRUE(segment.cut(str, words));
//print(words); //print(words);
EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
} }

View File

@ -8,41 +8,9 @@ TEST(MixSegmentTest, Test1)
MixSegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");; MixSegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");;
const char* str = "我来自北京邮电大学。。。 学号 123456"; const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","学号", " 123456"}; const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","学号", " 123456"};
//string s;
//vector<string> buf(res, res + sizeof(res)/sizeof(res[0]));
vector<string> words; vector<string> words;
ASSERT_EQ(segment.init(), true); ASSERT_TRUE(segment);
ASSERT_EQ(segment.cut(str, words), true); ASSERT_TRUE(segment.cut(str, words));
EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
//print(words);
//for(uint i = 0; i < sizeof(res)/sizeof(res[0]); i++)
//{
// buf.push_back()
//}
//buf.push_back("");
//buf.push_back("你好");
//buf.push_back("...hh");
//vector<string> res;
//uint size = strlen(str);
//uint offset = 0;
//while(offset < size)
//{
// uint len;
// const char* t = str + offset;
// int ret = filterAscii(t, size - offset, len);
// s.assign(t, len);
// res.push_back(s);
// //cout<<s<<","<<ret<<","<<len<<endl;
// //cout<<str<<endl;
// offset += len;
//}
//EXPECT_EQ(res, buf);
} }
//int main(int argc, char** argv)
//{
// //ChineseFilter chFilter;
// return 0;
//}