merge head

This commit is contained in:
aholic 2013-12-17 01:19:04 +08:00
commit 9af21d9658
16 changed files with 191 additions and 287 deletions

View File

@ -17,36 +17,27 @@ namespace CppJieba
{
private:
Trie* _trie;
const string _dictPath;
public:
FullSegment(const char* dictPath): _dictPath(dictPath){};
virtual ~FullSegment(){dispose();};
FullSegment(){_setInitFlag(false);};
explicit FullSegment(const string& dictPath){_setInitFlag(init(dictPath));}
virtual ~FullSegment(){};
public:
bool init()
bool init(const string& dictPath)
{
if(_getInitFlag())
{
LogError("already inited before now.");
return false;
}
_trie = TrieManager::getInstance().getTrie(_dictPath.c_str());
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
if (NULL == _trie)
{
LogError("get NULL pointor from getTrie(\"%s\")", _dictPath.c_str());
LogError("get NULL pointor from getTrie(\"%s\")", dictPath.c_str());
return false;
}
return _setInitFlag(true);
}
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_setInitFlag(false);
return true;
}
public:
using SegmentBase::cut;

View File

@ -33,12 +33,21 @@ namespace CppJieba
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
private:
const string _hmmModelPath;
public:
HMMSegment(const char * const filePath): _hmmModelPath(filePath)
HMMSegment(){_setInitFlag(false);}
explicit HMMSegment(const string& filePath)
{
_setInitFlag(init(filePath));
}
virtual ~HMMSegment(){}
public:
bool init(const string& filePath)
{
if(_getInitFlag())
{
return false;
}
memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B';
@ -49,20 +58,7 @@ namespace CppJieba
_emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbS);
}
virtual ~HMMSegment()
{
dispose();
}
public:
virtual bool init()
{
return _setInitFlag(_loadModel(_hmmModelPath.c_str()));
}
virtual bool dispose()
{
_setInitFlag(false);
return true;
return _setInitFlag(_loadModel(filePath.c_str()));
}
public:
using SegmentBase::cut;
@ -96,11 +92,6 @@ namespace CppJieba
public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
//if(!_getInitFlag())
//{
// LogError("not inited.");
// return false;
//}
assert(_getInitFlag());
if(begin == end)
{
@ -121,7 +112,6 @@ namespace CppJieba
}
return true;
}
//virtual bool cut(const string& str, vector<string>& res)const;
private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const

View File

@ -4,6 +4,7 @@
#include <stdio.h>
#include <string.h>
#include <cassert>
#include <sys/socket.h>
#include <sys/types.h>
#include <arpa/inet.h>
@ -40,9 +41,6 @@ namespace Husky
public:
virtual ~IRequestHandler(){};
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
virtual bool do_GET(const HttpReqInfo& httpReq, string& res) = 0;
};
@ -63,10 +61,11 @@ namespace Husky
public:
ServerFrame(unsigned nPort, unsigned nThreadCount, IRequestHandler* pHandler)
{
m_bShutdown = false;
m_nLsnPort = nPort;
m_nThreadCount = nThreadCount;
m_pHandler = pHandler;
m_bShutdown = false;
assert(pHandler);
pthread_mutex_init(&m_pmAccept,NULL);
};
virtual ~ServerFrame(){pthread_mutex_destroy(&m_pmAccept);};
@ -80,11 +79,6 @@ namespace Husky
}
LogInfo("init ok {port:%d, threadNum:%d}", m_nLsnPort, m_nThreadCount);
if(!m_pHandler->init())
{
LogFatal("m_pHandler init failed.");
return false;
}
return true;
}
virtual bool dispose()
@ -96,7 +90,6 @@ namespace Husky
return false;
}
int sockfd;
struct sockaddr_in dest;
@ -120,10 +113,6 @@ namespace Husky
LogError("error [%s]", strerror(errno));
}
close(sockfd);
if(!m_pHandler->dispose())
{
LogFatal("m_pHandler dispose failed.");
}
return true;
}
virtual bool run()

View File

@ -8,9 +8,6 @@ namespace CppJieba
{
public:
virtual ~ISegment(){};
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
public:
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const = 0;

View File

@ -34,37 +34,30 @@ namespace CppJieba
{
private:
Trie* _trie;
private:
const string _dictPath;
public:
MPSegment(const char * const dictPath): _dictPath(dictPath){};
virtual ~MPSegment(){dispose();};
MPSegment(){_setInitFlag(false);};
explicit MPSegment(const string& dictPath)
{
_setInitFlag(init(dictPath));
};
virtual ~MPSegment(){};
public:
virtual bool init()
bool init(const string& dictPath)
{
if(_getInitFlag())
{
LogError("already inited before now.");
return false;
}
_trie = TrieManager::getInstance().getTrie(_dictPath.c_str());
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
if (_trie == NULL)
{
LogError("get a NULL pointor form getTrie(\"%s\").", _dictPath.c_str());
LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str());
return false;
}
return _setInitFlag(true);
}
virtual bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_setInitFlag(false);
return true;
}
public:
using SegmentBase::cut;
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const

View File

@ -14,44 +14,32 @@ namespace CppJieba
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment(const char * const mpSegDict, const char * const hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict)
MixSegment(){_setInitFlag(false);};
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict): _mpSeg(mpSegDict), _hmmSeg(hmmSegDict)
{
_setInitFlag(_mpSeg && _hmmSeg);
}
virtual ~MixSegment()
{
dispose();
}
virtual ~MixSegment(){}
public:
virtual bool init()
bool init(const string& mpSegDict, const string& hmmSegDict)
{
if(_getInitFlag())
{
LogError("inited.");
return false;
}
if(!_mpSeg.init())
if(!_mpSeg.init(mpSegDict))
{
LogError("_mpSeg init");
return false;
}
if(!_hmmSeg.init())
if(!_hmmSeg.init(hmmSegDict))
{
LogError("_hmmSeg init");
return false;
}
return _setInitFlag(true);
}
virtual bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_mpSeg.dispose();
_hmmSeg.dispose();
_setInitFlag(false);
return true;
}
public:
using SegmentBase::cut;
public:

View File

@ -20,42 +20,35 @@ namespace CppJieba
private:
MixSegment _mixSeg;
FullSegment _fullSeg;
int _maxWordLen;
size_t _maxWordLen;
public:
QuerySegment(const char* dict, const char* model, int maxWordLen): _mixSeg(dict, model), _fullSeg(dict), _maxWordLen(maxWordLen){};
virtual ~QuerySegment(){dispose();};
QuerySegment(){_setInitFlag(false);};
explicit QuerySegment(const string& dict, const string& model, size_t maxWordLen)
{
_setInitFlag(init(dict, model, maxWordLen));
};
virtual ~QuerySegment(){};
public:
bool init()
bool init(const string& dict, const string& model, size_t maxWordLen)
{
if (_getInitFlag())
{
LogError("inited.");
LogError("inited already.");
return false;
}
if (!_mixSeg.init())
if (!_mixSeg.init(dict, model))
{
LogError("_mixSeg init");
return false;
}
if (!_fullSeg.init())
if (!_fullSeg.init(dict))
{
LogError("_fullSeg init");
return false;
}
return _setInitFlag(true);
}
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_fullSeg.dispose();
_mixSeg.dispose();
_setInitFlag(false);
return true;
}
public:
using SegmentBase::cut;
@ -92,9 +85,6 @@ namespace CppJieba
res.push_back(*fullResItr);
}
}
//clear fullRes
fullRes.clear();
}
else // just use the mix result
{

View File

@ -18,11 +18,9 @@ namespace CppJieba
bool _isInited;
bool _getInitFlag()const{return _isInited;};
bool _setInitFlag(bool flag){return _isInited = flag;};
public:
virtual bool init() = 0;
virtual bool dispose() = 0;
public:
operator bool(){return _getInitFlag();};
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const = 0;
virtual bool cut(const string& str, vector<string>& res)const
{

32
src/TfIdfKeyWord.hpp Normal file
View File

@ -0,0 +1,32 @@
#ifndef CPPJIEBA_TFIDF_H
#define CPPJIEBA_TFIDF_H
#include "MPSegment.hpp"
namespace CppJieba
{
using namespace Limonp;
class TfIdfKeyWord
{
private:
MPSegment _segment;
public:
TfIdfKeyWord(const char* dictFile): _segment(dictFile){};
~TfIdfKeyWord(){};
public:
bool init(){return _segment.init();};
bool dispose(){return _segment.dispose();};
public:
bool extract(const string& str, vector<string>& words, uint topN)
{
return _segment.cut(words);
return true;
}
};
}
#endif

View File

@ -63,57 +63,52 @@ int main(int argc, char ** argv)
if("cutHMM" == algorithm)
{
HMMSegment seg(modelPath.c_str());
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else if("cutDAG" == algorithm)
{
MPSegment seg(dictPath.c_str());
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return false;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else if ("cutFull" == algorithm)
{
FullSegment seg(dictPath.c_str());
if (!seg.init())
if (!seg)
{
cout << "seg init failed" << endl;
return false;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else if ("cutQuery" == algorithm)
{
QuerySegment seg(dictPath.c_str(), modelPath.c_str(), maxLen);
if (!seg.init())
if (!seg)
{
cout << "seg init failed" << endl;
return false;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else
{
MixSegment seg(dictPath.c_str(), modelPath.c_str());
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
return EXIT_SUCCESS;
}

View File

@ -15,10 +15,8 @@ using namespace CppJieba;
class ReqHandler: public IRequestHandler
{
public:
ReqHandler(const string& dictPath, const string& modelPath): _segment(dictPath.c_str(), modelPath.c_str()){};
ReqHandler(const string& dictPath, const string& modelPath): _segment(dictPath, modelPath){};
virtual ~ReqHandler(){};
virtual bool init(){return _segment.init();};
virtual bool dispose(){return _segment.dispose();};
public:
virtual bool do_GET(const HttpReqInfo& httpReq, string& strSnd)
{

View File

@ -31,33 +31,30 @@ int main(int argc, char ** argv)
//demo
{
HMMSegment seg(HMM_DICT_FILE);
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, TEST_FILE);
seg.dispose();
}
{
MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE);
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return EXIT_FAILURE;
}
cut(&seg, TEST_FILE);
seg.dispose();
}
{
MPSegment seg(JIEBA_DICT_FILE);
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return false;
}
cut(&seg, TEST_FILE);
seg.dispose();
}
return EXIT_SUCCESS;
}

View File

@ -25,36 +25,14 @@ void cut(const ISegment * seg, const char * const filePath)
int main(int argc, char ** argv)
{
//demo
//{
// HMMSegment seg;
// if(!seg.init("../dicts/hmm_model.utf8"))
// {
// cout<<"seg init failed."<<endl;
// return EXIT_FAILURE;
// }
// cut(&seg, "testlines.utf8");
// seg.dispose();
//}
//{
// MixSegment seg;
// if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8"))
// {
// cout<<"seg init failed."<<endl;
// return EXIT_FAILURE;
// }
// cut(&seg, "testlines.utf8");
// seg.dispose();
//}
{
MixSegment seg("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");
if(!seg.init())
if(!seg)
{
cout<<"seg init failed."<<endl;
return false;
}
cut(&seg, argv[1]);
seg.dispose();
}
return EXIT_SUCCESS;
}

View File

@ -11,8 +11,8 @@ TEST(HMMSegmentTest, Test1)
//string s;
//vector<string> buf(res, res + sizeof(res)/sizeof(res[0]));
vector<string> words;
ASSERT_EQ(segment.init(), true);
ASSERT_EQ(segment.cut(str, words), true);
ASSERT_TRUE(segment);
ASSERT_TRUE(segment.cut(str, words));
//print(words);
EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}

View File

@ -9,8 +9,8 @@ TEST(MPSegmentTest, Test1)
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","","", " 123456"};
vector<string> words;
ASSERT_EQ(segment.init(), true);
ASSERT_EQ(segment.cut(str, words), true);
ASSERT_TRUE(segment);
ASSERT_TRUE(segment.cut(str, words));
//print(words);
EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}

View File

@ -8,41 +8,9 @@ TEST(MixSegmentTest, Test1)
MixSegment segment("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8");;
const char* str = "我来自北京邮电大学。。。 学号 123456";
const char* res[] = {"", "来自", "北京邮电大学", "","",""," ","学号", " 123456"};
//string s;
//vector<string> buf(res, res + sizeof(res)/sizeof(res[0]));
vector<string> words;
ASSERT_EQ(segment.init(), true);
ASSERT_EQ(segment.cut(str, words), true);
ASSERT_TRUE(segment);
ASSERT_TRUE(segment.cut(str, words));
EXPECT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
//print(words);
//for(uint i = 0; i < sizeof(res)/sizeof(res[0]); i++)
//{
// buf.push_back()
//}
//buf.push_back("");
//buf.push_back("你好");
//buf.push_back("...hh");
//vector<string> res;
//uint size = strlen(str);
//uint offset = 0;
//while(offset < size)
//{
// uint len;
// const char* t = str + offset;
// int ret = filterAscii(t, size - offset, len);
// s.assign(t, len);
// res.push_back(s);
// //cout<<s<<","<<ret<<","<<len<<endl;
// //cout<<str<<endl;
// offset += len;
//}
//EXPECT_EQ(res, buf);
}
//int main(int argc, char** argv)
//{
// //ChineseFilter chFilter;
// return 0;
//}