Merge remote-tracking branch 'origin/hpp_ing' into dev

This commit is contained in:
wyy 2013-11-30 05:12:18 -08:00
commit bdb645ce69
21 changed files with 1190 additions and 1510 deletions

View File

@ -1,23 +1,15 @@
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(LIBCPPJIEBA_SRC HMMSegment.cpp MixSegment.cpp MPSegment.cpp Trie.cpp)
ADD_LIBRARY(cppjieba STATIC ${LIBCPPJIEBA_SRC})
ADD_EXECUTABLE(cjsegment segment.cpp) ADD_EXECUTABLE(cjsegment segment.cpp)
ADD_EXECUTABLE(cjserver server.cpp) ADD_EXECUTABLE(cjserver server.cpp)
TARGET_LINK_LIBRARIES(cjserver husky pthread)
LINK_DIRECTORIES(Husky) LINK_DIRECTORIES(Husky)
TARGET_LINK_LIBRARIES(cjsegment cppjieba)
TARGET_LINK_LIBRARIES(cjserver cppjieba husky pthread)
SET_TARGET_PROPERTIES(cppjieba PROPERTIES VERSION 1.2 SOVERSION 1)
INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin) INSTALL(TARGETS cjsegment RUNTIME DESTINATION bin)
INSTALL(TARGETS cjserver RUNTIME DESTINATION bin) INSTALL(TARGETS cjserver RUNTIME DESTINATION bin)
INSTALL(TARGETS cppjieba ARCHIVE DESTINATION lib/CppJieba) INSTALL(FILES ChineseFilter.hpp HMMSegment.hpp MPSegment.hpp Trie.hpp ISegment.hpp MixSegment.hpp SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
INSTALL(FILES ChineseFilter.hpp HMMSegment.h MPSegment.h structs.h Trie.h globals.h ISegment.hpp MixSegment.h SegmentBase.hpp TransCode.hpp DESTINATION include/CppJieba)
ADD_SUBDIRECTORY(Husky) ADD_SUBDIRECTORY(Husky)
ADD_SUBDIRECTORY(Limonp) ADD_SUBDIRECTORY(Limonp)

View File

@ -1,14 +1,14 @@
#ifndef CPPJIEBA_CHINESEFILTER_H #ifndef CPPJIEBA_CHINESEFILTER_H
#define CPPJIEBA_CHINESEFILTER_H #define CPPJIEBA_CHINESEFILTER_H
#include "globals.h"
#include "TransCode.hpp" #include "TransCode.hpp"
namespace CppJieba namespace CppJieba
{ {
enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1, OTHERS = 2};
typedef Unicode::const_iterator UniConIter;
class ChineseFilter; class ChineseFilter;
class ChFilterIterator class ChFilterIterator
{ {
public: public:

View File

@ -1,341 +0,0 @@
#include "HMMSegment.h"
namespace CppJieba
{
HMMSegment::HMMSegment()
{
memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B';
_statMap[1] = 'E';
_statMap[2] = 'M';
_statMap[3] = 'S';
_emitProbVec.push_back(&_emitProbB);
_emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbS);
}
HMMSegment::~HMMSegment()
{
dispose();
}
bool HMMSegment::init(const char* const modelPath)
{
return _setInitFlag(_loadModel(modelPath));
}
bool HMMSegment::dispose()
{
_setInitFlag(false);
return true;
}
bool HMMSegment::_loadModel(const char* const filePath)
{
LogInfo("loadModel [%s] start ...", filePath);
ifstream ifile(filePath);
string line;
vector<string> tmp;
vector<string> tmp2;
//load _startProb
if(!_getLine(ifile, line))
{
return false;
}
splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("start_p illegal");
return false;
}
for(uint j = 0; j< tmp.size(); j++)
{
_startProb[j] = atof(tmp[j].c_str());
//cout<<_startProb[j]<<endl;
}
//load _transProb
for(uint i = 0; i < STATUS_SUM; i++)
{
if(!_getLine(ifile, line))
{
return false;
}
splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("trans_p illegal");
return false;
}
for(uint j =0; j < STATUS_SUM; j++)
{
_transProb[i][j] = atof(tmp[j].c_str());
//cout<<_transProb[i][j]<<endl;
}
}
//load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
{
return false;
}
//load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
{
return false;
}
//load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
{
return false;
}
//load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
{
return false;
}
LogInfo("loadModel [%s] end.", filePath);
return true;
}
bool HMMSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
vector<uint> status;
if(!_viterbi(begin, end, status))
{
LogError("_viterbi failed.");
return false;
}
Unicode::const_iterator left = begin;
Unicode::const_iterator right;
for(uint i =0; i< status.size(); i++)
{
if(status[i] % 2) //if(E == status[i] || S == status[i])
{
right = begin + i + 1;
res.push_back(Unicode(left, right));
left = right;
}
}
return true;
}
bool HMMSegment::cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool HMMSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(begin == end)
{
return false;
}
vector<Unicode> words;
if(!cut(begin, end, words))
{
return false;
}
string tmp;
for(uint i = 0; i < words.size(); i++)
{
if(TransCode::encode(words[i], tmp))
{
res.push_back(tmp);
}
}
return true;
}
bool HMMSegment::_viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const
{
if(begin == end)
{
return false;
}
size_t Y = STATUS_SUM;
size_t X = end - begin;
size_t XYSize = X * Y;
int * path;
double * weight;
uint now, old, stat;
double tmp, endE, endS;
try
{
path = new int [XYSize];
weight = new double [XYSize];
}
catch(const std::bad_alloc&)
{
LogError("bad_alloc");
return false;
}
if(NULL == path || NULL == weight)
{
LogError("bad_alloc");
return false;
}
//start
for(uint y = 0; y < Y; y++)
{
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1;
}
//process
//for(; begin != end; begin++)
for(uint x = 1; x < X; x++)
{
for(uint y = 0; y < Y; y++)
{
now = x + y*X;
weight[now] = MIN_DOUBLE;
path[now] = E; // warning
for(uint preY = 0; preY < Y; preY++)
{
old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
if(tmp > weight[now])
{
weight[now] = tmp;
path[now] = preY;
}
}
}
}
endE = weight[X-1+E*X];
endS = weight[X-1+S*X];
stat = 0;
if(endE > endS)
{
stat = E;
}
else
{
stat = S;
}
status.assign(X, 0);
for(int x = X -1 ; x >= 0; x--)
{
status[x] = stat;
stat = path[x + stat*X];
}
delete [] path;
delete [] weight;
return true;
}
bool HMMSegment::_getLine(ifstream& ifile, string& line)
{
while(getline(ifile, line))
{
trim(line);
if(line.empty())
{
continue;
}
if(strStartsWith(line, "#"))
{
continue;
}
return true;
}
return false;
}
bool HMMSegment::_loadEmitProb(const string& line, EmitProbMap& mp)
{
if(line.empty())
{
return false;
}
vector<string> tmp, tmp2;
uint16_t unico = 0;
splitStr(line, tmp, ",");
for(uint i = 0; i < tmp.size(); i++)
{
splitStr(tmp[i], tmp2, ":");
if(2 != tmp2.size())
{
LogError("_emitProb illegal.");
return false;
}
if(!_decodeOne(tmp2[0], unico))
{
LogError("TransCode failed.");
return false;
}
mp[unico] = atof(tmp2[1].c_str());
}
return true;
}
bool HMMSegment::_decodeOne(const string& str, uint16_t& res)
{
Unicode ui16;
if(!TransCode::decode(str, ui16) || ui16.size() != 1)
{
return false;
}
res = ui16[0];
return true;
}
double HMMSegment::_getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
{
EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end())
{
return defVal;
}
return cit->second;
}
}
#ifdef HMMSEGMENT_UT
using namespace CppJieba;
size_t add(size_t a, size_t b)
{
return a*b;
}
int main()
{
TransCode::setUtf8Enc();
HMMSegment hmm;
hmm.loadModel("../dicts/hmm_model.utf8");
vector<string> res;
hmm.cut("小明硕士毕业于北邮网络研究院。。.", res);
cout<<joinStr(res, "/")<<endl;
return 0;
}
#endif

View File

@ -1,59 +0,0 @@
#ifndef CPPJIBEA_HMMSEGMENT_H
#define CPPJIBEA_HMMSEGMENT_H
#include <iostream>
#include <fstream>
#include <memory.h>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "globals.h"
#include "TransCode.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
namespace CppJieba
{
using namespace Limonp;
class HMMSegment: public SegmentBase
{
public:
/*
* STATUS:
* 0:B, 1:E, 2:M, 3:S
* */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
public:
HMMSegment();
virtual ~HMMSegment();
public:
bool init(const char* const modelPath);
bool dispose();
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const ;
bool cut(const string& str, vector<string>& res)const;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const;
//virtual bool cut(const string& str, vector<string>& res)const;
private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const;
bool _loadModel(const char* const filePath);
bool _getLine(ifstream& ifile, string& line);
bool _loadEmitProb(const string& line, EmitProbMap& mp);
bool _decodeOne(const string& str, uint16_t& res);
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const ;
};
}
#endif

346
src/HMMSegment.hpp Normal file
View File

@ -0,0 +1,346 @@
#ifndef CPPJIBEA_HMMSEGMENT_H
#define CPPJIBEA_HMMSEGMENT_H
#include <iostream>
#include <fstream>
#include <memory.h>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "TransCode.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
namespace CppJieba
{
using namespace Limonp;
typedef unordered_map<uint16_t, double> EmitProbMap;
class HMMSegment: public SegmentBase
{
public:
/*
* STATUS:
* 0:B, 1:E, 2:M, 3:S
* */
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
private:
char _statMap[STATUS_SUM];
double _startProb[STATUS_SUM];
double _transProb[STATUS_SUM][STATUS_SUM];
EmitProbMap _emitProbB;
EmitProbMap _emitProbE;
EmitProbMap _emitProbM;
EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec;
public:
HMMSegment()
{
memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B';
_statMap[1] = 'E';
_statMap[2] = 'M';
_statMap[3] = 'S';
_emitProbVec.push_back(&_emitProbB);
_emitProbVec.push_back(&_emitProbE);
_emitProbVec.push_back(&_emitProbM);
_emitProbVec.push_back(&_emitProbS);
}
virtual ~HMMSegment()
{
dispose();
}
public:
bool init(const char* const modelPath)
{
return _setInitFlag(_loadModel(modelPath));
}
bool dispose()
{
_setInitFlag(false);
return true;
}
public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
vector<uint> status;
if(!_viterbi(begin, end, status))
{
LogError("_viterbi failed.");
return false;
}
Unicode::const_iterator left = begin;
Unicode::const_iterator right;
for(uint i =0; i< status.size(); i++)
{
if(status[i] % 2) //if(E == status[i] || S == status[i])
{
right = begin + i + 1;
res.push_back(Unicode(left, right));
left = right;
}
}
return true;
}
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(begin == end)
{
return false;
}
vector<Unicode> words;
if(!cut(begin, end, words))
{
return false;
}
string tmp;
for(uint i = 0; i < words.size(); i++)
{
if(TransCode::encode(words[i], tmp))
{
res.push_back(tmp);
}
}
return true;
}
//virtual bool cut(const string& str, vector<string>& res)const;
private:
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<uint>& status)const
{
if(begin == end)
{
return false;
}
size_t Y = STATUS_SUM;
size_t X = end - begin;
size_t XYSize = X * Y;
int * path;
double * weight;
uint now, old, stat;
double tmp, endE, endS;
try
{
path = new int [XYSize];
weight = new double [XYSize];
}
catch(const std::bad_alloc&)
{
LogError("bad_alloc");
return false;
}
if(NULL == path || NULL == weight)
{
LogError("bad_alloc");
return false;
}
//start
for(uint y = 0; y < Y; y++)
{
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1;
}
//process
//for(; begin != end; begin++)
for(uint x = 1; x < X; x++)
{
for(uint y = 0; y < Y; y++)
{
now = x + y*X;
weight[now] = MIN_DOUBLE;
path[now] = E; // warning
for(uint preY = 0; preY < Y; preY++)
{
old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
if(tmp > weight[now])
{
weight[now] = tmp;
path[now] = preY;
}
}
}
}
endE = weight[X-1+E*X];
endS = weight[X-1+S*X];
stat = 0;
if(endE > endS)
{
stat = E;
}
else
{
stat = S;
}
status.assign(X, 0);
for(int x = X -1 ; x >= 0; x--)
{
status[x] = stat;
stat = path[x + stat*X];
}
delete [] path;
delete [] weight;
return true;
}
bool _loadModel(const char* const filePath)
{
LogInfo("loadModel [%s] start ...", filePath);
ifstream ifile(filePath);
string line;
vector<string> tmp;
vector<string> tmp2;
//load _startProb
if(!_getLine(ifile, line))
{
return false;
}
splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("start_p illegal");
return false;
}
for(uint j = 0; j< tmp.size(); j++)
{
_startProb[j] = atof(tmp[j].c_str());
//cout<<_startProb[j]<<endl;
}
//load _transProb
for(uint i = 0; i < STATUS_SUM; i++)
{
if(!_getLine(ifile, line))
{
return false;
}
splitStr(line, tmp, " ");
if(tmp.size() != STATUS_SUM)
{
LogError("trans_p illegal");
return false;
}
for(uint j =0; j < STATUS_SUM; j++)
{
_transProb[i][j] = atof(tmp[j].c_str());
//cout<<_transProb[i][j]<<endl;
}
}
//load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
{
return false;
}
//load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
{
return false;
}
//load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
{
return false;
}
//load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
{
return false;
}
LogInfo("loadModel [%s] end.", filePath);
return true;
}
bool _getLine(ifstream& ifile, string& line)
{
while(getline(ifile, line))
{
trim(line);
if(line.empty())
{
continue;
}
if(strStartsWith(line, "#"))
{
continue;
}
return true;
}
return false;
}
bool _loadEmitProb(const string& line, EmitProbMap& mp)
{
if(line.empty())
{
return false;
}
vector<string> tmp, tmp2;
uint16_t unico = 0;
splitStr(line, tmp, ",");
for(uint i = 0; i < tmp.size(); i++)
{
splitStr(tmp[i], tmp2, ":");
if(2 != tmp2.size())
{
LogError("_emitProb illegal.");
return false;
}
if(!_decodeOne(tmp2[0], unico))
{
LogError("TransCode failed.");
return false;
}
mp[unico] = atof(tmp2[1].c_str());
}
return true;
}
bool _decodeOne(const string& str, uint16_t& res)
{
Unicode ui16;
if(!TransCode::decode(str, ui16) || ui16.size() != 1)
{
return false;
}
res = ui16[0];
return true;
}
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
{
EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end())
{
return defVal;
}
return cit->second;
}
};
}
#endif

View File

@ -1,7 +1,6 @@
#ifndef CPPJIEBA_SEGMENTINTERFACE_H #ifndef CPPJIEBA_SEGMENTINTERFACE_H
#define CPPJIEBA_SEGMENTINTERFACE_H #define CPPJIEBA_SEGMENTINTERFACE_H
#include "globals.h"
namespace CppJieba namespace CppJieba
{ {

View File

@ -1,265 +0,0 @@
/************************************
* file enc : AISCII
* author : wuyanyi09@gmail.com
************************************/
#include "MPSegment.h"
namespace CppJieba
{
bool MPSegment::init(const char* const filePath)
{
if(_getInitFlag())
{
LogError("already inited before now.");
return false;
}
if(!_trie.init())
{
LogError("_trie.init failed.");
return false;
}
LogInfo("_trie.loadDict(%s) start...", filePath);
if(!_trie.loadDict(filePath))
{
LogError("_trie.loadDict faield.");
return false;
}
LogInfo("_trie.loadDict end.");
return _setInitFlag(true);
}
bool MPSegment::dispose()
{
if(!_getInitFlag())
{
return true;
}
_trie.dispose();
_setInitFlag(false);
return true;
}
bool MPSegment::cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool MPSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
vector<TrieNodeInfo> segWordInfos;
if(!cut(begin, end, segWordInfos))
{
return false;
}
string tmp;
for(uint i = 0; i < segWordInfos.size(); i++)
{
if(TransCode::encode(segWordInfos[i].word, tmp))
{
res.push_back(tmp);
}
else
{
LogError("encode failed.");
}
}
return true;
}
bool MPSegment::cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
SegmentContext segContext;
for(Unicode::const_iterator it = begin; it != end; it++)
{
segContext.push_back(SegmentChar(*it));
}
//calc DAG
if(!_calcDAG(segContext))
{
LogError("_calcDAG failed.");
return false;
}
if(!_calcDP(segContext))
{
LogError("_calcDP failed.");
return false;
}
if(!_cut(segContext, segWordInfos))
{
LogError("_cut failed.");
return false;
}
return true;
}
bool MPSegment::cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(str.empty())
{
return false;
}
Unicode sentence;
if(!TransCode::decode(str, sentence))
{
LogError("TransCode::decode failed.");
return false;
}
return cut(sentence.begin(), sentence.end(), segWordInfos);
}
bool MPSegment::_calcDAG(SegmentContext& segContext)const
{
if(segContext.empty())
{
LogError("segContext empty.");
return false;
}
Unicode unicode;
for(uint i = 0; i < segContext.size(); i++)
{
unicode.clear();
for(uint j = i ; j < segContext.size(); j++)
{
unicode.push_back(segContext[j].uniCh);
}
vector<pair<uint, const TrieNodeInfo*> > vp;
if(_trie.find(unicode, vp))
{
for(uint j = 0; j < vp.size(); j++)
{
uint nextp = vp[j].first + i;
segContext[i].dag[nextp] = vp[j].second;
//cout<<vp[j].first<<endl;
//LogDebug(vp[j].second->toString());
}
}
if(segContext[i].dag.end() == segContext[i].dag.find(i))
{
segContext[i].dag[i] = NULL;
}
}
return true;
}
bool MPSegment::_calcDP(SegmentContext& segContext)const
{
if(segContext.empty())
{
LogError("segContext empty");
return false;
}
for(int i = segContext.size() - 1; i >= 0; i--)
{
segContext[i].pInfo = NULL;
segContext[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
{
uint nextPos = it->first;
const TrieNodeInfo* p = it->second;
double val = 0.0;
if(nextPos + 1 < segContext.size())
{
val += segContext[nextPos + 1].weight;
}
if(p)
{
val += p->logFreq;
}
else
{
val += _trie.getMinLogFreq();
}
if(val > segContext[i].weight)
{
segContext[i].pInfo = p;
segContext[i].weight = val;
}
}
}
return true;
}
bool MPSegment::_cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const
{
uint i = 0;
while(i < segContext.size())
{
const TrieNodeInfo* p = segContext[i].pInfo;
if(p)
{
res.push_back(*p);
i += p->word.size();
}
else//single chinese word
{
TrieNodeInfo nodeInfo;
nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo);
i++;
}
}
return true;
}
}
#ifdef SEGMENT_UT
using namespace CppJieba;
int main()
{
MPSegment segment;
segment.init();
if(!segment._loadSegDict("../dicts/segdict.gbk.v3.0"))
{
cerr<<"1"<<endl;
return 1;
}
//segment.init("dicts/jieba.dict.utf8");
//ifstream ifile("testtitle.gbk");
ifstream ifile("badcase");
vector<string> res;
string line;
while(getline(ifile, line))
{
res.clear();
segment.cut(line, res);
PRINT_VECTOR(res);
getchar();
}
segment.dispose();
return 0;
}
#endif

View File

@ -1,49 +0,0 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_MPSEGMENT_H
#define CPPJIEBA_MPSEGMENT_H
#include <algorithm>
#include <set>
#include "Limonp/logger.hpp"
#include "Trie.h"
#include "globals.h"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
namespace CppJieba
{
typedef vector<SegmentChar> SegmentContext;
class MPSegment: public SegmentBase
{
private:
Trie _trie;
public:
MPSegment(){};
virtual ~MPSegment(){dispose();};
public:
bool init(const char* const filePath);
bool dispose();
public:
//bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const;
bool cut(const string& str, vector<string>& res)const;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const;
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const;
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const;
//virtual bool cut(const string& str, vector<string>& res)const;
private:
bool _calcDAG(SegmentContext& segContext)const;
bool _calcDP(SegmentContext& segContext)const;
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const;
};
}
#endif

264
src/MPSegment.hpp Normal file
View File

@ -0,0 +1,264 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_MPSEGMENT_H
#define CPPJIEBA_MPSEGMENT_H
#include <algorithm>
#include <set>
#include "Limonp/logger.hpp"
#include "Trie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
namespace CppJieba
{
struct SegmentChar
{
uint16_t uniCh;
DagType dag;
const TrieNodeInfo * pInfo;
double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
{
}
};
typedef vector<SegmentChar> SegmentContext;
class MPSegment: public SegmentBase
{
private:
Trie _trie;
public:
MPSegment(){};
virtual ~MPSegment(){dispose();};
public:
bool init(const char* const filePath)
{
if(_getInitFlag())
{
LogError("already inited before now.");
return false;
}
if(!_trie.init())
{
LogError("_trie.init failed.");
return false;
}
LogInfo("_trie.loadDict(%s) start...", filePath);
if(!_trie.loadDict(filePath))
{
LogError("_trie.loadDict faield.");
return false;
}
LogInfo("_trie.loadDict end.");
return _setInitFlag(true);
}
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_trie.dispose();
_setInitFlag(false);
return true;
}
public:
//bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const;
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
vector<TrieNodeInfo> segWordInfos;
if(!cut(begin, end, segWordInfos))
{
return false;
}
string tmp;
for(uint i = 0; i < segWordInfos.size(); i++)
{
if(TransCode::encode(segWordInfos[i].word, tmp))
{
res.push_back(tmp);
}
else
{
LogError("encode failed.");
}
}
return true;
}
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(str.empty())
{
return false;
}
Unicode sentence;
if(!TransCode::decode(str, sentence))
{
LogError("TransCode::decode failed.");
return false;
}
return cut(sentence.begin(), sentence.end(), segWordInfos);
}
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
SegmentContext segContext;
for(Unicode::const_iterator it = begin; it != end; it++)
{
segContext.push_back(SegmentChar(*it));
}
//calc DAG
if(!_calcDAG(segContext))
{
LogError("_calcDAG failed.");
return false;
}
if(!_calcDP(segContext))
{
LogError("_calcDP failed.");
return false;
}
if(!_cut(segContext, segWordInfos))
{
LogError("_cut failed.");
return false;
}
return true;
}
//virtual bool cut(const string& str, vector<string>& res)const;
private:
bool _calcDAG(SegmentContext& segContext)const
{
if(segContext.empty())
{
LogError("segContext empty.");
return false;
}
Unicode unicode;
for(uint i = 0; i < segContext.size(); i++)
{
unicode.clear();
for(uint j = i ; j < segContext.size(); j++)
{
unicode.push_back(segContext[j].uniCh);
}
vector<pair<uint, const TrieNodeInfo*> > vp;
if(_trie.find(unicode, vp))
{
for(uint j = 0; j < vp.size(); j++)
{
uint nextp = vp[j].first + i;
segContext[i].dag[nextp] = vp[j].second;
//cout<<vp[j].first<<endl;
//LogDebug(vp[j].second->toString());
}
}
if(segContext[i].dag.end() == segContext[i].dag.find(i))
{
segContext[i].dag[i] = NULL;
}
}
return true;
}
bool _calcDP(SegmentContext& segContext)const
{
if(segContext.empty())
{
LogError("segContext empty");
return false;
}
for(int i = segContext.size() - 1; i >= 0; i--)
{
segContext[i].pInfo = NULL;
segContext[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++)
{
uint nextPos = it->first;
const TrieNodeInfo* p = it->second;
double val = 0.0;
if(nextPos + 1 < segContext.size())
{
val += segContext[nextPos + 1].weight;
}
if(p)
{
val += p->logFreq;
}
else
{
val += _trie.getMinLogFreq();
}
if(val > segContext[i].weight)
{
segContext[i].pInfo = p;
segContext[i].weight = val;
}
}
}
return true;
}
bool _cut(SegmentContext& segContext, vector<TrieNodeInfo>& res)const
{
uint i = 0;
while(i < segContext.size())
{
const TrieNodeInfo* p = segContext[i].pInfo;
if(p)
{
res.push_back(*p);
i += p->word.size();
}
else//single chinese word
{
TrieNodeInfo nodeInfo;
nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo);
i++;
}
}
return true;
}
};
}
#endif

View File

@ -1,125 +0,0 @@
#include "MixSegment.h"
namespace CppJieba
{
MixSegment::MixSegment()
{
}
MixSegment::~MixSegment()
{
dispose();
}
bool MixSegment::init(const char* const mpSegDict, const char* const hmmSegDict)
{
if(_getInitFlag())
{
LogError("inited.");
return false;
}
if(!_mpSeg.init(mpSegDict))
{
LogError("_mpSeg init");
return false;
}
if(!_hmmSeg.init(hmmSegDict))
{
LogError("_hmmSeg init");
return false;
}
return _setInitFlag(true);
}
bool MixSegment::dispose()
{
if(!_getInitFlag())
{
return true;
}
_mpSeg.dispose();
_hmmSeg.dispose();
_setInitFlag(false);
return true;
}
bool MixSegment::cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(begin == end)
{
return false;
}
vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos))
{
LogError("mpSeg cutDAG failed.");
return false;
}
Unicode unico;
vector<Unicode> hmmRes;
string tmp;
for(uint i= 0; i < infos.size(); i++)
{
TransCode::encode(infos[i].word,tmp);
if(1 == infos[i].word.size())
{
unico.push_back(infos[i].word[0]);
}
else
{
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
unico.clear();
TransCode::encode(infos[i].word, tmp);
res.push_back(tmp);
}
}
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
return true;
}
}
#ifdef MIXSEGMENT_UT
using namespace CppJieba;
int main()
{
return 0;
}
#endif

View File

@ -1,28 +0,0 @@
#ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H
#include "MPSegment.h"
#include "HMMSegment.h"
#include "Limonp/str_functs.hpp"
namespace CppJieba
{
class MixSegment: public SegmentBase
{
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment();
virtual ~MixSegment();
public:
bool init(const char* const _mpSegDict, const char* const _hmmSegDict);
bool dispose();
public:
//virtual bool cut(const string& str, vector<string>& res) const;
bool cut(const string& str, vector<string>& res)const;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const;
};
}
#endif

128
src/MixSegment.hpp Normal file
View File

@ -0,0 +1,128 @@
#ifndef CPPJIEBA_MIXSEGMENT_H
#define CPPJIEBA_MIXSEGMENT_H
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "Limonp/str_functs.hpp"
namespace CppJieba
{
class MixSegment: public SegmentBase
{
private:
MPSegment _mpSeg;
HMMSegment _hmmSeg;
public:
MixSegment()
{
}
virtual ~MixSegment()
{
dispose();
}
public:
bool init(const char* const mpSegDict, const char* const hmmSegDict)
{
if(_getInitFlag())
{
LogError("inited.");
return false;
}
if(!_mpSeg.init(mpSegDict))
{
LogError("_mpSeg init");
return false;
}
if(!_hmmSeg.init(hmmSegDict))
{
LogError("_hmmSeg init");
return false;
}
return _setInitFlag(true);
}
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_mpSeg.dispose();
_hmmSeg.dispose();
_setInitFlag(false);
return true;
}
public:
//virtual bool cut(const string& str, vector<string>& res) const;
bool cut(const string& str, vector<string>& res)const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
{
if(!_getInitFlag())
{
LogError("not inited.");
return false;
}
if(begin == end)
{
return false;
}
vector<TrieNodeInfo> infos;
if(!_mpSeg.cut(begin, end, infos))
{
LogError("mpSeg cutDAG failed.");
return false;
}
Unicode unico;
vector<Unicode> hmmRes;
string tmp;
for(uint i= 0; i < infos.size(); i++)
{
TransCode::encode(infos[i].word,tmp);
if(1 == infos[i].word.size())
{
unico.push_back(infos[i].word[0]);
}
else
{
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
unico.clear();
TransCode::encode(infos[i].word, tmp);
res.push_back(tmp);
}
}
if(!unico.empty())
{
hmmRes.clear();
if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes))
{
LogError("_hmmSeg cut failed.");
return false;
}
for(uint j = 0; j < hmmRes.size(); j++)
{
TransCode::encode(hmmRes[j], tmp);
res.push_back(tmp);
}
}
return true;
}
};
}
#endif

View File

@ -1,7 +1,6 @@
#ifndef CPPJIEBA_SEGMENTBASE_H #ifndef CPPJIEBA_SEGMENTBASE_H
#define CPPJIEBA_SEGMENTBASE_H #define CPPJIEBA_SEGMENTBASE_H
#include "globals.h"
#include "ISegment.hpp" #include "ISegment.hpp"
#include "ChineseFilter.hpp" #include "ChineseFilter.hpp"
#include "Limonp/str_functs.hpp" #include "Limonp/str_functs.hpp"

View File

@ -6,13 +6,13 @@
#define CPPJIEBA_TRANSCODE_H #define CPPJIEBA_TRANSCODE_H
#include "globals.h"
#include "Limonp/str_functs.hpp" #include "Limonp/str_functs.hpp"
namespace CppJieba namespace CppJieba
{ {
using namespace Limonp; using namespace Limonp;
typedef std::vector<uint16_t> Unicode;
namespace TransCode namespace TransCode
{ {
inline bool decode(const string& str, vector<uint16_t>& vec) inline bool decode(const string& str, vector<uint16_t>& vec)

View File

@ -1,390 +0,0 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#include "Trie.h"
namespace CppJieba
{
Trie::Trie()
{
_root = NULL;
_freqSum = 0;
_minLogFreq = MAX_DOUBLE;
_initFlag = false;
}
Trie::~Trie()
{
dispose();
}
bool Trie::init()
{
if(_getInitFlag())
{
LogError("already initted!");
return false;
}
try
{
_root = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
if(NULL == _root)
{
return false;
}
_setInitFlag(true);
return true;
}
bool Trie::loadDict(const char * const filePath)
{
if(!_getInitFlag())
{
LogError("not initted.");
return false;
}
if(!checkFileExist(filePath))
{
LogError("cann't find fiel[%s].",filePath);
return false;
}
bool res = false;
res = _trieInsert(filePath);
if(!res)
{
LogError("_trieInsert failed.");
return false;
}
res = _countWeight();
if(!res)
{
LogError("_countWeight failed.");
return false;
}
return true;
}
bool Trie::_trieInsert(const char * const filePath)
{
ifstream ifile(filePath);
string line;
vector<string> vecBuf;
TrieNodeInfo nodeInfo;
while(getline(ifile, line))
{
vecBuf.clear();
splitStr(line, vecBuf, " ");
if(3 < vecBuf.size())
{
LogError("line[%s] illegal.", line.c_str());
return false;
}
if(!TransCode::decode(vecBuf[0], nodeInfo.word))
{
return false;
}
nodeInfo.freq = atoi(vecBuf[1].c_str());
if(3 == vecBuf.size())
{
nodeInfo.tag = vecBuf[2];
}
//insert node
if(!insert(nodeInfo))
{
LogError("insert node failed!");
}
}
return true;
}
bool Trie::dispose()
{
if(!_getInitFlag())
{
return false;
}
bool ret = _deleteNode(_root);
if(!ret)
{
LogFatal("_deleteNode failed!");
return false;
}
_root = NULL;
_nodeInfoVec.clear();
_setInitFlag(false);
return ret;
}
const TrieNodeInfo* Trie::findPrefix(const string& str)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return NULL;
}
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
LogError("TransCode::decode failed.");
return NULL;
}
//find
TrieNode* p = _root;
uint pos = 0;
uint16_t chUni = 0;
const TrieNodeInfo * res = NULL;
for(uint i = 0; i < uintVec.size(); i++)
{
chUni = uintVec[i];
if(p->isLeaf)
{
pos = p->nodeInfoVecPos;
if(pos >= _nodeInfoVec.size())
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
res = &(_nodeInfoVec[pos]);
}
if(p->hmap.find(chUni) == p->hmap.end())
{
break;
}
else
{
p = p->hmap[chUni];
}
}
return res;
}
const TrieNodeInfo* Trie::find(const string& str)const
{
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
return NULL;
}
return find(uintVec);
}
const TrieNodeInfo* Trie::find(const Unicode& uintVec)const
{
if(uintVec.empty())
{
return NULL;
}
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return NULL;
}
if(begin >= end)
{
return NULL;
}
TrieNode* p = _root;
for(Unicode::const_iterator it = begin; it != end; it++)
{
uint16_t chUni = *it;
if(p->hmap.find(chUni) == p-> hmap.end())
{
return NULL;
}
else
{
p = p->hmap[chUni];
}
}
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
return &(_nodeInfoVec[pos]);
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
}
return NULL;
}
bool Trie::find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return false;
}
TrieNode* p = _root;
//for(Unicode::const_iterator it = begin; it != end; it++)
for(uint i = 0; i < unico.size(); i++)
{
if(p->hmap.find(unico[i]) == p-> hmap.end())
{
break;
}
p = p->hmap[unico[i]];
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
res.push_back(make_pair(i, &_nodeInfoVec[pos]));
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return false;
}
}
}
return !res.empty();
}
bool Trie::_deleteNode(TrieNode* node)
{
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
{
TrieNode* next = it->second;
_deleteNode(next);
}
delete node;
return true;
}
bool Trie::insert(const TrieNodeInfo& nodeInfo)
{
if(!_getInitFlag())
{
LogFatal("not initted!");
return false;
}
const Unicode& uintVec = nodeInfo.word;
TrieNode* p = _root;
for(uint i = 0; i < uintVec.size(); i++)
{
uint16_t cu = uintVec[i];
if(NULL == p)
{
return false;
}
if(p->hmap.end() == p->hmap.find(cu))
{
TrieNode * next = NULL;
try
{
next = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
p->hmap[cu] = next;
p = next;
}
else
{
p = p->hmap[cu];
}
}
if(NULL == p)
{
return false;
}
if(p->isLeaf)
{
LogError("this node already inserted");
return false;
}
p->isLeaf = true;
_nodeInfoVec.push_back(nodeInfo);
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
return true;
}
bool Trie::_countWeight()
{
if(_nodeInfoVec.empty() || 0 != _freqSum)
{
LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
return false;
}
//freq total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_freqSum += _nodeInfoVec[i].freq;
}
if(0 == _freqSum)
{
LogError("_freqSum == 0 .");
return false;
}
//normalize
for(uint i = 0; i < _nodeInfoVec.size(); i++)
{
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
if(0 == nodeInfo.freq)
{
LogFatal("nodeInfo.freq == 0!");
return false;
}
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
if(_minLogFreq > nodeInfo.logFreq)
{
_minLogFreq = nodeInfo.logFreq;
}
}
return true;
}
}
#ifdef TRIE_UT
using namespace CppJieba;
int main()
{
Trie trie;
trie.init();
trie.loadDict("../dicts/segdict.gbk.v2.1");
//trie.loadDict("tmp");
cout<<trie.getMinLogFreq()<<endl;
trie.dispose();
return 0;
}
#endif

View File

@ -1,85 +0,0 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_TRIE_H
#define CPPJIEBA_TRIE_H
#include <iostream>
#include <fstream>
#include <map>
#include <cstring>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "TransCode.hpp"
#include "globals.h"
#include "structs.h"
namespace CppJieba
{
using namespace Limonp;
struct TrieNode
{
TrieNodeMap hmap;
bool isLeaf;
uint nodeInfoVecPos;
TrieNode()
{
isLeaf = false;
nodeInfoVecPos = 0;
}
};
class Trie
{
private:
TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec;
bool _initFlag;
int64_t _freqSum;
double _minLogFreq;
public:
Trie();
~Trie();
bool init();
bool loadDict(const char * const filePath);
bool dispose();
private:
void _setInitFlag(bool on){_initFlag = on;};
bool _getInitFlag()const{return _initFlag;};
public:
const TrieNodeInfo* find(const string& str)const;
const TrieNodeInfo* find(const Unicode& uintVec)const;
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const;
bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const;
const TrieNodeInfo* findPrefix(const string& str)const;
public:
//double getWeight(const string& str);
//double getWeight(const Unicode& uintVec);
//double getWeight(Unicode::const_iterator begin, Unicode::const_iterator end);
double getMinLogFreq()const{return _minLogFreq;};
//int64_t getTotalCount(){return _freqSum;};
bool insert(const TrieNodeInfo& nodeInfo);
private:
bool _trieInsert(const char * const filePath);
bool _countWeight();
bool _deleteNode(TrieNode* node);
};
}
#endif

441
src/Trie.hpp Normal file
View File

@ -0,0 +1,441 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_TRIE_H
#define CPPJIEBA_TRIE_H
#include <iostream>
#include <fstream>
#include <map>
#include <cstring>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp"
#include "TransCode.hpp"
namespace CppJieba
{
using namespace Limonp;
const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100;
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNode
{
TrieNodeMap hmap;
bool isLeaf;
uint nodeInfoVecPos;
TrieNode()
{
isLeaf = false;
nodeInfoVecPos = 0;
}
};
struct TrieNodeInfo
{
Unicode word;
size_t freq;
string tag;
double logFreq; //logFreq = log(freq/sum(freq));
TrieNodeInfo():freq(0),logFreq(0.0)
{
}
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
{
}
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
{
}
string toString()const
{
string tmp;
TransCode::encode(word, tmp);
return string_format("{word:%s,freq:%d, logFreq:%lf}", tmp.c_str(), freq, logFreq);
}
};
typedef unordered_map<uint, const TrieNodeInfo*> DagType;
class Trie
{
private:
TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec;
bool _initFlag;
int64_t _freqSum;
double _minLogFreq;
public:
Trie()
{
_root = NULL;
_freqSum = 0;
_minLogFreq = MAX_DOUBLE;
_initFlag = false;
}
~Trie()
{
dispose();
}
bool init()
{
if(_getInitFlag())
{
LogError("already initted!");
return false;
}
try
{
_root = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
if(NULL == _root)
{
return false;
}
_setInitFlag(true);
return true;
}
bool dispose()
{
if(!_getInitFlag())
{
return false;
}
bool ret = _deleteNode(_root);
if(!ret)
{
LogFatal("_deleteNode failed!");
return false;
}
_root = NULL;
_nodeInfoVec.clear();
_setInitFlag(false);
return ret;
}
bool loadDict(const char * const filePath)
{
if(!_getInitFlag())
{
LogError("not initted.");
return false;
}
if(!checkFileExist(filePath))
{
LogError("cann't find fiel[%s].",filePath);
return false;
}
bool res = false;
res = _trieInsert(filePath);
if(!res)
{
LogError("_trieInsert failed.");
return false;
}
res = _countWeight();
if(!res)
{
LogError("_countWeight failed.");
return false;
}
return true;
}
private:
void _setInitFlag(bool on){_initFlag = on;};
bool _getInitFlag()const{return _initFlag;};
public:
const TrieNodeInfo* find(const string& str)const
{
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
return NULL;
}
return find(uintVec);
}
const TrieNodeInfo* find(const Unicode& uintVec)const
{
if(uintVec.empty())
{
return NULL;
}
return find(uintVec.begin(), uintVec.end());
}
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return NULL;
}
if(begin >= end)
{
return NULL;
}
TrieNode* p = _root;
for(Unicode::const_iterator it = begin; it != end; it++)
{
uint16_t chUni = *it;
if(p->hmap.find(chUni) == p-> hmap.end())
{
return NULL;
}
else
{
p = p->hmap[chUni];
}
}
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
return &(_nodeInfoVec[pos]);
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
}
return NULL;
}
bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return false;
}
TrieNode* p = _root;
for(uint i = 0; i < unico.size(); i++)
{
if(p->hmap.find(unico[i]) == p-> hmap.end())
{
break;
}
p = p->hmap[unico[i]];
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
res.push_back(make_pair(i, &_nodeInfoVec[pos]));
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return false;
}
}
}
return !res.empty();
}
const TrieNodeInfo* findPrefix(const string& str)const
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return NULL;
}
Unicode uintVec;
if(!TransCode::decode(str, uintVec))
{
LogError("TransCode::decode failed.");
return NULL;
}
//find
TrieNode* p = _root;
uint pos = 0;
uint16_t chUni = 0;
const TrieNodeInfo * res = NULL;
for(uint i = 0; i < uintVec.size(); i++)
{
chUni = uintVec[i];
if(p->isLeaf)
{
pos = p->nodeInfoVecPos;
if(pos >= _nodeInfoVec.size())
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
res = &(_nodeInfoVec[pos]);
}
if(p->hmap.find(chUni) == p->hmap.end())
{
break;
}
else
{
p = p->hmap[chUni];
}
}
return res;
}
public:
double getMinLogFreq()const{return _minLogFreq;};
bool insert(const TrieNodeInfo& nodeInfo)
{
if(!_getInitFlag())
{
LogFatal("not initted!");
return false;
}
const Unicode& uintVec = nodeInfo.word;
TrieNode* p = _root;
for(uint i = 0; i < uintVec.size(); i++)
{
uint16_t cu = uintVec[i];
if(NULL == p)
{
return false;
}
if(p->hmap.end() == p->hmap.find(cu))
{
TrieNode * next = NULL;
try
{
next = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
p->hmap[cu] = next;
p = next;
}
else
{
p = p->hmap[cu];
}
}
if(NULL == p)
{
return false;
}
if(p->isLeaf)
{
LogError("this node already inserted");
return false;
}
p->isLeaf = true;
_nodeInfoVec.push_back(nodeInfo);
p->nodeInfoVecPos = _nodeInfoVec.size() - 1;
return true;
}
private:
bool _trieInsert(const char * const filePath)
{
ifstream ifile(filePath);
string line;
vector<string> vecBuf;
TrieNodeInfo nodeInfo;
while(getline(ifile, line))
{
vecBuf.clear();
splitStr(line, vecBuf, " ");
if(3 < vecBuf.size())
{
LogError("line[%s] illegal.", line.c_str());
return false;
}
if(!TransCode::decode(vecBuf[0], nodeInfo.word))
{
return false;
}
nodeInfo.freq = atoi(vecBuf[1].c_str());
if(3 == vecBuf.size())
{
nodeInfo.tag = vecBuf[2];
}
//insert node
if(!insert(nodeInfo))
{
LogError("insert node failed!");
}
}
return true;
}
bool _countWeight()
{
if(_nodeInfoVec.empty() || 0 != _freqSum)
{
LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
return false;
}
//freq total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_freqSum += _nodeInfoVec[i].freq;
}
if(0 == _freqSum)
{
LogError("_freqSum == 0 .");
return false;
}
//normalize
for(uint i = 0; i < _nodeInfoVec.size(); i++)
{
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
if(0 == nodeInfo.freq)
{
LogFatal("nodeInfo.freq == 0!");
return false;
}
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
if(_minLogFreq > nodeInfo.logFreq)
{
_minLogFreq = nodeInfo.logFreq;
}
}
return true;
}
bool _deleteNode(TrieNode* node)
{
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
{
TrieNode* next = it->second;
_deleteNode(next);
}
delete node;
return true;
}
};
}
#endif

View File

@ -1,36 +0,0 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_GLOBALS_H
#define CPPJIEBA_GLOBALS_H
#include <map>
#include <vector>
#include <string>
#include <sys/types.h>
#include <stdint.h>
//#include <hash_map>
#include <tr1/unordered_map>
//#include <ext/hash_map>
namespace CppJieba
{
using namespace std;
using std::tr1::unordered_map;
//using __gnu_cxx::hash_map;
//using namespace stdext;
//typedefs
typedef std::vector<std::string>::iterator VSI;
typedef std::vector<uint16_t> Unicode;
typedef Unicode::const_iterator UniConIter;
typedef unordered_map<uint16_t, struct TrieNode*> TrieNodeMap;
typedef unordered_map<uint16_t, double> EmitProbMap;
const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100;
enum CHAR_TYPE { CHWORD = 0, DIGIT_OR_LETTER = 1, OTHERS = 2};
}
#endif

View File

@ -1,9 +1,9 @@
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include "Limonp/ArgvContext.hpp" #include "Limonp/ArgvContext.hpp"
#include "MPSegment.h" #include "MPSegment.hpp"
#include "HMMSegment.h" #include "HMMSegment.hpp"
#include "MixSegment.h" #include "MixSegment.hpp"
using namespace CppJieba; using namespace CppJieba;

View File

@ -7,9 +7,9 @@
#include "Limonp/Config.hpp" #include "Limonp/Config.hpp"
#include "Husky/Daemon.h" #include "Husky/Daemon.h"
#include "Husky/ServerFrame.h" #include "Husky/ServerFrame.h"
#include "MPSegment.h" #include "MPSegment.hpp"
#include "HMMSegment.h" #include "HMMSegment.hpp"
#include "MixSegment.h" #include "MixSegment.hpp"
using namespace Husky; using namespace Husky;
using namespace CppJieba; using namespace CppJieba;

View File

@ -1,111 +0,0 @@
#ifndef CPPJIEBA_STRUCTS_H
#define CPPJIEBA_STRUCTS_H
#include <limits>
#include "globals.h"
#include "Trie.h"
#include "TransCode.hpp"
namespace CppJieba
{
struct TrieNodeInfo
{
//string word;
//size_t wLen;// the word's len , not string.length(),
Unicode word;
size_t freq;
string tag;
double logFreq; //logFreq = log(freq/sum(freq));
TrieNodeInfo():freq(0),logFreq(0.0)
{
}
TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq)
{
}
TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE)
{
}
string toString()const
{
string tmp;
TransCode::encode(word, tmp);
return string_format("{word:%s,freq:%d, logFreq:%lf}", tmp.c_str(), freq, logFreq);
}
};
typedef unordered_map<uint, const TrieNodeInfo*> DagType;
struct SegmentChar
{
uint16_t uniCh;
DagType dag;
const TrieNodeInfo * pInfo;
double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
{
}
/*const TrieNodeInfo* pInfo;
double weight;
SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w)
{
}*/
};
/*
struct SegmentContext
{
vector<SegmentChar> context;
bool getDA
};*/
typedef vector<SegmentChar> SegmentContext;
struct KeyWordInfo: public TrieNodeInfo
{
double idf;
double weight;// log(wLen+1)*logFreq;
KeyWordInfo():idf(0.0),weight(0.0)
{
}
KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0)
{
}
KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo)
{
}
string toString() const
{
string tmp;
TransCode::encode(word, tmp);
return string_format("{word:%s,weight:%lf, idf:%lf}", tmp.c_str(), weight, idf);
}
KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo)
{
word = trieNodeInfo.word;
freq = trieNodeInfo.freq;
tag = trieNodeInfo.tag;
logFreq = trieNodeInfo.logFreq;
return *this;
}
};
inline ostream& operator << (ostream& os, const KeyWordInfo& info)
{
string tmp;
TransCode::encode(info.word, tmp);
return os << "{words:" << tmp << ", weight:" << info.weight << ", idf:" << info.idf << "}";
}
//inline string joinWordInfos(const vector<KeyWordInfo>& vec)
//{
// vector<string> tmp;
// for(uint i = 0; i < vec.size(); i++)
// {
// tmp.push_back(vec[i].toString());
// }
// return joinStr(tmp, ",");
//}
}
#endif