cppjieba/src/Segment.cpp
2013-08-19 01:29:46 +08:00

238 lines
4.7 KiB
C++

/************************************
* file enc : AISCII
* author : wuyanyi09@gmail.com
************************************/
#include "Segment.h"
namespace CppJieba
{
Segment::Segment()
{
}
Segment::~Segment()
{
}
bool Segment::init()
{
if(!_trie.init())
{
LogError("_trie.init failed.");
return false;
}
return true;
}
bool Segment::loadSegDict(const char * const filePath)
{
LogInfo(string_format("_trie.loadDict(%s) start...", filePath));
bool retFlag = _trie.loadDict(filePath);
LogInfo("_trie.loadDict end.");
return retFlag;
}
bool Segment::dispose()
{
return _trie.dispose();
}
bool Segment::cutDAG(const string& str, vector<string>& res)
{
vector<TrieNodeInfo> segWordInfos;
if(!cutDAG(str, segWordInfos))
{
return false;
}
res.clear();
for(uint i = 0; i < segWordInfos.size(); i++)
{
res.push_back(segWordInfos[i].word);
}
return true;
}
bool Segment::cutDAG(const string& str, vector<TrieNodeInfo>& segWordInfos)
{
if(str.empty())
{
return false;
}
segWordInfos.clear();
SegmentContext segContext;
if(!TransCode::strToVec(str, segContext.uintVec))
{
LogError("TransCode::strToVec failed.");
return false;
}
//calc DAG
if(!_calcDAG(segContext))
{
LogError("_calcDAG failed.");
return false;
}
if(!_calcDP(segContext))
{
LogError("_calcDP failed.");
return false;
}
if(!_cutDAG(segContext, segWordInfos))
{
LogError("_cutDAG failed.");
return false;
}
return true;
}
bool Segment::_calcDAG(SegmentContext& segContext)
{
if(segContext.uintVec.empty())
{
return false;
}
vector<pair<uint, const TrieNodeInfo*> > vec;
VUINT16_CONST_ITER beginIter = segContext.uintVec.begin();
for(VUINT16_CONST_ITER iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++)
{
vec.clear();
vec.push_back(pair<uint, const TrieNodeInfo*>(iterI - beginIter, NULL));
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++)
{
//care: the iterJ exceed iterEnd
const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1);
if(NULL != ptNodeInfo)
{
vec.push_back(pair<uint, const TrieNodeInfo*>(iterJ - beginIter, ptNodeInfo));
}
}
segContext.dag.push_back(vec);
}
return true;
}
bool Segment::_calcDP(SegmentContext& segContext)
{
if(segContext.uintVec.empty())
{
LogError("uintVec illegal");
return false;
}
if(segContext.uintVec.size() != segContext.dag.size())
{
LogError("dag is illegal!");
return false;
}
segContext.dp.assign(segContext.uintVec.size() + 1, pair<const TrieNodeInfo*, double>(NULL, 0.0));
segContext.dp[segContext.uintVec.size()].first = NULL;
segContext.dp[segContext.uintVec.size()].second = 0.0;
for(int i = segContext.uintVec.size() - 1; i >= 0; i--)
{
// calc max
segContext.dp[i].first = NULL;
segContext.dp[i].second = -(numeric_limits<double>::max());
for(uint j = 0; j < segContext.dag[i].size(); j++)
{
const pair<uint , const TrieNodeInfo*>& p = segContext.dag[i][j];
int pos = p.first;
double val = segContext.dp[pos+1].second;
if(NULL != p.second)
{
val += (p.second)->logFreq;
}
else
{
val += _trie.getMinLogFreq();
}
if(val > segContext.dp[i].second)
{
segContext.dp[i].first = p.second;
segContext.dp[i].second = val;
}
}
}
segContext.dp.pop_back();
return true;
}
bool Segment::_cutDAG(SegmentContext& segContext, vector<TrieNodeInfo>& res)
{
if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size())
{
LogError("dp or uintVec illegal!");
return false;
}
res.clear();
VUINT16_CONST_ITER iterBegin = segContext.uintVec.begin();
uint i = 0;
while(i < segContext.dp.size())
{
const TrieNodeInfo* p = segContext.dp[i].first;
if(NULL == p)
{
TrieNodeInfo nodeInfo;
nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1);
nodeInfo.wLen = 1;
nodeInfo.freq = 0;
nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo);
i ++;
}
else
{
res.push_back(*p);
if(0 == p->wLen)
{
LogFatal("TrieNodeInfo's wLen is 0!");
return false;
}
i += p->wLen;
}
}
return true;
}
}
#ifdef SEGMENT_UT
using namespace CppJieba;
int main()
{
Segment segment;
segment.init();
if(!segment.loadSegDict("../dicts/segdict.gbk.v3.0"))
{
cerr<<"1"<<endl;
return 1;
}
//segment.init("dicts/jieba.dict.utf8");
//ifstream ifile("testtitle.gbk");
ifstream ifile("badcase");
vector<string> res;
string line;
while(getline(ifile, line))
{
res.clear();
segment.cutDAG(line, res);
PRINT_VECTOR(res);
getchar();
}
segment.dispose();
return 0;
}
#endif