add fullSegment

This commit is contained in:
aholic 2013-11-27 16:16:10 +08:00
parent 7f57443829
commit 26af60d867
3 changed files with 150 additions and 8 deletions

114
src/FullSegment.hpp Normal file
View File

@ -0,0 +1,114 @@
#ifndef CPPJIEBA_FULLSEGMENT_H
#define CPPJIEBAi_FULLSEGMENT_H
#include <algorithm>
#include <set>
#include "Limonp/logger.hpp"
#include "Trie.hpp"
#include "ISegment.hpp"
#include "SegmentBase.hpp"
#include "TransCode.hpp"
namespace CppJieba
{
class FullSegment: public SegmentBase
{
private:
Trie _trie;
public:
FullSegment(){};
virtual ~FullSegment(){dispose();};
public:
bool init(const char* const filePath)
{
if(_getInitFlag())
{
LogError("already inited before now.");
return false;
}
if(!_trie.init())
{
LogError("_trie.init failed.");
return false;
}
LogInfo("_trie.loadDict(%s) start...", filePath);
if(!_trie.loadDict(filePath))
{
LogError("_trie.loadDict faield.");
return false;
}
LogInfo("_trie.loadDict end.");
return _setInitFlag(true);
}
bool dispose()
{
if(!_getInitFlag())
{
return true;
}
_trie.dispose();
_setInitFlag(false);
return true;
}
bool cut(const string& str, vector<string>& res) const
{
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{
//resut of searching in trie tree
vector<pair<uint, const TrieNodeInfo*> > tRes;
//max index of res's words
int maxIdx = 0;
// always equals to (uItr - begin)
int uIdx = 0;
//tmp variables
int wordLen = 0;
string tmp;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
{
//find word start from itr
if (_trie.find(uItr, end, tRes))
{
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{
wordLen = itr->second->word.size();
if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
{
if (TransCode::encode(itr->second->word, tmp))
res.push_back(tmp);
else
LogError("encode failed.");
tmp.clear();
}
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
}
tRes.clear();
}
else // not found word start from itr
{
if (maxIdx <= uIdx) // never exist in prev results
{
//put itr itself in res
Unicode uTmp(1, *uItr);
if (TransCode::encode(uTmp, tmp))
{
res.push_back(tmp);
}
tmp.clear();
++maxIdx;
}
}
++uIdx;
}
}
};
}
#endif

View File

@ -214,27 +214,33 @@ namespace CppJieba
} }
return NULL; return NULL;
} }
bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<uint, const TrieNodeInfo*> >& res) const
{ {
if(!_getInitFlag()) if(!_getInitFlag())
{ {
LogFatal("trie not initted!"); LogFatal("trie not initted!");
return false; return false;
} }
TrieNode* p = _root; if (begin >= end)
for(uint i = 0; i < unico.size(); i++)
{ {
if(p->hmap.find(unico[i]) == p-> hmap.end()) LogFatal("begin >= end");
return false;
}
TrieNode* p = _root;
for (Unicode::const_iterator itr = begin; itr != end; itr++)
{
if(p->hmap.find(*itr) == p-> hmap.end())
{ {
break; break;
} }
p = p->hmap[unico[i]]; p = p->hmap[*itr];
if(p->isLeaf) if(p->isLeaf)
{ {
uint pos = p->nodeInfoVecPos; uint pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size()) if(pos < _nodeInfoVec.size())
{ {
res.push_back(make_pair(i, &_nodeInfoVec[pos])); res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
} }
else else
{ {
@ -246,6 +252,15 @@ namespace CppJieba
return !res.empty(); return !res.empty();
} }
bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
{
if (!unico.empty())
{
return find(unico.begin(), unico.end(), res);
}
return false;
}
const TrieNodeInfo* findPrefix(const string& str)const const TrieNodeInfo* findPrefix(const string& str)const
{ {
if(!_getInitFlag()) if(!_getInitFlag())

View File

@ -4,6 +4,7 @@
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "MixSegment.hpp" #include "MixSegment.hpp"
#include "FullSegment.hpp"
using namespace CppJieba; using namespace CppJieba;
@ -16,6 +17,7 @@ void cut(const ISegment * seg, const char * const filePath)
{ {
if(!line.empty()) if(!line.empty())
{ {
cout << line << endl;
res.clear(); res.clear();
seg->cut(line, res); seg->cut(line, res);
cout<<join(res.begin(), res.end(),"/")<<endl; cout<<join(res.begin(), res.end(),"/")<<endl;
@ -29,7 +31,7 @@ int main(int argc, char ** argv)
{ {
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n" cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n" <<"options:\n"
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n"
<<"\t--dictpath\tsee example\n" <<"\t--dictpath\tsee example\n"
<<"\t--modelpath\tsee example\n" <<"\t--modelpath\tsee example\n"
<<"example:\n" <<"example:\n"
@ -67,6 +69,17 @@ int main(int argc, char ** argv)
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose(); seg.dispose();
} }
else if ("cutFull" == algorithm)
{
FullSegment seg;
if (!seg.init(dictPath.c_str()))
{
cout << "seg init failed" << endl;
return false;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else else
{ {
MixSegment seg; MixSegment seg;