mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
Merge branch 'master' into dev
This commit is contained in:
commit
7106a4475f
115
src/FullSegment.hpp
Normal file
115
src/FullSegment.hpp
Normal file
@ -0,0 +1,115 @@
|
||||
#ifndef CPPJIEBA_FULLSEGMENT_H
|
||||
#define CPPJIEBAi_FULLSEGMENT_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "TransCode.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
class FullSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
Trie _trie;
|
||||
const string _dictPath;
|
||||
|
||||
public:
|
||||
FullSegment(const char* dictPath): _dictPath(dictPath){};
|
||||
virtual ~FullSegment(){dispose();};
|
||||
public:
|
||||
bool init()
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("already inited before now.");
|
||||
return false;
|
||||
}
|
||||
if(!_trie.init())
|
||||
{
|
||||
LogError("_trie.init failed.");
|
||||
return false;
|
||||
}
|
||||
LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str());
|
||||
if(!_trie.loadDict(_dictPath.c_str()))
|
||||
{
|
||||
LogError("_trie.loadDict faield.");
|
||||
return false;
|
||||
}
|
||||
LogInfo("_trie.loadDict end.");
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
bool dispose()
|
||||
{
|
||||
if(!_getInitFlag())
|
||||
{
|
||||
return true;
|
||||
}
|
||||
_trie.dispose();
|
||||
_setInitFlag(false);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cut(const string& str, vector<string>& res) const
|
||||
{
|
||||
return SegmentBase::cut(str, res);
|
||||
}
|
||||
|
||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
||||
{
|
||||
//resut of searching in trie tree
|
||||
vector<pair<uint, const TrieNodeInfo*> > tRes;
|
||||
|
||||
//max index of res's words
|
||||
int maxIdx = 0;
|
||||
|
||||
// always equals to (uItr - begin)
|
||||
int uIdx = 0;
|
||||
|
||||
//tmp variables
|
||||
int wordLen = 0;
|
||||
string tmp;
|
||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
||||
{
|
||||
//find word start from itr
|
||||
if (_trie.find(uItr, end, tRes))
|
||||
{
|
||||
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
{
|
||||
wordLen = itr->second->word.size();
|
||||
if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
|
||||
{
|
||||
if (TransCode::encode(itr->second->word, tmp))
|
||||
res.push_back(tmp);
|
||||
else
|
||||
LogError("encode failed.");
|
||||
tmp.clear();
|
||||
}
|
||||
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
||||
}
|
||||
tRes.clear();
|
||||
}
|
||||
else // not found word start from itr
|
||||
{
|
||||
if (maxIdx <= uIdx) // never exist in prev results
|
||||
{
|
||||
//put itr itself in res
|
||||
Unicode uTmp(1, *uItr);
|
||||
if (TransCode::encode(uTmp, tmp))
|
||||
{
|
||||
res.push_back(tmp);
|
||||
}
|
||||
tmp.clear();
|
||||
++maxIdx;
|
||||
}
|
||||
}
|
||||
++uIdx;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
27
src/Trie.hpp
27
src/Trie.hpp
@ -206,27 +206,33 @@ namespace CppJieba
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
|
||||
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector<pair<uint, const TrieNodeInfo*> >& res) const
|
||||
{
|
||||
if(!_getInitFlag())
|
||||
{
|
||||
LogFatal("trie not initted!");
|
||||
return false;
|
||||
}
|
||||
TrieNode* p = _root;
|
||||
for(uint i = 0; i < unico.size(); i++)
|
||||
if (begin >= end)
|
||||
{
|
||||
if(p->hmap.find(unico[i]) == p-> hmap.end())
|
||||
LogFatal("begin >= end");
|
||||
return false;
|
||||
}
|
||||
TrieNode* p = _root;
|
||||
for (Unicode::const_iterator itr = begin; itr != end; itr++)
|
||||
{
|
||||
if(p->hmap.find(*itr) == p-> hmap.end())
|
||||
{
|
||||
break;
|
||||
}
|
||||
p = p->hmap[unico[i]];
|
||||
p = p->hmap[*itr];
|
||||
if(p->isLeaf)
|
||||
{
|
||||
uint pos = p->nodeInfoVecPos;
|
||||
if(pos < _nodeInfoVec.size())
|
||||
{
|
||||
res.push_back(make_pair(i, &_nodeInfoVec[pos]));
|
||||
res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos]));
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -238,6 +244,15 @@ namespace CppJieba
|
||||
return !res.empty();
|
||||
}
|
||||
|
||||
bool find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const
|
||||
{
|
||||
if (!unico.empty())
|
||||
{
|
||||
return find(unico.begin(), unico.end(), res);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public:
|
||||
double getMinLogFreq()const{return _minLogFreq;};
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "MPSegment.hpp"
|
||||
#include "HMMSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
#include "FullSegment.hpp"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
@ -16,6 +17,7 @@ void cut(const ISegment * seg, const char * const filePath)
|
||||
{
|
||||
if(!line.empty())
|
||||
{
|
||||
cout << line << endl;
|
||||
res.clear();
|
||||
if(!seg->cut(line, res))
|
||||
{
|
||||
@ -35,7 +37,7 @@ int main(int argc, char ** argv)
|
||||
{
|
||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||
<<"options:\n"
|
||||
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n"
|
||||
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n"
|
||||
<<"\t--dictpath\tsee example\n"
|
||||
<<"\t--modelpath\tsee example\n"
|
||||
<<"example:\n"
|
||||
@ -73,6 +75,17 @@ int main(int argc, char ** argv)
|
||||
cut(&seg, arg[1].c_str());
|
||||
seg.dispose();
|
||||
}
|
||||
else if ("cutFull" == algorithm)
|
||||
{
|
||||
FullSegment seg(dictPath.c_str());
|
||||
if (!seg.init())
|
||||
{
|
||||
cout << "seg init failed" << endl;
|
||||
return false;
|
||||
}
|
||||
cut(&seg, arg[1].c_str());
|
||||
seg.dispose();
|
||||
}
|
||||
else
|
||||
{
|
||||
MixSegment seg(dictPath.c_str(), modelPath.c_str());
|
||||
|
Loading…
x
Reference in New Issue
Block a user