mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add QuerySegment
This commit is contained in:
parent
7106a4475f
commit
a25007f032
@ -1,5 +1,5 @@
|
|||||||
#ifndef CPPJIEBA_FULLSEGMENT_H
|
#ifndef CPPJIEBA_FULLSEGMENT_H
|
||||||
#define CPPJIEBAi_FULLSEGMENT_H
|
#define CPPJIEBA_FULLSEGMENT_H
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <set>
|
#include <set>
|
||||||
@ -23,11 +23,13 @@ namespace CppJieba
|
|||||||
public:
|
public:
|
||||||
bool init()
|
bool init()
|
||||||
{
|
{
|
||||||
|
#ifndef NO_CODING_LOG
|
||||||
if(_getInitFlag())
|
if(_getInitFlag())
|
||||||
{
|
{
|
||||||
LogError("already inited before now.");
|
LogError("already inited before now.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
if(!_trie.init())
|
if(!_trie.init())
|
||||||
{
|
{
|
||||||
LogError("_trie.init failed.");
|
LogError("_trie.init failed.");
|
||||||
@ -44,22 +46,35 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
bool dispose()
|
bool dispose()
|
||||||
{
|
{
|
||||||
|
#ifndef NO_CODING_LOG
|
||||||
if(!_getInitFlag())
|
if(!_getInitFlag())
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
_trie.dispose();
|
_trie.dispose();
|
||||||
_setInitFlag(false);
|
_setInitFlag(false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cut(const string& str, vector<string>& res) const
|
public:
|
||||||
{
|
using SegmentBase::cut;
|
||||||
return SegmentBase::cut(str, res);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
public:
|
||||||
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||||
{
|
{
|
||||||
|
#ifndef NO_CODING_LOG
|
||||||
|
if (!_getInitFlag())
|
||||||
|
{
|
||||||
|
LogError("not inited.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (begin > end)
|
||||||
|
{
|
||||||
|
LogError("begin > end");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
//resut of searching in trie tree
|
//resut of searching in trie tree
|
||||||
vector<pair<uint, const TrieNodeInfo*> > tRes;
|
vector<pair<uint, const TrieNodeInfo*> > tRes;
|
||||||
|
|
||||||
@ -71,10 +86,9 @@ namespace CppJieba
|
|||||||
|
|
||||||
//tmp variables
|
//tmp variables
|
||||||
int wordLen = 0;
|
int wordLen = 0;
|
||||||
string tmp;
|
|
||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
||||||
{
|
{
|
||||||
//find word start from itr
|
//find word start from uItr
|
||||||
if (_trie.find(uItr, end, tRes))
|
if (_trie.find(uItr, end, tRes))
|
||||||
{
|
{
|
||||||
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
@ -82,32 +96,64 @@ namespace CppJieba
|
|||||||
wordLen = itr->second->word.size();
|
wordLen = itr->second->word.size();
|
||||||
if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
|
if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
|
||||||
{
|
{
|
||||||
if (TransCode::encode(itr->second->word, tmp))
|
res.push_back(itr->second->word);
|
||||||
res.push_back(tmp);
|
|
||||||
else
|
|
||||||
LogError("encode failed.");
|
|
||||||
tmp.clear();
|
|
||||||
}
|
}
|
||||||
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
||||||
}
|
}
|
||||||
tRes.clear();
|
tRes.clear();
|
||||||
}
|
}
|
||||||
else // not found word start from itr
|
else // not found word start from uItr
|
||||||
{
|
{
|
||||||
if (maxIdx <= uIdx) // never exist in prev results
|
if (maxIdx <= uIdx) // never exist in prev results
|
||||||
{
|
{
|
||||||
//put itr itself in res
|
//put itr itself in res
|
||||||
Unicode uTmp(1, *uItr);
|
res.push_back(Unicode(1, *uItr));
|
||||||
if (TransCode::encode(uTmp, tmp))
|
|
||||||
{
|
//mark it exits
|
||||||
res.push_back(tmp);
|
|
||||||
}
|
|
||||||
tmp.clear();
|
|
||||||
++maxIdx;
|
++maxIdx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++uIdx;
|
++uIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
||||||
|
{
|
||||||
|
#ifndef NO_CODING_LOG
|
||||||
|
if (!_getInitFlag())
|
||||||
|
{
|
||||||
|
LogError("not inited.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (begin > end)
|
||||||
|
{
|
||||||
|
LogError("begin > end");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
vector<Unicode> uRes;
|
||||||
|
if (!cut(begin, end, uRes))
|
||||||
|
{
|
||||||
|
LogError("get unicode cut result error.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
string tmp;
|
||||||
|
|
||||||
|
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
|
||||||
|
{
|
||||||
|
if (TransCode::encode(*uItr, tmp))
|
||||||
|
{
|
||||||
|
res.push_back(tmp);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LogError("encode failed.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,13 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstdio>
|
||||||
#include "Limonp/ArgvContext.hpp"
|
#include "Limonp/ArgvContext.hpp"
|
||||||
#include "MPSegment.hpp"
|
#include "MPSegment.hpp"
|
||||||
#include "HMMSegment.hpp"
|
#include "HMMSegment.hpp"
|
||||||
#include "MixSegment.hpp"
|
#include "MixSegment.hpp"
|
||||||
#include "FullSegment.hpp"
|
#include "FullSegment.hpp"
|
||||||
|
#include "QuerySegment.hpp"
|
||||||
|
|
||||||
using namespace CppJieba;
|
using namespace CppJieba;
|
||||||
|
|
||||||
@ -37,13 +40,15 @@ int main(int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
|
||||||
<<"options:\n"
|
<<"options:\n"
|
||||||
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n"
|
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutQuery, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n"
|
||||||
<<"\t--dictpath\tsee example\n"
|
<<"\t--dictpath\tsee example\n"
|
||||||
<<"\t--modelpath\tsee example\n"
|
<<"\t--modelpath\tsee example\n"
|
||||||
|
<<"\t--maxlen\tspecify the granularity of cut used in cutQuery, If not specified, the default is 3\n"
|
||||||
<<"example:\n"
|
<<"example:\n"
|
||||||
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n"
|
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n"
|
||||||
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
|
||||||
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
|
||||||
|
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutQuery --maxlen 3\n"
|
||||||
<<endl;
|
<<endl;
|
||||||
|
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
@ -52,6 +57,7 @@ int main(int argc, char ** argv)
|
|||||||
string dictPath = arg["--dictpath"];
|
string dictPath = arg["--dictpath"];
|
||||||
string modelPath = arg["--modelpath"];
|
string modelPath = arg["--modelpath"];
|
||||||
string algorithm = arg["--algorithm"];
|
string algorithm = arg["--algorithm"];
|
||||||
|
int maxLen = atoi(arg["--maxlen"] == "" ? arg["--maxlen"].c_str() : "3");
|
||||||
|
|
||||||
if("cutHMM" == algorithm)
|
if("cutHMM" == algorithm)
|
||||||
{
|
{
|
||||||
@ -86,6 +92,17 @@ int main(int argc, char ** argv)
|
|||||||
cut(&seg, arg[1].c_str());
|
cut(&seg, arg[1].c_str());
|
||||||
seg.dispose();
|
seg.dispose();
|
||||||
}
|
}
|
||||||
|
else if ("cutQuery" == algorithm)
|
||||||
|
{
|
||||||
|
QuerySegment seg(dictPath.c_str(), modelPath.c_str(), maxLen);
|
||||||
|
if (!seg.init())
|
||||||
|
{
|
||||||
|
cout << "seg init failed" << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cut(&seg, arg[1].c_str());
|
||||||
|
seg.dispose();
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MixSegment seg(dictPath.c_str(), modelPath.c_str());
|
MixSegment seg(dictPath.c_str(), modelPath.c_str());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user