add QuerySegment

This commit is contained in:
aholic 2013-11-27 20:46:45 +08:00
parent 7106a4475f
commit a25007f032
2 changed files with 84 additions and 21 deletions

View File

@ -1,5 +1,5 @@
#ifndef CPPJIEBA_FULLSEGMENT_H #ifndef CPPJIEBA_FULLSEGMENT_H
#define CPPJIEBAi_FULLSEGMENT_H #define CPPJIEBA_FULLSEGMENT_H
#include <algorithm> #include <algorithm>
#include <set> #include <set>
@ -23,11 +23,13 @@ namespace CppJieba
public: public:
bool init() bool init()
{ {
#ifndef NO_CODING_LOG
if(_getInitFlag()) if(_getInitFlag())
{ {
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
#endif
if(!_trie.init()) if(!_trie.init())
{ {
LogError("_trie.init failed."); LogError("_trie.init failed.");
@ -44,22 +46,35 @@ namespace CppJieba
} }
bool dispose() bool dispose()
{ {
#ifndef NO_CODING_LOG
if(!_getInitFlag()) if(!_getInitFlag())
{ {
return true; return true;
} }
#endif
_trie.dispose(); _trie.dispose();
_setInitFlag(false); _setInitFlag(false);
return true; return true;
} }
bool cut(const string& str, vector<string>& res) const public:
{ using SegmentBase::cut;
return SegmentBase::cut(str, res);
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{ {
#ifndef NO_CODING_LOG
if (!_getInitFlag())
{
LogError("not inited.");
return false;
}
if (begin > end)
{
LogError("begin > end");
return false;
}
#endif
//resut of searching in trie tree //resut of searching in trie tree
vector<pair<uint, const TrieNodeInfo*> > tRes; vector<pair<uint, const TrieNodeInfo*> > tRes;
@ -71,10 +86,9 @@ namespace CppJieba
//tmp variables //tmp variables
int wordLen = 0; int wordLen = 0;
string tmp;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
{ {
//find word start from itr //find word start from uItr
if (_trie.find(uItr, end, tRes)) if (_trie.find(uItr, end, tRes))
{ {
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
@ -82,32 +96,64 @@ namespace CppJieba
wordLen = itr->second->word.size(); wordLen = itr->second->word.size();
if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx) if (wordLen >= 2 || tRes.size() == 1 && maxIdx <= uIdx)
{ {
if (TransCode::encode(itr->second->word, tmp)) res.push_back(itr->second->word);
res.push_back(tmp);
else
LogError("encode failed.");
tmp.clear();
} }
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
} }
tRes.clear(); tRes.clear();
} }
else // not found word start from itr else // not found word start from uItr
{ {
if (maxIdx <= uIdx) // never exist in prev results if (maxIdx <= uIdx) // never exist in prev results
{ {
//put itr itself in res //put itr itself in res
Unicode uTmp(1, *uItr); res.push_back(Unicode(1, *uItr));
if (TransCode::encode(uTmp, tmp))
{ //mark it exits
res.push_back(tmp);
}
tmp.clear();
++maxIdx; ++maxIdx;
} }
} }
++uIdx; ++uIdx;
} }
return true;
}
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
{
#ifndef NO_CODING_LOG
if (!_getInitFlag())
{
LogError("not inited.");
return false;
}
if (begin > end)
{
LogError("begin > end");
return false;
}
#endif
vector<Unicode> uRes;
if (!cut(begin, end, uRes))
{
LogError("get unicode cut result error.");
return false;
}
string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
{
if (TransCode::encode(*uItr, tmp))
{
res.push_back(tmp);
}
else
{
LogError("encode failed.");
}
}
return true;
} }
}; };
} }

View File

@ -1,10 +1,13 @@
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <cstdlib>
#include <cstdio>
#include "Limonp/ArgvContext.hpp" #include "Limonp/ArgvContext.hpp"
#include "MPSegment.hpp" #include "MPSegment.hpp"
#include "HMMSegment.hpp" #include "HMMSegment.hpp"
#include "MixSegment.hpp" #include "MixSegment.hpp"
#include "FullSegment.hpp" #include "FullSegment.hpp"
#include "QuerySegment.hpp"
using namespace CppJieba; using namespace CppJieba;
@ -37,13 +40,15 @@ int main(int argc, char ** argv)
{ {
cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n" cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n"
<<"options:\n" <<"options:\n"
<<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n" <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutFull, cutQuery, cutMix] for now. \n\t\t\tIf not specified, the default is cutMix\n"
<<"\t--dictpath\tsee example\n" <<"\t--dictpath\tsee example\n"
<<"\t--modelpath\tsee example\n" <<"\t--modelpath\tsee example\n"
<<"\t--maxlen\tspecify the granularity of cut used in cutQuery, If not specified, the default is 3\n"
<<"example:\n" <<"example:\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n" <<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --algorithm cutDAG\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n" <<"\t"<<argv[0]<<" ../test/testlines.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutHMM\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n" <<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutMix\n"
<<"\t"<<argv[0]<<" ../test/testlines.utf8 --dictpath ../dicts/jieba.dict.utf8 --modelpath ../dicts/hmm_model.utf8 --algorithm cutQuery --maxlen 3\n"
<<endl; <<endl;
return EXIT_FAILURE; return EXIT_FAILURE;
@ -52,6 +57,7 @@ int main(int argc, char ** argv)
string dictPath = arg["--dictpath"]; string dictPath = arg["--dictpath"];
string modelPath = arg["--modelpath"]; string modelPath = arg["--modelpath"];
string algorithm = arg["--algorithm"]; string algorithm = arg["--algorithm"];
int maxLen = atoi(arg["--maxlen"] == "" ? arg["--maxlen"].c_str() : "3");
if("cutHMM" == algorithm) if("cutHMM" == algorithm)
{ {
@ -86,6 +92,17 @@ int main(int argc, char ** argv)
cut(&seg, arg[1].c_str()); cut(&seg, arg[1].c_str());
seg.dispose(); seg.dispose();
} }
else if ("cutQuery" == algorithm)
{
QuerySegment seg(dictPath.c_str(), modelPath.c_str(), maxLen);
if (!seg.init())
{
cout << "seg init failed" << endl;
return false;
}
cut(&seg, arg[1].c_str());
seg.dispose();
}
else else
{ {
MixSegment seg(dictPath.c_str(), modelPath.c_str()); MixSegment seg(dictPath.c_str(), modelPath.c_str());