mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
110 lines
3.2 KiB
C++
110 lines
3.2 KiB
C++
#ifndef CPPJIEBA_POS_TAGGING_H
|
|
#define CPPJIEBA_POS_TAGGING_H
|
|
|
|
#include "MixSegment.hpp"
|
|
#include "Limonp/StringUtil.hpp"
|
|
#include "DictTrie.hpp"
|
|
|
|
namespace CppJieba
|
|
{
|
|
using namespace Limonp;
|
|
|
|
static const char* const POS_M = "m";
|
|
static const char* const POS_ENG = "eng";
|
|
static const char* const POS_X = "x";
|
|
|
|
class PosTagger
|
|
{
|
|
private:
|
|
MixSegment _segment;
|
|
const DictTrie * _dictTrie;
|
|
|
|
public:
|
|
PosTagger()
|
|
{}
|
|
PosTagger(
|
|
const string& dictPath,
|
|
const string& hmmFilePath,
|
|
const string& userDictPath = ""
|
|
)
|
|
{
|
|
init(dictPath, hmmFilePath, userDictPath);
|
|
};
|
|
~PosTagger(){};
|
|
public:
|
|
void init(
|
|
const string& dictPath,
|
|
const string& hmmFilePath,
|
|
const string& userDictPath = ""
|
|
)
|
|
{
|
|
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
|
|
_dictTrie = _segment.getDictTrie();
|
|
LIMONP_CHECK(_dictTrie);
|
|
};
|
|
|
|
|
|
bool tag(const string& src, vector<pair<string, string> >& res) const
|
|
{
|
|
vector<string> cutRes;
|
|
if (!_segment.cut(src, cutRes))
|
|
{
|
|
LogError("_mixSegment cut failed");
|
|
return false;
|
|
}
|
|
|
|
const DictUnit *tmp = NULL;
|
|
Unicode unico;
|
|
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
|
|
{
|
|
if (!TransCode::decode(*itr, unico))
|
|
{
|
|
LogError("decode failed.");
|
|
return false;
|
|
}
|
|
tmp = _dictTrie->find(unico.begin(), unico.end());
|
|
if(tmp == NULL || tmp->tag.empty())
|
|
{
|
|
res.push_back(make_pair(*itr, _specialRule(unico)));
|
|
}
|
|
else
|
|
{
|
|
res.push_back(make_pair(*itr, tmp->tag));
|
|
}
|
|
}
|
|
return !res.empty();
|
|
}
|
|
private:
|
|
const char* _specialRule(const Unicode& unicode) const
|
|
{
|
|
size_t m = 0;
|
|
size_t eng = 0;
|
|
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++)
|
|
{
|
|
if(unicode[i] < 0x80)
|
|
{
|
|
eng ++;
|
|
if('0' <= unicode[i] && unicode[i] <= '9')
|
|
{
|
|
m++;
|
|
}
|
|
}
|
|
}
|
|
// ascii char is not found
|
|
if(eng == 0)
|
|
{
|
|
return POS_X;
|
|
}
|
|
// all the ascii is number char
|
|
if(m == eng)
|
|
{
|
|
return POS_M;
|
|
}
|
|
// the ascii chars contain english letter
|
|
return POS_ENG;
|
|
}
|
|
};
|
|
}
|
|
|
|
#endif
|