mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
code style
This commit is contained in:
parent
bb32234654
commit
2b18a582fc
@ -26,60 +26,60 @@ class DictTrie {
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
DictTrie() {
|
DictTrie() {
|
||||||
_trie = NULL;
|
trie_ = NULL;
|
||||||
_minWeight = MAX_DOUBLE;
|
minWeight_ = MAX_DOUBLE;
|
||||||
}
|
}
|
||||||
DictTrie(const string& dictPath, const string& userDictPath = "") {
|
DictTrie(const string& dictPath, const string& userDictPath = "") {
|
||||||
new (this) DictTrie();
|
new (this) DictTrie();
|
||||||
init(dictPath, userDictPath);
|
init(dictPath, userDictPath);
|
||||||
}
|
}
|
||||||
~DictTrie() {
|
~DictTrie() {
|
||||||
if(_trie) {
|
if(trie_) {
|
||||||
delete _trie;
|
delete trie_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool init(const string& dictPath, const string& userDictPath = "") {
|
bool init(const string& dictPath, const string& userDictPath = "") {
|
||||||
if(_trie != NULL) {
|
if(trie_ != NULL) {
|
||||||
LogFatal("trie already initted");
|
LogFatal("trie already initted");
|
||||||
}
|
}
|
||||||
_loadDict(dictPath);
|
loadDict_(dictPath);
|
||||||
_calculateWeight(_nodeInfos);
|
calculateWeight_(nodeInfos_);
|
||||||
_minWeight = _findMinWeight(_nodeInfos);
|
minWeight_ = findMinWeight_(nodeInfos_);
|
||||||
|
|
||||||
if(userDictPath.size()) {
|
if(userDictPath.size()) {
|
||||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
double maxWeight = findMaxWeight_(nodeInfos_);
|
||||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
loadUserDict_(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||||
}
|
}
|
||||||
_shrink(_nodeInfos);
|
shrink_(nodeInfos_);
|
||||||
_trie = _createTrie(_nodeInfos);
|
trie_ = createTrie_(nodeInfos_);
|
||||||
assert(_trie);
|
assert(trie_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
return _trie->find(begin, end);
|
return trie_->find(begin, end);
|
||||||
}
|
}
|
||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
||||||
return _trie->find(begin, end, dag, offset);
|
return trie_->find(begin, end, dag, offset);
|
||||||
}
|
}
|
||||||
void find(
|
void find(
|
||||||
Unicode::const_iterator begin,
|
Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<SegmentChar>& res
|
vector<SegmentChar>& res
|
||||||
) const {
|
) const {
|
||||||
_trie->find(begin, end, res);
|
trie_->find(begin, end, res);
|
||||||
}
|
}
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
||||||
return isIn(_userDictSingleChineseWord, word);
|
return isIn(userDictSingleChineseWord_, word);
|
||||||
}
|
}
|
||||||
double getMinWeight() const {
|
double getMinWeight() const {
|
||||||
return _minWeight;
|
return minWeight_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Trie * _createTrie(const vector<DictUnit>& dictUnits) {
|
Trie * createTrie_(const vector<DictUnit>& dictUnits) {
|
||||||
assert(dictUnits.size());
|
assert(dictUnits.size());
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
vector<const DictUnit*> valuePointers;
|
vector<const DictUnit*> valuePointers;
|
||||||
@ -91,7 +91,7 @@ class DictTrie {
|
|||||||
Trie * trie = new Trie(words, valuePointers);
|
Trie * trie = new Trie(words, valuePointers);
|
||||||
return trie;
|
return trie;
|
||||||
}
|
}
|
||||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
|
void loadUserDict_(const string& filePath, double defaultWeight, const string& defaultTag) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if(!ifs.is_open()) {
|
||||||
LogFatal("file %s open failed.", filePath.c_str());
|
LogFatal("file %s open failed.", filePath.c_str());
|
||||||
@ -111,15 +111,15 @@ class DictTrie {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(nodeInfo.word.size() == 1) {
|
if(nodeInfo.word.size() == 1) {
|
||||||
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
userDictSingleChineseWord_.insert(nodeInfo.word[0]);
|
||||||
}
|
}
|
||||||
nodeInfo.weight = defaultWeight;
|
nodeInfo.weight = defaultWeight;
|
||||||
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
|
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
|
||||||
_nodeInfos.push_back(nodeInfo);
|
nodeInfos_.push_back(nodeInfo);
|
||||||
}
|
}
|
||||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||||
}
|
}
|
||||||
void _loadDict(const string& filePath) {
|
void loadDict_(const string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if(!ifs.is_open()) {
|
||||||
LogFatal("file %s open failed.", filePath.c_str());
|
LogFatal("file %s open failed.", filePath.c_str());
|
||||||
@ -141,17 +141,17 @@ class DictTrie {
|
|||||||
nodeInfo.weight = atof(buf[1].c_str());
|
nodeInfo.weight = atof(buf[1].c_str());
|
||||||
nodeInfo.tag = buf[2];
|
nodeInfo.tag = buf[2];
|
||||||
|
|
||||||
_nodeInfos.push_back(nodeInfo);
|
nodeInfos_.push_back(nodeInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
|
double findMinWeight_(const vector<DictUnit>& nodeInfos) const {
|
||||||
double ret = MAX_DOUBLE;
|
double ret = MAX_DOUBLE;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
ret = min(nodeInfos[i].weight, ret);
|
ret = min(nodeInfos[i].weight, ret);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
|
double findMaxWeight_(const vector<DictUnit>& nodeInfos) const {
|
||||||
double ret = MIN_DOUBLE;
|
double ret = MIN_DOUBLE;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
ret = max(nodeInfos[i].weight, ret);
|
ret = max(nodeInfos[i].weight, ret);
|
||||||
@ -159,7 +159,7 @@ class DictTrie {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void _calculateWeight(vector<DictUnit>& nodeInfos) const {
|
void calculateWeight_(vector<DictUnit>& nodeInfos) const {
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
sum += nodeInfos[i].weight;
|
sum += nodeInfos[i].weight;
|
||||||
@ -172,16 +172,16 @@ class DictTrie {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void _shrink(vector<DictUnit>& units) const {
|
void shrink_(vector<DictUnit>& units) const {
|
||||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
vector<DictUnit> _nodeInfos;
|
vector<DictUnit> nodeInfos_;
|
||||||
Trie * _trie;
|
Trie * trie_;
|
||||||
|
|
||||||
double _minWeight;
|
double minWeight_;
|
||||||
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
unordered_set<Unicode::value_type> userDictSingleChineseWord_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,40 +14,40 @@ namespace CppJieba {
|
|||||||
class FullSegment: public SegmentBase {
|
class FullSegment: public SegmentBase {
|
||||||
public:
|
public:
|
||||||
FullSegment() {
|
FullSegment() {
|
||||||
_dictTrie = NULL;
|
dictTrie_ = NULL;
|
||||||
_isBorrowed = false;
|
isBorrowed_ = false;
|
||||||
}
|
}
|
||||||
explicit FullSegment(const string& dictPath) {
|
explicit FullSegment(const string& dictPath) {
|
||||||
_dictTrie = NULL;
|
dictTrie_ = NULL;
|
||||||
init(dictPath);
|
init(dictPath);
|
||||||
}
|
}
|
||||||
explicit FullSegment(const DictTrie* dictTrie) {
|
explicit FullSegment(const DictTrie* dictTrie) {
|
||||||
_dictTrie = NULL;
|
dictTrie_ = NULL;
|
||||||
init(dictTrie);
|
init(dictTrie);
|
||||||
}
|
}
|
||||||
virtual ~FullSegment() {
|
virtual ~FullSegment() {
|
||||||
if(_dictTrie && ! _isBorrowed) {
|
if(dictTrie_ && ! isBorrowed_) {
|
||||||
delete _dictTrie;
|
delete dictTrie_;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
bool init(const string& dictPath) {
|
bool init(const string& dictPath) {
|
||||||
assert(_dictTrie == NULL);
|
assert(dictTrie_ == NULL);
|
||||||
_dictTrie = new DictTrie(dictPath);
|
dictTrie_ = new DictTrie(dictPath);
|
||||||
_isBorrowed = false;
|
isBorrowed_ = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool init(const DictTrie* dictTrie) {
|
bool init(const DictTrie* dictTrie) {
|
||||||
assert(_dictTrie == NULL);
|
assert(dictTrie_ == NULL);
|
||||||
assert(dictTrie);
|
assert(dictTrie);
|
||||||
_dictTrie = dictTrie;
|
dictTrie_ = dictTrie;
|
||||||
_isBorrowed = true;
|
isBorrowed_ = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
assert(_dictTrie);
|
assert(dictTrie_);
|
||||||
if (begin >= end) {
|
if (begin >= end) {
|
||||||
LogError("begin >= end");
|
LogError("begin >= end");
|
||||||
return false;
|
return false;
|
||||||
@ -66,7 +66,7 @@ class FullSegment: public SegmentBase {
|
|||||||
int wordLen = 0;
|
int wordLen = 0;
|
||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||||
//find word start from uItr
|
//find word start from uItr
|
||||||
if (_dictTrie->find(uItr, end, tRes, 0)) {
|
if (dictTrie_->find(uItr, end, tRes, 0)) {
|
||||||
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
{
|
{
|
||||||
@ -93,7 +93,7 @@ class FullSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
||||||
assert(_dictTrie);
|
assert(dictTrie_);
|
||||||
if (begin >= end) {
|
if (begin >= end) {
|
||||||
LogError("begin >= end");
|
LogError("begin >= end");
|
||||||
return false;
|
return false;
|
||||||
@ -117,8 +117,8 @@ class FullSegment: public SegmentBase {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
const DictTrie* _dictTrie;
|
const DictTrie* dictTrie_;
|
||||||
bool _isBorrowed;
|
bool isBorrowed_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,17 +31,17 @@ class HMMSegment: public SegmentBase {
|
|||||||
virtual ~HMMSegment() {}
|
virtual ~HMMSegment() {}
|
||||||
public:
|
public:
|
||||||
bool init(const string& filePath) {
|
bool init(const string& filePath) {
|
||||||
memset(_startProb, 0, sizeof(_startProb));
|
memset(startProb_, 0, sizeof(startProb_));
|
||||||
memset(_transProb, 0, sizeof(_transProb));
|
memset(transProb_, 0, sizeof(transProb_));
|
||||||
_statMap[0] = 'B';
|
statMap_[0] = 'B';
|
||||||
_statMap[1] = 'E';
|
statMap_[1] = 'E';
|
||||||
_statMap[2] = 'M';
|
statMap_[2] = 'M';
|
||||||
_statMap[3] = 'S';
|
statMap_[3] = 'S';
|
||||||
_emitProbVec.push_back(&_emitProbB);
|
emitProbVec_.push_back(&emitProbB_);
|
||||||
_emitProbVec.push_back(&_emitProbE);
|
emitProbVec_.push_back(&emitProbE_);
|
||||||
_emitProbVec.push_back(&_emitProbM);
|
emitProbVec_.push_back(&emitProbM_);
|
||||||
_emitProbVec.push_back(&_emitProbS);
|
emitProbVec_.push_back(&emitProbS_);
|
||||||
LIMONP_CHECK(_loadModel(filePath.c_str()));
|
LIMONP_CHECK(loadModel_(filePath.c_str()));
|
||||||
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -53,16 +53,16 @@ class HMMSegment: public SegmentBase {
|
|||||||
Unicode::const_iterator right = begin;
|
Unicode::const_iterator right = begin;
|
||||||
while(right != end) {
|
while(right != end) {
|
||||||
if(*right < 0x80) {
|
if(*right < 0x80) {
|
||||||
if(left != right && !_cut(left, right, res)) {
|
if(left != right && !cut_(left, right, res)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
left = right;
|
left = right;
|
||||||
do {
|
do {
|
||||||
right = _sequentialLetterRule(left, end);
|
right = sequentialLetterRule_(left, end);
|
||||||
if(right != left) {
|
if(right != left) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
right = _numbersRule(left, end);
|
right = numbersRule_(left, end);
|
||||||
if(right != left) {
|
if(right != left) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -74,7 +74,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
right++;
|
right++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(left != right && !_cut(left, right, res)) {
|
if(left != right && !cut_(left, right, res)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -100,7 +100,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
// sequential letters rule
|
// sequential letters rule
|
||||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
Unicode::const_iterator sequentialLetterRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
Unicode::value_type x = *begin;
|
Unicode::value_type x = *begin;
|
||||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||||
begin ++;
|
begin ++;
|
||||||
@ -118,7 +118,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
//
|
//
|
||||||
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
Unicode::const_iterator numbersRule_(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
Unicode::value_type x = *begin;
|
Unicode::value_type x = *begin;
|
||||||
if('0' <= x && x <= '9') {
|
if('0' <= x && x <= '9') {
|
||||||
begin ++;
|
begin ++;
|
||||||
@ -135,10 +135,10 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
bool cut_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
vector<size_t> status;
|
vector<size_t> status;
|
||||||
if(!_viterbi(begin, end, status)) {
|
if(!viterbi_(begin, end, status)) {
|
||||||
LogError("_viterbi failed.");
|
LogError("viterbi_ failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,7 +154,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
|
bool viterbi_(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
|
||||||
if(begin == end) {
|
if(begin == end) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -171,7 +171,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
|
|
||||||
//start
|
//start
|
||||||
for(size_t y = 0; y < Y; y++) {
|
for(size_t y = 0; y < Y; y++) {
|
||||||
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
weight[0 + y * X] = startProb_[y] + getEmitProb_(emitProbVec_[y], *begin, MIN_DOUBLE);
|
||||||
path[0 + y * X] = -1;
|
path[0 + y * X] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,10 +183,10 @@ class HMMSegment: public SegmentBase {
|
|||||||
now = x + y*X;
|
now = x + y*X;
|
||||||
weight[now] = MIN_DOUBLE;
|
weight[now] = MIN_DOUBLE;
|
||||||
path[now] = E; // warning
|
path[now] = E; // warning
|
||||||
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
emitProb = getEmitProb_(emitProbVec_[y], *(begin+x), MIN_DOUBLE);
|
||||||
for(size_t preY = 0; preY < Y; preY++) {
|
for(size_t preY = 0; preY < Y; preY++) {
|
||||||
old = x - 1 + preY * X;
|
old = x - 1 + preY * X;
|
||||||
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
tmp = weight[old] + transProb_[preY][y] + emitProb;
|
||||||
if(tmp > weight[now]) {
|
if(tmp > weight[now]) {
|
||||||
weight[now] = tmp;
|
weight[now] = tmp;
|
||||||
path[now] = preY;
|
path[now] = preY;
|
||||||
@ -212,13 +212,13 @@ class HMMSegment: public SegmentBase {
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool _loadModel(const char* const filePath) {
|
bool loadModel_(const char* const filePath) {
|
||||||
ifstream ifile(filePath);
|
ifstream ifile(filePath);
|
||||||
string line;
|
string line;
|
||||||
vector<string> tmp;
|
vector<string> tmp;
|
||||||
vector<string> tmp2;
|
vector<string> tmp2;
|
||||||
//load _startProb
|
//load startProb_
|
||||||
if(!_getLine(ifile, line)) {
|
if(!getLine_(ifile, line)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
@ -227,12 +227,12 @@ class HMMSegment: public SegmentBase {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t j = 0; j< tmp.size(); j++) {
|
for(size_t j = 0; j< tmp.size(); j++) {
|
||||||
_startProb[j] = atof(tmp[j].c_str());
|
startProb_[j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _transProb
|
//load transProb_
|
||||||
for(size_t i = 0; i < STATUS_SUM; i++) {
|
for(size_t i = 0; i < STATUS_SUM; i++) {
|
||||||
if(!_getLine(ifile, line)) {
|
if(!getLine_(ifile, line)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
@ -241,33 +241,33 @@ class HMMSegment: public SegmentBase {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t j =0; j < STATUS_SUM; j++) {
|
for(size_t j =0; j < STATUS_SUM; j++) {
|
||||||
_transProb[i][j] = atof(tmp[j].c_str());
|
transProb_[i][j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbB
|
//load emitProbB_
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
|
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbB_)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbE
|
//load emitProbE_
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
|
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbE_)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbM
|
//load emitProbM_
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
|
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbM_)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbS
|
//load emitProbS_
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
|
if(!getLine_(ifile, line) || !loadEmitProb_(line, emitProbS_)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool _getLine(ifstream& ifile, string& line) {
|
bool getLine_(ifstream& ifile, string& line) {
|
||||||
while(getline(ifile, line)) {
|
while(getline(ifile, line)) {
|
||||||
trim(line);
|
trim(line);
|
||||||
if(line.empty()) {
|
if(line.empty()) {
|
||||||
@ -280,7 +280,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool _loadEmitProb(const string& line, EmitProbMap& mp) {
|
bool loadEmitProb_(const string& line, EmitProbMap& mp) {
|
||||||
if(line.empty()) {
|
if(line.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -290,7 +290,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
for(size_t i = 0; i < tmp.size(); i++) {
|
for(size_t i = 0; i < tmp.size(); i++) {
|
||||||
split(tmp[i], tmp2, ":");
|
split(tmp[i], tmp2, ":");
|
||||||
if(2 != tmp2.size()) {
|
if(2 != tmp2.size()) {
|
||||||
LogError("_emitProb illegal.");
|
LogError("emitProb_ illegal.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
||||||
@ -301,7 +301,7 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
|
double getEmitProb_(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
|
||||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||||
if(cit == ptMp->end()) {
|
if(cit == ptMp->end()) {
|
||||||
return defVal;
|
return defVal;
|
||||||
@ -311,14 +311,14 @@ class HMMSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
char _statMap[STATUS_SUM];
|
char statMap_[STATUS_SUM];
|
||||||
double _startProb[STATUS_SUM];
|
double startProb_[STATUS_SUM];
|
||||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
double transProb_[STATUS_SUM][STATUS_SUM];
|
||||||
EmitProbMap _emitProbB;
|
EmitProbMap emitProbB_;
|
||||||
EmitProbMap _emitProbE;
|
EmitProbMap emitProbE_;
|
||||||
EmitProbMap _emitProbM;
|
EmitProbMap emitProbM_;
|
||||||
EmitProbMap _emitProbS;
|
EmitProbMap emitProbS_;
|
||||||
vector<EmitProbMap* > _emitProbVec;
|
vector<EmitProbMap* > emitProbVec_;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -18,9 +18,9 @@ class KeywordExtractor {
|
|||||||
~KeywordExtractor() {};
|
~KeywordExtractor() {};
|
||||||
|
|
||||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
||||||
_loadIdfDict(idfPath);
|
loadIdfDict_(idfPath);
|
||||||
_loadStopWordDict(stopWordPath);
|
loadStopWordDict_(stopWordPath);
|
||||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDict));
|
||||||
};
|
};
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
|
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
|
||||||
@ -36,30 +36,30 @@ class KeywordExtractor {
|
|||||||
|
|
||||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if(!_segment.cut(str, words)) {
|
if(!segment_.cut(str, words)) {
|
||||||
LogError("segment cut(%s) failed.", str.c_str());
|
LogError("segment cut(%s) failed.", str.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
if(_isSingleWord(*iter)) {
|
if(isSingleWord_(*iter)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wordmap[*iter] += 1.0;
|
wordmap[*iter] += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
||||||
if(_stopWords.end() != _stopWords.find(itr->first)) {
|
if(stopWords_.end() != stopWords_.find(itr->first)) {
|
||||||
wordmap.erase(itr++);
|
wordmap.erase(itr++);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
||||||
if(cit != _idfMap.end()) {
|
if(cit != idfMap_.end()) {
|
||||||
itr->second *= cit->second;
|
itr->second *= cit->second;
|
||||||
} else {
|
} else {
|
||||||
itr->second *= _idfAverage;
|
itr->second *= idfAverage_;
|
||||||
}
|
}
|
||||||
itr ++;
|
itr ++;
|
||||||
}
|
}
|
||||||
@ -67,12 +67,12 @@ class KeywordExtractor {
|
|||||||
keywords.clear();
|
keywords.clear();
|
||||||
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
||||||
topN = min(topN, keywords.size());
|
topN = min(topN, keywords.size());
|
||||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
|
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), cmp_);
|
||||||
keywords.resize(topN);
|
keywords.resize(topN);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void _loadIdfDict(const string& idfPath) {
|
void loadIdfDict_(const string& idfPath) {
|
||||||
ifstream ifs(idfPath.c_str());
|
ifstream ifs(idfPath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if(!ifs.is_open()) {
|
||||||
LogFatal("open %s failed.", idfPath.c_str());
|
LogFatal("open %s failed.", idfPath.c_str());
|
||||||
@ -93,28 +93,28 @@ class KeywordExtractor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
idf = atof(buf[1].c_str());
|
idf = atof(buf[1].c_str());
|
||||||
_idfMap[buf[0]] = idf;
|
idfMap_[buf[0]] = idf;
|
||||||
idfSum += idf;
|
idfSum += idf;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(lineno);
|
assert(lineno);
|
||||||
_idfAverage = idfSum / lineno;
|
idfAverage_ = idfSum / lineno;
|
||||||
assert(_idfAverage > 0.0);
|
assert(idfAverage_ > 0.0);
|
||||||
}
|
}
|
||||||
void _loadStopWordDict(const string& filePath) {
|
void loadStopWordDict_(const string& filePath) {
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs.is_open()) {
|
if(!ifs.is_open()) {
|
||||||
LogFatal("open %s failed.", filePath.c_str());
|
LogFatal("open %s failed.", filePath.c_str());
|
||||||
}
|
}
|
||||||
string line ;
|
string line ;
|
||||||
while(getline(ifs, line)) {
|
while(getline(ifs, line)) {
|
||||||
_stopWords.insert(line);
|
stopWords_.insert(line);
|
||||||
}
|
}
|
||||||
assert(_stopWords.size());
|
assert(stopWords_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool _isSingleWord(const string& str) const {
|
bool isSingleWord_(const string& str) const {
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
TransCode::decode(str, unicode);
|
TransCode::decode(str, unicode);
|
||||||
if(unicode.size() == 1)
|
if(unicode.size() == 1)
|
||||||
@ -122,16 +122,16 @@ class KeywordExtractor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
static bool cmp_(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
||||||
return lhs.second > rhs.second;
|
return lhs.second > rhs.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MixSegment _segment;
|
MixSegment segment_;
|
||||||
unordered_map<string, double> _idfMap;
|
unordered_map<string, double> idfMap_;
|
||||||
double _idfAverage;
|
double idfAverage_;
|
||||||
|
|
||||||
unordered_set<string> _stopWords;
|
unordered_set<string> stopWords_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,12 +21,12 @@ class MPSegment: public SegmentBase {
|
|||||||
virtual ~MPSegment() {};
|
virtual ~MPSegment() {};
|
||||||
|
|
||||||
bool init(const string& dictPath, const string& userDictPath = "") {
|
bool init(const string& dictPath, const string& userDictPath = "") {
|
||||||
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
LIMONP_CHECK(dictTrie_.init(dictPath, userDictPath));
|
||||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
|
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const {
|
||||||
return _dictTrie.isUserDictSingleChineseWord(value);
|
return dictTrie_.isUserDictSingleChineseWord(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
@ -57,20 +57,20 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
vector<SegmentChar> segmentChars;
|
vector<SegmentChar> segmentChars;
|
||||||
|
|
||||||
_dictTrie.find(begin, end, segmentChars);
|
dictTrie_.find(begin, end, segmentChars);
|
||||||
|
|
||||||
_calcDP(segmentChars);
|
calcDP_(segmentChars);
|
||||||
|
|
||||||
_cut(segmentChars, res);
|
cut_(segmentChars, res);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* getDictTrie() const {
|
||||||
return &_dictTrie;
|
return &dictTrie_;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void _calcDP(vector<SegmentChar>& segmentChars) const {
|
void calcDP_(vector<SegmentChar>& segmentChars) const {
|
||||||
size_t nextPos;
|
size_t nextPos;
|
||||||
const DictUnit* p;
|
const DictUnit* p;
|
||||||
double val;
|
double val;
|
||||||
@ -90,7 +90,7 @@ class MPSegment: public SegmentBase {
|
|||||||
if(p) {
|
if(p) {
|
||||||
val += p->weight;
|
val += p->weight;
|
||||||
} else {
|
} else {
|
||||||
val += _dictTrie.getMinWeight();
|
val += dictTrie_.getMinWeight();
|
||||||
}
|
}
|
||||||
if(val > segmentChars[i].weight) {
|
if(val > segmentChars[i].weight) {
|
||||||
segmentChars[i].pInfo = p;
|
segmentChars[i].pInfo = p;
|
||||||
@ -99,7 +99,7 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
|
void cut_(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while(i < segmentChars.size()) {
|
while(i < segmentChars.size()) {
|
||||||
const DictUnit* p = segmentChars[i].pInfo;
|
const DictUnit* p = segmentChars[i].pInfo;
|
||||||
@ -114,7 +114,7 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DictTrie _dictTrie;
|
DictTrie dictTrie_;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -17,8 +17,8 @@ class MixSegment: public SegmentBase {
|
|||||||
virtual ~MixSegment() {
|
virtual ~MixSegment() {
|
||||||
}
|
}
|
||||||
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
|
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") {
|
||||||
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
LIMONP_CHECK(mpSeg_.init(mpSegDict, userDict));
|
||||||
LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
|
LIMONP_CHECK(hmmSeg_.init(hmmSegDict));
|
||||||
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -26,7 +26,7 @@ class MixSegment: public SegmentBase {
|
|||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
words.reserve(end - begin);
|
words.reserve(end - begin);
|
||||||
if(!_mpSeg.cut(begin, end, words)) {
|
if(!mpSeg_.cut(begin, end, words)) {
|
||||||
LogError("mpSeg cutDAG failed.");
|
LogError("mpSeg cutDAG failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -37,21 +37,21 @@ class MixSegment: public SegmentBase {
|
|||||||
piece.reserve(end - begin);
|
piece.reserve(end - begin);
|
||||||
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
for (size_t i = 0, j = 0; i < words.size(); i++) {
|
||||||
//if mp get a word, it's ok, put it into result
|
//if mp get a word, it's ok, put it into result
|
||||||
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) {
|
if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.isUserDictSingleChineseWord(words[i][0]))) {
|
||||||
res.push_back(words[i]);
|
res.push_back(words[i]);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if mp get a single one and it is not in userdict, collect it in sequence
|
// if mp get a single one and it is not in userdict, collect it in sequence
|
||||||
j = i;
|
j = i;
|
||||||
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) {
|
while (j < words.size() && 1 == words[j].size() && !mpSeg_.isUserDictSingleChineseWord(words[j][0])) {
|
||||||
piece.push_back(words[j][0]);
|
piece.push_back(words[j][0]);
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// cut the sequence with hmm
|
// cut the sequence with hmm
|
||||||
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) {
|
if (!hmmSeg_.cut(piece.begin(), piece.end(), hmmRes)) {
|
||||||
LogError("_hmmSeg cut failed.");
|
LogError("hmmSeg_ cut failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,11 +92,11 @@ class MixSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* getDictTrie() const {
|
||||||
return _mpSeg.getDictTrie();
|
return mpSeg_.getDictTrie();
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
MPSegment _mpSeg;
|
MPSegment mpSeg_;
|
||||||
HMMSegment _hmmSeg;
|
HMMSegment hmmSeg_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,16 +30,16 @@ class PosTagger {
|
|||||||
const string& hmmFilePath,
|
const string& hmmFilePath,
|
||||||
const string& userDictPath = ""
|
const string& userDictPath = ""
|
||||||
) {
|
) {
|
||||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
|
LIMONP_CHECK(segment_.init(dictPath, hmmFilePath, userDictPath));
|
||||||
_dictTrie = _segment.getDictTrie();
|
dictTrie_ = segment_.getDictTrie();
|
||||||
LIMONP_CHECK(_dictTrie);
|
LIMONP_CHECK(dictTrie_);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
bool tag(const string& src, vector<pair<string, string> >& res) const {
|
||||||
vector<string> cutRes;
|
vector<string> cutRes;
|
||||||
if (!_segment.cut(src, cutRes)) {
|
if (!segment_.cut(src, cutRes)) {
|
||||||
LogError("_mixSegment cut failed");
|
LogError("mixSegment_ cut failed");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,9 +50,9 @@ class PosTagger {
|
|||||||
LogError("decode failed.");
|
LogError("decode failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
tmp = _dictTrie->find(unico.begin(), unico.end());
|
tmp = dictTrie_->find(unico.begin(), unico.end());
|
||||||
if(tmp == NULL || tmp->tag.empty()) {
|
if(tmp == NULL || tmp->tag.empty()) {
|
||||||
res.push_back(make_pair(*itr, _specialRule(unico)));
|
res.push_back(make_pair(*itr, specialRule_(unico)));
|
||||||
} else {
|
} else {
|
||||||
res.push_back(make_pair(*itr, tmp->tag));
|
res.push_back(make_pair(*itr, tmp->tag));
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@ class PosTagger {
|
|||||||
return !res.empty();
|
return !res.empty();
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
const char* _specialRule(const Unicode& unicode) const {
|
const char* specialRule_(const Unicode& unicode) const {
|
||||||
size_t m = 0;
|
size_t m = 0;
|
||||||
size_t eng = 0;
|
size_t eng = 0;
|
||||||
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
||||||
@ -83,8 +83,8 @@ class PosTagger {
|
|||||||
return POS_ENG;
|
return POS_ENG;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
MixSegment _segment;
|
MixSegment segment_;
|
||||||
const DictTrie * _dictTrie;
|
const DictTrie * dictTrie_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,10 +22,10 @@ class QuerySegment: public SegmentBase {
|
|||||||
};
|
};
|
||||||
virtual ~QuerySegment() {};
|
virtual ~QuerySegment() {};
|
||||||
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
|
bool init(const string& dict, const string& model, size_t maxWordLen, const string& userDict = "") {
|
||||||
LIMONP_CHECK(_mixSeg.init(dict, model, userDict));
|
LIMONP_CHECK(mixSeg_.init(dict, model, userDict));
|
||||||
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
|
LIMONP_CHECK(fullSeg_.init(mixSeg_.getDictTrie()));
|
||||||
assert(maxWordLen);
|
assert(maxWordLen);
|
||||||
_maxWordLen = maxWordLen;
|
maxWordLen_ = maxWordLen;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
@ -37,17 +37,17 @@ class QuerySegment: public SegmentBase {
|
|||||||
|
|
||||||
//use mix cut first
|
//use mix cut first
|
||||||
vector<Unicode> mixRes;
|
vector<Unicode> mixRes;
|
||||||
if (!_mixSeg.cut(begin, end, mixRes)) {
|
if (!mixSeg_.cut(begin, end, mixRes)) {
|
||||||
LogError("_mixSeg cut failed.");
|
LogError("mixSeg_ cut failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<Unicode> fullRes;
|
vector<Unicode> fullRes;
|
||||||
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
||||||
|
|
||||||
// if it's too long, cut with _fullSeg, put fullRes in res
|
// if it's too long, cut with fullSeg_, put fullRes in res
|
||||||
if (mixResItr->size() > _maxWordLen) {
|
if (mixResItr->size() > maxWordLen_) {
|
||||||
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
|
if (fullSeg_.cut(mixResItr->begin(), mixResItr->end(), fullRes)) {
|
||||||
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) {
|
||||||
res.push_back(*fullResItr);
|
res.push_back(*fullResItr);
|
||||||
}
|
}
|
||||||
@ -88,9 +88,9 @@ class QuerySegment: public SegmentBase {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
MixSegment _mixSeg;
|
MixSegment mixSeg_;
|
||||||
FullSegment _fullSeg;
|
FullSegment fullSeg_;
|
||||||
size_t _maxWordLen;
|
size_t maxWordLen_;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
|||||||
class SegmentBase: public ISegment, public NonCopyable {
|
class SegmentBase: public ISegment, public NonCopyable {
|
||||||
public:
|
public:
|
||||||
SegmentBase() {
|
SegmentBase() {
|
||||||
_loadSpecialSymbols();
|
loadSpecialSymbols_();
|
||||||
};
|
};
|
||||||
virtual ~SegmentBase() {};
|
virtual ~SegmentBase() {};
|
||||||
public:
|
public:
|
||||||
@ -39,7 +39,7 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
Unicode::const_iterator right;
|
Unicode::const_iterator right;
|
||||||
|
|
||||||
for(right = unicode.begin(); right != unicode.end(); right++) {
|
for(right = unicode.begin(); right != unicode.end(); right++) {
|
||||||
if(isIn(_specialSymbols, *right)) {
|
if(isIn(specialSymbols_, *right)) {
|
||||||
if(left != right) {
|
if(left != right) {
|
||||||
cut(left, right, res);
|
cut(left, right, res);
|
||||||
}
|
}
|
||||||
@ -55,15 +55,15 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void _loadSpecialSymbols() {
|
void loadSpecialSymbols_() {
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
for(size_t i = 0; i < size; i ++) {
|
for(size_t i = 0; i < size; i ++) {
|
||||||
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
|
||||||
}
|
}
|
||||||
assert(_specialSymbols.size());
|
assert(specialSymbols_.size());
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
unordered_set<UnicodeValueType> _specialSymbols;
|
unordered_set<UnicodeValueType> specialSymbols_;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
50
src/Trie.hpp
50
src/Trie.hpp
@ -61,19 +61,19 @@ class TrieNode {
|
|||||||
class Trie {
|
class Trie {
|
||||||
public:
|
public:
|
||||||
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
||||||
_root = new TrieNode;
|
root_ = new TrieNode;
|
||||||
_createTrie(keys, valuePointers);
|
createTrie_(keys, valuePointers);
|
||||||
_build();// build automation
|
build_();// build automation
|
||||||
}
|
}
|
||||||
~Trie() {
|
~Trie() {
|
||||||
if(_root) {
|
if(root_) {
|
||||||
_deleteNode(_root);
|
deleteNode_(root_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
TrieNode::NextMap::const_iterator citer;
|
TrieNode::NextMap::const_iterator citer;
|
||||||
const TrieNode* ptNode = _root;
|
const TrieNode* ptNode = root_;
|
||||||
for(Unicode::const_iterator it = begin; it != end; it++) {
|
for(Unicode::const_iterator it = begin; it != end; it++) {
|
||||||
// build automation
|
// build automation
|
||||||
assert(ptNode);
|
assert(ptNode);
|
||||||
@ -91,7 +91,7 @@ class Trie {
|
|||||||
vector<struct SegmentChar>& res
|
vector<struct SegmentChar>& res
|
||||||
) const {
|
) const {
|
||||||
res.resize(end - begin);
|
res.resize(end - begin);
|
||||||
const TrieNode * now = _root;
|
const TrieNode * now = root_;
|
||||||
const TrieNode* node;
|
const TrieNode* node;
|
||||||
// compiler will complain warnings if only "i < end - begin" .
|
// compiler will complain warnings if only "i < end - begin" .
|
||||||
for (size_t i = 0; i < size_t(end - begin); i++) {
|
for (size_t i = 0; i < size_t(end - begin); i++) {
|
||||||
@ -102,7 +102,7 @@ class Trie {
|
|||||||
bool flag = false;
|
bool flag = false;
|
||||||
|
|
||||||
// rollback
|
// rollback
|
||||||
while( now != _root ) {
|
while( now != root_ ) {
|
||||||
node = now->findNext(ch);
|
node = now->findNext(ch);
|
||||||
if (node != NULL) {
|
if (node != NULL) {
|
||||||
flag = true;
|
flag = true;
|
||||||
@ -116,11 +116,11 @@ class Trie {
|
|||||||
node = now->findNext(ch);
|
node = now->findNext(ch);
|
||||||
}
|
}
|
||||||
if(node == NULL) {
|
if(node == NULL) {
|
||||||
now = _root;
|
now = root_;
|
||||||
} else {
|
} else {
|
||||||
now = node;
|
now = node;
|
||||||
const TrieNode * temp = now;
|
const TrieNode * temp = now;
|
||||||
while(temp != _root) {
|
while(temp != root_) {
|
||||||
if (temp->ptValue) {
|
if (temp->ptValue) {
|
||||||
size_t pos = i - temp->ptValue->word.size() + 1;
|
size_t pos = i - temp->ptValue->word.size() + 1;
|
||||||
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
|
res[pos].dag.push_back(pair<vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
|
||||||
@ -139,7 +139,7 @@ class Trie {
|
|||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
DagType & res,
|
DagType & res,
|
||||||
size_t offset = 0) const {
|
size_t offset = 0) const {
|
||||||
const TrieNode * ptNode = _root;
|
const TrieNode * ptNode = root_;
|
||||||
TrieNode::NextMap::const_iterator citer;
|
TrieNode::NextMap::const_iterator citer;
|
||||||
for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
|
for(Unicode::const_iterator itr = begin; itr != end ; itr++) {
|
||||||
assert(ptNode);
|
assert(ptNode);
|
||||||
@ -158,13 +158,13 @@ class Trie {
|
|||||||
return !res.empty();
|
return !res.empty();
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void _build() {
|
void build_() {
|
||||||
queue<TrieNode*> que;
|
queue<TrieNode*> que;
|
||||||
assert(_root->ptValue == NULL);
|
assert(root_->ptValue == NULL);
|
||||||
assert(_root->next);
|
assert(root_->next);
|
||||||
_root->fail = NULL;
|
root_->fail = NULL;
|
||||||
for(TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
|
for(TrieNode::NextMap::iterator iter = root_->next->begin(); iter != root_->next->end(); iter++) {
|
||||||
iter->second->fail = _root;
|
iter->second->fail = root_;
|
||||||
que.push(iter->second);
|
que.push(iter->second);
|
||||||
}
|
}
|
||||||
TrieNode* back = NULL;
|
TrieNode* back = NULL;
|
||||||
@ -185,24 +185,24 @@ class Trie {
|
|||||||
back = back->fail;
|
back = back->fail;
|
||||||
}
|
}
|
||||||
if(back == NULL) {
|
if(back == NULL) {
|
||||||
iter->second->fail = _root;
|
iter->second->fail = root_;
|
||||||
}
|
}
|
||||||
que.push(iter->second);
|
que.push(iter->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
void createTrie_(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers) {
|
||||||
if(valuePointers.empty() || keys.empty()) {
|
if(valuePointers.empty() || keys.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
assert(keys.size() == valuePointers.size());
|
assert(keys.size() == valuePointers.size());
|
||||||
|
|
||||||
for(size_t i = 0; i < keys.size(); i++) {
|
for(size_t i = 0; i < keys.size(); i++) {
|
||||||
_insertNode(keys[i], valuePointers[i]);
|
insertNode_(keys[i], valuePointers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void _insertNode(const Unicode& key, const DictUnit* ptValue) {
|
void insertNode_(const Unicode& key, const DictUnit* ptValue) {
|
||||||
TrieNode* ptNode = _root;
|
TrieNode* ptNode = root_;
|
||||||
|
|
||||||
TrieNode::NextMap::const_iterator kmIter;
|
TrieNode::NextMap::const_iterator kmIter;
|
||||||
|
|
||||||
@ -224,21 +224,21 @@ class Trie {
|
|||||||
}
|
}
|
||||||
ptNode->ptValue = ptValue;
|
ptNode->ptValue = ptValue;
|
||||||
}
|
}
|
||||||
void _deleteNode(TrieNode* node) {
|
void deleteNode_(TrieNode* node) {
|
||||||
if(!node) {
|
if(!node) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if(node->next) {
|
if(node->next) {
|
||||||
TrieNode::NextMap::iterator it;
|
TrieNode::NextMap::iterator it;
|
||||||
for(it = node->next->begin(); it != node->next->end(); it++) {
|
for(it = node->next->begin(); it != node->next->end(); it++) {
|
||||||
_deleteNode(it->second);
|
deleteNode_(it->second);
|
||||||
}
|
}
|
||||||
delete node->next;
|
delete node->next;
|
||||||
}
|
}
|
||||||
delete node;
|
delete node;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
TrieNode* _root;
|
TrieNode* root_;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user