mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
astyle
This commit is contained in:
parent
d1a112c0c4
commit
931db7d1e5
321
src/DictTrie.hpp
321
src/DictTrie.hpp
@ -16,191 +16,164 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
using namespace Limonp;
|
||||||
using namespace Limonp;
|
const double MIN_DOUBLE = -3.14e+100;
|
||||||
const double MIN_DOUBLE = -3.14e+100;
|
const double MAX_DOUBLE = 3.14e+100;
|
||||||
const double MAX_DOUBLE = 3.14e+100;
|
const size_t DICT_COLUMN_NUM = 3;
|
||||||
const size_t DICT_COLUMN_NUM = 3;
|
const char* const UNKNOWN_TAG = "";
|
||||||
const char* const UNKNOWN_TAG = "";
|
|
||||||
|
|
||||||
class DictTrie
|
class DictTrie {
|
||||||
{
|
public:
|
||||||
public:
|
|
||||||
|
|
||||||
DictTrie()
|
DictTrie() {
|
||||||
{
|
_trie = NULL;
|
||||||
_trie = NULL;
|
_minWeight = MAX_DOUBLE;
|
||||||
_minWeight = MAX_DOUBLE;
|
}
|
||||||
}
|
DictTrie(const string& dictPath, const string& userDictPath = "") {
|
||||||
DictTrie(const string& dictPath, const string& userDictPath = "")
|
new (this) DictTrie();
|
||||||
{
|
init(dictPath, userDictPath);
|
||||||
new (this) DictTrie();
|
}
|
||||||
init(dictPath, userDictPath);
|
~DictTrie() {
|
||||||
}
|
if(_trie) {
|
||||||
~DictTrie()
|
delete _trie;
|
||||||
{
|
}
|
||||||
if(_trie)
|
}
|
||||||
{
|
|
||||||
delete _trie;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool init(const string& dictPath, const string& userDictPath = "")
|
|
||||||
{
|
|
||||||
assert(!_trie);
|
|
||||||
_loadDict(dictPath);
|
|
||||||
_calculateWeight(_nodeInfos);
|
|
||||||
_minWeight = _findMinWeight(_nodeInfos);
|
|
||||||
|
|
||||||
if(userDictPath.size())
|
|
||||||
{
|
|
||||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
|
||||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
|
||||||
}
|
|
||||||
_shrink(_nodeInfos);
|
|
||||||
_trie = _createTrie(_nodeInfos);
|
|
||||||
assert(_trie);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
bool init(const string& dictPath, const string& userDictPath = "") {
|
||||||
{
|
assert(!_trie);
|
||||||
return _trie->find(begin, end);
|
_loadDict(dictPath);
|
||||||
}
|
_calculateWeight(_nodeInfos);
|
||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
|
_minWeight = _findMinWeight(_nodeInfos);
|
||||||
{
|
|
||||||
return _trie->find(begin, end, dag, offset);
|
if(userDictPath.size()) {
|
||||||
}
|
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||||
void find(
|
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||||
Unicode::const_iterator begin,
|
}
|
||||||
Unicode::const_iterator end,
|
_shrink(_nodeInfos);
|
||||||
vector<SegmentChar>& res
|
_trie = _createTrie(_nodeInfos);
|
||||||
) const
|
assert(_trie);
|
||||||
{
|
return true;
|
||||||
_trie->find(begin, end, res);
|
}
|
||||||
}
|
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
{
|
return _trie->find(begin, end);
|
||||||
return isIn(_userDictSingleChineseWord, word);
|
}
|
||||||
}
|
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
||||||
double getMinWeight() const {return _minWeight;};
|
return _trie->find(begin, end, dag, offset);
|
||||||
|
}
|
||||||
|
void find(
|
||||||
|
Unicode::const_iterator begin,
|
||||||
|
Unicode::const_iterator end,
|
||||||
|
vector<SegmentChar>& res
|
||||||
|
) const {
|
||||||
|
_trie->find(begin, end, res);
|
||||||
|
}
|
||||||
|
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
||||||
|
return isIn(_userDictSingleChineseWord, word);
|
||||||
|
}
|
||||||
|
double getMinWeight() const {
|
||||||
|
return _minWeight;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
UglyTrie * _createTrie(const vector<DictUnit>& dictUnits)
|
UglyTrie * _createTrie(const vector<DictUnit>& dictUnits) {
|
||||||
{
|
assert(dictUnits.size());
|
||||||
assert(dictUnits.size());
|
vector<Unicode> words;
|
||||||
vector<Unicode> words;
|
vector<const DictUnit*> valuePointers;
|
||||||
vector<const DictUnit*> valuePointers;
|
for(size_t i = 0 ; i < dictUnits.size(); i ++) {
|
||||||
for(size_t i = 0 ; i < dictUnits.size(); i ++)
|
words.push_back(dictUnits[i].word);
|
||||||
{
|
valuePointers.push_back(&dictUnits[i]);
|
||||||
words.push_back(dictUnits[i].word);
|
}
|
||||||
valuePointers.push_back(&dictUnits[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
UglyTrie * trie = new UglyTrie(words, valuePointers);
|
UglyTrie * trie = new UglyTrie(words, valuePointers);
|
||||||
return trie;
|
return trie;
|
||||||
}
|
}
|
||||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
|
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
|
||||||
{
|
ifstream ifs(filePath.c_str());
|
||||||
ifstream ifs(filePath.c_str());
|
assert(ifs.is_open());
|
||||||
assert(ifs.is_open());
|
string line;
|
||||||
string line;
|
DictUnit nodeInfo;
|
||||||
DictUnit nodeInfo;
|
vector<string> buf;
|
||||||
vector<string> buf;
|
size_t lineno;
|
||||||
size_t lineno;
|
for(lineno = 0; getline(ifs, line); lineno++) {
|
||||||
for(lineno = 0; getline(ifs, line); lineno++)
|
buf.clear();
|
||||||
{
|
split(line, buf, " ");
|
||||||
buf.clear();
|
assert(buf.size() >= 1);
|
||||||
split(line, buf, " ");
|
if(!TransCode::decode(buf[0], nodeInfo.word)) {
|
||||||
assert(buf.size() >= 1);
|
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
continue;
|
||||||
{
|
}
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
if(nodeInfo.word.size() == 1) {
|
||||||
continue;
|
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
||||||
}
|
}
|
||||||
if(nodeInfo.word.size() == 1)
|
nodeInfo.weight = defaultWeight;
|
||||||
{
|
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
|
||||||
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
_nodeInfos.push_back(nodeInfo);
|
||||||
}
|
}
|
||||||
nodeInfo.weight = defaultWeight;
|
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||||
nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag);
|
}
|
||||||
_nodeInfos.push_back(nodeInfo);
|
void _loadDict(const string& filePath) {
|
||||||
}
|
ifstream ifs(filePath.c_str());
|
||||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
assert(ifs.is_open());
|
||||||
}
|
string line;
|
||||||
void _loadDict(const string& filePath)
|
vector<string> buf;
|
||||||
{
|
|
||||||
ifstream ifs(filePath.c_str());
|
|
||||||
assert(ifs.is_open());
|
|
||||||
string line;
|
|
||||||
vector<string> buf;
|
|
||||||
|
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
|
for(size_t lineno = 0 ; getline(ifs, line); lineno++) {
|
||||||
{
|
split(line, buf, " ");
|
||||||
split(line, buf, " ");
|
assert(buf.size() == DICT_COLUMN_NUM);
|
||||||
assert(buf.size() == DICT_COLUMN_NUM);
|
|
||||||
|
|
||||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
|
||||||
{
|
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
nodeInfo.weight = atof(buf[1].c_str());
|
|
||||||
nodeInfo.tag = buf[2];
|
|
||||||
|
|
||||||
_nodeInfos.push_back(nodeInfo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
double _findMinWeight(const vector<DictUnit>& nodeInfos) const
|
|
||||||
{
|
|
||||||
double ret = MAX_DOUBLE;
|
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
|
||||||
{
|
|
||||||
ret = min(nodeInfos[i].weight, ret);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
|
|
||||||
{
|
|
||||||
double ret = MIN_DOUBLE;
|
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
|
||||||
{
|
|
||||||
ret = max(nodeInfos[i].weight, ret);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void _calculateWeight(vector<DictUnit>& nodeInfos) const
|
if(!TransCode::decode(buf[0], nodeInfo.word)) {
|
||||||
{
|
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||||
double sum = 0.0;
|
continue;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
}
|
||||||
{
|
nodeInfo.weight = atof(buf[1].c_str());
|
||||||
sum += nodeInfos[i].weight;
|
nodeInfo.tag = buf[2];
|
||||||
}
|
|
||||||
assert(sum);
|
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
|
||||||
{
|
|
||||||
DictUnit& nodeInfo = nodeInfos[i];
|
|
||||||
assert(nodeInfo.weight);
|
|
||||||
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void _shrink(vector<DictUnit>& units) const
|
_nodeInfos.push_back(nodeInfo);
|
||||||
{
|
}
|
||||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
}
|
||||||
}
|
double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
|
||||||
|
double ret = MAX_DOUBLE;
|
||||||
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
|
ret = min(nodeInfos[i].weight, ret);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
|
||||||
|
double ret = MIN_DOUBLE;
|
||||||
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
|
ret = max(nodeInfos[i].weight, ret);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
void _calculateWeight(vector<DictUnit>& nodeInfos) const {
|
||||||
vector<DictUnit> _nodeInfos;
|
double sum = 0.0;
|
||||||
UglyTrie * _trie;
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
|
sum += nodeInfos[i].weight;
|
||||||
|
}
|
||||||
|
assert(sum);
|
||||||
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
|
DictUnit& nodeInfo = nodeInfos[i];
|
||||||
|
assert(nodeInfo.weight);
|
||||||
|
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
double _minWeight;
|
void _shrink(vector<DictUnit>& units) const {
|
||||||
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||||
};
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
vector<DictUnit> _nodeInfos;
|
||||||
|
UglyTrie * _trie;
|
||||||
|
|
||||||
|
double _minWeight;
|
||||||
|
unordered_set<Unicode::value_type> _userDictSingleChineseWord;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -10,140 +10,116 @@
|
|||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "TransCode.hpp"
|
#include "TransCode.hpp"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
class FullSegment: public SegmentBase {
|
||||||
class FullSegment: public SegmentBase
|
public:
|
||||||
{
|
FullSegment() {
|
||||||
public:
|
_dictTrie = NULL;
|
||||||
FullSegment()
|
_isBorrowed = false;
|
||||||
{
|
}
|
||||||
_dictTrie = NULL;
|
explicit FullSegment(const string& dictPath) {
|
||||||
_isBorrowed = false;
|
_dictTrie = NULL;
|
||||||
}
|
init(dictPath);
|
||||||
explicit FullSegment(const string& dictPath)
|
}
|
||||||
{
|
explicit FullSegment(const DictTrie* dictTrie) {
|
||||||
_dictTrie = NULL;
|
_dictTrie = NULL;
|
||||||
init(dictPath);
|
init(dictTrie);
|
||||||
}
|
}
|
||||||
explicit FullSegment(const DictTrie* dictTrie)
|
virtual ~FullSegment() {
|
||||||
{
|
if(_dictTrie && ! _isBorrowed) {
|
||||||
_dictTrie = NULL;
|
delete _dictTrie;
|
||||||
init(dictTrie);
|
}
|
||||||
}
|
|
||||||
virtual ~FullSegment()
|
|
||||||
{
|
|
||||||
if(_dictTrie && ! _isBorrowed)
|
|
||||||
{
|
|
||||||
delete _dictTrie;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
bool init(const string& dictPath)
|
bool init(const string& dictPath) {
|
||||||
{
|
assert(_dictTrie == NULL);
|
||||||
assert(_dictTrie == NULL);
|
_dictTrie = new DictTrie(dictPath);
|
||||||
_dictTrie = new DictTrie(dictPath);
|
_isBorrowed = false;
|
||||||
_isBorrowed = false;
|
return true;
|
||||||
return true;
|
}
|
||||||
}
|
bool init(const DictTrie* dictTrie) {
|
||||||
bool init(const DictTrie* dictTrie)
|
assert(_dictTrie == NULL);
|
||||||
{
|
assert(dictTrie);
|
||||||
assert(_dictTrie == NULL);
|
_dictTrie = dictTrie;
|
||||||
assert(dictTrie);
|
_isBorrowed = true;
|
||||||
_dictTrie = dictTrie;
|
return true;
|
||||||
_isBorrowed = true;
|
}
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
{
|
assert(_dictTrie);
|
||||||
assert(_dictTrie);
|
if (begin >= end) {
|
||||||
if (begin >= end)
|
LogError("begin >= end");
|
||||||
{
|
return false;
|
||||||
LogError("begin >= end");
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
//resut of searching in trie tree
|
//resut of searching in trie tree
|
||||||
DagType tRes;
|
DagType tRes;
|
||||||
|
|
||||||
//max index of res's words
|
//max index of res's words
|
||||||
int maxIdx = 0;
|
int maxIdx = 0;
|
||||||
|
|
||||||
// always equals to (uItr - begin)
|
// always equals to (uItr - begin)
|
||||||
int uIdx = 0;
|
int uIdx = 0;
|
||||||
|
|
||||||
//tmp variables
|
//tmp variables
|
||||||
int wordLen = 0;
|
int wordLen = 0;
|
||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||||
{
|
//find word start from uItr
|
||||||
//find word start from uItr
|
if (_dictTrie->find(uItr, end, tRes, 0)) {
|
||||||
if (_dictTrie->find(uItr, end, tRes, 0))
|
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
{
|
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
{
|
||||||
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
wordLen = itr->second->word.size();
|
||||||
{
|
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
|
||||||
wordLen = itr->second->word.size();
|
res.push_back(itr->second->word);
|
||||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
}
|
||||||
{
|
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
||||||
res.push_back(itr->second->word);
|
}
|
||||||
}
|
tRes.clear();
|
||||||
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
} else { // not found word start from uItr
|
||||||
}
|
if (maxIdx <= uIdx) { // never exist in prev results
|
||||||
tRes.clear();
|
//put itr itself in res
|
||||||
}
|
res.push_back(Unicode(1, *uItr));
|
||||||
else // not found word start from uItr
|
|
||||||
{
|
|
||||||
if (maxIdx <= uIdx) // never exist in prev results
|
|
||||||
{
|
|
||||||
//put itr itself in res
|
|
||||||
res.push_back(Unicode(1, *uItr));
|
|
||||||
|
|
||||||
//mark it exits
|
//mark it exits
|
||||||
++maxIdx;
|
++maxIdx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++uIdx;
|
++uIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
||||||
{
|
assert(_dictTrie);
|
||||||
assert(_dictTrie);
|
if (begin >= end) {
|
||||||
if (begin >= end)
|
LogError("begin >= end");
|
||||||
{
|
return false;
|
||||||
LogError("begin >= end");
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
vector<Unicode> uRes;
|
vector<Unicode> uRes;
|
||||||
if (!cut(begin, end, uRes))
|
if (!cut(begin, end, uRes)) {
|
||||||
{
|
LogError("get unicode cut result error.");
|
||||||
LogError("get unicode cut result error.");
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
string tmp;
|
string tmp;
|
||||||
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
|
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
|
||||||
{
|
if (TransCode::encode(*uItr, tmp)) {
|
||||||
if (TransCode::encode(*uItr, tmp))
|
res.push_back(tmp);
|
||||||
{
|
} else {
|
||||||
res.push_back(tmp);
|
LogError("encode failed.");
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
{
|
|
||||||
LogError("encode failed.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
const DictTrie* _dictTrie;
|
const DictTrie* _dictTrie;
|
||||||
bool _isBorrowed;
|
bool _isBorrowed;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -12,387 +12,315 @@
|
|||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
using namespace Limonp;
|
||||||
using namespace Limonp;
|
typedef unordered_map<uint16_t, double> EmitProbMap;
|
||||||
typedef unordered_map<uint16_t, double> EmitProbMap;
|
class HMMSegment: public SegmentBase {
|
||||||
class HMMSegment: public SegmentBase
|
public:
|
||||||
{
|
/*
|
||||||
public:
|
* STATUS:
|
||||||
/*
|
* 0:B, 1:E, 2:M, 3:S
|
||||||
* STATUS:
|
* */
|
||||||
* 0:B, 1:E, 2:M, 3:S
|
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
||||||
* */
|
|
||||||
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HMMSegment(){}
|
HMMSegment() {}
|
||||||
explicit HMMSegment(const string& filePath)
|
explicit HMMSegment(const string& filePath) {
|
||||||
{
|
LIMONP_CHECK(init(filePath));
|
||||||
LIMONP_CHECK(init(filePath));
|
}
|
||||||
}
|
virtual ~HMMSegment() {}
|
||||||
virtual ~HMMSegment(){}
|
public:
|
||||||
public:
|
bool init(const string& filePath) {
|
||||||
bool init(const string& filePath)
|
memset(_startProb, 0, sizeof(_startProb));
|
||||||
{
|
memset(_transProb, 0, sizeof(_transProb));
|
||||||
memset(_startProb, 0, sizeof(_startProb));
|
_statMap[0] = 'B';
|
||||||
memset(_transProb, 0, sizeof(_transProb));
|
_statMap[1] = 'E';
|
||||||
_statMap[0] = 'B';
|
_statMap[2] = 'M';
|
||||||
_statMap[1] = 'E';
|
_statMap[3] = 'S';
|
||||||
_statMap[2] = 'M';
|
_emitProbVec.push_back(&_emitProbB);
|
||||||
_statMap[3] = 'S';
|
_emitProbVec.push_back(&_emitProbE);
|
||||||
_emitProbVec.push_back(&_emitProbB);
|
_emitProbVec.push_back(&_emitProbM);
|
||||||
_emitProbVec.push_back(&_emitProbE);
|
_emitProbVec.push_back(&_emitProbS);
|
||||||
_emitProbVec.push_back(&_emitProbM);
|
LIMONP_CHECK(_loadModel(filePath.c_str()));
|
||||||
_emitProbVec.push_back(&_emitProbS);
|
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
||||||
LIMONP_CHECK(_loadModel(filePath.c_str()));
|
return true;
|
||||||
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
}
|
||||||
return true;
|
public:
|
||||||
}
|
using SegmentBase::cut;
|
||||||
public:
|
public:
|
||||||
using SegmentBase::cut;
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
|
||||||
public:
|
Unicode::const_iterator left = begin;
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
|
Unicode::const_iterator right = begin;
|
||||||
{
|
while(right != end) {
|
||||||
Unicode::const_iterator left = begin;
|
if(*right < 0x80) {
|
||||||
Unicode::const_iterator right = begin;
|
if(left != right && !_cut(left, right, res)) {
|
||||||
while(right != end)
|
return false;
|
||||||
{
|
}
|
||||||
if(*right < 0x80)
|
left = right;
|
||||||
{
|
do {
|
||||||
if(left != right && !_cut(left, right, res))
|
right = _sequentialLetterRule(left, end);
|
||||||
{
|
if(right != left) {
|
||||||
return false;
|
break;
|
||||||
}
|
}
|
||||||
left = right;
|
right = _numbersRule(left, end);
|
||||||
do {
|
if(right != left) {
|
||||||
right = _sequentialLetterRule(left, end);
|
break;
|
||||||
if(right != left)
|
}
|
||||||
{
|
right ++;
|
||||||
break;
|
} while(false);
|
||||||
}
|
res.push_back(Unicode(left, right));
|
||||||
right = _numbersRule(left, end);
|
left = right;
|
||||||
if(right != left)
|
} else {
|
||||||
{
|
right++;
|
||||||
break;
|
}
|
||||||
}
|
}
|
||||||
right ++;
|
if(left != right && !_cut(left, right, res)) {
|
||||||
} while(false);
|
return false;
|
||||||
res.push_back(Unicode(left, right));
|
}
|
||||||
left = right;
|
return true;
|
||||||
}
|
}
|
||||||
else
|
public:
|
||||||
{
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
||||||
right++;
|
if(begin == end) {
|
||||||
}
|
return false;
|
||||||
}
|
}
|
||||||
if(left != right && !_cut(left, right, res))
|
vector<Unicode> words;
|
||||||
{
|
words.reserve(end - begin);
|
||||||
return false;
|
if(!cut(begin, end, words)) {
|
||||||
}
|
return false;
|
||||||
return true;
|
}
|
||||||
}
|
size_t offset = res.size();
|
||||||
public:
|
res.resize(res.size() + words.size());
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
for(size_t i = 0; i < words.size(); i++) {
|
||||||
{
|
if(!TransCode::encode(words[i], res[offset + i])) {
|
||||||
if(begin == end)
|
LogError("encode failed.");
|
||||||
{
|
}
|
||||||
return false;
|
}
|
||||||
}
|
return true;
|
||||||
vector<Unicode> words;
|
}
|
||||||
words.reserve(end - begin);
|
private:
|
||||||
if(!cut(begin, end, words))
|
// sequential letters rule
|
||||||
{
|
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
return false;
|
Unicode::value_type x = *begin;
|
||||||
}
|
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||||
size_t offset = res.size();
|
begin ++;
|
||||||
res.resize(res.size() + words.size());
|
} else {
|
||||||
for(size_t i = 0; i < words.size(); i++)
|
return begin;
|
||||||
{
|
}
|
||||||
if(!TransCode::encode(words[i], res[offset + i]))
|
while(begin != end) {
|
||||||
{
|
x = *begin;
|
||||||
LogError("encode failed.");
|
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||||
}
|
begin ++;
|
||||||
}
|
} else {
|
||||||
return true;
|
break;
|
||||||
}
|
}
|
||||||
private:
|
}
|
||||||
// sequential letters rule
|
return begin;
|
||||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
}
|
||||||
{
|
//
|
||||||
Unicode::value_type x = *begin;
|
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
Unicode::value_type x = *begin;
|
||||||
{
|
if('0' <= x && x <= '9') {
|
||||||
begin ++;
|
begin ++;
|
||||||
}
|
} else {
|
||||||
else
|
return begin;
|
||||||
{
|
}
|
||||||
return begin;
|
while(begin != end) {
|
||||||
}
|
x = *begin;
|
||||||
while(begin != end)
|
if( ('0' <= x && x <= '9') || x == '.') {
|
||||||
{
|
begin++;
|
||||||
x = *begin;
|
} else {
|
||||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
|
break;
|
||||||
{
|
}
|
||||||
begin ++;
|
}
|
||||||
}
|
return begin;
|
||||||
else
|
}
|
||||||
{
|
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
break;
|
vector<size_t> status;
|
||||||
}
|
if(!_viterbi(begin, end, status)) {
|
||||||
}
|
LogError("_viterbi failed.");
|
||||||
return begin;
|
return false;
|
||||||
}
|
}
|
||||||
//
|
|
||||||
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
|
||||||
{
|
|
||||||
Unicode::value_type x = *begin;
|
|
||||||
if('0' <= x && x <= '9')
|
|
||||||
{
|
|
||||||
begin ++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return begin;
|
|
||||||
}
|
|
||||||
while(begin != end)
|
|
||||||
{
|
|
||||||
x = *begin;
|
|
||||||
if( ('0' <= x && x <= '9') || x == '.')
|
|
||||||
{
|
|
||||||
begin++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return begin;
|
|
||||||
}
|
|
||||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
|
||||||
{
|
|
||||||
vector<size_t> status;
|
|
||||||
if(!_viterbi(begin, end, status))
|
|
||||||
{
|
|
||||||
LogError("_viterbi failed.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right;
|
Unicode::const_iterator right;
|
||||||
for(size_t i = 0; i < status.size(); i++)
|
for(size_t i = 0; i < status.size(); i++) {
|
||||||
{
|
if(status[i] % 2) { //if(E == status[i] || S == status[i])
|
||||||
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
right = begin + i + 1;
|
||||||
{
|
res.push_back(Unicode(left, right));
|
||||||
right = begin + i + 1;
|
left = right;
|
||||||
res.push_back(Unicode(left, right));
|
}
|
||||||
left = right;
|
}
|
||||||
}
|
return true;
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
|
||||||
{
|
if(begin == end) {
|
||||||
if(begin == end)
|
return false;
|
||||||
{
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t Y = STATUS_SUM;
|
size_t Y = STATUS_SUM;
|
||||||
size_t X = end - begin;
|
size_t X = end - begin;
|
||||||
|
|
||||||
size_t XYSize = X * Y;
|
size_t XYSize = X * Y;
|
||||||
size_t now, old, stat;
|
size_t now, old, stat;
|
||||||
double tmp, endE, endS;
|
double tmp, endE, endS;
|
||||||
|
|
||||||
vector<int> path(XYSize);
|
vector<int> path(XYSize);
|
||||||
vector<double> weight(XYSize);
|
vector<double> weight(XYSize);
|
||||||
|
|
||||||
//start
|
//start
|
||||||
for(size_t y = 0; y < Y; y++)
|
for(size_t y = 0; y < Y; y++) {
|
||||||
{
|
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
||||||
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
path[0 + y * X] = -1;
|
||||||
path[0 + y * X] = -1;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
double emitProb;
|
double emitProb;
|
||||||
|
|
||||||
for(size_t x = 1; x < X; x++)
|
for(size_t x = 1; x < X; x++) {
|
||||||
{
|
for(size_t y = 0; y < Y; y++) {
|
||||||
for(size_t y = 0; y < Y; y++)
|
now = x + y*X;
|
||||||
{
|
weight[now] = MIN_DOUBLE;
|
||||||
now = x + y*X;
|
path[now] = E; // warning
|
||||||
weight[now] = MIN_DOUBLE;
|
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||||
path[now] = E; // warning
|
for(size_t preY = 0; preY < Y; preY++) {
|
||||||
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
old = x - 1 + preY * X;
|
||||||
for(size_t preY = 0; preY < Y; preY++)
|
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
||||||
{
|
if(tmp > weight[now]) {
|
||||||
old = x - 1 + preY * X;
|
weight[now] = tmp;
|
||||||
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
path[now] = preY;
|
||||||
if(tmp > weight[now])
|
}
|
||||||
{
|
}
|
||||||
weight[now] = tmp;
|
}
|
||||||
path[now] = preY;
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
endE = weight[X-1+E*X];
|
endE = weight[X-1+E*X];
|
||||||
endS = weight[X-1+S*X];
|
endS = weight[X-1+S*X];
|
||||||
stat = 0;
|
stat = 0;
|
||||||
if(endE >= endS)
|
if(endE >= endS) {
|
||||||
{
|
stat = E;
|
||||||
stat = E;
|
} else {
|
||||||
}
|
stat = S;
|
||||||
else
|
}
|
||||||
{
|
|
||||||
stat = S;
|
|
||||||
}
|
|
||||||
|
|
||||||
status.resize(X);
|
status.resize(X);
|
||||||
for(int x = X -1 ; x >= 0; x--)
|
for(int x = X -1 ; x >= 0; x--) {
|
||||||
{
|
status[x] = stat;
|
||||||
status[x] = stat;
|
stat = path[x + stat*X];
|
||||||
stat = path[x + stat*X];
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool _loadModel(const char* const filePath)
|
bool _loadModel(const char* const filePath) {
|
||||||
{
|
ifstream ifile(filePath);
|
||||||
ifstream ifile(filePath);
|
string line;
|
||||||
string line;
|
vector<string> tmp;
|
||||||
vector<string> tmp;
|
vector<string> tmp2;
|
||||||
vector<string> tmp2;
|
//load _startProb
|
||||||
//load _startProb
|
if(!_getLine(ifile, line)) {
|
||||||
if(!_getLine(ifile, line))
|
return false;
|
||||||
{
|
}
|
||||||
return false;
|
split(line, tmp, " ");
|
||||||
}
|
if(tmp.size() != STATUS_SUM) {
|
||||||
split(line, tmp, " ");
|
LogError("start_p illegal");
|
||||||
if(tmp.size() != STATUS_SUM)
|
return false;
|
||||||
{
|
}
|
||||||
LogError("start_p illegal");
|
for(size_t j = 0; j< tmp.size(); j++) {
|
||||||
return false;
|
_startProb[j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
for(size_t j = 0; j< tmp.size(); j++)
|
|
||||||
{
|
|
||||||
_startProb[j] = atof(tmp[j].c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
//load _transProb
|
//load _transProb
|
||||||
for(size_t i = 0; i < STATUS_SUM; i++)
|
for(size_t i = 0; i < STATUS_SUM; i++) {
|
||||||
{
|
if(!_getLine(ifile, line)) {
|
||||||
if(!_getLine(ifile, line))
|
return false;
|
||||||
{
|
}
|
||||||
return false;
|
split(line, tmp, " ");
|
||||||
}
|
if(tmp.size() != STATUS_SUM) {
|
||||||
split(line, tmp, " ");
|
LogError("trans_p illegal");
|
||||||
if(tmp.size() != STATUS_SUM)
|
return false;
|
||||||
{
|
}
|
||||||
LogError("trans_p illegal");
|
for(size_t j =0; j < STATUS_SUM; j++) {
|
||||||
return false;
|
_transProb[i][j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
for(size_t j =0; j < STATUS_SUM; j++)
|
}
|
||||||
{
|
|
||||||
_transProb[i][j] = atof(tmp[j].c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//load _emitProbB
|
//load _emitProbB
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
|
||||||
{
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
//load _emitProbE
|
//load _emitProbE
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
|
||||||
{
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
//load _emitProbM
|
//load _emitProbM
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
|
||||||
{
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
//load _emitProbS
|
//load _emitProbS
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
|
||||||
{
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool _getLine(ifstream& ifile, string& line)
|
bool _getLine(ifstream& ifile, string& line) {
|
||||||
{
|
while(getline(ifile, line)) {
|
||||||
while(getline(ifile, line))
|
trim(line);
|
||||||
{
|
if(line.empty()) {
|
||||||
trim(line);
|
continue;
|
||||||
if(line.empty())
|
}
|
||||||
{
|
if(startsWith(line, "#")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(startsWith(line, "#"))
|
return true;
|
||||||
{
|
}
|
||||||
continue;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
bool _loadEmitProb(const string& line, EmitProbMap& mp) {
|
||||||
}
|
if(line.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool _loadEmitProb(const string& line, EmitProbMap& mp)
|
vector<string> tmp, tmp2;
|
||||||
{
|
Unicode unicode;
|
||||||
if(line.empty())
|
split(line, tmp, ",");
|
||||||
{
|
for(size_t i = 0; i < tmp.size(); i++) {
|
||||||
return false;
|
split(tmp[i], tmp2, ":");
|
||||||
}
|
if(2 != tmp2.size()) {
|
||||||
vector<string> tmp, tmp2;
|
LogError("_emitProb illegal.");
|
||||||
Unicode unicode;
|
return false;
|
||||||
split(line, tmp, ",");
|
}
|
||||||
for(size_t i = 0; i < tmp.size(); i++)
|
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
||||||
{
|
LogError("TransCode failed.");
|
||||||
split(tmp[i], tmp2, ":");
|
return false;
|
||||||
if(2 != tmp2.size())
|
}
|
||||||
{
|
mp[unicode[0]] = atof(tmp2[1].c_str());
|
||||||
LogError("_emitProb illegal.");
|
}
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
|
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
|
||||||
{
|
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||||
LogError("TransCode failed.");
|
if(cit == ptMp->end()) {
|
||||||
return false;
|
return defVal;
|
||||||
}
|
}
|
||||||
mp[unicode[0]] = atof(tmp2[1].c_str());
|
return cit->second;
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
|
|
||||||
{
|
|
||||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
|
||||||
if(cit == ptMp->end())
|
|
||||||
{
|
|
||||||
return defVal;
|
|
||||||
}
|
|
||||||
return cit->second;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
char _statMap[STATUS_SUM];
|
char _statMap[STATUS_SUM];
|
||||||
double _startProb[STATUS_SUM];
|
double _startProb[STATUS_SUM];
|
||||||
double _transProb[STATUS_SUM][STATUS_SUM];
|
double _transProb[STATUS_SUM][STATUS_SUM];
|
||||||
EmitProbMap _emitProbB;
|
EmitProbMap _emitProbB;
|
||||||
EmitProbMap _emitProbE;
|
EmitProbMap _emitProbE;
|
||||||
EmitProbMap _emitProbM;
|
EmitProbMap _emitProbM;
|
||||||
EmitProbMap _emitProbS;
|
EmitProbMap _emitProbS;
|
||||||
vector<EmitProbMap* > _emitProbVec;
|
vector<EmitProbMap* > _emitProbVec;
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -2,15 +2,13 @@
|
|||||||
#define CPPJIEBA_SEGMENTINTERFACE_H
|
#define CPPJIEBA_SEGMENTINTERFACE_H
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
class ISegment {
|
||||||
class ISegment
|
public:
|
||||||
{
|
virtual ~ISegment() {};
|
||||||
public:
|
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||||
virtual ~ISegment(){};
|
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
||||||
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
};
|
||||||
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -5,162 +5,136 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
using namespace Limonp;
|
||||||
using namespace Limonp;
|
|
||||||
|
|
||||||
/*utf8*/
|
/*utf8*/
|
||||||
class KeywordExtractor
|
class KeywordExtractor {
|
||||||
{
|
public:
|
||||||
public:
|
KeywordExtractor() {};
|
||||||
KeywordExtractor(){};
|
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
||||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
|
||||||
{
|
};
|
||||||
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
|
~KeywordExtractor() {};
|
||||||
};
|
|
||||||
~KeywordExtractor(){};
|
|
||||||
|
|
||||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
||||||
{
|
_loadIdfDict(idfPath);
|
||||||
_loadIdfDict(idfPath);
|
_loadStopWordDict(stopWordPath);
|
||||||
_loadStopWordDict(stopWordPath);
|
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
};
|
||||||
};
|
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
|
||||||
{
|
vector<pair<string, double> > topWords;
|
||||||
vector<pair<string, double> > topWords;
|
if(!extract(str, topWords, topN)) {
|
||||||
if(!extract(str, topWords, topN))
|
return false;
|
||||||
{
|
}
|
||||||
return false;
|
for(size_t i = 0; i < topWords.size(); i++) {
|
||||||
}
|
keywords.push_back(topWords[i].first);
|
||||||
for(size_t i = 0; i < topWords.size(); i++)
|
}
|
||||||
{
|
return true;
|
||||||
keywords.push_back(topWords[i].first);
|
}
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
|
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
{
|
vector<string> words;
|
||||||
vector<string> words;
|
if(!_segment.cut(str, words)) {
|
||||||
if(!_segment.cut(str, words))
|
LogError("segment cut(%s) failed.", str.c_str());
|
||||||
{
|
return false;
|
||||||
LogError("segment cut(%s) failed.", str.c_str());
|
}
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
|
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
{
|
if(_isSingleWord(*iter)) {
|
||||||
if(_isSingleWord(*iter))
|
continue;
|
||||||
{
|
}
|
||||||
continue;
|
wordmap[*iter] += 1.0;
|
||||||
}
|
}
|
||||||
wordmap[*iter] += 1.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
||||||
{
|
if(_stopWords.end() != _stopWords.find(itr->first)) {
|
||||||
if(_stopWords.end() != _stopWords.find(itr->first))
|
wordmap.erase(itr);
|
||||||
{
|
continue;
|
||||||
wordmap.erase(itr);
|
}
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
||||||
if(cit != _idfMap.end())
|
if(cit != _idfMap.end()) {
|
||||||
{
|
itr->second *= cit->second;
|
||||||
itr->second *= cit->second;
|
} else {
|
||||||
}
|
itr->second *= _idfAverage;
|
||||||
else
|
}
|
||||||
{
|
itr ++;
|
||||||
itr->second *= _idfAverage;
|
}
|
||||||
}
|
|
||||||
itr ++;
|
|
||||||
}
|
|
||||||
|
|
||||||
keywords.clear();
|
keywords.clear();
|
||||||
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
||||||
topN = min(topN, keywords.size());
|
topN = min(topN, keywords.size());
|
||||||
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
|
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
|
||||||
keywords.resize(topN);
|
keywords.resize(topN);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void _loadIdfDict(const string& idfPath)
|
void _loadIdfDict(const string& idfPath) {
|
||||||
{
|
ifstream ifs(idfPath.c_str());
|
||||||
ifstream ifs(idfPath.c_str());
|
if(!ifs) {
|
||||||
if(!ifs)
|
LogError("open %s failed.", idfPath.c_str());
|
||||||
{
|
assert(false);
|
||||||
LogError("open %s failed.", idfPath.c_str());
|
}
|
||||||
assert(false);
|
string line ;
|
||||||
}
|
vector<string> buf;
|
||||||
string line ;
|
double idf = 0.0;
|
||||||
vector<string> buf;
|
double idfSum = 0.0;
|
||||||
double idf = 0.0;
|
size_t lineno = 0;
|
||||||
double idfSum = 0.0;
|
for(; getline(ifs, line); lineno++) {
|
||||||
size_t lineno = 0;
|
buf.clear();
|
||||||
for(;getline(ifs, line); lineno++)
|
if(line.empty()) {
|
||||||
{
|
LogError("line[%d] empty. skipped.", lineno);
|
||||||
buf.clear();
|
continue;
|
||||||
if(line.empty())
|
}
|
||||||
{
|
if(!split(line, buf, " ") || buf.size() != 2) {
|
||||||
LogError("line[%d] empty. skipped.", lineno);
|
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(!split(line, buf, " ") || buf.size() != 2)
|
idf = atof(buf[1].c_str());
|
||||||
{
|
_idfMap[buf[0]] = idf;
|
||||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
idfSum += idf;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
idf = atof(buf[1].c_str());
|
|
||||||
_idfMap[buf[0]] = idf;
|
|
||||||
idfSum += idf;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(lineno);
|
assert(lineno);
|
||||||
_idfAverage = idfSum / lineno;
|
_idfAverage = idfSum / lineno;
|
||||||
assert(_idfAverage > 0.0);
|
assert(_idfAverage > 0.0);
|
||||||
}
|
}
|
||||||
void _loadStopWordDict(const string& filePath)
|
void _loadStopWordDict(const string& filePath) {
|
||||||
{
|
ifstream ifs(filePath.c_str());
|
||||||
ifstream ifs(filePath.c_str());
|
if(!ifs) {
|
||||||
if(!ifs)
|
LogError("open %s failed.", filePath.c_str());
|
||||||
{
|
assert(false);
|
||||||
LogError("open %s failed.", filePath.c_str());
|
}
|
||||||
assert(false);
|
string line ;
|
||||||
}
|
while(getline(ifs, line)) {
|
||||||
string line ;
|
_stopWords.insert(line);
|
||||||
while(getline(ifs, line))
|
}
|
||||||
{
|
assert(_stopWords.size());
|
||||||
_stopWords.insert(line);
|
}
|
||||||
}
|
|
||||||
assert(_stopWords.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
bool _isSingleWord(const string& str) const
|
bool _isSingleWord(const string& str) const {
|
||||||
{
|
Unicode unicode;
|
||||||
Unicode unicode;
|
TransCode::decode(str, unicode);
|
||||||
TransCode::decode(str, unicode);
|
if(unicode.size() == 1)
|
||||||
if(unicode.size() == 1)
|
return true;
|
||||||
return true;
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
|
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
||||||
{
|
return lhs.second > rhs.second;
|
||||||
return lhs.second > rhs.second;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
MixSegment _segment;
|
|
||||||
unordered_map<string, double> _idfMap;
|
|
||||||
double _idfAverage;
|
|
||||||
|
|
||||||
unordered_set<string> _stopWords;
|
private:
|
||||||
};
|
MixSegment _segment;
|
||||||
|
unordered_map<string, double> _idfMap;
|
||||||
|
double _idfAverage;
|
||||||
|
|
||||||
|
unordered_set<string> _stopWords;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user