mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
astyle
This commit is contained in:
parent
d1a112c0c4
commit
931db7d1e5
@ -16,45 +16,37 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
const double MIN_DOUBLE = -3.14e+100;
|
const double MIN_DOUBLE = -3.14e+100;
|
||||||
const double MAX_DOUBLE = 3.14e+100;
|
const double MAX_DOUBLE = 3.14e+100;
|
||||||
const size_t DICT_COLUMN_NUM = 3;
|
const size_t DICT_COLUMN_NUM = 3;
|
||||||
const char* const UNKNOWN_TAG = "";
|
const char* const UNKNOWN_TAG = "";
|
||||||
|
|
||||||
class DictTrie
|
class DictTrie {
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
DictTrie()
|
DictTrie() {
|
||||||
{
|
|
||||||
_trie = NULL;
|
_trie = NULL;
|
||||||
_minWeight = MAX_DOUBLE;
|
_minWeight = MAX_DOUBLE;
|
||||||
}
|
}
|
||||||
DictTrie(const string& dictPath, const string& userDictPath = "")
|
DictTrie(const string& dictPath, const string& userDictPath = "") {
|
||||||
{
|
|
||||||
new (this) DictTrie();
|
new (this) DictTrie();
|
||||||
init(dictPath, userDictPath);
|
init(dictPath, userDictPath);
|
||||||
}
|
}
|
||||||
~DictTrie()
|
~DictTrie() {
|
||||||
{
|
if(_trie) {
|
||||||
if(_trie)
|
|
||||||
{
|
|
||||||
delete _trie;
|
delete _trie;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool init(const string& dictPath, const string& userDictPath = "")
|
bool init(const string& dictPath, const string& userDictPath = "") {
|
||||||
{
|
|
||||||
assert(!_trie);
|
assert(!_trie);
|
||||||
_loadDict(dictPath);
|
_loadDict(dictPath);
|
||||||
_calculateWeight(_nodeInfos);
|
_calculateWeight(_nodeInfos);
|
||||||
_minWeight = _findMinWeight(_nodeInfos);
|
_minWeight = _findMinWeight(_nodeInfos);
|
||||||
|
|
||||||
if(userDictPath.size())
|
if(userDictPath.size()) {
|
||||||
{
|
|
||||||
double maxWeight = _findMaxWeight(_nodeInfos);
|
double maxWeight = _findMaxWeight(_nodeInfos);
|
||||||
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
|
||||||
}
|
}
|
||||||
@ -64,37 +56,33 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
{
|
|
||||||
return _trie->find(begin, end);
|
return _trie->find(begin, end);
|
||||||
}
|
}
|
||||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
|
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
|
||||||
{
|
|
||||||
return _trie->find(begin, end, dag, offset);
|
return _trie->find(begin, end, dag, offset);
|
||||||
}
|
}
|
||||||
void find(
|
void find(
|
||||||
Unicode::const_iterator begin,
|
Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<SegmentChar>& res
|
vector<SegmentChar>& res
|
||||||
) const
|
) const {
|
||||||
{
|
|
||||||
_trie->find(begin, end, res);
|
_trie->find(begin, end, res);
|
||||||
}
|
}
|
||||||
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
|
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
|
||||||
{
|
|
||||||
return isIn(_userDictSingleChineseWord, word);
|
return isIn(_userDictSingleChineseWord, word);
|
||||||
}
|
}
|
||||||
double getMinWeight() const {return _minWeight;};
|
double getMinWeight() const {
|
||||||
|
return _minWeight;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
UglyTrie * _createTrie(const vector<DictUnit>& dictUnits)
|
UglyTrie * _createTrie(const vector<DictUnit>& dictUnits) {
|
||||||
{
|
|
||||||
assert(dictUnits.size());
|
assert(dictUnits.size());
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
vector<const DictUnit*> valuePointers;
|
vector<const DictUnit*> valuePointers;
|
||||||
for(size_t i = 0 ; i < dictUnits.size(); i ++)
|
for(size_t i = 0 ; i < dictUnits.size(); i ++) {
|
||||||
{
|
|
||||||
words.push_back(dictUnits[i].word);
|
words.push_back(dictUnits[i].word);
|
||||||
valuePointers.push_back(&dictUnits[i]);
|
valuePointers.push_back(&dictUnits[i]);
|
||||||
}
|
}
|
||||||
@ -102,26 +90,22 @@ namespace CppJieba
|
|||||||
UglyTrie * trie = new UglyTrie(words, valuePointers);
|
UglyTrie * trie = new UglyTrie(words, valuePointers);
|
||||||
return trie;
|
return trie;
|
||||||
}
|
}
|
||||||
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
|
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
|
||||||
{
|
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
assert(ifs.is_open());
|
assert(ifs.is_open());
|
||||||
string line;
|
string line;
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
size_t lineno;
|
size_t lineno;
|
||||||
for(lineno = 0; getline(ifs, line); lineno++)
|
for(lineno = 0; getline(ifs, line); lineno++) {
|
||||||
{
|
|
||||||
buf.clear();
|
buf.clear();
|
||||||
split(line, buf, " ");
|
split(line, buf, " ");
|
||||||
assert(buf.size() >= 1);
|
assert(buf.size() >= 1);
|
||||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
if(!TransCode::decode(buf[0], nodeInfo.word)) {
|
||||||
{
|
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(nodeInfo.word.size() == 1)
|
if(nodeInfo.word.size() == 1) {
|
||||||
{
|
|
||||||
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
_userDictSingleChineseWord.insert(nodeInfo.word[0]);
|
||||||
}
|
}
|
||||||
nodeInfo.weight = defaultWeight;
|
nodeInfo.weight = defaultWeight;
|
||||||
@ -130,21 +114,18 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
|
||||||
}
|
}
|
||||||
void _loadDict(const string& filePath)
|
void _loadDict(const string& filePath) {
|
||||||
{
|
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
assert(ifs.is_open());
|
assert(ifs.is_open());
|
||||||
string line;
|
string line;
|
||||||
vector<string> buf;
|
vector<string> buf;
|
||||||
|
|
||||||
DictUnit nodeInfo;
|
DictUnit nodeInfo;
|
||||||
for(size_t lineno = 0 ; getline(ifs, line); lineno++)
|
for(size_t lineno = 0 ; getline(ifs, line); lineno++) {
|
||||||
{
|
|
||||||
split(line, buf, " ");
|
split(line, buf, " ");
|
||||||
assert(buf.size() == DICT_COLUMN_NUM);
|
assert(buf.size() == DICT_COLUMN_NUM);
|
||||||
|
|
||||||
if(!TransCode::decode(buf[0], nodeInfo.word))
|
if(!TransCode::decode(buf[0], nodeInfo.word)) {
|
||||||
{
|
|
||||||
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
LogError("line[%u:%s] illegal.", lineno, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -154,43 +135,35 @@ namespace CppJieba
|
|||||||
_nodeInfos.push_back(nodeInfo);
|
_nodeInfos.push_back(nodeInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double _findMinWeight(const vector<DictUnit>& nodeInfos) const
|
double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
|
||||||
{
|
|
||||||
double ret = MAX_DOUBLE;
|
double ret = MAX_DOUBLE;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
{
|
|
||||||
ret = min(nodeInfos[i].weight, ret);
|
ret = min(nodeInfos[i].weight, ret);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
|
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
|
||||||
{
|
|
||||||
double ret = MIN_DOUBLE;
|
double ret = MIN_DOUBLE;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
{
|
|
||||||
ret = max(nodeInfos[i].weight, ret);
|
ret = max(nodeInfos[i].weight, ret);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void _calculateWeight(vector<DictUnit>& nodeInfos) const
|
void _calculateWeight(vector<DictUnit>& nodeInfos) const {
|
||||||
{
|
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
{
|
|
||||||
sum += nodeInfos[i].weight;
|
sum += nodeInfos[i].weight;
|
||||||
}
|
}
|
||||||
assert(sum);
|
assert(sum);
|
||||||
for(size_t i = 0; i < nodeInfos.size(); i++)
|
for(size_t i = 0; i < nodeInfos.size(); i++) {
|
||||||
{
|
|
||||||
DictUnit& nodeInfo = nodeInfos[i];
|
DictUnit& nodeInfo = nodeInfos[i];
|
||||||
assert(nodeInfo.weight);
|
assert(nodeInfo.weight);
|
||||||
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
|
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void _shrink(vector<DictUnit>& units) const
|
void _shrink(vector<DictUnit>& units) const {
|
||||||
{
|
|
||||||
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,43 +10,34 @@
|
|||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "TransCode.hpp"
|
#include "TransCode.hpp"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
class FullSegment: public SegmentBase {
|
||||||
class FullSegment: public SegmentBase
|
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
FullSegment()
|
FullSegment() {
|
||||||
{
|
|
||||||
_dictTrie = NULL;
|
_dictTrie = NULL;
|
||||||
_isBorrowed = false;
|
_isBorrowed = false;
|
||||||
}
|
}
|
||||||
explicit FullSegment(const string& dictPath)
|
explicit FullSegment(const string& dictPath) {
|
||||||
{
|
|
||||||
_dictTrie = NULL;
|
_dictTrie = NULL;
|
||||||
init(dictPath);
|
init(dictPath);
|
||||||
}
|
}
|
||||||
explicit FullSegment(const DictTrie* dictTrie)
|
explicit FullSegment(const DictTrie* dictTrie) {
|
||||||
{
|
|
||||||
_dictTrie = NULL;
|
_dictTrie = NULL;
|
||||||
init(dictTrie);
|
init(dictTrie);
|
||||||
}
|
}
|
||||||
virtual ~FullSegment()
|
virtual ~FullSegment() {
|
||||||
{
|
if(_dictTrie && ! _isBorrowed) {
|
||||||
if(_dictTrie && ! _isBorrowed)
|
|
||||||
{
|
|
||||||
delete _dictTrie;
|
delete _dictTrie;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
bool init(const string& dictPath)
|
bool init(const string& dictPath) {
|
||||||
{
|
|
||||||
assert(_dictTrie == NULL);
|
assert(_dictTrie == NULL);
|
||||||
_dictTrie = new DictTrie(dictPath);
|
_dictTrie = new DictTrie(dictPath);
|
||||||
_isBorrowed = false;
|
_isBorrowed = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool init(const DictTrie* dictTrie)
|
bool init(const DictTrie* dictTrie) {
|
||||||
{
|
|
||||||
assert(_dictTrie == NULL);
|
assert(_dictTrie == NULL);
|
||||||
assert(dictTrie);
|
assert(dictTrie);
|
||||||
_dictTrie = dictTrie;
|
_dictTrie = dictTrie;
|
||||||
@ -55,11 +46,9 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
{
|
|
||||||
assert(_dictTrie);
|
assert(_dictTrie);
|
||||||
if (begin >= end)
|
if (begin >= end) {
|
||||||
{
|
|
||||||
LogError("begin >= end");
|
LogError("begin >= end");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -75,27 +64,21 @@ namespace CppJieba
|
|||||||
|
|
||||||
//tmp variables
|
//tmp variables
|
||||||
int wordLen = 0;
|
int wordLen = 0;
|
||||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
|
||||||
{
|
|
||||||
//find word start from uItr
|
//find word start from uItr
|
||||||
if (_dictTrie->find(uItr, end, tRes, 0))
|
if (_dictTrie->find(uItr, end, tRes, 0)) {
|
||||||
{
|
|
||||||
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||||
{
|
{
|
||||||
wordLen = itr->second->word.size();
|
wordLen = itr->second->word.size();
|
||||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
|
||||||
{
|
|
||||||
res.push_back(itr->second->word);
|
res.push_back(itr->second->word);
|
||||||
}
|
}
|
||||||
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
|
||||||
}
|
}
|
||||||
tRes.clear();
|
tRes.clear();
|
||||||
}
|
} else { // not found word start from uItr
|
||||||
else // not found word start from uItr
|
if (maxIdx <= uIdx) { // never exist in prev results
|
||||||
{
|
|
||||||
if (maxIdx <= uIdx) // never exist in prev results
|
|
||||||
{
|
|
||||||
//put itr itself in res
|
//put itr itself in res
|
||||||
res.push_back(Unicode(1, *uItr));
|
res.push_back(Unicode(1, *uItr));
|
||||||
|
|
||||||
@ -109,31 +92,24 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
|
||||||
{
|
|
||||||
assert(_dictTrie);
|
assert(_dictTrie);
|
||||||
if (begin >= end)
|
if (begin >= end) {
|
||||||
{
|
|
||||||
LogError("begin >= end");
|
LogError("begin >= end");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<Unicode> uRes;
|
vector<Unicode> uRes;
|
||||||
if (!cut(begin, end, uRes))
|
if (!cut(begin, end, uRes)) {
|
||||||
{
|
|
||||||
LogError("get unicode cut result error.");
|
LogError("get unicode cut result error.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
string tmp;
|
string tmp;
|
||||||
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
|
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
|
||||||
{
|
if (TransCode::encode(*uItr, tmp)) {
|
||||||
if (TransCode::encode(*uItr, tmp))
|
|
||||||
{
|
|
||||||
res.push_back(tmp);
|
res.push_back(tmp);
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
LogError("encode failed.");
|
LogError("encode failed.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,12 +12,10 @@
|
|||||||
#include "SegmentBase.hpp"
|
#include "SegmentBase.hpp"
|
||||||
#include "DictTrie.hpp"
|
#include "DictTrie.hpp"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
typedef unordered_map<uint16_t, double> EmitProbMap;
|
typedef unordered_map<uint16_t, double> EmitProbMap;
|
||||||
class HMMSegment: public SegmentBase
|
class HMMSegment: public SegmentBase {
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
/*
|
/*
|
||||||
* STATUS:
|
* STATUS:
|
||||||
@ -27,14 +25,12 @@ namespace CppJieba
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
HMMSegment() {}
|
HMMSegment() {}
|
||||||
explicit HMMSegment(const string& filePath)
|
explicit HMMSegment(const string& filePath) {
|
||||||
{
|
|
||||||
LIMONP_CHECK(init(filePath));
|
LIMONP_CHECK(init(filePath));
|
||||||
}
|
}
|
||||||
virtual ~HMMSegment() {}
|
virtual ~HMMSegment() {}
|
||||||
public:
|
public:
|
||||||
bool init(const string& filePath)
|
bool init(const string& filePath) {
|
||||||
{
|
|
||||||
memset(_startProb, 0, sizeof(_startProb));
|
memset(_startProb, 0, sizeof(_startProb));
|
||||||
memset(_transProb, 0, sizeof(_transProb));
|
memset(_transProb, 0, sizeof(_transProb));
|
||||||
_statMap[0] = 'B';
|
_statMap[0] = 'B';
|
||||||
@ -52,65 +48,51 @@ namespace CppJieba
|
|||||||
public:
|
public:
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
public:
|
public:
|
||||||
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
|
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
|
||||||
{
|
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right = begin;
|
Unicode::const_iterator right = begin;
|
||||||
while(right != end)
|
while(right != end) {
|
||||||
{
|
if(*right < 0x80) {
|
||||||
if(*right < 0x80)
|
if(left != right && !_cut(left, right, res)) {
|
||||||
{
|
|
||||||
if(left != right && !_cut(left, right, res))
|
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
left = right;
|
left = right;
|
||||||
do {
|
do {
|
||||||
right = _sequentialLetterRule(left, end);
|
right = _sequentialLetterRule(left, end);
|
||||||
if(right != left)
|
if(right != left) {
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
right = _numbersRule(left, end);
|
right = _numbersRule(left, end);
|
||||||
if(right != left)
|
if(right != left) {
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
right ++;
|
right ++;
|
||||||
} while(false);
|
} while(false);
|
||||||
res.push_back(Unicode(left, right));
|
res.push_back(Unicode(left, right));
|
||||||
left = right;
|
left = right;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
right++;
|
right++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(left != right && !_cut(left, right, res))
|
if(left != right && !_cut(left, right, res)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
|
||||||
{
|
if(begin == end) {
|
||||||
if(begin == end)
|
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
vector<Unicode> words;
|
vector<Unicode> words;
|
||||||
words.reserve(end - begin);
|
words.reserve(end - begin);
|
||||||
if(!cut(begin, end, words))
|
if(!cut(begin, end, words)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
size_t offset = res.size();
|
size_t offset = res.size();
|
||||||
res.resize(res.size() + words.size());
|
res.resize(res.size() + words.size());
|
||||||
for(size_t i = 0; i < words.size(); i++)
|
for(size_t i = 0; i < words.size(); i++) {
|
||||||
{
|
if(!TransCode::encode(words[i], res[offset + i])) {
|
||||||
if(!TransCode::encode(words[i], res[offset + i]))
|
|
||||||
{
|
|
||||||
LogError("encode failed.");
|
LogError("encode failed.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -118,72 +100,52 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
// sequential letters rule
|
// sequential letters rule
|
||||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
{
|
|
||||||
Unicode::value_type x = *begin;
|
Unicode::value_type x = *begin;
|
||||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
||||||
{
|
|
||||||
begin ++;
|
begin ++;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
while(begin != end)
|
while(begin != end) {
|
||||||
{
|
|
||||||
x = *begin;
|
x = *begin;
|
||||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
|
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
||||||
{
|
|
||||||
begin ++;
|
begin ++;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
//
|
//
|
||||||
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
|
||||||
{
|
|
||||||
Unicode::value_type x = *begin;
|
Unicode::value_type x = *begin;
|
||||||
if('0' <= x && x <= '9')
|
if('0' <= x && x <= '9') {
|
||||||
{
|
|
||||||
begin ++;
|
begin ++;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
while(begin != end)
|
while(begin != end) {
|
||||||
{
|
|
||||||
x = *begin;
|
x = *begin;
|
||||||
if( ('0' <= x && x <= '9') || x == '.')
|
if( ('0' <= x && x <= '9') || x == '.') {
|
||||||
{
|
|
||||||
begin++;
|
begin++;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return begin;
|
return begin;
|
||||||
}
|
}
|
||||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||||
{
|
|
||||||
vector<size_t> status;
|
vector<size_t> status;
|
||||||
if(!_viterbi(begin, end, status))
|
if(!_viterbi(begin, end, status)) {
|
||||||
{
|
|
||||||
LogError("_viterbi failed.");
|
LogError("_viterbi failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Unicode::const_iterator left = begin;
|
Unicode::const_iterator left = begin;
|
||||||
Unicode::const_iterator right;
|
Unicode::const_iterator right;
|
||||||
for(size_t i = 0; i < status.size(); i++)
|
for(size_t i = 0; i < status.size(); i++) {
|
||||||
{
|
if(status[i] % 2) { //if(E == status[i] || S == status[i])
|
||||||
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
|
||||||
{
|
|
||||||
right = begin + i + 1;
|
right = begin + i + 1;
|
||||||
res.push_back(Unicode(left, right));
|
res.push_back(Unicode(left, right));
|
||||||
left = right;
|
left = right;
|
||||||
@ -192,10 +154,8 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
|
||||||
{
|
if(begin == end) {
|
||||||
if(begin == end)
|
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,8 +170,7 @@ namespace CppJieba
|
|||||||
vector<double> weight(XYSize);
|
vector<double> weight(XYSize);
|
||||||
|
|
||||||
//start
|
//start
|
||||||
for(size_t y = 0; y < Y; y++)
|
for(size_t y = 0; y < Y; y++) {
|
||||||
{
|
|
||||||
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
||||||
path[0 + y * X] = -1;
|
path[0 + y * X] = -1;
|
||||||
}
|
}
|
||||||
@ -219,20 +178,16 @@ namespace CppJieba
|
|||||||
|
|
||||||
double emitProb;
|
double emitProb;
|
||||||
|
|
||||||
for(size_t x = 1; x < X; x++)
|
for(size_t x = 1; x < X; x++) {
|
||||||
{
|
for(size_t y = 0; y < Y; y++) {
|
||||||
for(size_t y = 0; y < Y; y++)
|
|
||||||
{
|
|
||||||
now = x + y*X;
|
now = x + y*X;
|
||||||
weight[now] = MIN_DOUBLE;
|
weight[now] = MIN_DOUBLE;
|
||||||
path[now] = E; // warning
|
path[now] = E; // warning
|
||||||
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
||||||
for(size_t preY = 0; preY < Y; preY++)
|
for(size_t preY = 0; preY < Y; preY++) {
|
||||||
{
|
|
||||||
old = x - 1 + preY * X;
|
old = x - 1 + preY * X;
|
||||||
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
||||||
if(tmp > weight[now])
|
if(tmp > weight[now]) {
|
||||||
{
|
|
||||||
weight[now] = tmp;
|
weight[now] = tmp;
|
||||||
path[now] = preY;
|
path[now] = preY;
|
||||||
}
|
}
|
||||||
@ -243,127 +198,102 @@ namespace CppJieba
|
|||||||
endE = weight[X-1+E*X];
|
endE = weight[X-1+E*X];
|
||||||
endS = weight[X-1+S*X];
|
endS = weight[X-1+S*X];
|
||||||
stat = 0;
|
stat = 0;
|
||||||
if(endE >= endS)
|
if(endE >= endS) {
|
||||||
{
|
|
||||||
stat = E;
|
stat = E;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
stat = S;
|
stat = S;
|
||||||
}
|
}
|
||||||
|
|
||||||
status.resize(X);
|
status.resize(X);
|
||||||
for(int x = X -1 ; x >= 0; x--)
|
for(int x = X -1 ; x >= 0; x--) {
|
||||||
{
|
|
||||||
status[x] = stat;
|
status[x] = stat;
|
||||||
stat = path[x + stat*X];
|
stat = path[x + stat*X];
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool _loadModel(const char* const filePath)
|
bool _loadModel(const char* const filePath) {
|
||||||
{
|
|
||||||
ifstream ifile(filePath);
|
ifstream ifile(filePath);
|
||||||
string line;
|
string line;
|
||||||
vector<string> tmp;
|
vector<string> tmp;
|
||||||
vector<string> tmp2;
|
vector<string> tmp2;
|
||||||
//load _startProb
|
//load _startProb
|
||||||
if(!_getLine(ifile, line))
|
if(!_getLine(ifile, line)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
if(tmp.size() != STATUS_SUM)
|
if(tmp.size() != STATUS_SUM) {
|
||||||
{
|
|
||||||
LogError("start_p illegal");
|
LogError("start_p illegal");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t j = 0; j< tmp.size(); j++)
|
for(size_t j = 0; j< tmp.size(); j++) {
|
||||||
{
|
|
||||||
_startProb[j] = atof(tmp[j].c_str());
|
_startProb[j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _transProb
|
//load _transProb
|
||||||
for(size_t i = 0; i < STATUS_SUM; i++)
|
for(size_t i = 0; i < STATUS_SUM; i++) {
|
||||||
{
|
if(!_getLine(ifile, line)) {
|
||||||
if(!_getLine(ifile, line))
|
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
split(line, tmp, " ");
|
split(line, tmp, " ");
|
||||||
if(tmp.size() != STATUS_SUM)
|
if(tmp.size() != STATUS_SUM) {
|
||||||
{
|
|
||||||
LogError("trans_p illegal");
|
LogError("trans_p illegal");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t j =0; j < STATUS_SUM; j++)
|
for(size_t j =0; j < STATUS_SUM; j++) {
|
||||||
{
|
|
||||||
_transProb[i][j] = atof(tmp[j].c_str());
|
_transProb[i][j] = atof(tmp[j].c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbB
|
//load _emitProbB
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbE
|
//load _emitProbE
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbM
|
//load _emitProbM
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//load _emitProbS
|
//load _emitProbS
|
||||||
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
|
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool _getLine(ifstream& ifile, string& line)
|
bool _getLine(ifstream& ifile, string& line) {
|
||||||
{
|
while(getline(ifile, line)) {
|
||||||
while(getline(ifile, line))
|
|
||||||
{
|
|
||||||
trim(line);
|
trim(line);
|
||||||
if(line.empty())
|
if(line.empty()) {
|
||||||
{
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(startsWith(line, "#"))
|
if(startsWith(line, "#")) {
|
||||||
{
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool _loadEmitProb(const string& line, EmitProbMap& mp)
|
bool _loadEmitProb(const string& line, EmitProbMap& mp) {
|
||||||
{
|
if(line.empty()) {
|
||||||
if(line.empty())
|
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
vector<string> tmp, tmp2;
|
vector<string> tmp, tmp2;
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
split(line, tmp, ",");
|
split(line, tmp, ",");
|
||||||
for(size_t i = 0; i < tmp.size(); i++)
|
for(size_t i = 0; i < tmp.size(); i++) {
|
||||||
{
|
|
||||||
split(tmp[i], tmp2, ":");
|
split(tmp[i], tmp2, ":");
|
||||||
if(2 != tmp2.size())
|
if(2 != tmp2.size()) {
|
||||||
{
|
|
||||||
LogError("_emitProb illegal.");
|
LogError("_emitProb illegal.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
|
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
|
||||||
{
|
|
||||||
LogError("TransCode failed.");
|
LogError("TransCode failed.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -371,11 +301,9 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
|
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
|
||||||
{
|
|
||||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||||
if(cit == ptMp->end())
|
if(cit == ptMp->end()) {
|
||||||
{
|
|
||||||
return defVal;
|
return defVal;
|
||||||
}
|
}
|
||||||
return cit->second;
|
return cit->second;
|
||||||
|
@ -2,10 +2,8 @@
|
|||||||
#define CPPJIEBA_SEGMENTINTERFACE_H
|
#define CPPJIEBA_SEGMENTINTERFACE_H
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
class ISegment {
|
||||||
class ISegment
|
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
virtual ~ISegment() {};
|
virtual ~ISegment() {};
|
||||||
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
||||||
|
@ -5,76 +5,60 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba {
|
||||||
{
|
|
||||||
using namespace Limonp;
|
using namespace Limonp;
|
||||||
|
|
||||||
/*utf8*/
|
/*utf8*/
|
||||||
class KeywordExtractor
|
class KeywordExtractor {
|
||||||
{
|
|
||||||
public:
|
public:
|
||||||
KeywordExtractor() {};
|
KeywordExtractor() {};
|
||||||
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
||||||
{
|
|
||||||
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
|
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
|
||||||
};
|
};
|
||||||
~KeywordExtractor() {};
|
~KeywordExtractor() {};
|
||||||
|
|
||||||
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "")
|
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
|
||||||
{
|
|
||||||
_loadIdfDict(idfPath);
|
_loadIdfDict(idfPath);
|
||||||
_loadStopWordDict(stopWordPath);
|
_loadStopWordDict(stopWordPath);
|
||||||
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
|
||||||
};
|
};
|
||||||
|
|
||||||
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
bool extract(const string& str, vector<string>& keywords, size_t topN) const {
|
||||||
{
|
|
||||||
vector<pair<string, double> > topWords;
|
vector<pair<string, double> > topWords;
|
||||||
if(!extract(str, topWords, topN))
|
if(!extract(str, topWords, topN)) {
|
||||||
{
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(size_t i = 0; i < topWords.size(); i++)
|
for(size_t i = 0; i < topWords.size(); i++) {
|
||||||
{
|
|
||||||
keywords.push_back(topWords[i].first);
|
keywords.push_back(topWords[i].first);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
|
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
|
||||||
{
|
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
if(!_segment.cut(str, words))
|
if(!_segment.cut(str, words)) {
|
||||||
{
|
|
||||||
LogError("segment cut(%s) failed.", str.c_str());
|
LogError("segment cut(%s) failed.", str.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
map<string, double> wordmap;
|
map<string, double> wordmap;
|
||||||
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
|
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
|
||||||
{
|
if(_isSingleWord(*iter)) {
|
||||||
if(_isSingleWord(*iter))
|
|
||||||
{
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
wordmap[*iter] += 1.0;
|
wordmap[*iter] += 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
|
||||||
{
|
if(_stopWords.end() != _stopWords.find(itr->first)) {
|
||||||
if(_stopWords.end() != _stopWords.find(itr->first))
|
|
||||||
{
|
|
||||||
wordmap.erase(itr);
|
wordmap.erase(itr);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
||||||
if(cit != _idfMap.end())
|
if(cit != _idfMap.end()) {
|
||||||
{
|
|
||||||
itr->second *= cit->second;
|
itr->second *= cit->second;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
itr->second *= _idfAverage;
|
itr->second *= _idfAverage;
|
||||||
}
|
}
|
||||||
itr ++;
|
itr ++;
|
||||||
@ -88,11 +72,9 @@ namespace CppJieba
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void _loadIdfDict(const string& idfPath)
|
void _loadIdfDict(const string& idfPath) {
|
||||||
{
|
|
||||||
ifstream ifs(idfPath.c_str());
|
ifstream ifs(idfPath.c_str());
|
||||||
if(!ifs)
|
if(!ifs) {
|
||||||
{
|
|
||||||
LogError("open %s failed.", idfPath.c_str());
|
LogError("open %s failed.", idfPath.c_str());
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
@ -101,16 +83,13 @@ namespace CppJieba
|
|||||||
double idf = 0.0;
|
double idf = 0.0;
|
||||||
double idfSum = 0.0;
|
double idfSum = 0.0;
|
||||||
size_t lineno = 0;
|
size_t lineno = 0;
|
||||||
for(;getline(ifs, line); lineno++)
|
for(; getline(ifs, line); lineno++) {
|
||||||
{
|
|
||||||
buf.clear();
|
buf.clear();
|
||||||
if(line.empty())
|
if(line.empty()) {
|
||||||
{
|
|
||||||
LogError("line[%d] empty. skipped.", lineno);
|
LogError("line[%d] empty. skipped.", lineno);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if(!split(line, buf, " ") || buf.size() != 2)
|
if(!split(line, buf, " ") || buf.size() != 2) {
|
||||||
{
|
|
||||||
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -124,24 +103,20 @@ namespace CppJieba
|
|||||||
_idfAverage = idfSum / lineno;
|
_idfAverage = idfSum / lineno;
|
||||||
assert(_idfAverage > 0.0);
|
assert(_idfAverage > 0.0);
|
||||||
}
|
}
|
||||||
void _loadStopWordDict(const string& filePath)
|
void _loadStopWordDict(const string& filePath) {
|
||||||
{
|
|
||||||
ifstream ifs(filePath.c_str());
|
ifstream ifs(filePath.c_str());
|
||||||
if(!ifs)
|
if(!ifs) {
|
||||||
{
|
|
||||||
LogError("open %s failed.", filePath.c_str());
|
LogError("open %s failed.", filePath.c_str());
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
string line ;
|
string line ;
|
||||||
while(getline(ifs, line))
|
while(getline(ifs, line)) {
|
||||||
{
|
|
||||||
_stopWords.insert(line);
|
_stopWords.insert(line);
|
||||||
}
|
}
|
||||||
assert(_stopWords.size());
|
assert(_stopWords.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool _isSingleWord(const string& str) const
|
bool _isSingleWord(const string& str) const {
|
||||||
{
|
|
||||||
Unicode unicode;
|
Unicode unicode;
|
||||||
TransCode::decode(str, unicode);
|
TransCode::decode(str, unicode);
|
||||||
if(unicode.size() == 1)
|
if(unicode.size() == 1)
|
||||||
@ -149,8 +124,7 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
|
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
|
||||||
{
|
|
||||||
return lhs.second > rhs.second;
|
return lhs.second > rhs.second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user