This commit is contained in:
xuangong 2015-07-20 23:54:20 +08:00
parent d1a112c0c4
commit 931db7d1e5
5 changed files with 656 additions and 807 deletions

View File

@ -16,45 +16,37 @@
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp; const double MIN_DOUBLE = -3.14e+100;
const double MIN_DOUBLE = -3.14e+100; const double MAX_DOUBLE = 3.14e+100;
const double MAX_DOUBLE = 3.14e+100; const size_t DICT_COLUMN_NUM = 3;
const size_t DICT_COLUMN_NUM = 3; const char* const UNKNOWN_TAG = "";
const char* const UNKNOWN_TAG = "";
class DictTrie class DictTrie {
{
public: public:
DictTrie() DictTrie() {
{
_trie = NULL; _trie = NULL;
_minWeight = MAX_DOUBLE; _minWeight = MAX_DOUBLE;
} }
DictTrie(const string& dictPath, const string& userDictPath = "") DictTrie(const string& dictPath, const string& userDictPath = "") {
{
new (this) DictTrie(); new (this) DictTrie();
init(dictPath, userDictPath); init(dictPath, userDictPath);
} }
~DictTrie() ~DictTrie() {
{ if(_trie) {
if(_trie)
{
delete _trie; delete _trie;
} }
} }
bool init(const string& dictPath, const string& userDictPath = "") bool init(const string& dictPath, const string& userDictPath = "") {
{
assert(!_trie); assert(!_trie);
_loadDict(dictPath); _loadDict(dictPath);
_calculateWeight(_nodeInfos); _calculateWeight(_nodeInfos);
_minWeight = _findMinWeight(_nodeInfos); _minWeight = _findMinWeight(_nodeInfos);
if(userDictPath.size()) if(userDictPath.size()) {
{
double maxWeight = _findMaxWeight(_nodeInfos); double maxWeight = _findMaxWeight(_nodeInfos);
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
} }
@ -64,37 +56,33 @@ namespace CppJieba
return true; return true;
} }
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
return _trie->find(begin, end); return _trie->find(begin, end);
} }
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const {
{
return _trie->find(begin, end, dag, offset); return _trie->find(begin, end, dag, offset);
} }
void find( void find(
Unicode::const_iterator begin, Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<SegmentChar>& res vector<SegmentChar>& res
) const ) const {
{
_trie->find(begin, end, res); _trie->find(begin, end, res);
} }
bool isUserDictSingleChineseWord(const Unicode::value_type& word) const bool isUserDictSingleChineseWord(const Unicode::value_type& word) const {
{
return isIn(_userDictSingleChineseWord, word); return isIn(_userDictSingleChineseWord, word);
} }
double getMinWeight() const {return _minWeight;}; double getMinWeight() const {
return _minWeight;
};
private: private:
UglyTrie * _createTrie(const vector<DictUnit>& dictUnits) UglyTrie * _createTrie(const vector<DictUnit>& dictUnits) {
{
assert(dictUnits.size()); assert(dictUnits.size());
vector<Unicode> words; vector<Unicode> words;
vector<const DictUnit*> valuePointers; vector<const DictUnit*> valuePointers;
for(size_t i = 0 ; i < dictUnits.size(); i ++) for(size_t i = 0 ; i < dictUnits.size(); i ++) {
{
words.push_back(dictUnits[i].word); words.push_back(dictUnits[i].word);
valuePointers.push_back(&dictUnits[i]); valuePointers.push_back(&dictUnits[i]);
} }
@ -102,26 +90,22 @@ namespace CppJieba
UglyTrie * trie = new UglyTrie(words, valuePointers); UglyTrie * trie = new UglyTrie(words, valuePointers);
return trie; return trie;
} }
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) {
{
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
assert(ifs.is_open()); assert(ifs.is_open());
string line; string line;
DictUnit nodeInfo; DictUnit nodeInfo;
vector<string> buf; vector<string> buf;
size_t lineno; size_t lineno;
for(lineno = 0; getline(ifs, line); lineno++) for(lineno = 0; getline(ifs, line); lineno++) {
{
buf.clear(); buf.clear();
split(line, buf, " "); split(line, buf, " ");
assert(buf.size() >= 1); assert(buf.size() >= 1);
if(!TransCode::decode(buf[0], nodeInfo.word)) if(!TransCode::decode(buf[0], nodeInfo.word)) {
{
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue; continue;
} }
if(nodeInfo.word.size() == 1) if(nodeInfo.word.size() == 1) {
{
_userDictSingleChineseWord.insert(nodeInfo.word[0]); _userDictSingleChineseWord.insert(nodeInfo.word[0]);
} }
nodeInfo.weight = defaultWeight; nodeInfo.weight = defaultWeight;
@ -130,21 +114,18 @@ namespace CppJieba
} }
LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
} }
void _loadDict(const string& filePath) void _loadDict(const string& filePath) {
{
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
assert(ifs.is_open()); assert(ifs.is_open());
string line; string line;
vector<string> buf; vector<string> buf;
DictUnit nodeInfo; DictUnit nodeInfo;
for(size_t lineno = 0 ; getline(ifs, line); lineno++) for(size_t lineno = 0 ; getline(ifs, line); lineno++) {
{
split(line, buf, " "); split(line, buf, " ");
assert(buf.size() == DICT_COLUMN_NUM); assert(buf.size() == DICT_COLUMN_NUM);
if(!TransCode::decode(buf[0], nodeInfo.word)) if(!TransCode::decode(buf[0], nodeInfo.word)) {
{
LogError("line[%u:%s] illegal.", lineno, line.c_str()); LogError("line[%u:%s] illegal.", lineno, line.c_str());
continue; continue;
} }
@ -154,43 +135,35 @@ namespace CppJieba
_nodeInfos.push_back(nodeInfo); _nodeInfos.push_back(nodeInfo);
} }
} }
double _findMinWeight(const vector<DictUnit>& nodeInfos) const double _findMinWeight(const vector<DictUnit>& nodeInfos) const {
{
double ret = MAX_DOUBLE; double ret = MAX_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
ret = min(nodeInfos[i].weight, ret); ret = min(nodeInfos[i].weight, ret);
} }
return ret; return ret;
} }
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const double _findMaxWeight(const vector<DictUnit>& nodeInfos) const {
{
double ret = MIN_DOUBLE; double ret = MIN_DOUBLE;
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
ret = max(nodeInfos[i].weight, ret); ret = max(nodeInfos[i].weight, ret);
} }
return ret; return ret;
} }
void _calculateWeight(vector<DictUnit>& nodeInfos) const void _calculateWeight(vector<DictUnit>& nodeInfos) const {
{
double sum = 0.0; double sum = 0.0;
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
sum += nodeInfos[i].weight; sum += nodeInfos[i].weight;
} }
assert(sum); assert(sum);
for(size_t i = 0; i < nodeInfos.size(); i++) for(size_t i = 0; i < nodeInfos.size(); i++) {
{
DictUnit& nodeInfo = nodeInfos[i]; DictUnit& nodeInfo = nodeInfos[i];
assert(nodeInfo.weight); assert(nodeInfo.weight);
nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); nodeInfo.weight = log(double(nodeInfo.weight)/double(sum));
} }
} }
void _shrink(vector<DictUnit>& units) const void _shrink(vector<DictUnit>& units) const {
{
vector<DictUnit>(units.begin(), units.end()).swap(units); vector<DictUnit>(units.begin(), units.end()).swap(units);
} }
@ -200,7 +173,7 @@ namespace CppJieba
double _minWeight; double _minWeight;
unordered_set<Unicode::value_type> _userDictSingleChineseWord; unordered_set<Unicode::value_type> _userDictSingleChineseWord;
}; };
} }
#endif #endif

View File

@ -10,43 +10,34 @@
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
namespace CppJieba namespace CppJieba {
{ class FullSegment: public SegmentBase {
class FullSegment: public SegmentBase
{
public: public:
FullSegment() FullSegment() {
{
_dictTrie = NULL; _dictTrie = NULL;
_isBorrowed = false; _isBorrowed = false;
} }
explicit FullSegment(const string& dictPath) explicit FullSegment(const string& dictPath) {
{
_dictTrie = NULL; _dictTrie = NULL;
init(dictPath); init(dictPath);
} }
explicit FullSegment(const DictTrie* dictTrie) explicit FullSegment(const DictTrie* dictTrie) {
{
_dictTrie = NULL; _dictTrie = NULL;
init(dictTrie); init(dictTrie);
} }
virtual ~FullSegment() virtual ~FullSegment() {
{ if(_dictTrie && ! _isBorrowed) {
if(_dictTrie && ! _isBorrowed)
{
delete _dictTrie; delete _dictTrie;
} }
}; };
bool init(const string& dictPath) bool init(const string& dictPath) {
{
assert(_dictTrie == NULL); assert(_dictTrie == NULL);
_dictTrie = new DictTrie(dictPath); _dictTrie = new DictTrie(dictPath);
_isBorrowed = false; _isBorrowed = false;
return true; return true;
} }
bool init(const DictTrie* dictTrie) bool init(const DictTrie* dictTrie) {
{
assert(_dictTrie == NULL); assert(_dictTrie == NULL);
assert(dictTrie); assert(dictTrie);
_dictTrie = dictTrie; _dictTrie = dictTrie;
@ -55,11 +46,9 @@ namespace CppJieba
} }
using SegmentBase::cut; using SegmentBase::cut;
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{
assert(_dictTrie); assert(_dictTrie);
if (begin >= end) if (begin >= end) {
{
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
@ -75,27 +64,21 @@ namespace CppJieba
//tmp variables //tmp variables
int wordLen = 0; int wordLen = 0;
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) {
{
//find word start from uItr //find word start from uItr
if (_dictTrie->find(uItr, end, tRes, 0)) if (_dictTrie->find(uItr, end, tRes, 0)) {
{
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {
wordLen = itr->second->word.size(); wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) {
{
res.push_back(itr->second->word); res.push_back(itr->second->word);
} }
maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx;
} }
tRes.clear(); tRes.clear();
} } else { // not found word start from uItr
else // not found word start from uItr if (maxIdx <= uIdx) { // never exist in prev results
{
if (maxIdx <= uIdx) // never exist in prev results
{
//put itr itself in res //put itr itself in res
res.push_back(Unicode(1, *uItr)); res.push_back(Unicode(1, *uItr));
@ -109,31 +92,24 @@ namespace CppJieba
return true; return true;
} }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const {
{
assert(_dictTrie); assert(_dictTrie);
if (begin >= end) if (begin >= end) {
{
LogError("begin >= end"); LogError("begin >= end");
return false; return false;
} }
vector<Unicode> uRes; vector<Unicode> uRes;
if (!cut(begin, end, uRes)) if (!cut(begin, end, uRes)) {
{
LogError("get unicode cut result error."); LogError("get unicode cut result error.");
return false; return false;
} }
string tmp; string tmp;
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) {
{ if (TransCode::encode(*uItr, tmp)) {
if (TransCode::encode(*uItr, tmp))
{
res.push_back(tmp); res.push_back(tmp);
} } else {
else
{
LogError("encode failed."); LogError("encode failed.");
} }
} }
@ -143,7 +119,7 @@ namespace CppJieba
private: private:
const DictTrie* _dictTrie; const DictTrie* _dictTrie;
bool _isBorrowed; bool _isBorrowed;
}; };
} }
#endif #endif

View File

@ -12,12 +12,10 @@
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "DictTrie.hpp" #include "DictTrie.hpp"
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp; typedef unordered_map<uint16_t, double> EmitProbMap;
typedef unordered_map<uint16_t, double> EmitProbMap; class HMMSegment: public SegmentBase {
class HMMSegment: public SegmentBase
{
public: public:
/* /*
* STATUS: * STATUS:
@ -26,15 +24,13 @@ namespace CppJieba
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
public: public:
HMMSegment(){} HMMSegment() {}
explicit HMMSegment(const string& filePath) explicit HMMSegment(const string& filePath) {
{
LIMONP_CHECK(init(filePath)); LIMONP_CHECK(init(filePath));
} }
virtual ~HMMSegment(){} virtual ~HMMSegment() {}
public: public:
bool init(const string& filePath) bool init(const string& filePath) {
{
memset(_startProb, 0, sizeof(_startProb)); memset(_startProb, 0, sizeof(_startProb));
memset(_transProb, 0, sizeof(_transProb)); memset(_transProb, 0, sizeof(_transProb));
_statMap[0] = 'B'; _statMap[0] = 'B';
@ -52,65 +48,51 @@ namespace CppJieba
public: public:
using SegmentBase::cut; using SegmentBase::cut;
public: public:
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const {
{
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right = begin; Unicode::const_iterator right = begin;
while(right != end) while(right != end) {
{ if(*right < 0x80) {
if(*right < 0x80) if(left != right && !_cut(left, right, res)) {
{
if(left != right && !_cut(left, right, res))
{
return false; return false;
} }
left = right; left = right;
do { do {
right = _sequentialLetterRule(left, end); right = _sequentialLetterRule(left, end);
if(right != left) if(right != left) {
{
break; break;
} }
right = _numbersRule(left, end); right = _numbersRule(left, end);
if(right != left) if(right != left) {
{
break; break;
} }
right ++; right ++;
} while(false); } while(false);
res.push_back(Unicode(left, right)); res.push_back(Unicode(left, right));
left = right; left = right;
} } else {
else
{
right++; right++;
} }
} }
if(left != right && !_cut(left, right, res)) if(left != right && !_cut(left, right, res)) {
{
return false; return false;
} }
return true; return true;
} }
public: public:
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const {
{ if(begin == end) {
if(begin == end)
{
return false; return false;
} }
vector<Unicode> words; vector<Unicode> words;
words.reserve(end - begin); words.reserve(end - begin);
if(!cut(begin, end, words)) if(!cut(begin, end, words)) {
{
return false; return false;
} }
size_t offset = res.size(); size_t offset = res.size();
res.resize(res.size() + words.size()); res.resize(res.size() + words.size());
for(size_t i = 0; i < words.size(); i++) for(size_t i = 0; i < words.size(); i++) {
{ if(!TransCode::encode(words[i], res[offset + i])) {
if(!TransCode::encode(words[i], res[offset + i]))
{
LogError("encode failed."); LogError("encode failed.");
} }
} }
@ -118,72 +100,52 @@ namespace CppJieba
} }
private: private:
// sequential letters rule // sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
Unicode::value_type x = *begin; Unicode::value_type x = *begin;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
{
begin ++; begin ++;
} } else {
else
{
return begin; return begin;
} }
while(begin != end) while(begin != end) {
{
x = *begin; x = *begin;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
{
begin ++; begin ++;
} } else {
else
{
break; break;
} }
} }
return begin; return begin;
} }
// //
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
{
Unicode::value_type x = *begin; Unicode::value_type x = *begin;
if('0' <= x && x <= '9') if('0' <= x && x <= '9') {
{
begin ++; begin ++;
} } else {
else
{
return begin; return begin;
} }
while(begin != end) while(begin != end) {
{
x = *begin; x = *begin;
if( ('0' <= x && x <= '9') || x == '.') if( ('0' <= x && x <= '9') || x == '.') {
{
begin++; begin++;
} } else {
else
{
break; break;
} }
} }
return begin; return begin;
} }
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
{
vector<size_t> status; vector<size_t> status;
if(!_viterbi(begin, end, status)) if(!_viterbi(begin, end, status)) {
{
LogError("_viterbi failed."); LogError("_viterbi failed.");
return false; return false;
} }
Unicode::const_iterator left = begin; Unicode::const_iterator left = begin;
Unicode::const_iterator right; Unicode::const_iterator right;
for(size_t i = 0; i < status.size(); i++) for(size_t i = 0; i < status.size(); i++) {
{ if(status[i] % 2) { //if(E == status[i] || S == status[i])
if(status[i] % 2) //if(E == status[i] || S == status[i])
{
right = begin + i + 1; right = begin + i + 1;
res.push_back(Unicode(left, right)); res.push_back(Unicode(left, right));
left = right; left = right;
@ -192,10 +154,8 @@ namespace CppJieba
return true; return true;
} }
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const {
{ if(begin == end) {
if(begin == end)
{
return false; return false;
} }
@ -210,8 +170,7 @@ namespace CppJieba
vector<double> weight(XYSize); vector<double> weight(XYSize);
//start //start
for(size_t y = 0; y < Y; y++) for(size_t y = 0; y < Y; y++) {
{
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
path[0 + y * X] = -1; path[0 + y * X] = -1;
} }
@ -219,20 +178,16 @@ namespace CppJieba
double emitProb; double emitProb;
for(size_t x = 1; x < X; x++) for(size_t x = 1; x < X; x++) {
{ for(size_t y = 0; y < Y; y++) {
for(size_t y = 0; y < Y; y++)
{
now = x + y*X; now = x + y*X;
weight[now] = MIN_DOUBLE; weight[now] = MIN_DOUBLE;
path[now] = E; // warning path[now] = E; // warning
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
for(size_t preY = 0; preY < Y; preY++) for(size_t preY = 0; preY < Y; preY++) {
{
old = x - 1 + preY * X; old = x - 1 + preY * X;
tmp = weight[old] + _transProb[preY][y] + emitProb; tmp = weight[old] + _transProb[preY][y] + emitProb;
if(tmp > weight[now]) if(tmp > weight[now]) {
{
weight[now] = tmp; weight[now] = tmp;
path[now] = preY; path[now] = preY;
} }
@ -243,127 +198,102 @@ namespace CppJieba
endE = weight[X-1+E*X]; endE = weight[X-1+E*X];
endS = weight[X-1+S*X]; endS = weight[X-1+S*X];
stat = 0; stat = 0;
if(endE >= endS) if(endE >= endS) {
{
stat = E; stat = E;
} } else {
else
{
stat = S; stat = S;
} }
status.resize(X); status.resize(X);
for(int x = X -1 ; x >= 0; x--) for(int x = X -1 ; x >= 0; x--) {
{
status[x] = stat; status[x] = stat;
stat = path[x + stat*X]; stat = path[x + stat*X];
} }
return true; return true;
} }
bool _loadModel(const char* const filePath) bool _loadModel(const char* const filePath) {
{
ifstream ifile(filePath); ifstream ifile(filePath);
string line; string line;
vector<string> tmp; vector<string> tmp;
vector<string> tmp2; vector<string> tmp2;
//load _startProb //load _startProb
if(!_getLine(ifile, line)) if(!_getLine(ifile, line)) {
{
return false; return false;
} }
split(line, tmp, " "); split(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM) {
{
LogError("start_p illegal"); LogError("start_p illegal");
return false; return false;
} }
for(size_t j = 0; j< tmp.size(); j++) for(size_t j = 0; j< tmp.size(); j++) {
{
_startProb[j] = atof(tmp[j].c_str()); _startProb[j] = atof(tmp[j].c_str());
} }
//load _transProb //load _transProb
for(size_t i = 0; i < STATUS_SUM; i++) for(size_t i = 0; i < STATUS_SUM; i++) {
{ if(!_getLine(ifile, line)) {
if(!_getLine(ifile, line))
{
return false; return false;
} }
split(line, tmp, " "); split(line, tmp, " ");
if(tmp.size() != STATUS_SUM) if(tmp.size() != STATUS_SUM) {
{
LogError("trans_p illegal"); LogError("trans_p illegal");
return false; return false;
} }
for(size_t j =0; j < STATUS_SUM; j++) for(size_t j =0; j < STATUS_SUM; j++) {
{
_transProb[i][j] = atof(tmp[j].c_str()); _transProb[i][j] = atof(tmp[j].c_str());
} }
} }
//load _emitProbB //load _emitProbB
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) {
{
return false; return false;
} }
//load _emitProbE //load _emitProbE
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) {
{
return false; return false;
} }
//load _emitProbM //load _emitProbM
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) {
{
return false; return false;
} }
//load _emitProbS //load _emitProbS
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) {
{
return false; return false;
} }
return true; return true;
} }
bool _getLine(ifstream& ifile, string& line) bool _getLine(ifstream& ifile, string& line) {
{ while(getline(ifile, line)) {
while(getline(ifile, line))
{
trim(line); trim(line);
if(line.empty()) if(line.empty()) {
{
continue; continue;
} }
if(startsWith(line, "#")) if(startsWith(line, "#")) {
{
continue; continue;
} }
return true; return true;
} }
return false; return false;
} }
bool _loadEmitProb(const string& line, EmitProbMap& mp) bool _loadEmitProb(const string& line, EmitProbMap& mp) {
{ if(line.empty()) {
if(line.empty())
{
return false; return false;
} }
vector<string> tmp, tmp2; vector<string> tmp, tmp2;
Unicode unicode; Unicode unicode;
split(line, tmp, ","); split(line, tmp, ",");
for(size_t i = 0; i < tmp.size(); i++) for(size_t i = 0; i < tmp.size(); i++) {
{
split(tmp[i], tmp2, ":"); split(tmp[i], tmp2, ":");
if(2 != tmp2.size()) if(2 != tmp2.size()) {
{
LogError("_emitProb illegal."); LogError("_emitProb illegal.");
return false; return false;
} }
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) {
{
LogError("TransCode failed."); LogError("TransCode failed.");
return false; return false;
} }
@ -371,11 +301,9 @@ namespace CppJieba
} }
return true; return true;
} }
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const {
{
EmitProbMap::const_iterator cit = ptMp->find(key); EmitProbMap::const_iterator cit = ptMp->find(key);
if(cit == ptMp->end()) if(cit == ptMp->end()) {
{
return defVal; return defVal;
} }
return cit->second; return cit->second;
@ -392,7 +320,7 @@ namespace CppJieba
EmitProbMap _emitProbS; EmitProbMap _emitProbS;
vector<EmitProbMap* > _emitProbVec; vector<EmitProbMap* > _emitProbVec;
}; };
} }
#endif #endif

View File

@ -2,15 +2,13 @@
#define CPPJIEBA_SEGMENTINTERFACE_H #define CPPJIEBA_SEGMENTINTERFACE_H
namespace CppJieba namespace CppJieba {
{ class ISegment {
class ISegment
{
public: public:
virtual ~ISegment(){}; virtual ~ISegment() {};
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0; virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const = 0; virtual bool cut(const string& str, vector<string>& res) const = 0;
}; };
} }
#endif #endif

View File

@ -5,76 +5,60 @@
#include <cmath> #include <cmath>
#include <set> #include <set>
namespace CppJieba namespace CppJieba {
{ using namespace Limonp;
using namespace Limonp;
/*utf8*/ /*utf8*/
class KeywordExtractor class KeywordExtractor {
{
public: public:
KeywordExtractor(){}; KeywordExtractor() {};
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
{
init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict);
}; };
~KeywordExtractor(){}; ~KeywordExtractor() {};
void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") {
{
_loadIdfDict(idfPath); _loadIdfDict(idfPath);
_loadStopWordDict(stopWordPath); _loadStopWordDict(stopWordPath);
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict));
}; };
bool extract(const string& str, vector<string>& keywords, size_t topN) const bool extract(const string& str, vector<string>& keywords, size_t topN) const {
{
vector<pair<string, double> > topWords; vector<pair<string, double> > topWords;
if(!extract(str, topWords, topN)) if(!extract(str, topWords, topN)) {
{
return false; return false;
} }
for(size_t i = 0; i < topWords.size(); i++) for(size_t i = 0; i < topWords.size(); i++) {
{
keywords.push_back(topWords[i].first); keywords.push_back(topWords[i].first);
} }
return true; return true;
} }
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const {
{
vector<string> words; vector<string> words;
if(!_segment.cut(str, words)) if(!_segment.cut(str, words)) {
{
LogError("segment cut(%s) failed.", str.c_str()); LogError("segment cut(%s) failed.", str.c_str());
return false; return false;
} }
map<string, double> wordmap; map<string, double> wordmap;
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++) {
{ if(_isSingleWord(*iter)) {
if(_isSingleWord(*iter))
{
continue; continue;
} }
wordmap[*iter] += 1.0; wordmap[*iter] += 1.0;
} }
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) {
{ if(_stopWords.end() != _stopWords.find(itr->first)) {
if(_stopWords.end() != _stopWords.find(itr->first))
{
wordmap.erase(itr); wordmap.erase(itr);
continue; continue;
} }
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
if(cit != _idfMap.end()) if(cit != _idfMap.end()) {
{
itr->second *= cit->second; itr->second *= cit->second;
} } else {
else
{
itr->second *= _idfAverage; itr->second *= _idfAverage;
} }
itr ++; itr ++;
@ -88,11 +72,9 @@ namespace CppJieba
return true; return true;
} }
private: private:
void _loadIdfDict(const string& idfPath) void _loadIdfDict(const string& idfPath) {
{
ifstream ifs(idfPath.c_str()); ifstream ifs(idfPath.c_str());
if(!ifs) if(!ifs) {
{
LogError("open %s failed.", idfPath.c_str()); LogError("open %s failed.", idfPath.c_str());
assert(false); assert(false);
} }
@ -101,16 +83,13 @@ namespace CppJieba
double idf = 0.0; double idf = 0.0;
double idfSum = 0.0; double idfSum = 0.0;
size_t lineno = 0; size_t lineno = 0;
for(;getline(ifs, line); lineno++) for(; getline(ifs, line); lineno++) {
{
buf.clear(); buf.clear();
if(line.empty()) if(line.empty()) {
{
LogError("line[%d] empty. skipped.", lineno); LogError("line[%d] empty. skipped.", lineno);
continue; continue;
} }
if(!split(line, buf, " ") || buf.size() != 2) if(!split(line, buf, " ") || buf.size() != 2) {
{
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
continue; continue;
} }
@ -124,24 +103,20 @@ namespace CppJieba
_idfAverage = idfSum / lineno; _idfAverage = idfSum / lineno;
assert(_idfAverage > 0.0); assert(_idfAverage > 0.0);
} }
void _loadStopWordDict(const string& filePath) void _loadStopWordDict(const string& filePath) {
{
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());
if(!ifs) if(!ifs) {
{
LogError("open %s failed.", filePath.c_str()); LogError("open %s failed.", filePath.c_str());
assert(false); assert(false);
} }
string line ; string line ;
while(getline(ifs, line)) while(getline(ifs, line)) {
{
_stopWords.insert(line); _stopWords.insert(line);
} }
assert(_stopWords.size()); assert(_stopWords.size());
} }
bool _isSingleWord(const string& str) const bool _isSingleWord(const string& str) const {
{
Unicode unicode; Unicode unicode;
TransCode::decode(str, unicode); TransCode::decode(str, unicode);
if(unicode.size() == 1) if(unicode.size() == 1)
@ -149,8 +124,7 @@ namespace CppJieba
return false; return false;
} }
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs) {
{
return lhs.second > rhs.second; return lhs.second > rhs.second;
} }
@ -160,7 +134,7 @@ namespace CppJieba
double _idfAverage; double _idfAverage;
unordered_set<string> _stopWords; unordered_set<string> _stopWords;
}; };
} }
#endif #endif