From 931db7d1e5dbe6b613db7170e463ccd88b3a943a Mon Sep 17 00:00:00 2001 From: xuangong Date: Mon, 20 Jul 2015 23:54:20 +0800 Subject: [PATCH] astyle --- src/DictTrie.hpp | 321 +++++++++---------- src/FullSegment.hpp | 220 ++++++------- src/HMMSegment.hpp | 646 +++++++++++++++++---------------------- src/ISegment.hpp | 16 +- src/KeywordExtractor.hpp | 260 +++++++--------- 5 files changed, 656 insertions(+), 807 deletions(-) diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 350fcd4..40602bf 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -16,191 +16,164 @@ -namespace CppJieba -{ - using namespace Limonp; - const double MIN_DOUBLE = -3.14e+100; - const double MAX_DOUBLE = 3.14e+100; - const size_t DICT_COLUMN_NUM = 3; - const char* const UNKNOWN_TAG = ""; +namespace CppJieba { +using namespace Limonp; +const double MIN_DOUBLE = -3.14e+100; +const double MAX_DOUBLE = 3.14e+100; +const size_t DICT_COLUMN_NUM = 3; +const char* const UNKNOWN_TAG = ""; - class DictTrie - { - public: +class DictTrie { + public: - DictTrie() - { - _trie = NULL; - _minWeight = MAX_DOUBLE; - } - DictTrie(const string& dictPath, const string& userDictPath = "") - { - new (this) DictTrie(); - init(dictPath, userDictPath); - } - ~DictTrie() - { - if(_trie) - { - delete _trie; - } - } - - bool init(const string& dictPath, const string& userDictPath = "") - { - assert(!_trie); - _loadDict(dictPath); - _calculateWeight(_nodeInfos); - _minWeight = _findMinWeight(_nodeInfos); - - if(userDictPath.size()) - { - double maxWeight = _findMaxWeight(_nodeInfos); - _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); - } - _shrink(_nodeInfos); - _trie = _createTrie(_nodeInfos); - assert(_trie); - return true; - } + DictTrie() { + _trie = NULL; + _minWeight = MAX_DOUBLE; + } + DictTrie(const string& dictPath, const string& userDictPath = "") { + new (this) DictTrie(); + init(dictPath, userDictPath); + } + ~DictTrie() { + if(_trie) { + delete _trie; + } + } - const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - return _trie->find(begin, end); - } - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const - { - return _trie->find(begin, end, dag, offset); - } - void find( - Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res - ) const - { - _trie->find(begin, end, res); - } - bool isUserDictSingleChineseWord(const Unicode::value_type& word) const - { - return isIn(_userDictSingleChineseWord, word); - } - double getMinWeight() const {return _minWeight;}; + bool init(const string& dictPath, const string& userDictPath = "") { + assert(!_trie); + _loadDict(dictPath); + _calculateWeight(_nodeInfos); + _minWeight = _findMinWeight(_nodeInfos); + + if(userDictPath.size()) { + double maxWeight = _findMaxWeight(_nodeInfos); + _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG); + } + _shrink(_nodeInfos); + _trie = _createTrie(_nodeInfos); + assert(_trie); + return true; + } + + const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + return _trie->find(begin, end); + } + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const { + return _trie->find(begin, end, dag, offset); + } + void find( + Unicode::const_iterator begin, + Unicode::const_iterator end, + vector& res + ) const { + _trie->find(begin, end, res); + } + bool isUserDictSingleChineseWord(const Unicode::value_type& word) const { + return isIn(_userDictSingleChineseWord, word); + } + double getMinWeight() const { + return _minWeight; + }; - private: - UglyTrie * _createTrie(const vector& dictUnits) - { - assert(dictUnits.size()); - vector words; - vector valuePointers; - for(size_t i = 0 ; i < dictUnits.size(); i ++) - { - words.push_back(dictUnits[i].word); - valuePointers.push_back(&dictUnits[i]); - } + private: + UglyTrie * _createTrie(const vector& dictUnits) { + assert(dictUnits.size()); + vector words; + vector valuePointers; + for(size_t i = 0 ; i < dictUnits.size(); i ++) { + words.push_back(dictUnits[i].word); + valuePointers.push_back(&dictUnits[i]); + } - UglyTrie * trie = new UglyTrie(words, valuePointers); - return trie; - } - void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) - { - ifstream ifs(filePath.c_str()); - assert(ifs.is_open()); - string line; - DictUnit nodeInfo; - vector buf; - size_t lineno; - for(lineno = 0; getline(ifs, line); lineno++) - { - buf.clear(); - split(line, buf, " "); - assert(buf.size() >= 1); - if(!TransCode::decode(buf[0], nodeInfo.word)) - { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - continue; - } - if(nodeInfo.word.size() == 1) - { - _userDictSingleChineseWord.insert(nodeInfo.word[0]); - } - nodeInfo.weight = defaultWeight; - nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); - _nodeInfos.push_back(nodeInfo); - } - LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); - } - void _loadDict(const string& filePath) - { - ifstream ifs(filePath.c_str()); - assert(ifs.is_open()); - string line; - vector buf; + UglyTrie * trie = new UglyTrie(words, valuePointers); + return trie; + } + void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) { + ifstream ifs(filePath.c_str()); + assert(ifs.is_open()); + string line; + DictUnit nodeInfo; + vector buf; + size_t lineno; + for(lineno = 0; getline(ifs, line); lineno++) { + buf.clear(); + split(line, buf, " "); + assert(buf.size() >= 1); + if(!TransCode::decode(buf[0], nodeInfo.word)) { + LogError("line[%u:%s] illegal.", lineno, line.c_str()); + continue; + } + if(nodeInfo.word.size() == 1) { + _userDictSingleChineseWord.insert(nodeInfo.word[0]); + } + nodeInfo.weight = defaultWeight; + nodeInfo.tag = (buf.size() == 2 ? buf[1] : defaultTag); + _nodeInfos.push_back(nodeInfo); + } + LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno); + } + void _loadDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + assert(ifs.is_open()); + string line; + vector buf; - DictUnit nodeInfo; - for(size_t lineno = 0 ; getline(ifs, line); lineno++) - { - split(line, buf, " "); - assert(buf.size() == DICT_COLUMN_NUM); - - if(!TransCode::decode(buf[0], nodeInfo.word)) - { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - continue; - } - nodeInfo.weight = atof(buf[1].c_str()); - nodeInfo.tag = buf[2]; - - _nodeInfos.push_back(nodeInfo); - } - } - double _findMinWeight(const vector& nodeInfos) const - { - double ret = MAX_DOUBLE; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - ret = min(nodeInfos[i].weight, ret); - } - return ret; - } - double _findMaxWeight(const vector& nodeInfos) const - { - double ret = MIN_DOUBLE; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - ret = max(nodeInfos[i].weight, ret); - } - return ret; - } + DictUnit nodeInfo; + for(size_t lineno = 0 ; getline(ifs, line); lineno++) { + split(line, buf, " "); + assert(buf.size() == DICT_COLUMN_NUM); - void _calculateWeight(vector& nodeInfos) const - { - double sum = 0.0; - for(size_t i = 0; i < nodeInfos.size(); i++) - { - sum += nodeInfos[i].weight; - } - assert(sum); - for(size_t i = 0; i < nodeInfos.size(); i++) - { - DictUnit& nodeInfo = nodeInfos[i]; - assert(nodeInfo.weight); - nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); - } - } + if(!TransCode::decode(buf[0], nodeInfo.word)) { + LogError("line[%u:%s] illegal.", lineno, line.c_str()); + continue; + } + nodeInfo.weight = atof(buf[1].c_str()); + nodeInfo.tag = buf[2]; - void _shrink(vector& units) const - { - vector(units.begin(), units.end()).swap(units); - } + _nodeInfos.push_back(nodeInfo); + } + } + double _findMinWeight(const vector& nodeInfos) const { + double ret = MAX_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) { + ret = min(nodeInfos[i].weight, ret); + } + return ret; + } + double _findMaxWeight(const vector& nodeInfos) const { + double ret = MIN_DOUBLE; + for(size_t i = 0; i < nodeInfos.size(); i++) { + ret = max(nodeInfos[i].weight, ret); + } + return ret; + } - private: - vector _nodeInfos; - UglyTrie * _trie; + void _calculateWeight(vector& nodeInfos) const { + double sum = 0.0; + for(size_t i = 0; i < nodeInfos.size(); i++) { + sum += nodeInfos[i].weight; + } + assert(sum); + for(size_t i = 0; i < nodeInfos.size(); i++) { + DictUnit& nodeInfo = nodeInfos[i]; + assert(nodeInfo.weight); + nodeInfo.weight = log(double(nodeInfo.weight)/double(sum)); + } + } - double _minWeight; - unordered_set _userDictSingleChineseWord; - }; + void _shrink(vector& units) const { + vector(units.begin(), units.end()).swap(units); + } + + private: + vector _nodeInfos; + UglyTrie * _trie; + + double _minWeight; + unordered_set _userDictSingleChineseWord; +}; } #endif diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 0a3e747..a8b60a1 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -10,140 +10,116 @@ #include "SegmentBase.hpp" #include "TransCode.hpp" -namespace CppJieba -{ - class FullSegment: public SegmentBase - { - public: - FullSegment() - { - _dictTrie = NULL; - _isBorrowed = false; - } - explicit FullSegment(const string& dictPath) - { - _dictTrie = NULL; - init(dictPath); - } - explicit FullSegment(const DictTrie* dictTrie) - { - _dictTrie = NULL; - init(dictTrie); - } - virtual ~FullSegment() - { - if(_dictTrie && ! _isBorrowed) - { - delete _dictTrie; - } +namespace CppJieba { +class FullSegment: public SegmentBase { + public: + FullSegment() { + _dictTrie = NULL; + _isBorrowed = false; + } + explicit FullSegment(const string& dictPath) { + _dictTrie = NULL; + init(dictPath); + } + explicit FullSegment(const DictTrie* dictTrie) { + _dictTrie = NULL; + init(dictTrie); + } + virtual ~FullSegment() { + if(_dictTrie && ! _isBorrowed) { + delete _dictTrie; + } - }; - bool init(const string& dictPath) - { - assert(_dictTrie == NULL); - _dictTrie = new DictTrie(dictPath); - _isBorrowed = false; - return true; - } - bool init(const DictTrie* dictTrie) - { - assert(_dictTrie == NULL); - assert(dictTrie); - _dictTrie = dictTrie; - _isBorrowed = true; - return true; - } + }; + bool init(const string& dictPath) { + assert(_dictTrie == NULL); + _dictTrie = new DictTrie(dictPath); + _isBorrowed = false; + return true; + } + bool init(const DictTrie* dictTrie) { + assert(_dictTrie == NULL); + assert(dictTrie); + _dictTrie = dictTrie; + _isBorrowed = true; + return true; + } - using SegmentBase::cut; - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - assert(_dictTrie); - if (begin >= end) - { - LogError("begin >= end"); - return false; - } + using SegmentBase::cut; + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_dictTrie); + if (begin >= end) { + LogError("begin >= end"); + return false; + } - //resut of searching in trie tree - DagType tRes; + //resut of searching in trie tree + DagType tRes; - //max index of res's words - int maxIdx = 0; + //max index of res's words + int maxIdx = 0; - // always equals to (uItr - begin) - int uIdx = 0; + // always equals to (uItr - begin) + int uIdx = 0; - //tmp variables - int wordLen = 0; - for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) - { - //find word start from uItr - if (_dictTrie->find(uItr, end, tRes, 0)) - { - for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) - { - wordLen = itr->second->word.size(); - if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) - { - res.push_back(itr->second->word); - } - maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; - } - tRes.clear(); - } - else // not found word start from uItr - { - if (maxIdx <= uIdx) // never exist in prev results - { - //put itr itself in res - res.push_back(Unicode(1, *uItr)); + //tmp variables + int wordLen = 0; + for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) { + //find word start from uItr + if (_dictTrie->find(uItr, end, tRes, 0)) { + for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + //for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + { + wordLen = itr->second->word.size(); + if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) { + res.push_back(itr->second->word); + } + maxIdx = uIdx+wordLen > maxIdx ? uIdx+wordLen : maxIdx; + } + tRes.clear(); + } else { // not found word start from uItr + if (maxIdx <= uIdx) { // never exist in prev results + //put itr itself in res + res.push_back(Unicode(1, *uItr)); - //mark it exits - ++maxIdx; - } - } - ++uIdx; - } + //mark it exits + ++maxIdx; + } + } + ++uIdx; + } - return true; - } + return true; + } - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - assert(_dictTrie); - if (begin >= end) - { - LogError("begin >= end"); - return false; - } + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + assert(_dictTrie); + if (begin >= end) { + LogError("begin >= end"); + return false; + } - vector uRes; - if (!cut(begin, end, uRes)) - { - LogError("get unicode cut result error."); - return false; - } + vector uRes; + if (!cut(begin, end, uRes)) { + LogError("get unicode cut result error."); + return false; + } - string tmp; - for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) - { - if (TransCode::encode(*uItr, tmp)) - { - res.push_back(tmp); - } - else - { - LogError("encode failed."); - } - } + string tmp; + for (vector::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++) { + if (TransCode::encode(*uItr, tmp)) { + res.push_back(tmp); + } else { + LogError("encode failed."); + } + } - return true; - } - private: - const DictTrie* _dictTrie; - bool _isBorrowed; - }; + return true; + } + private: + const DictTrie* _dictTrie; + bool _isBorrowed; +}; } #endif diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index d7c8c89..d000bce 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -12,387 +12,315 @@ #include "SegmentBase.hpp" #include "DictTrie.hpp" -namespace CppJieba -{ - using namespace Limonp; - typedef unordered_map EmitProbMap; - class HMMSegment: public SegmentBase - { - public: - /* - * STATUS: - * 0:B, 1:E, 2:M, 3:S - * */ - enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; +namespace CppJieba { +using namespace Limonp; +typedef unordered_map EmitProbMap; +class HMMSegment: public SegmentBase { + public: + /* + * STATUS: + * 0:B, 1:E, 2:M, 3:S + * */ + enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; - public: - HMMSegment(){} - explicit HMMSegment(const string& filePath) - { - LIMONP_CHECK(init(filePath)); - } - virtual ~HMMSegment(){} - public: - bool init(const string& filePath) - { - memset(_startProb, 0, sizeof(_startProb)); - memset(_transProb, 0, sizeof(_transProb)); - _statMap[0] = 'B'; - _statMap[1] = 'E'; - _statMap[2] = 'M'; - _statMap[3] = 'S'; - _emitProbVec.push_back(&_emitProbB); - _emitProbVec.push_back(&_emitProbE); - _emitProbVec.push_back(&_emitProbM); - _emitProbVec.push_back(&_emitProbS); - LIMONP_CHECK(_loadModel(filePath.c_str())); - LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); - return true; - } - public: - using SegmentBase::cut; - public: - bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - Unicode::const_iterator left = begin; - Unicode::const_iterator right = begin; - while(right != end) - { - if(*right < 0x80) - { - if(left != right && !_cut(left, right, res)) - { - return false; - } - left = right; - do { - right = _sequentialLetterRule(left, end); - if(right != left) - { - break; - } - right = _numbersRule(left, end); - if(right != left) - { - break; - } - right ++; - } while(false); - res.push_back(Unicode(left, right)); - left = right; - } - else - { - right++; - } - } - if(left != right && !_cut(left, right, res)) - { - return false; - } - return true; - } - public: - virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const - { - if(begin == end) - { - return false; - } - vector words; - words.reserve(end - begin); - if(!cut(begin, end, words)) - { - return false; - } - size_t offset = res.size(); - res.resize(res.size() + words.size()); - for(size_t i = 0; i < words.size(); i++) - { - if(!TransCode::encode(words[i], res[offset + i])) - { - LogError("encode failed."); - } - } - return true; - } - private: - // sequential letters rule - Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - Unicode::value_type x = *begin; - if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) - { - begin ++; - } - else - { - return begin; - } - while(begin != end) - { - x = *begin; - if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) - { - begin ++; - } - else - { - break; - } - } - return begin; - } - // - Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const - { - Unicode::value_type x = *begin; - if('0' <= x && x <= '9') - { - begin ++; - } - else - { - return begin; - } - while(begin != end) - { - x = *begin; - if( ('0' <= x && x <= '9') || x == '.') - { - begin++; - } - else - { - break; - } - } - return begin; - } - bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const - { - vector status; - if(!_viterbi(begin, end, status)) - { - LogError("_viterbi failed."); - return false; - } + public: + HMMSegment() {} + explicit HMMSegment(const string& filePath) { + LIMONP_CHECK(init(filePath)); + } + virtual ~HMMSegment() {} + public: + bool init(const string& filePath) { + memset(_startProb, 0, sizeof(_startProb)); + memset(_transProb, 0, sizeof(_transProb)); + _statMap[0] = 'B'; + _statMap[1] = 'E'; + _statMap[2] = 'M'; + _statMap[3] = 'S'; + _emitProbVec.push_back(&_emitProbB); + _emitProbVec.push_back(&_emitProbE); + _emitProbVec.push_back(&_emitProbM); + _emitProbVec.push_back(&_emitProbS); + LIMONP_CHECK(_loadModel(filePath.c_str())); + LogInfo("HMMSegment init(%s) ok.", filePath.c_str()); + return true; + } + public: + using SegmentBase::cut; + public: + bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { + Unicode::const_iterator left = begin; + Unicode::const_iterator right = begin; + while(right != end) { + if(*right < 0x80) { + if(left != right && !_cut(left, right, res)) { + return false; + } + left = right; + do { + right = _sequentialLetterRule(left, end); + if(right != left) { + break; + } + right = _numbersRule(left, end); + if(right != left) { + break; + } + right ++; + } while(false); + res.push_back(Unicode(left, right)); + left = right; + } else { + right++; + } + } + if(left != right && !_cut(left, right, res)) { + return false; + } + return true; + } + public: + virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res)const { + if(begin == end) { + return false; + } + vector words; + words.reserve(end - begin); + if(!cut(begin, end, words)) { + return false; + } + size_t offset = res.size(); + res.resize(res.size() + words.size()); + for(size_t i = 0; i < words.size(); i++) { + if(!TransCode::encode(words[i], res[offset + i])) { + LogError("encode failed."); + } + } + return true; + } + private: + // sequential letters rule + Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { + Unicode::value_type x = *begin; + if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { + begin ++; + } else { + return begin; + } + while(begin != end) { + x = *begin; + if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { + begin ++; + } else { + break; + } + } + return begin; + } + // + Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { + Unicode::value_type x = *begin; + if('0' <= x && x <= '9') { + begin ++; + } else { + return begin; + } + while(begin != end) { + x = *begin; + if( ('0' <= x && x <= '9') || x == '.') { + begin++; + } else { + break; + } + } + return begin; + } + bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + vector status; + if(!_viterbi(begin, end, status)) { + LogError("_viterbi failed."); + return false; + } - Unicode::const_iterator left = begin; - Unicode::const_iterator right; - for(size_t i = 0; i < status.size(); i++) - { - if(status[i] % 2) //if(E == status[i] || S == status[i]) - { - right = begin + i + 1; - res.push_back(Unicode(left, right)); - left = right; - } - } - return true; - } + Unicode::const_iterator left = begin; + Unicode::const_iterator right; + for(size_t i = 0; i < status.size(); i++) { + if(status[i] % 2) { //if(E == status[i] || S == status[i]) + right = begin + i + 1; + res.push_back(Unicode(left, right)); + left = right; + } + } + return true; + } - bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const - { - if(begin == end) - { - return false; - } + bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const { + if(begin == end) { + return false; + } - size_t Y = STATUS_SUM; - size_t X = end - begin; + size_t Y = STATUS_SUM; + size_t X = end - begin; - size_t XYSize = X * Y; - size_t now, old, stat; - double tmp, endE, endS; + size_t XYSize = X * Y; + size_t now, old, stat; + double tmp, endE, endS; - vector path(XYSize); - vector weight(XYSize); + vector path(XYSize); + vector weight(XYSize); - //start - for(size_t y = 0; y < Y; y++) - { - weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); - path[0 + y * X] = -1; - } + //start + for(size_t y = 0; y < Y; y++) { + weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); + path[0 + y * X] = -1; + } - double emitProb; + double emitProb; - for(size_t x = 1; x < X; x++) - { - for(size_t y = 0; y < Y; y++) - { - now = x + y*X; - weight[now] = MIN_DOUBLE; - path[now] = E; // warning - emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); - for(size_t preY = 0; preY < Y; preY++) - { - old = x - 1 + preY * X; - tmp = weight[old] + _transProb[preY][y] + emitProb; - if(tmp > weight[now]) - { - weight[now] = tmp; - path[now] = preY; - } - } - } - } + for(size_t x = 1; x < X; x++) { + for(size_t y = 0; y < Y; y++) { + now = x + y*X; + weight[now] = MIN_DOUBLE; + path[now] = E; // warning + emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); + for(size_t preY = 0; preY < Y; preY++) { + old = x - 1 + preY * X; + tmp = weight[old] + _transProb[preY][y] + emitProb; + if(tmp > weight[now]) { + weight[now] = tmp; + path[now] = preY; + } + } + } + } - endE = weight[X-1+E*X]; - endS = weight[X-1+S*X]; - stat = 0; - if(endE >= endS) - { - stat = E; - } - else - { - stat = S; - } + endE = weight[X-1+E*X]; + endS = weight[X-1+S*X]; + stat = 0; + if(endE >= endS) { + stat = E; + } else { + stat = S; + } - status.resize(X); - for(int x = X -1 ; x >= 0; x--) - { - status[x] = stat; - stat = path[x + stat*X]; - } + status.resize(X); + for(int x = X -1 ; x >= 0; x--) { + status[x] = stat; + stat = path[x + stat*X]; + } - return true; - } - bool _loadModel(const char* const filePath) - { - ifstream ifile(filePath); - string line; - vector tmp; - vector tmp2; - //load _startProb - if(!_getLine(ifile, line)) - { - return false; - } - split(line, tmp, " "); - if(tmp.size() != STATUS_SUM) - { - LogError("start_p illegal"); - return false; - } - for(size_t j = 0; j< tmp.size(); j++) - { - _startProb[j] = atof(tmp[j].c_str()); - } + return true; + } + bool _loadModel(const char* const filePath) { + ifstream ifile(filePath); + string line; + vector tmp; + vector tmp2; + //load _startProb + if(!_getLine(ifile, line)) { + return false; + } + split(line, tmp, " "); + if(tmp.size() != STATUS_SUM) { + LogError("start_p illegal"); + return false; + } + for(size_t j = 0; j< tmp.size(); j++) { + _startProb[j] = atof(tmp[j].c_str()); + } - //load _transProb - for(size_t i = 0; i < STATUS_SUM; i++) - { - if(!_getLine(ifile, line)) - { - return false; - } - split(line, tmp, " "); - if(tmp.size() != STATUS_SUM) - { - LogError("trans_p illegal"); - return false; - } - for(size_t j =0; j < STATUS_SUM; j++) - { - _transProb[i][j] = atof(tmp[j].c_str()); - } - } + //load _transProb + for(size_t i = 0; i < STATUS_SUM; i++) { + if(!_getLine(ifile, line)) { + return false; + } + split(line, tmp, " "); + if(tmp.size() != STATUS_SUM) { + LogError("trans_p illegal"); + return false; + } + for(size_t j =0; j < STATUS_SUM; j++) { + _transProb[i][j] = atof(tmp[j].c_str()); + } + } - //load _emitProbB - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) - { - return false; - } + //load _emitProbB + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB)) { + return false; + } - //load _emitProbE - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) - { - return false; - } + //load _emitProbE + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE)) { + return false; + } - //load _emitProbM - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) - { - return false; - } + //load _emitProbM + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM)) { + return false; + } - //load _emitProbS - if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) - { - return false; - } + //load _emitProbS + if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS)) { + return false; + } - return true; - } - bool _getLine(ifstream& ifile, string& line) - { - while(getline(ifile, line)) - { - trim(line); - if(line.empty()) - { - continue; - } - if(startsWith(line, "#")) - { - continue; - } - return true; - } - return false; - } - bool _loadEmitProb(const string& line, EmitProbMap& mp) - { - if(line.empty()) - { - return false; - } - vector tmp, tmp2; - Unicode unicode; - split(line, tmp, ","); - for(size_t i = 0; i < tmp.size(); i++) - { - split(tmp[i], tmp2, ":"); - if(2 != tmp2.size()) - { - LogError("_emitProb illegal."); - return false; - } - if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) - { - LogError("TransCode failed."); - return false; - } - mp[unicode[0]] = atof(tmp2[1].c_str()); - } - return true; - } - double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const - { - EmitProbMap::const_iterator cit = ptMp->find(key); - if(cit == ptMp->end()) - { - return defVal; - } - return cit->second; + return true; + } + bool _getLine(ifstream& ifile, string& line) { + while(getline(ifile, line)) { + trim(line); + if(line.empty()) { + continue; + } + if(startsWith(line, "#")) { + continue; + } + return true; + } + return false; + } + bool _loadEmitProb(const string& line, EmitProbMap& mp) { + if(line.empty()) { + return false; + } + vector tmp, tmp2; + Unicode unicode; + split(line, tmp, ","); + for(size_t i = 0; i < tmp.size(); i++) { + split(tmp[i], tmp2, ":"); + if(2 != tmp2.size()) { + LogError("_emitProb illegal."); + return false; + } + if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) { + LogError("TransCode failed."); + return false; + } + mp[unicode[0]] = atof(tmp2[1].c_str()); + } + return true; + } + double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const { + EmitProbMap::const_iterator cit = ptMp->find(key); + if(cit == ptMp->end()) { + return defVal; + } + return cit->second; - } + } - private: - char _statMap[STATUS_SUM]; - double _startProb[STATUS_SUM]; - double _transProb[STATUS_SUM][STATUS_SUM]; - EmitProbMap _emitProbB; - EmitProbMap _emitProbE; - EmitProbMap _emitProbM; - EmitProbMap _emitProbS; - vector _emitProbVec; + private: + char _statMap[STATUS_SUM]; + double _startProb[STATUS_SUM]; + double _transProb[STATUS_SUM][STATUS_SUM]; + EmitProbMap _emitProbB; + EmitProbMap _emitProbE; + EmitProbMap _emitProbM; + EmitProbMap _emitProbS; + vector _emitProbVec; - }; +}; } #endif diff --git a/src/ISegment.hpp b/src/ISegment.hpp index 167e2f9..4faded5 100644 --- a/src/ISegment.hpp +++ b/src/ISegment.hpp @@ -2,15 +2,13 @@ #define CPPJIEBA_SEGMENTINTERFACE_H -namespace CppJieba -{ - class ISegment - { - public: - virtual ~ISegment(){}; - virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; - virtual bool cut(const string& str, vector& res) const = 0; - }; +namespace CppJieba { +class ISegment { + public: + virtual ~ISegment() {}; + virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const = 0; + virtual bool cut(const string& str, vector& res) const = 0; +}; } #endif diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index e3726d9..10f900e 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -5,162 +5,136 @@ #include #include -namespace CppJieba -{ - using namespace Limonp; +namespace CppJieba { +using namespace Limonp; - /*utf8*/ - class KeywordExtractor - { - public: - KeywordExtractor(){}; - KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") - { - init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); - }; - ~KeywordExtractor(){}; +/*utf8*/ +class KeywordExtractor { + public: + KeywordExtractor() {}; + KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { + init(dictPath, hmmFilePath, idfPath, stopWordPath, userDict); + }; + ~KeywordExtractor() {}; - void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") - { - _loadIdfDict(idfPath); - _loadStopWordDict(stopWordPath); - LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); - }; + void init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath, const string& userDict = "") { + _loadIdfDict(idfPath); + _loadStopWordDict(stopWordPath); + LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDict)); + }; - bool extract(const string& str, vector& keywords, size_t topN) const - { - vector > topWords; - if(!extract(str, topWords, topN)) - { - return false; - } - for(size_t i = 0; i < topWords.size(); i++) - { - keywords.push_back(topWords[i].first); - } - return true; - } + bool extract(const string& str, vector& keywords, size_t topN) const { + vector > topWords; + if(!extract(str, topWords, topN)) { + return false; + } + for(size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].first); + } + return true; + } - bool extract(const string& str, vector >& keywords, size_t topN) const - { - vector words; - if(!_segment.cut(str, words)) - { - LogError("segment cut(%s) failed.", str.c_str()); - return false; - } + bool extract(const string& str, vector >& keywords, size_t topN) const { + vector words; + if(!_segment.cut(str, words)) { + LogError("segment cut(%s) failed.", str.c_str()); + return false; + } - map wordmap; - for(vector::iterator iter = words.begin(); iter != words.end(); iter++) - { - if(_isSingleWord(*iter)) - { - continue; - } - wordmap[*iter] += 1.0; - } + map wordmap; + for(vector::iterator iter = words.begin(); iter != words.end(); iter++) { + if(_isSingleWord(*iter)) { + continue; + } + wordmap[*iter] += 1.0; + } - for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) - { - if(_stopWords.end() != _stopWords.find(itr->first)) - { - wordmap.erase(itr); - continue; - } + for(map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { + if(_stopWords.end() != _stopWords.find(itr->first)) { + wordmap.erase(itr); + continue; + } - unordered_map::const_iterator cit = _idfMap.find(itr->first); - if(cit != _idfMap.end()) - { - itr->second *= cit->second; - } - else - { - itr->second *= _idfAverage; - } - itr ++; - } + unordered_map::const_iterator cit = _idfMap.find(itr->first); + if(cit != _idfMap.end()) { + itr->second *= cit->second; + } else { + itr->second *= _idfAverage; + } + itr ++; + } - keywords.clear(); - std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); - topN = min(topN, keywords.size()); - partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); - keywords.resize(topN); - return true; - } - private: - void _loadIdfDict(const string& idfPath) - { - ifstream ifs(idfPath.c_str()); - if(!ifs) - { - LogError("open %s failed.", idfPath.c_str()); - assert(false); - } - string line ; - vector buf; - double idf = 0.0; - double idfSum = 0.0; - size_t lineno = 0; - for(;getline(ifs, line); lineno++) - { - buf.clear(); - if(line.empty()) - { - LogError("line[%d] empty. skipped.", lineno); - continue; - } - if(!split(line, buf, " ") || buf.size() != 2) - { - LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); - continue; - } - idf = atof(buf[1].c_str()); - _idfMap[buf[0]] = idf; - idfSum += idf; + keywords.clear(); + std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); + keywords.resize(topN); + return true; + } + private: + void _loadIdfDict(const string& idfPath) { + ifstream ifs(idfPath.c_str()); + if(!ifs) { + LogError("open %s failed.", idfPath.c_str()); + assert(false); + } + string line ; + vector buf; + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for(; getline(ifs, line); lineno++) { + buf.clear(); + if(line.empty()) { + LogError("line[%d] empty. skipped.", lineno); + continue; + } + if(!split(line, buf, " ") || buf.size() != 2) { + LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); + continue; + } + idf = atof(buf[1].c_str()); + _idfMap[buf[0]] = idf; + idfSum += idf; - } + } - assert(lineno); - _idfAverage = idfSum / lineno; - assert(_idfAverage > 0.0); - } - void _loadStopWordDict(const string& filePath) - { - ifstream ifs(filePath.c_str()); - if(!ifs) - { - LogError("open %s failed.", filePath.c_str()); - assert(false); - } - string line ; - while(getline(ifs, line)) - { - _stopWords.insert(line); - } - assert(_stopWords.size()); - } + assert(lineno); + _idfAverage = idfSum / lineno; + assert(_idfAverage > 0.0); + } + void _loadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + if(!ifs) { + LogError("open %s failed.", filePath.c_str()); + assert(false); + } + string line ; + while(getline(ifs, line)) { + _stopWords.insert(line); + } + assert(_stopWords.size()); + } - bool _isSingleWord(const string& str) const - { - Unicode unicode; - TransCode::decode(str, unicode); - if(unicode.size() == 1) - return true; - return false; - } + bool _isSingleWord(const string& str) const { + Unicode unicode; + TransCode::decode(str, unicode); + if(unicode.size() == 1) + return true; + return false; + } - static bool _cmp(const pair& lhs, const pair& rhs) - { - return lhs.second > rhs.second; - } - - private: - MixSegment _segment; - unordered_map _idfMap; - double _idfAverage; + static bool _cmp(const pair& lhs, const pair& rhs) { + return lhs.second > rhs.second; + } - unordered_set _stopWords; - }; + private: + MixSegment _segment; + unordered_map _idfMap; + double _idfAverage; + + unordered_set _stopWords; +}; } #endif