From 346bc54c35602272e7216debbe265aa531f0b1ff Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Mon, 19 Aug 2013 01:29:46 +0800 Subject: [PATCH] add segmentContext in segment && run ok --- demo/keywordext_demo.cpp | 2 +- src/KeyWordExt.cpp | 18 ++-- src/KeyWordExt.h | 6 +- src/Segment.cpp | 188 ++++++++++++++++++--------------------- src/Segment.h | 12 +-- src/TransCode.cpp | 5 -- src/TransCode.h | 1 - src/Trie.cpp | 50 +++++------ src/Trie.h | 6 +- src/structs.h | 25 +++++- 10 files changed, 156 insertions(+), 157 deletions(-) diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp index f3d71f0..c305c67 100644 --- a/demo/keywordext_demo.cpp +++ b/demo/keywordext_demo.cpp @@ -29,7 +29,7 @@ void testKeyWordExt(const char * dictPath, const char * filePath) if(!line.empty()) { ext.extract(line, res, 20); - cout< b.weight; } - bool KeyWordExt::_sortWLIDF(vector& wordInfos) + bool KeyWordExt::_sortWLIDF(vector& wordInfos) { for(uint i = 0; i < wordInfos.size(); i++) { - WordInfo& wInfo = wordInfos[i]; - double logWordFreq = _segment.getWordWeight(wInfo.word); + KeyWordInfo& wInfo = wordInfos[i]; + double logWordFreq = 1.0;//_segment.getWordWeight(wInfo.word); wInfo.idf = -logWordFreq; size_t wLen = TransCode::getWordLength(wInfo.word); if(0 == wLen) @@ -108,10 +108,10 @@ namespace CppJieba bool KeyWordExt::_extractTopN(const vector& words, vector& keywords, uint topN) { keywords.clear(); - vector wordInfos; + vector wordInfos; for(uint i = 0; i < words.size(); i++) { - WordInfo wInfo; + KeyWordInfo wInfo; wInfo.word = words[i]; wordInfos.push_back(wInfo); } @@ -358,16 +358,16 @@ namespace CppJieba return false; } - bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) + bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) { if(2 > wordInfos.size()) { return true; } - WordInfo prior; + KeyWordInfo prior; bool flag = false; - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) + for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) { if(_isContainSubWords(it->word)) { diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index 51f33bf..3e8854f 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -36,12 +36,12 @@ namespace CppJieba bool extract(const string& title, vector& keywords, uint topN); bool extract(const vector& words, vector& keywords, uint topN); private: - static bool _wordInfoCompare(const WordInfo& a, const WordInfo& b); + static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); private: bool _extractTopN(const vector& words, vector& keywords, uint topN); private: //sort by word len - idf - bool _sortWLIDF(vector& wordInfos); + bool _sortWLIDF(vector& wordInfos); private: bool _filter(vector& strs); bool _filterDuplicate(vector& strs); @@ -49,7 +49,7 @@ namespace CppJieba bool _filterSubstr(vector& strs); bool _filterStopWords(vector& strs); private: - bool _prioritizeSubWords(vector& wordInfos); + bool _prioritizeSubWords(vector& wordInfos); bool _isContainSubWords(const string& word); }; diff --git a/src/Segment.cpp b/src/Segment.cpp index 3b58745..68167fb 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -16,8 +16,7 @@ namespace CppJieba bool Segment::init() { - bool retFlag = _trie.init(); - if(!retFlag) + if(!_trie.init()) { LogError("_trie.init failed."); return false; @@ -39,179 +38,166 @@ namespace CppJieba return _trie.dispose(); } - double Segment::getWordWeight(const string& str) + bool Segment::cutDAG(const string& str, vector& res) { - return _trie.getWeight(str); + vector segWordInfos; + if(!cutDAG(str, segWordInfos)) + { + return false; + } + res.clear(); + for(uint i = 0; i < segWordInfos.size(); i++) + { + res.push_back(segWordInfos[i].word); + } + return true; } - bool Segment::cutDAG(const string& str, vector& res) + bool Segment::cutDAG(const string& str, vector& segWordInfos) { if(str.empty()) { return false; } - res.clear(); - - bool retFlag; - VUINT16 unicode; - retFlag = TransCode::strToVec(str, unicode); - if(!retFlag) + segWordInfos.clear(); + SegmentContext segContext; + + if(!TransCode::strToVec(str, segContext.uintVec)) { LogError("TransCode::strToVec failed."); return false; } //calc DAG - vector > dag; - retFlag = _calcDAG(unicode, dag); - if(!retFlag) + if(!_calcDAG(segContext)) { LogError("_calcDAG failed."); return false; } -#ifdef DEBUG - { - string tmp("{"); - FOR_VECTOR(dag, i) - { - tmp += "["; - FOR_VECTOR(dag[i], j) - { - tmp += string_format("%d,", dag[i][j]); - } - tmp += "],"; - } - tmp += "}"; - LogDebug(tmp); - } -#endif - - vector > dp; - retFlag = _calcDP(unicode, dag, dp); - if(!retFlag) + if(!_calcDP(segContext)) { LogError("_calcDP failed."); return false; } - - retFlag = _cutDAG(unicode, dp, res); - if(!retFlag) + if(!_cutDAG(segContext, segWordInfos)) { LogError("_cutDAG failed."); return false; } - + return true; } - bool Segment::_calcDAG(const VUINT16& unicode, vector >& dag) + bool Segment::_calcDAG(SegmentContext& segContext) { - if(unicode.empty()) + if(segContext.uintVec.empty()) { return false; } - VUINT16_CONST_ITER beginIter = unicode.begin(); - for(VUINT16_CONST_ITER iterI = unicode.begin(); iterI != unicode.end(); iterI++) + vector > vec; + VUINT16_CONST_ITER beginIter = segContext.uintVec.begin(); + for(VUINT16_CONST_ITER iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) { - vector vec; - vec.push_back(iterI - beginIter); - for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != unicode.end(); iterJ++) + vec.clear(); + vec.push_back(pair(iterI - beginIter, NULL)); + for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++) { //care: the iterJ exceed iterEnd - if(NULL != _trie.find(iterI, iterJ + 1)) + const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1); + if(NULL != ptNodeInfo) { - vec.push_back(iterJ - beginIter); + vec.push_back(pair(iterJ - beginIter, ptNodeInfo)); } } - dag.push_back(vec); + segContext.dag.push_back(vec); } return true; } - bool Segment::_calcDP(const VUINT16& unicode, const vector >& dag, vector >& res) + bool Segment::_calcDP(SegmentContext& segContext) { - if(unicode.empty()) + if(segContext.uintVec.empty()) { - LogError("unicode illegal"); + LogError("uintVec illegal"); return false; } - if(unicode.size() != dag.size()) + if(segContext.uintVec.size() != segContext.dag.size()) { LogError("dag is illegal!"); return false; } - res.clear(); - res.assign(unicode.size() + 1, pair(-1, 0.0)); - res[unicode.size()].first = -1; - res[unicode.size()].second = 0.0; + segContext.dp.assign(segContext.uintVec.size() + 1, pair(NULL, 0.0)); + segContext.dp[segContext.uintVec.size()].first = NULL; + segContext.dp[segContext.uintVec.size()].second = 0.0; - VUINT16_CONST_ITER iterBegin = unicode.begin(); - - for(int i = unicode.size() - 1; i >= 0; i--) + for(int i = segContext.uintVec.size() - 1; i >= 0; i--) { // calc max - res[i].first = -1; - res[i].second = -(numeric_limits::max()); - for(uint j = 0; j < dag[i].size(); j++) + segContext.dp[i].first = NULL; + segContext.dp[i].second = -(numeric_limits::max()); + for(uint j = 0; j < segContext.dag[i].size(); j++) { - //cout<<(i/2)<<","< res[i].second) + const pair& p = segContext.dag[i][j]; + int pos = p.first; + double val = segContext.dp[pos+1].second; + if(NULL != p.second) { - res[i].first = pos; - res[i].second = val; + val += (p.second)->logFreq; + } + else + { + val += _trie.getMinLogFreq(); + } + + if(val > segContext.dp[i].second) + { + segContext.dp[i].first = p.second; + segContext.dp[i].second = val; } } } - //FOR_VECTOR(res, i) - //{ - // cout< >& dp, vector& res) + + bool Segment::_cutDAG(SegmentContext& segContext, vector& res) { - if(dp.size() != unicode.size()) + if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size()) { - LogError("dp or unicode illegal!"); + LogError("dp or uintVec illegal!"); return false; } - res.clear(); - uint begin = 0, end = 0; - VUINT16_CONST_ITER iterBegin = unicode.begin(); - //for(uint i = 0; i < dp.size(); i++) - while(begin < dp.size() && end <= dp.size()) + VUINT16_CONST_ITER iterBegin = segContext.uintVec.begin(); + uint i = 0; + while(i < segContext.dp.size()) { - //cout<wLen) + { + LogFatal("TrieNodeInfo's wLen is 0!"); + return false; + } + i += p->wLen; } - res.push_back(tmp); - begin = end; } return true; } diff --git a/src/Segment.h b/src/Segment.h index 1526411..2c8f4fa 100644 --- a/src/Segment.h +++ b/src/Segment.h @@ -23,14 +23,16 @@ namespace CppJieba bool init(); bool loadSegDict(const char * const filePath); bool dispose(); - double getWordWeight(const string& str); public: - bool cutDAG(const string& chStr, vector& res); + bool cutDAG(const string& str, vector& segWordInfos); + bool cutDAG(const string& str, vector& res); private: - bool _calcDAG(const VUINT16& unicode, vector >& dag); - bool _calcDP(const VUINT16& unicode, const vector >& dag, vector >& res); - bool _cutDAG(const VUINT16& unicode, const vector >& dp, vector& res); + bool _calcDAG(SegmentContext& segContext); + bool _calcDP(SegmentContext& segContext); + bool _cutDAG(SegmentContext& segContext, vector& res); + + //bool _fill(const string& ) }; } diff --git a/src/TransCode.cpp b/src/TransCode.cpp index 2eb35a3..ed259ef 100644 --- a/src/TransCode.cpp +++ b/src/TransCode.cpp @@ -44,11 +44,6 @@ namespace CppJieba return true; } - bool TransCode::a(const string& str, vector& vec) - { - return true; - } - bool TransCode::strToVec(const string& str, vector& vec) { if(NULL == _pf_strToVec) diff --git a/src/TransCode.h b/src/TransCode.h index e5467e8..60468d1 100644 --- a/src/TransCode.h +++ b/src/TransCode.h @@ -36,7 +36,6 @@ namespace CppJieba public: static bool init(); public: - static bool a(const string& str, vector& vec); static bool strToVec(const string& str, vector& vec); static string vecToStr(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end); static size_t getWordLength(const string& str); diff --git a/src/Trie.cpp b/src/Trie.cpp index 2d913b5..52182de 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -152,9 +152,9 @@ namespace CppJieba LogFatal("trie not initted!"); return NULL; } - VUINT16 unicode; + VUINT16 uintVec; - bool retFlag = TransCode::strToVec(str, unicode); + bool retFlag = TransCode::strToVec(str, uintVec); if(retFlag) { LogError("TransCode::strToVec failed."); @@ -164,9 +164,9 @@ namespace CppJieba //find TrieNode* p = _root; TrieNodeInfo * res = NULL; - for(uint i = 0; i < unicode.size(); i++) + for(uint i = 0; i < uintVec.size(); i++) { - uint16_t chUni = unicode[i]; + uint16_t chUni = uintVec[i]; if(p->isLeaf) { uint pos = p->nodeInfoVecPos; @@ -195,22 +195,22 @@ namespace CppJieba const TrieNodeInfo* Trie::find(const string& str) { - VUINT16 unicode; - bool retFlag = TransCode::strToVec(str, unicode); + VUINT16 uintVec; + bool retFlag = TransCode::strToVec(str, uintVec); if(!retFlag) { return NULL; } - return find(unicode); + return find(uintVec); } - const TrieNodeInfo* Trie::find(const VUINT16& unicode) + const TrieNodeInfo* Trie::find(const VUINT16& uintVec) { - if(unicode.empty()) + if(uintVec.empty()) { return NULL; } - return find(unicode.begin(), unicode.end()); + return find(uintVec.begin(), uintVec.end()); } const TrieNodeInfo* Trie::find(VUINT16_CONST_ITER begin, VUINT16_CONST_ITER end) @@ -257,25 +257,25 @@ namespace CppJieba double Trie::getWeight(const string& str) { - VUINT16 unicode; - TransCode::strToVec(str, unicode); - return getWeight(unicode); + VUINT16 uintVec; + TransCode::strToVec(str, uintVec); + return getWeight(uintVec); } - double Trie::getWeight(const VUINT16& unicode) + double Trie::getWeight(const VUINT16& uintVec) { - if(unicode.empty()) + if(uintVec.empty()) { - return getMinWeight(); + return getMinLogFreq(); } - const TrieNodeInfo * p = find(unicode); + const TrieNodeInfo * p = find(uintVec); if(NULL != p) { return p->logFreq; } else { - return getMinWeight(); + return getMinLogFreq(); } } @@ -289,11 +289,11 @@ namespace CppJieba } else { - return getMinWeight(); + return getMinLogFreq(); } } - double Trie::getMinWeight() + double Trie::getMinLogFreq() { return _minLogFreq; } @@ -326,8 +326,8 @@ namespace CppJieba const string& word = nodeInfo.word; - VUINT16 unicode; - bool retFlag = TransCode::strToVec(word, unicode); + VUINT16 uintVec; + bool retFlag = TransCode::strToVec(word, uintVec); if(!retFlag) { LogError("TransCode::strToVec error."); @@ -335,9 +335,9 @@ namespace CppJieba } TrieNode* p = _root; - for(uint i = 0; i < unicode.size(); i++) + for(uint i = 0; i < uintVec.size(); i++) { - uint16_t cu = unicode[i]; + uint16_t cu = uintVec[i]; if(NULL == p) { return false; @@ -426,7 +426,7 @@ int main() trie.init(); trie.loadDict("../dicts/segdict.gbk.v2.1"); //trie.loadDict("tmp"); - cout< uintVec; + vector< vector > > dag; + vector< pair > dp; + //vector words; + }; + + /* + struct SegmentWordInfo: public TrieNodeInfo + { + + }; + */ + - struct WordInfo: public TrieNodeInfo + struct KeyWordInfo: public TrieNodeInfo { double idf; double weight;// log(wLen+1)*logFreq; - WordInfo() + KeyWordInfo() { idf = 0.0; weight = 0.0; @@ -36,7 +53,7 @@ namespace CppJieba } }; - inline string joinWordInfos(const vector& vec) + inline string joinWordInfos(const vector& vec) { vector tmp; for(uint i = 0; i < vec.size(); i++)