diff --git a/demo/keywordext_demo.cpp b/demo/keywordext_demo.cpp index 6298be3..a69e66e 100644 --- a/demo/keywordext_demo.cpp +++ b/demo/keywordext_demo.cpp @@ -8,7 +8,7 @@ using namespace CppJieba; void testKeyWordExt(const char * dictPath, const char * filePath) { KeyWordExt ext; - if(!ext.init(dictPath, "../dicts/stopwords.gbk.v1.0")) + if(!ext.init(dictPath)) { return; } diff --git a/demo/segment_demo.cpp b/demo/segment_demo.cpp index aa512e0..9e4d7ef 100644 --- a/demo/segment_demo.cpp +++ b/demo/segment_demo.cpp @@ -32,7 +32,7 @@ void cut(const char * const filePath) { if(!line.empty()) { - seg.cutDAG(line, res); + seg.cut(line, res); cout< trieNodeInfos; - _segment.cutDAG(title, trieNodeInfos); + _segment.cut(title, trieNodeInfos); keyWordInfos.clear(); for(uint i = 0; i < trieNodeInfos.size(); i++) @@ -249,7 +246,7 @@ namespace CppJieba bool KeyWordExt::_filterDuplicate(vector& wordInfos) { - set st; + set st; for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) { if(st.find(it->word) != st.end()) @@ -271,7 +268,7 @@ namespace CppJieba { // filter single word - if(1 == it->wLen) + if(1 == it->word.size()) { it = wordInfos.erase(it); } @@ -285,79 +282,68 @@ namespace CppJieba bool KeyWordExt::_filterSubstr(vector& wordInfos) { - vector tmp ; + vector tmp ; for(uint i = 0; i < wordInfos.size(); i++) { tmp.push_back(wordInfos[i].word); } - set subs; - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); it ++) - { - for(uint j = 0; j < tmp.size(); j++) - { - if(it->word != tmp[j] && string::npos != tmp[j].find(it->word, 0)) - { - subs.insert(it->word); - } - } - } - //erase subs from strs for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) { - if(subs.end() != subs.find(it->word)) - { - it = wordInfos.erase(it); - } - else - { - it ++; - } + if(_isSubIn(tmp, it->word)) + { + it = wordInfos.erase(it); + } + else + { + it++; + } } + return true; } - bool KeyWordExt::_isContainSubWords(const string& word) - { - for(uint i = 0; i < _priorSubWords.size(); i++) - { - if(string::npos != word.find(_priorSubWords[i])) - { - return true; - } - } - return false; - } + //bool KeyWordExt::_isContainSubWords(const string& word) + //{ + // for(uint i = 0; i < _priorSubWords.size(); i++) + // { + // if(string::npos != word.find(_priorSubWords[i])) + // { + // return true; + // } + // } + // return false; + //} - bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) - { - if(2 > wordInfos.size()) - { - return true; - } + //bool KeyWordExt::_prioritizeSubWords(vector& wordInfos) + //{ + // if(2 > wordInfos.size()) + // { + // return true; + // } - KeyWordInfo prior; - bool flag = false; - for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) - { - if(_isContainSubWords(it->word)) - { - prior = *it; - it = wordInfos.erase(it); - flag = true; - break; - } - else - { - it ++; - } - } - if(flag) - { - wordInfos.insert(wordInfos.begin(), prior); - } - return true; - } + // KeyWordInfo prior; + // bool flag = false; + // for(vector::iterator it = wordInfos.begin(); it != wordInfos.end(); ) + // { + // if(_isContainSubWords(it->word)) + // { + // prior = *it; + // it = wordInfos.erase(it); + // flag = true; + // break; + // } + // else + // { + // it ++; + // } + // } + // if(flag) + // { + // wordInfos.insert(wordInfos.begin(), prior); + // } + // return true; + //} } @@ -375,12 +361,6 @@ int main() } ext._loadStopWords("../dicts/stopwords.gbk.v1.0"); - if(!ext._loadPriorSubWords("../dicts/prior.gbk")) - { - cerr<<"err"< res; string line; diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index 5f9432b..fbac900 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -1,7 +1,7 @@ /************************************ * file enc : ASCII * author : wuyanyi09@gmail.com -************************************/ + ************************************/ #ifndef CPPJIEBA_KEYWORDEXT_H #define CPPJIEBA_KEYWORDEXT_H @@ -11,45 +11,56 @@ namespace CppJieba { - class KeyWordExt - { - private: - MPSegment _segment; - vector _priorSubWords; - set _stopWords; - public: - KeyWordExt(); - ~KeyWordExt(); - bool init(const char* const segDictFile, const char* const stopWordDictFile); - bool dispose(); - - private: - bool _loadStopWords(const char * const filePath); - bool _loadPriorSubWords(const char * const filePath); + class KeyWordExt + { + private: + MPSegment _segment; + //vector _priorSubWords; + set _stopWords; + public: + KeyWordExt(); + ~KeyWordExt(); + bool init(const char* const segDictFile); + bool dispose(); + bool loadStopWords(const char * const filePath); + private: + //bool _loadPriorSubWords(const char * const filePath); - public: - bool extract(const string& title, vector& keyWordInfos, uint topN); - bool extract(const vector& words, vector& keyWordInfos, uint topN); - private: - static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); - private: - bool _extract(vector& keyWordInfos, uint topN); - bool _extTopN(vector& wordInfos, uint topN); - private: - //sort by word len - idf - bool _sortWLIDF(vector& wordInfos); - private: - bool _filter(vector& ); - bool _filterDuplicate(vector& ); - bool _filterSingleWord(vector& ); - bool _filterSubstr(vector& ); - bool _filterStopWords(vector& ); - private: - bool _prioritizeSubWords(vector& wordInfos); - bool _isContainSubWords(const string& word); + public: + bool extract(const string& title, vector& keyWordInfos, uint topN); + bool extract(const vector& words, vector& keyWordInfos, uint topN); + private: + static bool _wordInfoCompare(const KeyWordInfo& a, const KeyWordInfo& b); + private: + bool _extract(vector& keyWordInfos, uint topN); + bool _extTopN(vector& wordInfos, uint topN); + private: + //sort by word len - idf + bool _sortWLIDF(vector& wordInfos); + private: + bool _filter(vector& ); + bool _filterDuplicate(vector& ); + bool _filterSingleWord(vector& ); + bool _filterSubstr(vector& ); + bool _filterStopWords(vector& ); + private: + inline bool _isSubIn(const vector& words, const Unicode& word)const + { - }; + for(uint j = 0; j < words.size(); j++) + { + if(word != words[j] && words[j].end() != search(words[j].begin(), words[j].end(), word.begin(), word.end())) + { + return true; + } + } + return false; + } + //bool _prioritizeSubWords(vector& wordInfos); + //bool _isContainSubWords(const string& word); + + }; } diff --git a/src/MPSegment.cpp b/src/MPSegment.cpp index 3409e63..2d81308 100644 --- a/src/MPSegment.cpp +++ b/src/MPSegment.cpp @@ -36,22 +36,22 @@ namespace CppJieba return _trie.dispose(); } - bool MPSegment::cutDAG(const string& str, vector& res) + bool MPSegment::cut(const string& str, vector& res) { vector segWordInfos; - if(!cutDAG(str, segWordInfos)) + if(!cut(str, segWordInfos)) { return false; } res.clear(); for(uint i = 0; i < segWordInfos.size(); i++) { - res.push_back(segWordInfos[i].word); + res.push_back(TransCode::vecToStr(segWordInfos[i].word.begin(), segWordInfos[i].word.end())); } return true; } - bool MPSegment::cutDAG(const string& str, vector& segWordInfos) + bool MPSegment::cut(const string& str, vector& segWordInfos) { if(str.empty()) { @@ -59,13 +59,19 @@ namespace CppJieba } segWordInfos.clear(); SegmentContext segContext; - - if(!TransCode::strToVec(str, segContext.uintVec)) + Unicode sentence; + + if(!TransCode::strToVec(str, sentence)) { LogError("TransCode::strToVec failed."); return false; } - + + for(uint i = 0; i < sentence.size(); i++) + { + segContext.push_back(SegmentChar(sentence[i])); + } + //calc DAG if(!_calcDAG(segContext)) { @@ -79,9 +85,9 @@ namespace CppJieba return false; } - if(!_cutDAG(segContext, segWordInfos)) + if(!_cut(segContext, segWordInfos)) { - LogError("_cutDAG failed."); + LogError("_cut failed."); return false; } @@ -90,111 +96,150 @@ namespace CppJieba bool MPSegment::_calcDAG(SegmentContext& segContext) { - if(segContext.uintVec.empty()) + if(segContext.empty()) { + LogError("segContext empty."); return false; } - vector > vec; - Unicode::const_iterator beginIter = segContext.uintVec.begin(); - for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) - { - vec.clear(); - vec.push_back(pair(iterI - beginIter, NULL)); - for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++) - { - //care: the iterJ exceed iterEnd - const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1); - if(NULL != ptNodeInfo) - { - vec.push_back(pair(iterJ - beginIter, ptNodeInfo)); - } - } - segContext.dag.push_back(vec); - } - return true; + + Unicode unicode; + for(uint i = 0; i < segContext.size(); i++) + { + unicode.clear(); + for(uint j = i ; j < segContext.size(); j++) + { + unicode.push_back(segContext[j].uniCh); + const TrieNodeInfo* pInfo = _trie.find(unicode); + if(pInfo) + { + segContext[i].dag[j] = pInfo; + } + } + if(segContext[i].dag.end() == segContext[i].dag.find(i)) + { + segContext[i].dag[i] = NULL; + } + } + return true; + //vector > vec; + //Unicode::const_iterator beginIter = segContext.uintVec.begin(); + //for(Unicode::const_iterator iterI = segContext.uintVec.begin(); iterI != segContext.uintVec.end(); iterI++) + //{ + // vec.clear(); + // vec.push_back(pair(iterI - beginIter, NULL)); + // for(Unicode::const_iterator iterJ = iterI + 1; iterJ != segContext.uintVec.end(); iterJ++) + // { + // //care: the iterJ exceed iterEnd + // const TrieNodeInfo* ptNodeInfo = _trie.find(iterI, iterJ + 1); + // if(NULL != ptNodeInfo) + // { + // vec.push_back(pair(iterJ - beginIter, ptNodeInfo)); + // } + // } + // segContext.dag.push_back(vec); + //} + //return true; } bool MPSegment::_calcDP(SegmentContext& segContext) { - if(segContext.uintVec.empty()) + if(segContext.empty()) { - LogError("uintVec illegal"); + LogError("segContext empty"); return false; } + + for(int i = segContext.size() - 1; i >= 0; i--) + { + segContext[i].pInfo = NULL; + segContext[i].weight = MIN_DOUBLE; + for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) + { + uint nextPos = it->first; + const TrieNodeInfo* p = it->second; + double val = 0.0; + if(nextPos + 1 < segContext.size()) + { + val += segContext[nextPos + 1].weight; + } - if(segContext.uintVec.size() != segContext.dag.size()) - { - LogError("dag is illegal!"); - return false; - } - - segContext.dp.assign(segContext.uintVec.size() + 1, pair(NULL, 0.0)); - segContext.dp[segContext.uintVec.size()].first = NULL; - segContext.dp[segContext.uintVec.size()].second = 0.0; - - for(int i = segContext.uintVec.size() - 1; i >= 0; i--) - { - // calc max - segContext.dp[i].first = NULL; - segContext.dp[i].second = MIN_DOUBLE; - for(uint j = 0; j < segContext.dag[i].size(); j++) - { - const pair& p = segContext.dag[i][j]; - int pos = p.first; - double val = segContext.dp[pos+1].second; - if(NULL != p.second) - { - val += (p.second)->logFreq; - } - else - { + if(p) + { + val += p->logFreq; + } + else + { val += _trie.getMinLogFreq(); - } - - if(val > segContext.dp[i].second) + } + if(val > segContext[i].weight) { - segContext.dp[i].first = p.second; - segContext.dp[i].second = val; + segContext[i].pInfo = p; + segContext[i].weight = val; } - } - } - segContext.dp.pop_back(); - return true; + } + } + return true; + + //segContext.dp.assign(segContext.uintVec.size() + 1, pair(NULL, 0.0)); + //segContext.dp[segContext.uintVec.size()].first = NULL; + //segContext.dp[segContext.uintVec.size()].second = 0.0; + + //for(int i = segContext.uintVec.size() - 1; i >= 0; i--) + //{ + // // calc max + // segContext.dp[i].first = NULL; + // segContext.dp[i].second = MIN_DOUBLE; + // for(uint j = 0; j < segContext.dag[i].size(); j++) + // { + // const pair& p = segContext.dag[i][j]; + // int pos = p.first; + // double val = segContext.dp[pos+1].second; + // if(NULL != p.second) + // { + // val += (p.second)->logFreq; + // } + // else + // { + // val += _trie.getMinLogFreq(); + // } + + // if(val > segContext.dp[i].second) + // { + // segContext.dp[i].first = p.second; + // segContext.dp[i].second = val; + // } + // } + //} + //segContext.dp.pop_back(); + //return true; } - bool MPSegment::_cutDAG(SegmentContext& segContext, vector& res) + bool MPSegment::_cut(SegmentContext& segContext, vector& res) { - if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size()) - { - LogError("dp or uintVec illegal!"); - return false; - } + //if(segContext.dp.empty() || segContext.uintVec.empty() || segContext.dp.size() != segContext.uintVec.size()) + //{ + // LogFatal("dp or uintVec illegal!"); + // return false; + //} res.clear(); - Unicode::const_iterator iterBegin = segContext.uintVec.begin(); uint i = 0; - while(i < segContext.dp.size()) + while(i < segContext.size()) { - const TrieNodeInfo* p = segContext.dp[i].first; - if(NULL == p) + const TrieNodeInfo* p = segContext[i].pInfo; + if(p) + { + res.push_back(*p); + i += p->word.size(); + } + else//single chinese word { TrieNodeInfo nodeInfo; - nodeInfo.word = TransCode::vecToStr(iterBegin + i, iterBegin + i +1); - nodeInfo.wLen = 1; + nodeInfo.word.push_back(segContext[i].uniCh); nodeInfo.freq = 0; nodeInfo.logFreq = _trie.getMinLogFreq(); res.push_back(nodeInfo); - i ++; - } - else - { - res.push_back(*p); - if(0 == p->wLen) - { - LogFatal("TrieNodeInfo's wLen is 0!"); - return false; - } - i += p->wLen; + i++; } } return true; @@ -223,7 +268,7 @@ int main() while(getline(ifile, line)) { res.clear(); - segment.cutDAG(line, res); + segment.cut(line, res); PRINT_VECTOR(res); getchar(); } diff --git a/src/MPSegment.h b/src/MPSegment.h index f4b5439..ca6670e 100644 --- a/src/MPSegment.h +++ b/src/MPSegment.h @@ -13,6 +13,9 @@ namespace CppJieba { + + typedef vector SegmentContext; + class MPSegment { private: @@ -25,15 +28,14 @@ namespace CppJieba bool init(const char* const filePath); bool dispose(); public: - bool cutDAG(const string& str, vector& segWordInfos); - bool cutDAG(const string& str, vector& res); + bool cut(const string& str, vector& segWordInfos); + bool cut(const string& str, vector& res); private: bool _calcDAG(SegmentContext& segContext); bool _calcDP(SegmentContext& segContext); - bool _cutDAG(SegmentContext& segContext, vector& res); + bool _cut(SegmentContext& segContext, vector& res); - //bool _fill(const string& ) }; } diff --git a/src/MixSegment.cpp b/src/MixSegment.cpp index 4b592d0..2b73510 100644 --- a/src/MixSegment.cpp +++ b/src/MixSegment.cpp @@ -35,12 +35,12 @@ namespace CppJieba bool MixSegment::cut(const string& str, vector& res) { vector infos; - if(!_mpSeg.cutDAG(str, infos)) + if(!_mpSeg.cut(str, infos)) { LogError("_mpSeg cutDAG failed."); return false; } - for(uint = 0; i < infos.size(); i++) + for(uint i= 0; i < infos.size(); i++) { } diff --git a/src/Trie.cpp b/src/Trie.cpp index 6bed239..35ae3ce 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -109,9 +109,11 @@ namespace CppJieba LogError(string_format("line[%s] illegal.", line.c_str())); return false; } - nodeInfo.word = vecBuf[0]; + if(!TransCode::strToVec(vecBuf[0], nodeInfo.word)) + { + return false; + } nodeInfo.freq = atoi(vecBuf[1].c_str()); - nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word); if(3 == vecBuf.size()) { nodeInfo.tag = vecBuf[2]; @@ -193,7 +195,7 @@ namespace CppJieba return res; } - const TrieNodeInfo* Trie::find(const string& str) + TrieNodeInfo* Trie::find(const string& str) { Unicode uintVec; bool retFlag = TransCode::strToVec(str, uintVec); @@ -204,7 +206,7 @@ namespace CppJieba return find(uintVec); } - const TrieNodeInfo* Trie::find(const Unicode& uintVec) + TrieNodeInfo* Trie::find(const Unicode& uintVec) { if(uintVec.empty()) { @@ -213,7 +215,7 @@ namespace CppJieba return find(uintVec.begin(), uintVec.end()); } - const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end) + TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end) { if(!_getInitFlag()) @@ -324,16 +326,8 @@ namespace CppJieba return false; } - const string& word = nodeInfo.word; - - Unicode uintVec; - bool retFlag = TransCode::strToVec(word, uintVec); - if(!retFlag) - { - LogError("TransCode::strToVec error."); - return false; - } + const Unicode& uintVec = nodeInfo.word; TrieNode* p = _root; for(uint i = 0; i < uintVec.size(); i++) { diff --git a/src/Trie.h b/src/Trie.h index f991f8b..ea74882 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -66,9 +66,9 @@ namespace CppJieba bool _getInitFlag(); public: - const TrieNodeInfo* find(const string& str); - const TrieNodeInfo* find(const Unicode& uintVec); - const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end); + TrieNodeInfo* find(const string& str); + TrieNodeInfo* find(const Unicode& uintVec); + TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end); const TrieNodeInfo* findPrefix(const string& str); public: diff --git a/src/globals.h b/src/globals.h index 5b3937e..62d7069 100644 --- a/src/globals.h +++ b/src/globals.h @@ -27,7 +27,6 @@ namespace CppJieba typedef unordered_map TrieNodeMap; typedef unordered_map EmitProbMap; - const double MIN_DOUBLE = -3.14e+100; const double MAX_DOUBLE = 3.14e+100; } diff --git a/src/structs.h b/src/structs.h index 30045af..0ff505d 100644 --- a/src/structs.h +++ b/src/structs.h @@ -4,35 +4,63 @@ #include #include "globals.h" #include "Trie.h" +#include "TransCode.h" namespace CppJieba { struct TrieNodeInfo { - string word; - size_t wLen;// the word's len , not string.length(), + //string word; + //size_t wLen;// the word's len , not string.length(), + Unicode word; size_t freq; string tag; double logFreq; //logFreq = log(freq/sum(freq)); - TrieNodeInfo():wLen(0),freq(0),logFreq(0.0) + TrieNodeInfo():freq(0),logFreq(0.0) { } - TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), wLen(nodeInfo.wLen), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) + TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) { } - TrieNodeInfo(const string& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) + TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) { - wLen = TransCode::getWordLength(_word); } }; + + typedef unordered_map DagType; + struct SegmentChar + { + uint16_t uniCh; + DagType dag; + const TrieNodeInfo * pInfo; + double weight; + + SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0) + { + } + + /*const TrieNodeInfo* pInfo; + double weight; + SegmentChar(uint16_t unich, const TrieNodeInfo* p, double w):uniCh(unich), pInfo(p), weight(w) + { + }*/ + }; + /* + struct SegmentContext + { + vector context; + bool getDA + };*/ + typedef vector SegmentContext; - struct SegmentContext//: public TrieNodeInfo - { - vector uintVec; - vector< vector > > dag; - vector< pair > dp; - }; + //struct SegmentContext + //{ + // vector context; + // //vector uintVec; + // //vector< vector > > dag; + // //vector< pair > dp; + //}; /* struct SegmentWordInfo: public TrieNodeInfo @@ -48,7 +76,7 @@ namespace CppJieba KeyWordInfo():idf(0.0),weight(0.0) { } - KeyWordInfo(const string& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0) + KeyWordInfo(const Unicode& _word):TrieNodeInfo(_word),idf(0.0),weight(0.0) { } KeyWordInfo(const TrieNodeInfo& trieNodeInfo):TrieNodeInfo(trieNodeInfo) @@ -56,13 +84,12 @@ namespace CppJieba } string toString() const { - return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); + return string_format("{word:%s,weight:%lf, idf:%lf}", TransCode::vecToStr(word.begin(), word.end()).c_str(), weight, idf); } KeyWordInfo& operator = (const TrieNodeInfo& trieNodeInfo) { word = trieNodeInfo.word; freq = trieNodeInfo.freq; - wLen = trieNodeInfo.wLen; tag = trieNodeInfo.tag; logFreq = trieNodeInfo.logFreq; return *this;