From ff34095252d23aa162e0c0fcbaf7851e5c853267 Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Sun, 18 Aug 2013 14:04:15 +0800 Subject: [PATCH] mv *NodeInfo into structs.h and WordInfo is derived from TrieNodeInfo --- src/KeyWordExt.cpp | 19 ------------------ src/KeyWordExt.h | 30 +--------------------------- src/Segment.cpp | 7 +++---- src/Trie.cpp | 37 +++++++++++++++++----------------- src/Trie.h | 23 +++------------------ src/globals.h | 2 ++ src/structs.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 78 insertions(+), 90 deletions(-) create mode 100644 src/structs.h diff --git a/src/KeyWordExt.cpp b/src/KeyWordExt.cpp index 55c7beb..fdb440c 100644 --- a/src/KeyWordExt.cpp +++ b/src/KeyWordExt.cpp @@ -89,25 +89,6 @@ namespace CppJieba bool KeyWordExt::_sortWLIDF(vector& wordInfos) { - //size_t wLenSum = 0; - for(uint i = 0; i < wordInfos.size(); i++) - { - wordInfos[i].wLen = TransCode::getWordLength(wordInfos[i].word); - if(0 == wordInfos[i].wLen) - { - LogFatal("wLen is 0"); - return false; - } - //wLenSum += wordInfos[i].wLen; - } - - /* - if(0 == wLenSum) - { - LogFatal("wLenSum == 0."); - return false; - }*/ - for(uint i = 0; i < wordInfos.size(); i++) { WordInfo& wInfo = wordInfos[i]; diff --git a/src/KeyWordExt.h b/src/KeyWordExt.h index f204211..51f33bf 100644 --- a/src/KeyWordExt.h +++ b/src/KeyWordExt.h @@ -6,39 +6,11 @@ #define CPPJIEBA_KEYWORDEXT_H #include "Segment.h" +#include "structs.h" namespace CppJieba { - struct WordInfo - { - string word; - size_t wLen; - double weight; - double idf; - WordInfo() - { - word = ""; - wLen = 0; - weight = 0.0; - idf = 0.0; - } - string getInfoStr() const - { - return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); - } - }; - - inline string joinWordInfos(const vector& vec) - { - vector tmp; - for(uint i = 0; i < vec.size(); i++) - { - tmp.push_back(vec[i].getInfoStr()); - } - return joinStr(tmp, ","); - } - class KeyWordExt { private: diff --git a/src/Segment.cpp b/src/Segment.cpp index 8254b39..3b58745 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -112,13 +112,12 @@ namespace CppJieba { return false; } - typedef VUINT16_CONST_ITER UCI; - UCI beginIter = unicode.begin(); - for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++) + VUINT16_CONST_ITER beginIter = unicode.begin(); + for(VUINT16_CONST_ITER iterI = unicode.begin(); iterI != unicode.end(); iterI++) { vector vec; vec.push_back(iterI - beginIter); - for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++) + for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != unicode.end(); iterJ++) { //care: the iterJ exceed iterEnd if(NULL != _trie.find(iterI, iterJ + 1)) diff --git a/src/Trie.cpp b/src/Trie.cpp index 66215b2..2d913b5 100644 --- a/src/Trie.cpp +++ b/src/Trie.cpp @@ -20,8 +20,8 @@ namespace CppJieba { _root = NULL; - _totalCount = 0; - _minWeight = numeric_limits::max(); + _freqSum = 0; + _minLogFreq = numeric_limits::max(); _initFlag = false; } @@ -110,7 +110,8 @@ namespace CppJieba return false; } nodeInfo.word = vecBuf[0]; - nodeInfo.count = atoi(vecBuf[1].c_str()); + nodeInfo.freq = atoi(vecBuf[1].c_str()); + nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word); if(3 == vecBuf.size()) { nodeInfo.tag = vecBuf[2]; @@ -270,7 +271,7 @@ namespace CppJieba const TrieNodeInfo * p = find(unicode); if(NULL != p) { - return p->weight; + return p->logFreq; } else { @@ -284,7 +285,7 @@ namespace CppJieba const TrieNodeInfo * p = find(begin, end); if(NULL != p) { - return p->weight; + return p->logFreq; } else { @@ -294,12 +295,12 @@ namespace CppJieba double Trie::getMinWeight() { - return _minWeight; + return _minLogFreq; } int64_t Trie::getTotalCount() { - return _totalCount; + return _freqSum; } bool Trie::_deleteNode(TrieNode* node) @@ -379,21 +380,21 @@ namespace CppJieba bool Trie::_countWeight() { - if(_nodeInfoVec.empty() || 0 != _totalCount) + if(_nodeInfoVec.empty() || 0 != _freqSum) { - LogError("_nodeInfoVec is empty or _totalCount has been counted already."); + LogError("_nodeInfoVec is empty or _freqSum has been counted already."); return false; } - //count total freq + //freq total freq for(size_t i = 0; i < _nodeInfoVec.size(); i++) { - _totalCount += _nodeInfoVec[i].count; + _freqSum += _nodeInfoVec[i].freq; } - if(0 == _totalCount) + if(0 == _freqSum) { - LogError("_totalCount == 0 ."); + LogError("_freqSum == 0 ."); return false; } @@ -401,15 +402,15 @@ namespace CppJieba for(uint i = 0; i < _nodeInfoVec.size(); i++) { TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; - if(0 == nodeInfo.count) + if(0 == nodeInfo.freq) { - LogFatal("nodeInfo.count == 0!"); + LogFatal("nodeInfo.freq == 0!"); return false; } - nodeInfo.weight = log(double(nodeInfo.count)/double(_totalCount)); - if(_minWeight > nodeInfo.weight) + nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); + if(_minLogFreq > nodeInfo.logFreq) { - _minWeight = nodeInfo.weight; + _minLogFreq = nodeInfo.logFreq; } } diff --git a/src/Trie.h b/src/Trie.h index 574adfa..68c7998 100644 --- a/src/Trie.h +++ b/src/Trie.h @@ -18,29 +18,12 @@ #include "cppcommon/logger.h" #include "TransCode.h" #include "globals.h" +#include "structs.h" namespace CppJieba { using namespace CPPCOMMON; - using namespace std; - typedef map TrieNodeMap; - - struct TrieNodeInfo - { - string word; - size_t wLen;// the word's len , not string.size(), - size_t count; - string tag; - double weight; - TrieNodeInfo() - { - wLen = 0; - count = 0; - weight = 0.0; - } - }; - struct TrieNode { TrieNodeMap hmap; @@ -60,8 +43,8 @@ namespace CppJieba TrieNode* _root; vector _nodeInfoVec; - int64_t _totalCount; - double _minWeight; + int64_t _freqSum; + double _minLogFreq; bool _initFlag; public: diff --git a/src/globals.h b/src/globals.h index 709e67b..01d2ce6 100644 --- a/src/globals.h +++ b/src/globals.h @@ -14,11 +14,13 @@ namespace CppJieba { + using namespace std; //typedefs typedef unsigned int uint; typedef std::vector::iterator VSI; typedef std::vector VUINT16; typedef std::vector::const_iterator VUINT16_CONST_ITER; + typedef map TrieNodeMap; } diff --git a/src/structs.h b/src/structs.h new file mode 100644 index 0000000..13b70a6 --- /dev/null +++ b/src/structs.h @@ -0,0 +1,50 @@ +#ifndef CPPJIEBA_STRUCTS_H +#define CPPJIEBA_STRUCTS_H + +#include "globals.h" + +namespace CppJieba +{ + + struct TrieNodeInfo + { + string word; + size_t wLen;// the word's len , not string.length(), + size_t freq; + string tag; + double logFreq;//log(freq/sum(freq)); + TrieNodeInfo() + { + wLen = 0; + freq = 0; + logFreq = 0.0; + } + }; + + struct WordInfo: public TrieNodeInfo + { + double idf; + double weight;// log(wLen+1)*logFreq; + WordInfo() + { + idf = 0.0; + weight = 0.0; + } + string toString() const + { + return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf); + } + }; + + inline string joinWordInfos(const vector& vec) + { + vector tmp; + for(uint i = 0; i < vec.size(); i++) + { + tmp.push_back(vec[i].toString()); + } + return joinStr(tmp, ","); + } +} + +#endif