mv *NodeInfo into structs.h and WordInfo is derived from TrieNodeInfo

This commit is contained in:
gwdwyy 2013-08-18 14:04:15 +08:00
parent abb507a029
commit ff34095252
7 changed files with 78 additions and 90 deletions

View File

@ -89,25 +89,6 @@ namespace CppJieba
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
{
//size_t wLenSum = 0;
for(uint i = 0; i < wordInfos.size(); i++)
{
wordInfos[i].wLen = TransCode::getWordLength(wordInfos[i].word);
if(0 == wordInfos[i].wLen)
{
LogFatal("wLen is 0");
return false;
}
//wLenSum += wordInfos[i].wLen;
}
/*
if(0 == wLenSum)
{
LogFatal("wLenSum == 0.");
return false;
}*/
for(uint i = 0; i < wordInfos.size(); i++)
{
WordInfo& wInfo = wordInfos[i];

View File

@ -6,39 +6,11 @@
#define CPPJIEBA_KEYWORDEXT_H
#include "Segment.h"
#include "structs.h"
namespace CppJieba
{
struct WordInfo
{
string word;
size_t wLen;
double weight;
double idf;
WordInfo()
{
word = "";
wLen = 0;
weight = 0.0;
idf = 0.0;
}
string getInfoStr() const
{
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
}
};
inline string joinWordInfos(const vector<WordInfo>& vec)
{
vector<string> tmp;
for(uint i = 0; i < vec.size(); i++)
{
tmp.push_back(vec[i].getInfoStr());
}
return joinStr(tmp, ",");
}
class KeyWordExt
{
private:

View File

@ -112,13 +112,12 @@ namespace CppJieba
{
return false;
}
typedef VUINT16_CONST_ITER UCI;
UCI beginIter = unicode.begin();
for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++)
VUINT16_CONST_ITER beginIter = unicode.begin();
for(VUINT16_CONST_ITER iterI = unicode.begin(); iterI != unicode.end(); iterI++)
{
vector<uint> vec;
vec.push_back(iterI - beginIter);
for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
{
//care: the iterJ exceed iterEnd
if(NULL != _trie.find(iterI, iterJ + 1))

View File

@ -20,8 +20,8 @@ namespace CppJieba
{
_root = NULL;
_totalCount = 0;
_minWeight = numeric_limits<double>::max();
_freqSum = 0;
_minLogFreq = numeric_limits<double>::max();
_initFlag = false;
}
@ -110,7 +110,8 @@ namespace CppJieba
return false;
}
nodeInfo.word = vecBuf[0];
nodeInfo.count = atoi(vecBuf[1].c_str());
nodeInfo.freq = atoi(vecBuf[1].c_str());
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
if(3 == vecBuf.size())
{
nodeInfo.tag = vecBuf[2];
@ -270,7 +271,7 @@ namespace CppJieba
const TrieNodeInfo * p = find(unicode);
if(NULL != p)
{
return p->weight;
return p->logFreq;
}
else
{
@ -284,7 +285,7 @@ namespace CppJieba
const TrieNodeInfo * p = find(begin, end);
if(NULL != p)
{
return p->weight;
return p->logFreq;
}
else
{
@ -294,12 +295,12 @@ namespace CppJieba
double Trie::getMinWeight()
{
return _minWeight;
return _minLogFreq;
}
int64_t Trie::getTotalCount()
{
return _totalCount;
return _freqSum;
}
bool Trie::_deleteNode(TrieNode* node)
@ -379,21 +380,21 @@ namespace CppJieba
bool Trie::_countWeight()
{
if(_nodeInfoVec.empty() || 0 != _totalCount)
if(_nodeInfoVec.empty() || 0 != _freqSum)
{
LogError("_nodeInfoVec is empty or _totalCount has been counted already.");
LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
return false;
}
//count total freq
//freq total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_totalCount += _nodeInfoVec[i].count;
_freqSum += _nodeInfoVec[i].freq;
}
if(0 == _totalCount)
if(0 == _freqSum)
{
LogError("_totalCount == 0 .");
LogError("_freqSum == 0 .");
return false;
}
@ -401,15 +402,15 @@ namespace CppJieba
for(uint i = 0; i < _nodeInfoVec.size(); i++)
{
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
if(0 == nodeInfo.count)
if(0 == nodeInfo.freq)
{
LogFatal("nodeInfo.count == 0!");
LogFatal("nodeInfo.freq == 0!");
return false;
}
nodeInfo.weight = log(double(nodeInfo.count)/double(_totalCount));
if(_minWeight > nodeInfo.weight)
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
if(_minLogFreq > nodeInfo.logFreq)
{
_minWeight = nodeInfo.weight;
_minLogFreq = nodeInfo.logFreq;
}
}

View File

@ -18,29 +18,12 @@
#include "cppcommon/logger.h"
#include "TransCode.h"
#include "globals.h"
#include "structs.h"
namespace CppJieba
{
using namespace CPPCOMMON;
using namespace std;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNodeInfo
{
string word;
size_t wLen;// the word's len , not string.size(),
size_t count;
string tag;
double weight;
TrieNodeInfo()
{
wLen = 0;
count = 0;
weight = 0.0;
}
};
struct TrieNode
{
TrieNodeMap hmap;
@ -60,8 +43,8 @@ namespace CppJieba
TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec;
int64_t _totalCount;
double _minWeight;
int64_t _freqSum;
double _minLogFreq;
bool _initFlag;
public:

View File

@ -14,11 +14,13 @@
namespace CppJieba
{
using namespace std;
//typedefs
typedef unsigned int uint;
typedef std::vector<std::string>::iterator VSI;
typedef std::vector<uint16_t> VUINT16;
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
}

50
src/structs.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef CPPJIEBA_STRUCTS_H
#define CPPJIEBA_STRUCTS_H
#include "globals.h"
namespace CppJieba
{
struct TrieNodeInfo
{
string word;
size_t wLen;// the word's len , not string.length(),
size_t freq;
string tag;
double logFreq;//log(freq/sum(freq));
TrieNodeInfo()
{
wLen = 0;
freq = 0;
logFreq = 0.0;
}
};
struct WordInfo: public TrieNodeInfo
{
double idf;
double weight;// log(wLen+1)*logFreq;
WordInfo()
{
idf = 0.0;
weight = 0.0;
}
string toString() const
{
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
}
};
inline string joinWordInfos(const vector<WordInfo>& vec)
{
vector<string> tmp;
for(uint i = 0; i < vec.size(); i++)
{
tmp.push_back(vec[i].toString());
}
return joinStr(tmp, ",");
}
}
#endif