mv *NodeInfo into structs.h and WordInfo is derived from TrieNodeInfo

This commit is contained in:
gwdwyy 2013-08-18 14:04:15 +08:00
parent abb507a029
commit ff34095252
7 changed files with 78 additions and 90 deletions

View File

@ -89,25 +89,6 @@ namespace CppJieba
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos) bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
{ {
//size_t wLenSum = 0;
for(uint i = 0; i < wordInfos.size(); i++)
{
wordInfos[i].wLen = TransCode::getWordLength(wordInfos[i].word);
if(0 == wordInfos[i].wLen)
{
LogFatal("wLen is 0");
return false;
}
//wLenSum += wordInfos[i].wLen;
}
/*
if(0 == wLenSum)
{
LogFatal("wLenSum == 0.");
return false;
}*/
for(uint i = 0; i < wordInfos.size(); i++) for(uint i = 0; i < wordInfos.size(); i++)
{ {
WordInfo& wInfo = wordInfos[i]; WordInfo& wInfo = wordInfos[i];

View File

@ -6,39 +6,11 @@
#define CPPJIEBA_KEYWORDEXT_H #define CPPJIEBA_KEYWORDEXT_H
#include "Segment.h" #include "Segment.h"
#include "structs.h"
namespace CppJieba namespace CppJieba
{ {
struct WordInfo
{
string word;
size_t wLen;
double weight;
double idf;
WordInfo()
{
word = "";
wLen = 0;
weight = 0.0;
idf = 0.0;
}
string getInfoStr() const
{
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
}
};
inline string joinWordInfos(const vector<WordInfo>& vec)
{
vector<string> tmp;
for(uint i = 0; i < vec.size(); i++)
{
tmp.push_back(vec[i].getInfoStr());
}
return joinStr(tmp, ",");
}
class KeyWordExt class KeyWordExt
{ {
private: private:

View File

@ -112,13 +112,12 @@ namespace CppJieba
{ {
return false; return false;
} }
typedef VUINT16_CONST_ITER UCI; VUINT16_CONST_ITER beginIter = unicode.begin();
UCI beginIter = unicode.begin(); for(VUINT16_CONST_ITER iterI = unicode.begin(); iterI != unicode.end(); iterI++)
for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++)
{ {
vector<uint> vec; vector<uint> vec;
vec.push_back(iterI - beginIter); vec.push_back(iterI - beginIter);
for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++) for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
{ {
//care: the iterJ exceed iterEnd //care: the iterJ exceed iterEnd
if(NULL != _trie.find(iterI, iterJ + 1)) if(NULL != _trie.find(iterI, iterJ + 1))

View File

@ -20,8 +20,8 @@ namespace CppJieba
{ {
_root = NULL; _root = NULL;
_totalCount = 0; _freqSum = 0;
_minWeight = numeric_limits<double>::max(); _minLogFreq = numeric_limits<double>::max();
_initFlag = false; _initFlag = false;
} }
@ -110,7 +110,8 @@ namespace CppJieba
return false; return false;
} }
nodeInfo.word = vecBuf[0]; nodeInfo.word = vecBuf[0];
nodeInfo.count = atoi(vecBuf[1].c_str()); nodeInfo.freq = atoi(vecBuf[1].c_str());
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
if(3 == vecBuf.size()) if(3 == vecBuf.size())
{ {
nodeInfo.tag = vecBuf[2]; nodeInfo.tag = vecBuf[2];
@ -270,7 +271,7 @@ namespace CppJieba
const TrieNodeInfo * p = find(unicode); const TrieNodeInfo * p = find(unicode);
if(NULL != p) if(NULL != p)
{ {
return p->weight; return p->logFreq;
} }
else else
{ {
@ -284,7 +285,7 @@ namespace CppJieba
const TrieNodeInfo * p = find(begin, end); const TrieNodeInfo * p = find(begin, end);
if(NULL != p) if(NULL != p)
{ {
return p->weight; return p->logFreq;
} }
else else
{ {
@ -294,12 +295,12 @@ namespace CppJieba
double Trie::getMinWeight() double Trie::getMinWeight()
{ {
return _minWeight; return _minLogFreq;
} }
int64_t Trie::getTotalCount() int64_t Trie::getTotalCount()
{ {
return _totalCount; return _freqSum;
} }
bool Trie::_deleteNode(TrieNode* node) bool Trie::_deleteNode(TrieNode* node)
@ -379,21 +380,21 @@ namespace CppJieba
bool Trie::_countWeight() bool Trie::_countWeight()
{ {
if(_nodeInfoVec.empty() || 0 != _totalCount) if(_nodeInfoVec.empty() || 0 != _freqSum)
{ {
LogError("_nodeInfoVec is empty or _totalCount has been counted already."); LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
return false; return false;
} }
//count total freq //freq total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++) for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{ {
_totalCount += _nodeInfoVec[i].count; _freqSum += _nodeInfoVec[i].freq;
} }
if(0 == _totalCount) if(0 == _freqSum)
{ {
LogError("_totalCount == 0 ."); LogError("_freqSum == 0 .");
return false; return false;
} }
@ -401,15 +402,15 @@ namespace CppJieba
for(uint i = 0; i < _nodeInfoVec.size(); i++) for(uint i = 0; i < _nodeInfoVec.size(); i++)
{ {
TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
if(0 == nodeInfo.count) if(0 == nodeInfo.freq)
{ {
LogFatal("nodeInfo.count == 0!"); LogFatal("nodeInfo.freq == 0!");
return false; return false;
} }
nodeInfo.weight = log(double(nodeInfo.count)/double(_totalCount)); nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
if(_minWeight > nodeInfo.weight) if(_minLogFreq > nodeInfo.logFreq)
{ {
_minWeight = nodeInfo.weight; _minLogFreq = nodeInfo.logFreq;
} }
} }

View File

@ -18,29 +18,12 @@
#include "cppcommon/logger.h" #include "cppcommon/logger.h"
#include "TransCode.h" #include "TransCode.h"
#include "globals.h" #include "globals.h"
#include "structs.h"
namespace CppJieba namespace CppJieba
{ {
using namespace CPPCOMMON; using namespace CPPCOMMON;
using namespace std;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNodeInfo
{
string word;
size_t wLen;// the word's len , not string.size(),
size_t count;
string tag;
double weight;
TrieNodeInfo()
{
wLen = 0;
count = 0;
weight = 0.0;
}
};
struct TrieNode struct TrieNode
{ {
TrieNodeMap hmap; TrieNodeMap hmap;
@ -60,8 +43,8 @@ namespace CppJieba
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfoVec;
int64_t _totalCount; int64_t _freqSum;
double _minWeight; double _minLogFreq;
bool _initFlag; bool _initFlag;
public: public:

View File

@ -14,11 +14,13 @@
namespace CppJieba namespace CppJieba
{ {
using namespace std;
//typedefs //typedefs
typedef unsigned int uint; typedef unsigned int uint;
typedef std::vector<std::string>::iterator VSI; typedef std::vector<std::string>::iterator VSI;
typedef std::vector<uint16_t> VUINT16; typedef std::vector<uint16_t> VUINT16;
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER; typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
} }

50
src/structs.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef CPPJIEBA_STRUCTS_H
#define CPPJIEBA_STRUCTS_H
#include "globals.h"
namespace CppJieba
{
struct TrieNodeInfo
{
string word;
size_t wLen;// the word's len , not string.length(),
size_t freq;
string tag;
double logFreq;//log(freq/sum(freq));
TrieNodeInfo()
{
wLen = 0;
freq = 0;
logFreq = 0.0;
}
};
struct WordInfo: public TrieNodeInfo
{
double idf;
double weight;// log(wLen+1)*logFreq;
WordInfo()
{
idf = 0.0;
weight = 0.0;
}
string toString() const
{
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
}
};
inline string joinWordInfos(const vector<WordInfo>& vec)
{
vector<string> tmp;
for(uint i = 0; i < vec.size(); i++)
{
tmp.push_back(vec[i].toString());
}
return joinStr(tmp, ",");
}
}
#endif