mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
mv *NodeInfo into structs.h and WordInfo is derived from TrieNodeInfo
This commit is contained in:
parent
abb507a029
commit
ff34095252
@ -89,25 +89,6 @@ namespace CppJieba
|
||||
|
||||
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
|
||||
{
|
||||
//size_t wLenSum = 0;
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
wordInfos[i].wLen = TransCode::getWordLength(wordInfos[i].word);
|
||||
if(0 == wordInfos[i].wLen)
|
||||
{
|
||||
LogFatal("wLen is 0");
|
||||
return false;
|
||||
}
|
||||
//wLenSum += wordInfos[i].wLen;
|
||||
}
|
||||
|
||||
/*
|
||||
if(0 == wLenSum)
|
||||
{
|
||||
LogFatal("wLenSum == 0.");
|
||||
return false;
|
||||
}*/
|
||||
|
||||
for(uint i = 0; i < wordInfos.size(); i++)
|
||||
{
|
||||
WordInfo& wInfo = wordInfos[i];
|
||||
|
@ -6,39 +6,11 @@
|
||||
#define CPPJIEBA_KEYWORDEXT_H
|
||||
|
||||
#include "Segment.h"
|
||||
#include "structs.h"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
struct WordInfo
|
||||
{
|
||||
string word;
|
||||
size_t wLen;
|
||||
double weight;
|
||||
double idf;
|
||||
WordInfo()
|
||||
{
|
||||
word = "";
|
||||
wLen = 0;
|
||||
weight = 0.0;
|
||||
idf = 0.0;
|
||||
}
|
||||
string getInfoStr() const
|
||||
{
|
||||
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
||||
}
|
||||
};
|
||||
|
||||
inline string joinWordInfos(const vector<WordInfo>& vec)
|
||||
{
|
||||
vector<string> tmp;
|
||||
for(uint i = 0; i < vec.size(); i++)
|
||||
{
|
||||
tmp.push_back(vec[i].getInfoStr());
|
||||
}
|
||||
return joinStr(tmp, ",");
|
||||
}
|
||||
|
||||
class KeyWordExt
|
||||
{
|
||||
private:
|
||||
|
@ -112,13 +112,12 @@ namespace CppJieba
|
||||
{
|
||||
return false;
|
||||
}
|
||||
typedef VUINT16_CONST_ITER UCI;
|
||||
UCI beginIter = unicode.begin();
|
||||
for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++)
|
||||
VUINT16_CONST_ITER beginIter = unicode.begin();
|
||||
for(VUINT16_CONST_ITER iterI = unicode.begin(); iterI != unicode.end(); iterI++)
|
||||
{
|
||||
vector<uint> vec;
|
||||
vec.push_back(iterI - beginIter);
|
||||
for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
|
||||
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
|
||||
{
|
||||
//care: the iterJ exceed iterEnd
|
||||
if(NULL != _trie.find(iterI, iterJ + 1))
|
||||
|
37
src/Trie.cpp
37
src/Trie.cpp
@ -20,8 +20,8 @@ namespace CppJieba
|
||||
{
|
||||
|
||||
_root = NULL;
|
||||
_totalCount = 0;
|
||||
_minWeight = numeric_limits<double>::max();
|
||||
_freqSum = 0;
|
||||
_minLogFreq = numeric_limits<double>::max();
|
||||
_initFlag = false;
|
||||
}
|
||||
|
||||
@ -110,7 +110,8 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
nodeInfo.word = vecBuf[0];
|
||||
nodeInfo.count = atoi(vecBuf[1].c_str());
|
||||
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
||||
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
|
||||
if(3 == vecBuf.size())
|
||||
{
|
||||
nodeInfo.tag = vecBuf[2];
|
||||
@ -270,7 +271,7 @@ namespace CppJieba
|
||||
const TrieNodeInfo * p = find(unicode);
|
||||
if(NULL != p)
|
||||
{
|
||||
return p->weight;
|
||||
return p->logFreq;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -284,7 +285,7 @@ namespace CppJieba
|
||||
const TrieNodeInfo * p = find(begin, end);
|
||||
if(NULL != p)
|
||||
{
|
||||
return p->weight;
|
||||
return p->logFreq;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -294,12 +295,12 @@ namespace CppJieba
|
||||
|
||||
double Trie::getMinWeight()
|
||||
{
|
||||
return _minWeight;
|
||||
return _minLogFreq;
|
||||
}
|
||||
|
||||
int64_t Trie::getTotalCount()
|
||||
{
|
||||
return _totalCount;
|
||||
return _freqSum;
|
||||
}
|
||||
|
||||
bool Trie::_deleteNode(TrieNode* node)
|
||||
@ -379,21 +380,21 @@ namespace CppJieba
|
||||
|
||||
bool Trie::_countWeight()
|
||||
{
|
||||
if(_nodeInfoVec.empty() || 0 != _totalCount)
|
||||
if(_nodeInfoVec.empty() || 0 != _freqSum)
|
||||
{
|
||||
LogError("_nodeInfoVec is empty or _totalCount has been counted already.");
|
||||
LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
|
||||
return false;
|
||||
}
|
||||
|
||||
//count total freq
|
||||
//freq total freq
|
||||
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||
{
|
||||
_totalCount += _nodeInfoVec[i].count;
|
||||
_freqSum += _nodeInfoVec[i].freq;
|
||||
}
|
||||
|
||||
if(0 == _totalCount)
|
||||
if(0 == _freqSum)
|
||||
{
|
||||
LogError("_totalCount == 0 .");
|
||||
LogError("_freqSum == 0 .");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -401,15 +402,15 @@ namespace CppJieba
|
||||
for(uint i = 0; i < _nodeInfoVec.size(); i++)
|
||||
{
|
||||
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
|
||||
if(0 == nodeInfo.count)
|
||||
if(0 == nodeInfo.freq)
|
||||
{
|
||||
LogFatal("nodeInfo.count == 0!");
|
||||
LogFatal("nodeInfo.freq == 0!");
|
||||
return false;
|
||||
}
|
||||
nodeInfo.weight = log(double(nodeInfo.count)/double(_totalCount));
|
||||
if(_minWeight > nodeInfo.weight)
|
||||
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
|
||||
if(_minLogFreq > nodeInfo.logFreq)
|
||||
{
|
||||
_minWeight = nodeInfo.weight;
|
||||
_minLogFreq = nodeInfo.logFreq;
|
||||
}
|
||||
}
|
||||
|
||||
|
23
src/Trie.h
23
src/Trie.h
@ -18,29 +18,12 @@
|
||||
#include "cppcommon/logger.h"
|
||||
#include "TransCode.h"
|
||||
#include "globals.h"
|
||||
#include "structs.h"
|
||||
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace CPPCOMMON;
|
||||
using namespace std;
|
||||
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||
|
||||
struct TrieNodeInfo
|
||||
{
|
||||
string word;
|
||||
size_t wLen;// the word's len , not string.size(),
|
||||
size_t count;
|
||||
string tag;
|
||||
double weight;
|
||||
TrieNodeInfo()
|
||||
{
|
||||
wLen = 0;
|
||||
count = 0;
|
||||
weight = 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
struct TrieNode
|
||||
{
|
||||
TrieNodeMap hmap;
|
||||
@ -60,8 +43,8 @@ namespace CppJieba
|
||||
TrieNode* _root;
|
||||
vector<TrieNodeInfo> _nodeInfoVec;
|
||||
|
||||
int64_t _totalCount;
|
||||
double _minWeight;
|
||||
int64_t _freqSum;
|
||||
double _minLogFreq;
|
||||
bool _initFlag;
|
||||
|
||||
public:
|
||||
|
@ -14,11 +14,13 @@
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
using namespace std;
|
||||
//typedefs
|
||||
typedef unsigned int uint;
|
||||
typedef std::vector<std::string>::iterator VSI;
|
||||
typedef std::vector<uint16_t> VUINT16;
|
||||
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
|
||||
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||
|
||||
}
|
||||
|
||||
|
50
src/structs.h
Normal file
50
src/structs.h
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef CPPJIEBA_STRUCTS_H
|
||||
#define CPPJIEBA_STRUCTS_H
|
||||
|
||||
#include "globals.h"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
struct TrieNodeInfo
|
||||
{
|
||||
string word;
|
||||
size_t wLen;// the word's len , not string.length(),
|
||||
size_t freq;
|
||||
string tag;
|
||||
double logFreq;//log(freq/sum(freq));
|
||||
TrieNodeInfo()
|
||||
{
|
||||
wLen = 0;
|
||||
freq = 0;
|
||||
logFreq = 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
struct WordInfo: public TrieNodeInfo
|
||||
{
|
||||
double idf;
|
||||
double weight;// log(wLen+1)*logFreq;
|
||||
WordInfo()
|
||||
{
|
||||
idf = 0.0;
|
||||
weight = 0.0;
|
||||
}
|
||||
string toString() const
|
||||
{
|
||||
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
||||
}
|
||||
};
|
||||
|
||||
inline string joinWordInfos(const vector<WordInfo>& vec)
|
||||
{
|
||||
vector<string> tmp;
|
||||
for(uint i = 0; i < vec.size(); i++)
|
||||
{
|
||||
tmp.push_back(vec[i].toString());
|
||||
}
|
||||
return joinStr(tmp, ",");
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user