mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
mv *NodeInfo into structs.h and WordInfo is derived from TrieNodeInfo
This commit is contained in:
parent
abb507a029
commit
ff34095252
@ -89,25 +89,6 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
|
bool KeyWordExt::_sortWLIDF(vector<WordInfo>& wordInfos)
|
||||||
{
|
{
|
||||||
//size_t wLenSum = 0;
|
|
||||||
for(uint i = 0; i < wordInfos.size(); i++)
|
|
||||||
{
|
|
||||||
wordInfos[i].wLen = TransCode::getWordLength(wordInfos[i].word);
|
|
||||||
if(0 == wordInfos[i].wLen)
|
|
||||||
{
|
|
||||||
LogFatal("wLen is 0");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//wLenSum += wordInfos[i].wLen;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
if(0 == wLenSum)
|
|
||||||
{
|
|
||||||
LogFatal("wLenSum == 0.");
|
|
||||||
return false;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
for(uint i = 0; i < wordInfos.size(); i++)
|
for(uint i = 0; i < wordInfos.size(); i++)
|
||||||
{
|
{
|
||||||
WordInfo& wInfo = wordInfos[i];
|
WordInfo& wInfo = wordInfos[i];
|
||||||
|
@ -6,39 +6,11 @@
|
|||||||
#define CPPJIEBA_KEYWORDEXT_H
|
#define CPPJIEBA_KEYWORDEXT_H
|
||||||
|
|
||||||
#include "Segment.h"
|
#include "Segment.h"
|
||||||
|
#include "structs.h"
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
struct WordInfo
|
|
||||||
{
|
|
||||||
string word;
|
|
||||||
size_t wLen;
|
|
||||||
double weight;
|
|
||||||
double idf;
|
|
||||||
WordInfo()
|
|
||||||
{
|
|
||||||
word = "";
|
|
||||||
wLen = 0;
|
|
||||||
weight = 0.0;
|
|
||||||
idf = 0.0;
|
|
||||||
}
|
|
||||||
string getInfoStr() const
|
|
||||||
{
|
|
||||||
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline string joinWordInfos(const vector<WordInfo>& vec)
|
|
||||||
{
|
|
||||||
vector<string> tmp;
|
|
||||||
for(uint i = 0; i < vec.size(); i++)
|
|
||||||
{
|
|
||||||
tmp.push_back(vec[i].getInfoStr());
|
|
||||||
}
|
|
||||||
return joinStr(tmp, ",");
|
|
||||||
}
|
|
||||||
|
|
||||||
class KeyWordExt
|
class KeyWordExt
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
|
@ -112,13 +112,12 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
typedef VUINT16_CONST_ITER UCI;
|
VUINT16_CONST_ITER beginIter = unicode.begin();
|
||||||
UCI beginIter = unicode.begin();
|
for(VUINT16_CONST_ITER iterI = unicode.begin(); iterI != unicode.end(); iterI++)
|
||||||
for(UCI iterI = unicode.begin(); iterI != unicode.end(); iterI++)
|
|
||||||
{
|
{
|
||||||
vector<uint> vec;
|
vector<uint> vec;
|
||||||
vec.push_back(iterI - beginIter);
|
vec.push_back(iterI - beginIter);
|
||||||
for(UCI iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
|
for(VUINT16_CONST_ITER iterJ = iterI + 1; iterJ != unicode.end(); iterJ++)
|
||||||
{
|
{
|
||||||
//care: the iterJ exceed iterEnd
|
//care: the iterJ exceed iterEnd
|
||||||
if(NULL != _trie.find(iterI, iterJ + 1))
|
if(NULL != _trie.find(iterI, iterJ + 1))
|
||||||
|
37
src/Trie.cpp
37
src/Trie.cpp
@ -20,8 +20,8 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
|
|
||||||
_root = NULL;
|
_root = NULL;
|
||||||
_totalCount = 0;
|
_freqSum = 0;
|
||||||
_minWeight = numeric_limits<double>::max();
|
_minLogFreq = numeric_limits<double>::max();
|
||||||
_initFlag = false;
|
_initFlag = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,7 +110,8 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
nodeInfo.word = vecBuf[0];
|
nodeInfo.word = vecBuf[0];
|
||||||
nodeInfo.count = atoi(vecBuf[1].c_str());
|
nodeInfo.freq = atoi(vecBuf[1].c_str());
|
||||||
|
nodeInfo.wLen = TransCode::getWordLength(nodeInfo.word);
|
||||||
if(3 == vecBuf.size())
|
if(3 == vecBuf.size())
|
||||||
{
|
{
|
||||||
nodeInfo.tag = vecBuf[2];
|
nodeInfo.tag = vecBuf[2];
|
||||||
@ -270,7 +271,7 @@ namespace CppJieba
|
|||||||
const TrieNodeInfo * p = find(unicode);
|
const TrieNodeInfo * p = find(unicode);
|
||||||
if(NULL != p)
|
if(NULL != p)
|
||||||
{
|
{
|
||||||
return p->weight;
|
return p->logFreq;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -284,7 +285,7 @@ namespace CppJieba
|
|||||||
const TrieNodeInfo * p = find(begin, end);
|
const TrieNodeInfo * p = find(begin, end);
|
||||||
if(NULL != p)
|
if(NULL != p)
|
||||||
{
|
{
|
||||||
return p->weight;
|
return p->logFreq;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -294,12 +295,12 @@ namespace CppJieba
|
|||||||
|
|
||||||
double Trie::getMinWeight()
|
double Trie::getMinWeight()
|
||||||
{
|
{
|
||||||
return _minWeight;
|
return _minLogFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t Trie::getTotalCount()
|
int64_t Trie::getTotalCount()
|
||||||
{
|
{
|
||||||
return _totalCount;
|
return _freqSum;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Trie::_deleteNode(TrieNode* node)
|
bool Trie::_deleteNode(TrieNode* node)
|
||||||
@ -379,21 +380,21 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool Trie::_countWeight()
|
bool Trie::_countWeight()
|
||||||
{
|
{
|
||||||
if(_nodeInfoVec.empty() || 0 != _totalCount)
|
if(_nodeInfoVec.empty() || 0 != _freqSum)
|
||||||
{
|
{
|
||||||
LogError("_nodeInfoVec is empty or _totalCount has been counted already.");
|
LogError("_nodeInfoVec is empty or _freqSum has been counted already.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//count total freq
|
//freq total freq
|
||||||
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
{
|
{
|
||||||
_totalCount += _nodeInfoVec[i].count;
|
_freqSum += _nodeInfoVec[i].freq;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(0 == _totalCount)
|
if(0 == _freqSum)
|
||||||
{
|
{
|
||||||
LogError("_totalCount == 0 .");
|
LogError("_freqSum == 0 .");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -401,15 +402,15 @@ namespace CppJieba
|
|||||||
for(uint i = 0; i < _nodeInfoVec.size(); i++)
|
for(uint i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
{
|
{
|
||||||
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
|
TrieNodeInfo& nodeInfo = _nodeInfoVec[i];
|
||||||
if(0 == nodeInfo.count)
|
if(0 == nodeInfo.freq)
|
||||||
{
|
{
|
||||||
LogFatal("nodeInfo.count == 0!");
|
LogFatal("nodeInfo.freq == 0!");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
nodeInfo.weight = log(double(nodeInfo.count)/double(_totalCount));
|
nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum));
|
||||||
if(_minWeight > nodeInfo.weight)
|
if(_minLogFreq > nodeInfo.logFreq)
|
||||||
{
|
{
|
||||||
_minWeight = nodeInfo.weight;
|
_minLogFreq = nodeInfo.logFreq;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
23
src/Trie.h
23
src/Trie.h
@ -18,29 +18,12 @@
|
|||||||
#include "cppcommon/logger.h"
|
#include "cppcommon/logger.h"
|
||||||
#include "TransCode.h"
|
#include "TransCode.h"
|
||||||
#include "globals.h"
|
#include "globals.h"
|
||||||
|
#include "structs.h"
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
using namespace CPPCOMMON;
|
using namespace CPPCOMMON;
|
||||||
using namespace std;
|
|
||||||
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
|
|
||||||
|
|
||||||
struct TrieNodeInfo
|
|
||||||
{
|
|
||||||
string word;
|
|
||||||
size_t wLen;// the word's len , not string.size(),
|
|
||||||
size_t count;
|
|
||||||
string tag;
|
|
||||||
double weight;
|
|
||||||
TrieNodeInfo()
|
|
||||||
{
|
|
||||||
wLen = 0;
|
|
||||||
count = 0;
|
|
||||||
weight = 0.0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct TrieNode
|
struct TrieNode
|
||||||
{
|
{
|
||||||
TrieNodeMap hmap;
|
TrieNodeMap hmap;
|
||||||
@ -60,8 +43,8 @@ namespace CppJieba
|
|||||||
TrieNode* _root;
|
TrieNode* _root;
|
||||||
vector<TrieNodeInfo> _nodeInfoVec;
|
vector<TrieNodeInfo> _nodeInfoVec;
|
||||||
|
|
||||||
int64_t _totalCount;
|
int64_t _freqSum;
|
||||||
double _minWeight;
|
double _minLogFreq;
|
||||||
bool _initFlag;
|
bool _initFlag;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -14,11 +14,13 @@
|
|||||||
namespace CppJieba
|
namespace CppJieba
|
||||||
{
|
{
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
//typedefs
|
//typedefs
|
||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
typedef std::vector<std::string>::iterator VSI;
|
typedef std::vector<std::string>::iterator VSI;
|
||||||
typedef std::vector<uint16_t> VUINT16;
|
typedef std::vector<uint16_t> VUINT16;
|
||||||
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
|
typedef std::vector<uint16_t>::const_iterator VUINT16_CONST_ITER;
|
||||||
|
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
50
src/structs.h
Normal file
50
src/structs.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#ifndef CPPJIEBA_STRUCTS_H
|
||||||
|
#define CPPJIEBA_STRUCTS_H
|
||||||
|
|
||||||
|
#include "globals.h"
|
||||||
|
|
||||||
|
namespace CppJieba
|
||||||
|
{
|
||||||
|
|
||||||
|
struct TrieNodeInfo
|
||||||
|
{
|
||||||
|
string word;
|
||||||
|
size_t wLen;// the word's len , not string.length(),
|
||||||
|
size_t freq;
|
||||||
|
string tag;
|
||||||
|
double logFreq;//log(freq/sum(freq));
|
||||||
|
TrieNodeInfo()
|
||||||
|
{
|
||||||
|
wLen = 0;
|
||||||
|
freq = 0;
|
||||||
|
logFreq = 0.0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct WordInfo: public TrieNodeInfo
|
||||||
|
{
|
||||||
|
double idf;
|
||||||
|
double weight;// log(wLen+1)*logFreq;
|
||||||
|
WordInfo()
|
||||||
|
{
|
||||||
|
idf = 0.0;
|
||||||
|
weight = 0.0;
|
||||||
|
}
|
||||||
|
string toString() const
|
||||||
|
{
|
||||||
|
return string_format("{word:%s,wLen:%d weight:%lf, idf:%lf}", word.c_str(), wLen, weight, idf);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
inline string joinWordInfos(const vector<WordInfo>& vec)
|
||||||
|
{
|
||||||
|
vector<string> tmp;
|
||||||
|
for(uint i = 0; i < vec.size(); i++)
|
||||||
|
{
|
||||||
|
tmp.push_back(vec[i].toString());
|
||||||
|
}
|
||||||
|
return joinStr(tmp, ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
Loading…
x
Reference in New Issue
Block a user