add _countweigt

This commit is contained in:
gwdwyy 2013-07-06 11:39:56 +08:00
parent 1974c49071
commit a97749898f
3 changed files with 59 additions and 10 deletions

View File

@ -129,7 +129,8 @@ using namespace CppJieba;
int main()
{
Segment segment;
segment.init("dict.utf8");
segment.init("jieba.dict.utf8");
vector<string> res;
string title = "我来到北京清华大学3D电视";
bool flag = segment.cutMM(title, res);

View File

@ -12,9 +12,8 @@ namespace CppJieba
return Trie::iterator(NULL);
}
Trie::Trie()
Trie::Trie():_root(NULL), _nodeInfoVec(), _totalWeight(0)
{
_root = NULL;
}
Trie::~Trie()
@ -22,7 +21,25 @@ namespace CppJieba
destroy();
}
bool Trie::init(const char* const filepath)
bool Trie::init(const char* const filePath)
{
bool res = false;
res = _buildTree(filePath);
if(!res)
{
LogError("_buildTree failed.");
return false;
}
res = _countWeight();
if(!res)
{
LogError("_countWeight failed.");
return false;
}
return true;
}
bool Trie::_buildTree(const char* const filePath)
{
char msgBuf[bufSize];
if(NULL != _root)
@ -31,7 +48,7 @@ namespace CppJieba
return false;
}
_root = new TrieNode;
ifstream ifile(filepath);
ifstream ifile(filePath);
string line;
vector<string> vecBuf;
while(getline(ifile, line))
@ -281,6 +298,31 @@ namespace CppJieba
}
return true;
}
bool Trie::_countWeight()
{
if(_nodeInfoVec.empty() || 0 != _totalWeight)
{
LogError("_nodeInfoVec is empty or _totalWeight has been counted already.");
return false;
}
//count total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_totalWeight += _nodeInfoVec[i].count;
//cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl;
}
//normalize
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalWeight));
//cout<<_nodeInfoVec[i].weight<<endl;
}
return true;
}
}
#ifdef TRIE_UT
@ -288,9 +330,8 @@ using namespace CppJieba;
int main()
{
Trie trie;
trie.init("dict.utf8");
char utf[1024] = "我来到北京清华大学3D电视";
vector< vector<size_t> > res;
trie.init("jieba.dict.utf8");
//char utf[1024] = "我来到北京清华大学3D电视";
trie.destroy();
return 0;
}

11
Trie.h
View File

@ -5,6 +5,8 @@
//#include <ext/hash_map>
#include <map>
#include <cstring>
#include <stdint.h>
#include <cmath>
#include "cppcommon/str_functs.h"
#include "cppcommon/vec_functs.h"
#include "cppcommon/logger.h"
@ -25,7 +27,8 @@ namespace CppJieba
string word;
unsigned int count;
string tag;
TrieNodeInfo():word(),count(0),tag()
double weight;
TrieNodeInfo():word(),count(0),tag(),weight(0.0)
{
}
};
@ -91,6 +94,8 @@ namespace CppJieba
private:
TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec;
int64_t _totalWeight;
public:
typedef TrieNodeIterator iterator;
@ -101,7 +106,7 @@ namespace CppJieba
public:
Trie();
~Trie();
bool init(const char* const filepath = DICT_FILE_PATH);
bool init(const char* const filePath);
bool destroy();
void display();
@ -117,9 +122,11 @@ namespace CppJieba
//bool cutMa
private:
bool _buildTree(const char* const filePath);
bool _destroyNode(TrieNode* node);
void _display(TrieNode* node, int level);
bool _insert(const TrieNodeInfo& nodeInfo);
bool _countWeight();
private:
enum {bufSize = 1024};