add _countweigt

This commit is contained in:
gwdwyy 2013-07-06 11:39:56 +08:00
parent 1974c49071
commit a97749898f
3 changed files with 59 additions and 10 deletions

View File

@ -129,7 +129,8 @@ using namespace CppJieba;
int main() int main()
{ {
Segment segment; Segment segment;
segment.init("dict.utf8"); segment.init("jieba.dict.utf8");
vector<string> res; vector<string> res;
string title = "我来到北京清华大学3D电视"; string title = "我来到北京清华大学3D电视";
bool flag = segment.cutMM(title, res); bool flag = segment.cutMM(title, res);

View File

@ -12,9 +12,8 @@ namespace CppJieba
return Trie::iterator(NULL); return Trie::iterator(NULL);
} }
Trie::Trie() Trie::Trie():_root(NULL), _nodeInfoVec(), _totalWeight(0)
{ {
_root = NULL;
} }
Trie::~Trie() Trie::~Trie()
@ -22,7 +21,25 @@ namespace CppJieba
destroy(); destroy();
} }
bool Trie::init(const char* const filepath) bool Trie::init(const char* const filePath)
{
bool res = false;
res = _buildTree(filePath);
if(!res)
{
LogError("_buildTree failed.");
return false;
}
res = _countWeight();
if(!res)
{
LogError("_countWeight failed.");
return false;
}
return true;
}
bool Trie::_buildTree(const char* const filePath)
{ {
char msgBuf[bufSize]; char msgBuf[bufSize];
if(NULL != _root) if(NULL != _root)
@ -31,7 +48,7 @@ namespace CppJieba
return false; return false;
} }
_root = new TrieNode; _root = new TrieNode;
ifstream ifile(filepath); ifstream ifile(filePath);
string line; string line;
vector<string> vecBuf; vector<string> vecBuf;
while(getline(ifile, line)) while(getline(ifile, line))
@ -281,6 +298,31 @@ namespace CppJieba
} }
return true; return true;
} }
bool Trie::_countWeight()
{
if(_nodeInfoVec.empty() || 0 != _totalWeight)
{
LogError("_nodeInfoVec is empty or _totalWeight has been counted already.");
return false;
}
//count total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_totalWeight += _nodeInfoVec[i].count;
//cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl;
}
//normalize
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{
_nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalWeight));
//cout<<_nodeInfoVec[i].weight<<endl;
}
return true;
}
} }
#ifdef TRIE_UT #ifdef TRIE_UT
@ -288,9 +330,8 @@ using namespace CppJieba;
int main() int main()
{ {
Trie trie; Trie trie;
trie.init("dict.utf8"); trie.init("jieba.dict.utf8");
char utf[1024] = "我来到北京清华大学3D电视"; //char utf[1024] = "我来到北京清华大学3D电视";
vector< vector<size_t> > res;
trie.destroy(); trie.destroy();
return 0; return 0;
} }

11
Trie.h
View File

@ -5,6 +5,8 @@
//#include <ext/hash_map> //#include <ext/hash_map>
#include <map> #include <map>
#include <cstring> #include <cstring>
#include <stdint.h>
#include <cmath>
#include "cppcommon/str_functs.h" #include "cppcommon/str_functs.h"
#include "cppcommon/vec_functs.h" #include "cppcommon/vec_functs.h"
#include "cppcommon/logger.h" #include "cppcommon/logger.h"
@ -25,7 +27,8 @@ namespace CppJieba
string word; string word;
unsigned int count; unsigned int count;
string tag; string tag;
TrieNodeInfo():word(),count(0),tag() double weight;
TrieNodeInfo():word(),count(0),tag(),weight(0.0)
{ {
} }
}; };
@ -91,6 +94,8 @@ namespace CppJieba
private: private:
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfoVec;
int64_t _totalWeight;
public: public:
typedef TrieNodeIterator iterator; typedef TrieNodeIterator iterator;
@ -101,7 +106,7 @@ namespace CppJieba
public: public:
Trie(); Trie();
~Trie(); ~Trie();
bool init(const char* const filepath = DICT_FILE_PATH); bool init(const char* const filePath);
bool destroy(); bool destroy();
void display(); void display();
@ -117,9 +122,11 @@ namespace CppJieba
//bool cutMa //bool cutMa
private: private:
bool _buildTree(const char* const filePath);
bool _destroyNode(TrieNode* node); bool _destroyNode(TrieNode* node);
void _display(TrieNode* node, int level); void _display(TrieNode* node, int level);
bool _insert(const TrieNodeInfo& nodeInfo); bool _insert(const TrieNodeInfo& nodeInfo);
bool _countWeight();
private: private:
enum {bufSize = 1024}; enum {bufSize = 1024};