mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add _countweigt
This commit is contained in:
parent
1974c49071
commit
a97749898f
@ -129,7 +129,8 @@ using namespace CppJieba;
|
|||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
Segment segment;
|
Segment segment;
|
||||||
segment.init("dict.utf8");
|
segment.init("jieba.dict.utf8");
|
||||||
|
|
||||||
vector<string> res;
|
vector<string> res;
|
||||||
string title = "我来到北京清华大学3D电视";
|
string title = "我来到北京清华大学3D电视";
|
||||||
bool flag = segment.cutMM(title, res);
|
bool flag = segment.cutMM(title, res);
|
||||||
|
55
Trie.cpp
55
Trie.cpp
@ -12,9 +12,8 @@ namespace CppJieba
|
|||||||
return Trie::iterator(NULL);
|
return Trie::iterator(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
Trie::Trie()
|
Trie::Trie():_root(NULL), _nodeInfoVec(), _totalWeight(0)
|
||||||
{
|
{
|
||||||
_root = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Trie::~Trie()
|
Trie::~Trie()
|
||||||
@ -22,7 +21,25 @@ namespace CppJieba
|
|||||||
destroy();
|
destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Trie::init(const char* const filepath)
|
bool Trie::init(const char* const filePath)
|
||||||
|
{
|
||||||
|
bool res = false;
|
||||||
|
res = _buildTree(filePath);
|
||||||
|
if(!res)
|
||||||
|
{
|
||||||
|
LogError("_buildTree failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
res = _countWeight();
|
||||||
|
if(!res)
|
||||||
|
{
|
||||||
|
LogError("_countWeight failed.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Trie::_buildTree(const char* const filePath)
|
||||||
{
|
{
|
||||||
char msgBuf[bufSize];
|
char msgBuf[bufSize];
|
||||||
if(NULL != _root)
|
if(NULL != _root)
|
||||||
@ -31,7 +48,7 @@ namespace CppJieba
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
_root = new TrieNode;
|
_root = new TrieNode;
|
||||||
ifstream ifile(filepath);
|
ifstream ifile(filePath);
|
||||||
string line;
|
string line;
|
||||||
vector<string> vecBuf;
|
vector<string> vecBuf;
|
||||||
while(getline(ifile, line))
|
while(getline(ifile, line))
|
||||||
@ -281,6 +298,31 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Trie::_countWeight()
|
||||||
|
{
|
||||||
|
if(_nodeInfoVec.empty() || 0 != _totalWeight)
|
||||||
|
{
|
||||||
|
LogError("_nodeInfoVec is empty or _totalWeight has been counted already.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
//count total freq
|
||||||
|
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
|
{
|
||||||
|
_totalWeight += _nodeInfoVec[i].count;
|
||||||
|
//cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
//normalize
|
||||||
|
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
|
{
|
||||||
|
_nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalWeight));
|
||||||
|
//cout<<_nodeInfoVec[i].weight<<endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef TRIE_UT
|
#ifdef TRIE_UT
|
||||||
@ -288,9 +330,8 @@ using namespace CppJieba;
|
|||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
Trie trie;
|
Trie trie;
|
||||||
trie.init("dict.utf8");
|
trie.init("jieba.dict.utf8");
|
||||||
char utf[1024] = "我来到北京清华大学3D电视";
|
//char utf[1024] = "我来到北京清华大学3D电视";
|
||||||
vector< vector<size_t> > res;
|
|
||||||
trie.destroy();
|
trie.destroy();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
11
Trie.h
11
Trie.h
@ -5,6 +5,8 @@
|
|||||||
//#include <ext/hash_map>
|
//#include <ext/hash_map>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <cmath>
|
||||||
#include "cppcommon/str_functs.h"
|
#include "cppcommon/str_functs.h"
|
||||||
#include "cppcommon/vec_functs.h"
|
#include "cppcommon/vec_functs.h"
|
||||||
#include "cppcommon/logger.h"
|
#include "cppcommon/logger.h"
|
||||||
@ -25,7 +27,8 @@ namespace CppJieba
|
|||||||
string word;
|
string word;
|
||||||
unsigned int count;
|
unsigned int count;
|
||||||
string tag;
|
string tag;
|
||||||
TrieNodeInfo():word(),count(0),tag()
|
double weight;
|
||||||
|
TrieNodeInfo():word(),count(0),tag(),weight(0.0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -91,6 +94,8 @@ namespace CppJieba
|
|||||||
private:
|
private:
|
||||||
TrieNode* _root;
|
TrieNode* _root;
|
||||||
vector<TrieNodeInfo> _nodeInfoVec;
|
vector<TrieNodeInfo> _nodeInfoVec;
|
||||||
|
int64_t _totalWeight;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
typedef TrieNodeIterator iterator;
|
typedef TrieNodeIterator iterator;
|
||||||
|
|
||||||
@ -101,7 +106,7 @@ namespace CppJieba
|
|||||||
public:
|
public:
|
||||||
Trie();
|
Trie();
|
||||||
~Trie();
|
~Trie();
|
||||||
bool init(const char* const filepath = DICT_FILE_PATH);
|
bool init(const char* const filePath);
|
||||||
bool destroy();
|
bool destroy();
|
||||||
void display();
|
void display();
|
||||||
|
|
||||||
@ -117,9 +122,11 @@ namespace CppJieba
|
|||||||
//bool cutMa
|
//bool cutMa
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
bool _buildTree(const char* const filePath);
|
||||||
bool _destroyNode(TrieNode* node);
|
bool _destroyNode(TrieNode* node);
|
||||||
void _display(TrieNode* node, int level);
|
void _display(TrieNode* node, int level);
|
||||||
bool _insert(const TrieNodeInfo& nodeInfo);
|
bool _insert(const TrieNodeInfo& nodeInfo);
|
||||||
|
bool _countWeight();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum {bufSize = 1024};
|
enum {bufSize = 1024};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user