mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
modify find function
This commit is contained in:
parent
a97749898f
commit
6c3028a2c9
@ -98,7 +98,7 @@ namespace CppJieba
|
|||||||
for(int j = 0; j <= i; j++)
|
for(int j = 0; j <= i; j++)
|
||||||
{
|
{
|
||||||
size_t uniLen = i - j + 1;
|
size_t uniLen = i - j + 1;
|
||||||
if(_trie.find(uniStr + j, uniLen))
|
if(NULL != _trie.find(uniStr + j, uniLen))
|
||||||
{
|
{
|
||||||
memset(utfBuf, 0 ,sizeof(utfBuf));
|
memset(utfBuf, 0 ,sizeof(utfBuf));
|
||||||
size_t ret = unicodeToUtf8(uniStr + j, uniLen, utfBuf);
|
size_t ret = unicodeToUtf8(uniStr + j, uniLen, utfBuf);
|
||||||
|
77
Trie.cpp
77
Trie.cpp
@ -4,16 +4,17 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
Trie::iterator Trie::begin()
|
Trie::iterator Trie::begin()
|
||||||
{
|
{
|
||||||
return Trie::iterator(_root);
|
return _nodeInfoVec.begin();
|
||||||
}
|
}
|
||||||
|
|
||||||
Trie::iterator Trie::end()
|
Trie::iterator Trie::end()
|
||||||
{
|
{
|
||||||
return Trie::iterator(NULL);
|
return _nodeInfoVec.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
Trie::Trie():_root(NULL), _nodeInfoVec(), _totalWeight(0)
|
Trie::Trie():_root(NULL), _totalCount(0)
|
||||||
{
|
{
|
||||||
|
_minWeight = numeric_limits<double>::max();
|
||||||
}
|
}
|
||||||
|
|
||||||
Trie::~Trie()
|
Trie::~Trie()
|
||||||
@ -103,6 +104,38 @@ namespace CppJieba
|
|||||||
_display(_root, 0);
|
_display(_root, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len)
|
||||||
|
{
|
||||||
|
TrieNode* p = _root;
|
||||||
|
for(size_t i = 0; i < len; i++)
|
||||||
|
{
|
||||||
|
ChUnicode chUni = chUniStr[i];
|
||||||
|
if(p->hmap.find(chUni) == p-> hmap.end())
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
p = p->hmap[chUni];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(p->isLeaf)
|
||||||
|
{
|
||||||
|
unsigned int pos = p->nodeInfoVecPos;
|
||||||
|
if(pos < _nodeInfoVec.size())
|
||||||
|
{
|
||||||
|
return &(_nodeInfoVec[pos]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
bool Trie::find(const ChUnicode* chUniStr, size_t len)
|
bool Trie::find(const ChUnicode* chUniStr, size_t len)
|
||||||
{
|
{
|
||||||
int res = -1;
|
int res = -1;
|
||||||
@ -121,6 +154,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return p->isLeaf;
|
return p->isLeaf;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
bool Trie::find(const vector<ChUnicode>& uniVec)
|
bool Trie::find(const vector<ChUnicode>& uniVec)
|
||||||
@ -168,6 +202,12 @@ namespace CppJieba
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double getWeight(const ChUnicode* uniStr, size_t len)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res)
|
bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res)
|
||||||
{
|
{
|
||||||
res.clear();
|
res.clear();
|
||||||
@ -187,28 +227,8 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
bool Trie::cutUtf8(const string& str, vector< vector<size_t> >& res)
|
|
||||||
{
|
|
||||||
ChUnicode buf[ChUniMaxLen];
|
|
||||||
size_t len = utf8ToUnicode(str.c_str(), str.size(), buf);
|
|
||||||
if(0 == len)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return cut(buf, len, res);
|
|
||||||
/*
|
|
||||||
PRINT_MATRIX(res);
|
|
||||||
char buf[1024];
|
|
||||||
FOR_VECTOR(res, i)
|
|
||||||
{
|
|
||||||
FOR_VECTOR(res[i], j)
|
|
||||||
{
|
|
||||||
unicodeToUtf8(chUniStr + i, res[i][j] - i + 1, buf);
|
|
||||||
cout<<buf<<endl;
|
|
||||||
}
|
|
||||||
}*/
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Trie::_destroyNode(TrieNode* node)
|
bool Trie::_destroyNode(TrieNode* node)
|
||||||
{
|
{
|
||||||
@ -301,25 +321,26 @@ namespace CppJieba
|
|||||||
|
|
||||||
bool Trie::_countWeight()
|
bool Trie::_countWeight()
|
||||||
{
|
{
|
||||||
if(_nodeInfoVec.empty() || 0 != _totalWeight)
|
if(_nodeInfoVec.empty() || 0 != _totalCount)
|
||||||
{
|
{
|
||||||
LogError("_nodeInfoVec is empty or _totalWeight has been counted already.");
|
LogError("_nodeInfoVec is empty or _totalCount has been counted already.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//count total freq
|
//count total freq
|
||||||
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
{
|
{
|
||||||
_totalWeight += _nodeInfoVec[i].count;
|
_totalCount += _nodeInfoVec[i].count;
|
||||||
//cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl;
|
//cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//normalize
|
//normalize
|
||||||
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
for(size_t i = 0; i < _nodeInfoVec.size(); i++)
|
||||||
{
|
{
|
||||||
_nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalWeight));
|
_nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalCount));
|
||||||
//cout<<_nodeInfoVec[i].weight<<endl;
|
//cout<<_nodeInfoVec[i].weight<<endl;
|
||||||
}
|
}
|
||||||
|
cout<<_minWeight<<endl;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
19
Trie.h
19
Trie.h
@ -7,6 +7,7 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <limits>
|
||||||
#include "cppcommon/str_functs.h"
|
#include "cppcommon/str_functs.h"
|
||||||
#include "cppcommon/vec_functs.h"
|
#include "cppcommon/vec_functs.h"
|
||||||
#include "cppcommon/logger.h"
|
#include "cppcommon/logger.h"
|
||||||
@ -45,6 +46,7 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
struct TrieNodeIterator
|
struct TrieNodeIterator
|
||||||
{
|
{
|
||||||
TrieNode* ptNode;
|
TrieNode* ptNode;
|
||||||
@ -88,16 +90,19 @@ namespace CppJieba
|
|||||||
return ptNode != x.ptNode;
|
return ptNode != x.ptNode;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
class Trie
|
class Trie
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
TrieNode* _root;
|
TrieNode* _root;
|
||||||
vector<TrieNodeInfo> _nodeInfoVec;
|
vector<TrieNodeInfo> _nodeInfoVec;
|
||||||
int64_t _totalWeight;
|
|
||||||
|
int64_t _totalCount;
|
||||||
|
double _minWeight;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
typedef TrieNodeIterator iterator;
|
typedef vector<TrieNodeInfo>::iterator iterator;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
iterator begin();
|
iterator begin();
|
||||||
@ -111,22 +116,24 @@ namespace CppJieba
|
|||||||
void display();
|
void display();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bool find(const ChUnicode* chUniStr, size_t len);
|
const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len);
|
||||||
|
//bool find(const ChUnicode* chUniStr, size_t len);
|
||||||
//bool find(const vector<ChUnicode>& uniVec);
|
//bool find(const vector<ChUnicode>& uniVec);
|
||||||
int findMaxMatch(const ChUnicode* chUniStr, size_t len);
|
int findMaxMatch(const ChUnicode* chUniStr, size_t len);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bool cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res);
|
double getWeight(const ChUnicode* uniStr, size_t len);
|
||||||
|
//bool cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res);
|
||||||
//bool cutUni(const vector<ChUnicode>& uniVec, )
|
//bool cutUni(const vector<ChUnicode>& uniVec, )
|
||||||
bool cutUtf8(const string& str, vector< vector<size_t> >& res);
|
//bool cutUtf8(const string& str, vector< vector<size_t> >& res);
|
||||||
//bool cutMa
|
//bool cutMa
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool _buildTree(const char* const filePath);
|
bool _buildTree(const char* const filePath);
|
||||||
|
bool _countWeight();
|
||||||
bool _destroyNode(TrieNode* node);
|
bool _destroyNode(TrieNode* node);
|
||||||
void _display(TrieNode* node, int level);
|
void _display(TrieNode* node, int level);
|
||||||
bool _insert(const TrieNodeInfo& nodeInfo);
|
bool _insert(const TrieNodeInfo& nodeInfo);
|
||||||
bool _countWeight();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum {bufSize = 1024};
|
enum {bufSize = 1024};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user