modify find function

This commit is contained in:
gwdwyy 2013-07-06 12:37:48 +08:00
parent a97749898f
commit 6c3028a2c9
3 changed files with 63 additions and 35 deletions

View File

@ -98,7 +98,7 @@ namespace CppJieba
for(int j = 0; j <= i; j++) for(int j = 0; j <= i; j++)
{ {
size_t uniLen = i - j + 1; size_t uniLen = i - j + 1;
if(_trie.find(uniStr + j, uniLen)) if(NULL != _trie.find(uniStr + j, uniLen))
{ {
memset(utfBuf, 0 ,sizeof(utfBuf)); memset(utfBuf, 0 ,sizeof(utfBuf));
size_t ret = unicodeToUtf8(uniStr + j, uniLen, utfBuf); size_t ret = unicodeToUtf8(uniStr + j, uniLen, utfBuf);

View File

@ -4,16 +4,17 @@ namespace CppJieba
{ {
Trie::iterator Trie::begin() Trie::iterator Trie::begin()
{ {
return Trie::iterator(_root); return _nodeInfoVec.begin();
} }
Trie::iterator Trie::end() Trie::iterator Trie::end()
{ {
return Trie::iterator(NULL); return _nodeInfoVec.end();
} }
Trie::Trie():_root(NULL), _nodeInfoVec(), _totalWeight(0) Trie::Trie():_root(NULL), _totalCount(0)
{ {
_minWeight = numeric_limits<double>::max();
} }
Trie::~Trie() Trie::~Trie()
@ -103,6 +104,38 @@ namespace CppJieba
_display(_root, 0); _display(_root, 0);
} }
const TrieNodeInfo* Trie::find(const ChUnicode* const chUniStr, size_t len)
{
TrieNode* p = _root;
for(size_t i = 0; i < len; i++)
{
ChUnicode chUni = chUniStr[i];
if(p->hmap.find(chUni) == p-> hmap.end())
{
return NULL;
}
else
{
p = p->hmap[chUni];
}
}
if(p->isLeaf)
{
unsigned int pos = p->nodeInfoVecPos;
if(pos < _nodeInfoVec.size())
{
return &(_nodeInfoVec[pos]);
}
else
{
LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range");
return NULL;
}
}
return NULL;
}
/*
bool Trie::find(const ChUnicode* chUniStr, size_t len) bool Trie::find(const ChUnicode* chUniStr, size_t len)
{ {
int res = -1; int res = -1;
@ -121,6 +154,7 @@ namespace CppJieba
} }
return p->isLeaf; return p->isLeaf;
} }
*/
/* /*
bool Trie::find(const vector<ChUnicode>& uniVec) bool Trie::find(const vector<ChUnicode>& uniVec)
@ -168,6 +202,12 @@ namespace CppJieba
return res; return res;
} }
double getWeight(const ChUnicode* uniStr, size_t len)
{
}
/*
bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res) bool Trie::cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res)
{ {
res.clear(); res.clear();
@ -187,28 +227,8 @@ namespace CppJieba
} }
return true; return true;
} }
*/
bool Trie::cutUtf8(const string& str, vector< vector<size_t> >& res)
{
ChUnicode buf[ChUniMaxLen];
size_t len = utf8ToUnicode(str.c_str(), str.size(), buf);
if(0 == len)
{
return false;
}
return cut(buf, len, res);
/*
PRINT_MATRIX(res);
char buf[1024];
FOR_VECTOR(res, i)
{
FOR_VECTOR(res[i], j)
{
unicodeToUtf8(chUniStr + i, res[i][j] - i + 1, buf);
cout<<buf<<endl;
}
}*/
}
bool Trie::_destroyNode(TrieNode* node) bool Trie::_destroyNode(TrieNode* node)
{ {
@ -301,25 +321,26 @@ namespace CppJieba
bool Trie::_countWeight() bool Trie::_countWeight()
{ {
if(_nodeInfoVec.empty() || 0 != _totalWeight) if(_nodeInfoVec.empty() || 0 != _totalCount)
{ {
LogError("_nodeInfoVec is empty or _totalWeight has been counted already."); LogError("_nodeInfoVec is empty or _totalCount has been counted already.");
return false; return false;
} }
//count total freq //count total freq
for(size_t i = 0; i < _nodeInfoVec.size(); i++) for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{ {
_totalWeight += _nodeInfoVec[i].count; _totalCount += _nodeInfoVec[i].count;
//cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl; //cout<<_nodeInfoVec[i].word<<_nodeInfoVec[i].count<<endl;
} }
//normalize //normalize
for(size_t i = 0; i < _nodeInfoVec.size(); i++) for(size_t i = 0; i < _nodeInfoVec.size(); i++)
{ {
_nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalWeight)); _nodeInfoVec[i].weight = log(double(_nodeInfoVec[i].count)/double(_totalCount));
//cout<<_nodeInfoVec[i].weight<<endl; //cout<<_nodeInfoVec[i].weight<<endl;
} }
cout<<_minWeight<<endl;
return true; return true;
} }

19
Trie.h
View File

@ -7,6 +7,7 @@
#include <cstring> #include <cstring>
#include <stdint.h> #include <stdint.h>
#include <cmath> #include <cmath>
#include <limits>
#include "cppcommon/str_functs.h" #include "cppcommon/str_functs.h"
#include "cppcommon/vec_functs.h" #include "cppcommon/vec_functs.h"
#include "cppcommon/logger.h" #include "cppcommon/logger.h"
@ -45,6 +46,7 @@ namespace CppJieba
} }
}; };
/*
struct TrieNodeIterator struct TrieNodeIterator
{ {
TrieNode* ptNode; TrieNode* ptNode;
@ -88,16 +90,19 @@ namespace CppJieba
return ptNode != x.ptNode; return ptNode != x.ptNode;
} }
}; };
*/
class Trie class Trie
{ {
private: private:
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfoVec;
int64_t _totalWeight;
int64_t _totalCount;
double _minWeight;
public: public:
typedef TrieNodeIterator iterator; typedef vector<TrieNodeInfo>::iterator iterator;
public: public:
iterator begin(); iterator begin();
@ -111,22 +116,24 @@ namespace CppJieba
void display(); void display();
public: public:
bool find(const ChUnicode* chUniStr, size_t len); const TrieNodeInfo* find(const ChUnicode* const chUniStr, size_t len);
//bool find(const ChUnicode* chUniStr, size_t len);
//bool find(const vector<ChUnicode>& uniVec); //bool find(const vector<ChUnicode>& uniVec);
int findMaxMatch(const ChUnicode* chUniStr, size_t len); int findMaxMatch(const ChUnicode* chUniStr, size_t len);
public: public:
bool cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res); double getWeight(const ChUnicode* uniStr, size_t len);
//bool cut(const ChUnicode* chUniStr, size_t len, vector< vector<size_t> >& res);
//bool cutUni(const vector<ChUnicode>& uniVec, ) //bool cutUni(const vector<ChUnicode>& uniVec, )
bool cutUtf8(const string& str, vector< vector<size_t> >& res); //bool cutUtf8(const string& str, vector< vector<size_t> >& res);
//bool cutMa //bool cutMa
private: private:
bool _buildTree(const char* const filePath); bool _buildTree(const char* const filePath);
bool _countWeight();
bool _destroyNode(TrieNode* node); bool _destroyNode(TrieNode* node);
void _display(TrieNode* node, int level); void _display(TrieNode* node, int level);
bool _insert(const TrieNodeInfo& nodeInfo); bool _insert(const TrieNodeInfo& nodeInfo);
bool _countWeight();
private: private:
enum {bufSize = 1024}; enum {bufSize = 1024};