rewriting trie.cpp/h

This commit is contained in:
gwdwyy 2013-07-22 14:49:00 +08:00
parent ca5e5517e7
commit d69411e998
4 changed files with 37 additions and 50 deletions

View File

@ -16,8 +16,7 @@ namespace CppJieba
bool Segment::init() bool Segment::init()
{ {
bool retFlag; bool retFlag = _trie.init();
retFlag = _trie.init();
if(!retFlag) if(!retFlag)
{ {
LogError("_trie.init failed."); LogError("_trie.init failed.");
@ -28,9 +27,8 @@ namespace CppJieba
bool Segment::loadSegDict(const string& filePath) bool Segment::loadSegDict(const string& filePath)
{ {
bool retFlag;
retFlag = _trie.loadDict(filePath);
LogInfo(string_format("_trie.loadDict(%s) start...", filePath.c_str())); LogInfo(string_format("_trie.loadDict(%s) start...", filePath.c_str()));
bool retFlag = _trie.loadDict(filePath);
LogInfo("_trie.loadDict end."); LogInfo("_trie.loadDict end.");
return retFlag; return retFlag;
} }
@ -48,10 +46,10 @@ namespace CppJieba
string uniStr = gEncoding.decode(str); string uniStr = gEncoding.decode(str);
if(uniStr.empty()) if(uniStr.empty())
{ {
LogError("_utf8ToUni failed."); LogError("gEncoding.decode failed.");
return false; return false;
} }
//calc DAG //calc DAG
vector<vector<uint> > dag; vector<vector<uint> > dag;
retFlag = _calcDAG(uniStr, dag); retFlag = _calcDAG(uniStr, dag);
@ -181,12 +179,8 @@ using namespace CppJieba;
int main() int main()
{ {
/*
cout<<__FILE__<<__LINE__<<endl;
Segment segment; Segment segment;
cout<<__FILE__<<__LINE__<<endl;
segment.init(); segment.init();
cout<<__FILE__<<__LINE__<<endl;
if(!segment.loadSegDict("../dicts/segdict.utf8.v2.1")) if(!segment.loadSegDict("../dicts/segdict.utf8.v2.1"))
{ {
cerr<<"1"<<endl; cerr<<"1"<<endl;
@ -199,14 +193,12 @@ int main()
while(getline(ifile, line)) while(getline(ifile, line))
{ {
res.clear(); res.clear();
cout<<__FILE__<<__LINE__<<endl;
segment.cutDAG(line, res); segment.cutDAG(line, res);
PRINT_VECTOR(res); PRINT_VECTOR(res);
getchar(); getchar();
} }
segment.dispose(); segment.dispose();
*/
return 0; return 0;
} }

View File

@ -114,13 +114,7 @@ namespace CppJieba
//insert node //insert node
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
nodeInfo.word = chWord; nodeInfo.word = chWord;
size_t wLen = getUtf8WordLen(chWord); nodeInfo.wLen = 0;
if(0 == wLen)
{
LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
return false;
}
nodeInfo.wLen = wLen;
nodeInfo.count = count; nodeInfo.count = count;
nodeInfo.tag = tag; nodeInfo.tag = tag;
@ -162,28 +156,21 @@ namespace CppJieba
LogFatal("trie not initted!"); LogFatal("trie not initted!");
return NULL; return NULL;
} }
if(str.empty()) vector<uint16_t> unicode;
{
LogError("str is empty"); bool retFlag = gEncoding.decode(str, unicode);
return NULL; if(retFlag)
}
string uniStr = gEncoding.decode(str);
if(uniStr.empty())
{ {
LogError("gEncoding.decode failed."); LogError("gEncoding.decode failed.");
return NULL; return NULL;
} }
if(uniStr.size() % 2)
{
LogError("utf8ToUnicode return uniStr illegal");
return NULL;
}
//find //find
TrieNode* p = _root; TrieNode* p = _root;
TrieNodeInfo * res = NULL; TrieNodeInfo * res = NULL;
for(uint i = 0; i < uniStr.size(); i+=2) for(uint i = 0; i < unicode.size(); i++)
{ {
ChUnicode chUni = twocharToUint16(uniStr[0], uniStr[i+1]); uint16_t chUni = unicode[i];
if(p->isLeaf) if(p->isLeaf)
{ {
uint pos = p->nodeInfoVecPos; uint pos = p->nodeInfoVecPos;
@ -212,25 +199,32 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const string& str) const TrieNodeInfo* Trie::find(const string& str)
{ {
string uniStr = gEncoding.decode(str); vector<uint16_t> unicode;
return _findUniStr(uniStr); bool retFlag = gEncoding.decode(str, unicode);
if(!retFlag)
{
return NULL;
}
return find(unicode);
} }
const TrieNodeInfo* Trie::_findUniStr(const string& uniStr) const TrieNodeInfo* Trie::find(const vector<uint16_t>& unicode)
{ {
if(!_getInitFlag()) if(!_getInitFlag())
{ {
LogFatal("trie not initted!"); LogFatal("trie not initted!");
return NULL; return NULL;
} }
if(uniStr.empty() || uniStr.size() % 2) if(unicode.empty())
{ {
LogError("uniStr illegal"); LogError("unicode empty");
return NULL;
} }
TrieNode* p = _root; TrieNode* p = _root;
for(uint i = 0; i < uniStr.size(); i+=2) for(uint i = 0; i < unicode.size(); i++)
{ {
ChUnicode chUni = twocharToUint16(uniStr[i], uniStr[i+1]); uint16_t chUni = unicode[i];
if(p->hmap.find(chUni) == p-> hmap.end()) if(p->hmap.find(chUni) == p-> hmap.end())
{ {
return NULL; return NULL;
@ -258,8 +252,10 @@ namespace CppJieba
double Trie::getWeight(const string& str) double Trie::getWeight(const string& str)
{ {
string uniStr = gEncoding.decode(str);
const TrieNodeInfo * p = _findUniStr(uniStr); vector<uint16_t> unicode;
gEncoding.decode(str, unicode);
const TrieNodeInfo * p = find(unicode);
if(NULL != p) if(NULL != p)
{ {
return p->weight; return p->weight;
@ -303,17 +299,18 @@ namespace CppJieba
const string& word = nodeInfo.word; const string& word = nodeInfo.word;
string uniStr = gEncoding.decode(word); vector<uint16_t> unicode;
if(uniStr.empty()) bool retFlag = gEncoding.decode(word, unicode);
if(!retFlag)
{ {
LogError("gEncoding.decode error."); LogError("gEncoding.decode error.");
return false; return false;
} }
TrieNode* p = _root; TrieNode* p = _root;
for(uint i = 0; i < uniStr.size(); i+=2) for(uint i = 0; i < unicode.size(); i++)
{ {
ChUnicode cu = twocharToUint16(uniStr[i], uniStr[i+1]); uint16_t cu = unicode[i];
if(NULL == p) if(NULL == p)
{ {
return false; return false;
@ -398,7 +395,6 @@ namespace CppJieba
using namespace CppJieba; using namespace CppJieba;
int main() int main()
{ {
cout<<__FILE__<<__FILE__<<endl;
Trie trie; Trie trie;
trie.init(); trie.init();
trie.loadDict("../dicts/segdict.utf8.v2.1"); trie.loadDict("../dicts/segdict.utf8.v2.1");

View File

@ -26,7 +26,7 @@ namespace CppJieba
using namespace CPPCOMMON; using namespace CPPCOMMON;
using namespace std; using namespace std;
//using __gnu_cxx::hash_map; //using __gnu_cxx::hash_map;
typedef map<ChUnicode, struct TrieNode*> TrieNodeMap; typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNodeInfo struct TrieNodeInfo
{ {
@ -88,6 +88,7 @@ namespace CppJieba
public: public:
const TrieNodeInfo* find(const string& str); const TrieNodeInfo* find(const string& str);
const TrieNodeInfo* find(const vector<uint16_t>& unicode);
const TrieNodeInfo* findPrefix(const string& str); const TrieNodeInfo* findPrefix(const string& str);
public: public:
@ -102,7 +103,6 @@ namespace CppJieba
bool _buildTree(const string& filePath); bool _buildTree(const string& filePath);
bool _countWeight(); bool _countWeight();
bool _deleteNode(TrieNode* node); bool _deleteNode(TrieNode* node);
const TrieNodeInfo* _findUniStr(const string& uniStr);
}; };
} }

View File

@ -13,7 +13,6 @@ namespace CppJieba
{ {
//typedefs //typedefs
typedef uint16_t ChUnicode;
typedef unsigned int uint; typedef unsigned int uint;
typedef std::vector<std::string>::iterator VSI; typedef std::vector<std::string>::iterator VSI;