rewriting trie.cpp/h

This commit is contained in:
gwdwyy 2013-07-22 14:49:00 +08:00
parent ca5e5517e7
commit d69411e998
4 changed files with 37 additions and 50 deletions

View File

@ -16,8 +16,7 @@ namespace CppJieba
bool Segment::init()
{
bool retFlag;
retFlag = _trie.init();
bool retFlag = _trie.init();
if(!retFlag)
{
LogError("_trie.init failed.");
@ -28,9 +27,8 @@ namespace CppJieba
bool Segment::loadSegDict(const string& filePath)
{
bool retFlag;
retFlag = _trie.loadDict(filePath);
LogInfo(string_format("_trie.loadDict(%s) start...", filePath.c_str()));
bool retFlag = _trie.loadDict(filePath);
LogInfo("_trie.loadDict end.");
return retFlag;
}
@ -48,7 +46,7 @@ namespace CppJieba
string uniStr = gEncoding.decode(str);
if(uniStr.empty())
{
LogError("_utf8ToUni failed.");
LogError("gEncoding.decode failed.");
return false;
}
@ -181,12 +179,8 @@ using namespace CppJieba;
int main()
{
/*
cout<<__FILE__<<__LINE__<<endl;
Segment segment;
cout<<__FILE__<<__LINE__<<endl;
segment.init();
cout<<__FILE__<<__LINE__<<endl;
if(!segment.loadSegDict("../dicts/segdict.utf8.v2.1"))
{
cerr<<"1"<<endl;
@ -199,14 +193,12 @@ int main()
while(getline(ifile, line))
{
res.clear();
cout<<__FILE__<<__LINE__<<endl;
segment.cutDAG(line, res);
PRINT_VECTOR(res);
getchar();
}
segment.dispose();
*/
return 0;
}

View File

@ -114,13 +114,7 @@ namespace CppJieba
//insert node
TrieNodeInfo nodeInfo;
nodeInfo.word = chWord;
size_t wLen = getUtf8WordLen(chWord);
if(0 == wLen)
{
LogFatal(string_format("getUtf8WordLen(%s) return 0", chWord.c_str()));
return false;
}
nodeInfo.wLen = wLen;
nodeInfo.wLen = 0;
nodeInfo.count = count;
nodeInfo.tag = tag;
@ -162,28 +156,21 @@ namespace CppJieba
LogFatal("trie not initted!");
return NULL;
}
if(str.empty())
{
LogError("str is empty");
return NULL;
}
string uniStr = gEncoding.decode(str);
if(uniStr.empty())
vector<uint16_t> unicode;
bool retFlag = gEncoding.decode(str, unicode);
if(retFlag)
{
LogError("gEncoding.decode failed.");
return NULL;
}
if(uniStr.size() % 2)
{
LogError("utf8ToUnicode return uniStr illegal");
return NULL;
}
//find
TrieNode* p = _root;
TrieNodeInfo * res = NULL;
for(uint i = 0; i < uniStr.size(); i+=2)
for(uint i = 0; i < unicode.size(); i++)
{
ChUnicode chUni = twocharToUint16(uniStr[0], uniStr[i+1]);
uint16_t chUni = unicode[i];
if(p->isLeaf)
{
uint pos = p->nodeInfoVecPos;
@ -212,25 +199,32 @@ namespace CppJieba
const TrieNodeInfo* Trie::find(const string& str)
{
string uniStr = gEncoding.decode(str);
return _findUniStr(uniStr);
vector<uint16_t> unicode;
bool retFlag = gEncoding.decode(str, unicode);
if(!retFlag)
{
return NULL;
}
return find(unicode);
}
const TrieNodeInfo* Trie::_findUniStr(const string& uniStr)
const TrieNodeInfo* Trie::find(const vector<uint16_t>& unicode)
{
if(!_getInitFlag())
{
LogFatal("trie not initted!");
return NULL;
}
if(uniStr.empty() || uniStr.size() % 2)
if(unicode.empty())
{
LogError("uniStr illegal");
LogError("unicode empty");
return NULL;
}
TrieNode* p = _root;
for(uint i = 0; i < uniStr.size(); i+=2)
for(uint i = 0; i < unicode.size(); i++)
{
ChUnicode chUni = twocharToUint16(uniStr[i], uniStr[i+1]);
uint16_t chUni = unicode[i];
if(p->hmap.find(chUni) == p-> hmap.end())
{
return NULL;
@ -258,8 +252,10 @@ namespace CppJieba
double Trie::getWeight(const string& str)
{
string uniStr = gEncoding.decode(str);
const TrieNodeInfo * p = _findUniStr(uniStr);
vector<uint16_t> unicode;
gEncoding.decode(str, unicode);
const TrieNodeInfo * p = find(unicode);
if(NULL != p)
{
return p->weight;
@ -303,17 +299,18 @@ namespace CppJieba
const string& word = nodeInfo.word;
string uniStr = gEncoding.decode(word);
if(uniStr.empty())
vector<uint16_t> unicode;
bool retFlag = gEncoding.decode(word, unicode);
if(!retFlag)
{
LogError("gEncoding.decode error.");
return false;
}
TrieNode* p = _root;
for(uint i = 0; i < uniStr.size(); i+=2)
for(uint i = 0; i < unicode.size(); i++)
{
ChUnicode cu = twocharToUint16(uniStr[i], uniStr[i+1]);
uint16_t cu = unicode[i];
if(NULL == p)
{
return false;
@ -398,7 +395,6 @@ namespace CppJieba
using namespace CppJieba;
int main()
{
cout<<__FILE__<<__FILE__<<endl;
Trie trie;
trie.init();
trie.loadDict("../dicts/segdict.utf8.v2.1");

View File

@ -26,7 +26,7 @@ namespace CppJieba
using namespace CPPCOMMON;
using namespace std;
//using __gnu_cxx::hash_map;
typedef map<ChUnicode, struct TrieNode*> TrieNodeMap;
typedef map<uint16_t, struct TrieNode*> TrieNodeMap;
struct TrieNodeInfo
{
@ -88,6 +88,7 @@ namespace CppJieba
public:
const TrieNodeInfo* find(const string& str);
const TrieNodeInfo* find(const vector<uint16_t>& unicode);
const TrieNodeInfo* findPrefix(const string& str);
public:
@ -102,7 +103,6 @@ namespace CppJieba
bool _buildTree(const string& filePath);
bool _countWeight();
bool _deleteNode(TrieNode* node);
const TrieNodeInfo* _findUniStr(const string& uniStr);
};
}

View File

@ -13,7 +13,6 @@ namespace CppJieba
{
//typedefs
typedef uint16_t ChUnicode;
typedef unsigned int uint;
typedef std::vector<std::string>::iterator VSI;