From d69411e9988cbc17eeba6ddb558e5903aa01744f Mon Sep 17 00:00:00 2001 From: gwdwyy Date: Mon, 22 Jul 2013 14:49:00 +0800 Subject: [PATCH] rewriting trie.cpp/h --- src/Segment.cpp | 16 +++--------- src/Trie.cpp | 66 +++++++++++++++++++++++-------------------------- src/Trie.h | 4 +-- src/globals.h | 1 - 4 files changed, 37 insertions(+), 50 deletions(-) diff --git a/src/Segment.cpp b/src/Segment.cpp index 7e8f74b..0baf1d9 100644 --- a/src/Segment.cpp +++ b/src/Segment.cpp @@ -16,8 +16,7 @@ namespace CppJieba bool Segment::init() { - bool retFlag; - retFlag = _trie.init(); + bool retFlag = _trie.init(); if(!retFlag) { LogError("_trie.init failed."); @@ -28,9 +27,8 @@ namespace CppJieba bool Segment::loadSegDict(const string& filePath) { - bool retFlag; - retFlag = _trie.loadDict(filePath); LogInfo(string_format("_trie.loadDict(%s) start...", filePath.c_str())); + bool retFlag = _trie.loadDict(filePath); LogInfo("_trie.loadDict end."); return retFlag; } @@ -48,10 +46,10 @@ namespace CppJieba string uniStr = gEncoding.decode(str); if(uniStr.empty()) { - LogError("_utf8ToUni failed."); + LogError("gEncoding.decode failed."); return false; } - + //calc DAG vector > dag; retFlag = _calcDAG(uniStr, dag); @@ -181,12 +179,8 @@ using namespace CppJieba; int main() { - /* - cout<<__FILE__<<__LINE__< unicode; + + bool retFlag = gEncoding.decode(str, unicode); + if(retFlag) { LogError("gEncoding.decode failed."); return NULL; } - if(uniStr.size() % 2) - { - LogError("utf8ToUnicode return uniStr illegal"); - return NULL; - } + //find TrieNode* p = _root; TrieNodeInfo * res = NULL; - for(uint i = 0; i < uniStr.size(); i+=2) + for(uint i = 0; i < unicode.size(); i++) { - ChUnicode chUni = twocharToUint16(uniStr[0], uniStr[i+1]); + uint16_t chUni = unicode[i]; if(p->isLeaf) { uint pos = p->nodeInfoVecPos; @@ -212,25 +199,32 @@ namespace CppJieba const TrieNodeInfo* Trie::find(const string& str) { - string uniStr = gEncoding.decode(str); - return _findUniStr(uniStr); + vector unicode; + bool retFlag = gEncoding.decode(str, unicode); + if(!retFlag) + { + return NULL; + } + return find(unicode); } - const TrieNodeInfo* Trie::_findUniStr(const string& uniStr) + const TrieNodeInfo* Trie::find(const vector& unicode) { + if(!_getInitFlag()) { LogFatal("trie not initted!"); return NULL; } - if(uniStr.empty() || uniStr.size() % 2) + if(unicode.empty()) { - LogError("uniStr illegal"); + LogError("unicode empty"); + return NULL; } TrieNode* p = _root; - for(uint i = 0; i < uniStr.size(); i+=2) + for(uint i = 0; i < unicode.size(); i++) { - ChUnicode chUni = twocharToUint16(uniStr[i], uniStr[i+1]); + uint16_t chUni = unicode[i]; if(p->hmap.find(chUni) == p-> hmap.end()) { return NULL; @@ -258,8 +252,10 @@ namespace CppJieba double Trie::getWeight(const string& str) { - string uniStr = gEncoding.decode(str); - const TrieNodeInfo * p = _findUniStr(uniStr); + + vector unicode; + gEncoding.decode(str, unicode); + const TrieNodeInfo * p = find(unicode); if(NULL != p) { return p->weight; @@ -303,17 +299,18 @@ namespace CppJieba const string& word = nodeInfo.word; - string uniStr = gEncoding.decode(word); - if(uniStr.empty()) + vector unicode; + bool retFlag = gEncoding.decode(word, unicode); + if(!retFlag) { LogError("gEncoding.decode error."); return false; } TrieNode* p = _root; - for(uint i = 0; i < uniStr.size(); i+=2) + for(uint i = 0; i < unicode.size(); i++) { - ChUnicode cu = twocharToUint16(uniStr[i], uniStr[i+1]); + uint16_t cu = unicode[i]; if(NULL == p) { return false; @@ -398,7 +395,6 @@ namespace CppJieba using namespace CppJieba; int main() { - cout<<__FILE__<<__FILE__< TrieNodeMap; + typedef map TrieNodeMap; struct TrieNodeInfo { @@ -88,6 +88,7 @@ namespace CppJieba public: const TrieNodeInfo* find(const string& str); + const TrieNodeInfo* find(const vector& unicode); const TrieNodeInfo* findPrefix(const string& str); public: @@ -102,7 +103,6 @@ namespace CppJieba bool _buildTree(const string& filePath); bool _countWeight(); bool _deleteNode(TrieNode* node); - const TrieNodeInfo* _findUniStr(const string& uniStr); }; } diff --git a/src/globals.h b/src/globals.h index 1c73c45..4d89362 100644 --- a/src/globals.h +++ b/src/globals.h @@ -13,7 +13,6 @@ namespace CppJieba { //typedefs - typedef uint16_t ChUnicode; typedef unsigned int uint; typedef std::vector::iterator VSI;