From fe7e3ff80747809f97fcb73d35ddcc2cb25a35ce Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 16 Mar 2014 20:20:37 +0800 Subject: [PATCH] prettify Trie.hpp ing --- src/FullSegment.hpp | 4 +- src/HMMSegment.hpp | 26 +++--- src/KeywordExtractor.hpp | 8 +- src/MPSegment.hpp | 8 +- src/MixSegment.hpp | 4 +- src/SegmentBase.hpp | 10 +-- src/Trie.hpp | 145 ++++++++------------------------- test/unittest/TSegmentBase.cpp | 6 +- test/unittest/TTrie.cpp | 15 ++-- 9 files changed, 76 insertions(+), 150 deletions(-) diff --git a/src/FullSegment.hpp b/src/FullSegment.hpp index 654f6af..902c652 100644 --- a/src/FullSegment.hpp +++ b/src/FullSegment.hpp @@ -48,7 +48,7 @@ namespace CppJieba } //resut of searching in trie tree - vector > tRes; + vector > tRes; //max index of res's words int maxIdx = 0; @@ -63,7 +63,7 @@ namespace CppJieba //find word start from uItr if (_trie.find(uItr, end, tRes)) { - for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) + for (vector >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) { wordLen = itr->second->word.size(); if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 4ad039b..6deb52c 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -76,7 +76,7 @@ namespace CppJieba LogError("not inited."); return false; } - vector status; + vector status; if(!_viterbi(begin, end, status)) { LogError("_viterbi failed."); @@ -85,7 +85,7 @@ namespace CppJieba Unicode::const_iterator left = begin; Unicode::const_iterator right; - for(uint i =0; i< status.size(); i++) + for(size_t i =0; i< status.size(); i++) { if(status[i] % 2) //if(E == status[i] || S == status[i]) { @@ -110,7 +110,7 @@ namespace CppJieba return false; } string tmp; - for(uint i = 0; i < words.size(); i++) + for(size_t i = 0; i < words.size(); i++) { if(TransCode::encode(words[i], tmp)) { @@ -121,7 +121,7 @@ namespace CppJieba } private: - bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const + bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector& status)const { if(begin == end) { @@ -133,7 +133,7 @@ namespace CppJieba size_t XYSize = X * Y; int * path; double * weight; - uint now, old, stat; + size_t now, old, stat; double tmp, endE, endS; try @@ -153,21 +153,21 @@ namespace CppJieba } //start - for(uint y = 0; y < Y; y++) + for(size_t y = 0; y < Y; y++) { weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE); path[0 + y * X] = -1; } //process //for(; begin != end; begin++) - for(uint x = 1; x < X; x++) + for(size_t x = 1; x < X; x++) { - for(uint y = 0; y < Y; y++) + for(size_t y = 0; y < Y; y++) { now = x + y*X; weight[now] = MIN_DOUBLE; path[now] = E; // warning - for(uint preY = 0; preY < Y; preY++) + for(size_t preY = 0; preY < Y; preY++) { old = x - 1 + preY * X; tmp = weight[old] + _transProb[preY][y] + _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE); @@ -221,14 +221,14 @@ namespace CppJieba LogError("start_p illegal"); return false; } - for(uint j = 0; j< tmp.size(); j++) + for(size_t j = 0; j< tmp.size(); j++) { _startProb[j] = atof(tmp[j].c_str()); //cout<<_startProb[j]< tmp, tmp2; uint16_t unico = 0; split(line, tmp, ","); - for(uint i = 0; i < tmp.size(); i++) + for(size_t i = 0; i < tmp.size(); i++) { split(tmp[i], tmp2, ":"); if(2 != tmp2.size()) diff --git a/src/KeywordExtractor.hpp b/src/KeywordExtractor.hpp index a59fd6b..28b1e0e 100644 --- a/src/KeywordExtractor.hpp +++ b/src/KeywordExtractor.hpp @@ -37,7 +37,7 @@ namespace CppJieba }; public: - bool extract(const string& str, vector& keywords, uint topN) const + bool extract(const string& str, vector& keywords, size_t topN) const { assert(_getInitFlag()); vector > topWords; @@ -45,14 +45,14 @@ namespace CppJieba { return false; } - for(uint i = 0; i < topWords.size(); i++) + for(size_t i = 0; i < topWords.size(); i++) { keywords.push_back(topWords[i].first); } return true; } - bool extract(const string& str, vector >& keywords, uint topN) const + bool extract(const string& str, vector >& keywords, size_t topN) const { vector words; if(!_segment.cut(str, words)) @@ -75,7 +75,7 @@ namespace CppJieba } map wordmap; - for(uint i = 0; i < words.size(); i ++) + for(size_t i = 0; i < words.size(); i ++) { wordmap[ words[i] ] += 1.0; } diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 1d3ca1d..a70e8b2 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -66,7 +66,7 @@ namespace CppJieba return false; } string tmp; - for(uint i = 0; i < segWordInfos.size(); i++) + for(size_t i = 0; i < segWordInfos.size(); i++) { if(TransCode::encode(segWordInfos[i].word, tmp)) { @@ -123,7 +123,7 @@ namespace CppJieba for(Unicode::const_iterator it = begin; it != end; it++) { SegmentChar schar(*it); - uint i = it - begin; + size_t i = it - begin; _trie.find(it, end, i, schar.dag); //DagType::iterator dagIter; if(schar.dag.end() == schar.dag.find(i)) @@ -148,7 +148,7 @@ namespace CppJieba segContext[i].weight = MIN_DOUBLE; for(DagType::const_iterator it = segContext[i].dag.begin(); it != segContext[i].dag.end(); it++) { - uint nextPos = it->first; + size_t nextPos = it->first; const TrieNodeInfo* p = it->second; double val = 0.0; if(nextPos + 1 < segContext.size()) @@ -176,7 +176,7 @@ namespace CppJieba } bool _cut(SegmentContext& segContext, vector& res)const { - uint i = 0; + size_t i = 0; while(i < segContext.size()) { const TrieNodeInfo* p = segContext[i].pInfo; diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp index 11c5dc7..b5a205f 100644 --- a/src/MixSegment.hpp +++ b/src/MixSegment.hpp @@ -59,7 +59,7 @@ namespace CppJieba vector hmmRes; Unicode piece; - for (uint i = 0, j = 0; i < infos.size(); i++) + for (size_t i = 0, j = 0; i < infos.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != infos[i].word.size()) @@ -84,7 +84,7 @@ namespace CppJieba } //put hmm result to return - for (uint k = 0; k < hmmRes.size(); k++) + for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } diff --git a/src/SegmentBase.hpp b/src/SegmentBase.hpp index dabee0c..e67ee04 100644 --- a/src/SegmentBase.hpp +++ b/src/SegmentBase.hpp @@ -37,15 +37,15 @@ namespace CppJieba return cut(unico.begin(), unico.end(), res); #else const char * const cstr = str.c_str(); - uint size = str.size(); - uint offset = 0; + size_t size = str.size(); + size_t offset = 0; string subs; int ret; - uint len; + size_t len; while(offset < size) { const char * const nstr = cstr + offset; - uint nsize = size - offset; + size_t nsize = size - offset; if(-1 == (ret = filterAscii(nstr, nsize, len)) || 0 == len || len > nsize) { LogFatal("str[%s] illegal.", cstr); @@ -78,7 +78,7 @@ namespace CppJieba * else count the nonascii string's length and return 1; * if errors, return -1; * */ - static int filterAscii(const char* str, uint len, uint& resLen) + static int filterAscii(const char* str, size_t len, size_t& resLen) { if(!str || !len) { diff --git a/src/Trie.hpp b/src/Trie.hpp index 17812e9..decf6da 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -24,12 +24,13 @@ namespace CppJieba using namespace Limonp; const double MIN_DOUBLE = -3.14e+100; const double MAX_DOUBLE = 3.14e+100; + const size_t DICT_COLUMN_NUM = 3; typedef unordered_map TrieNodeMap; struct TrieNode { TrieNodeMap hmap; bool isLeaf; - uint nodeInfoVecPos; + size_t nodeInfoVecPos; TrieNode() { isLeaf = false; @@ -44,18 +45,11 @@ namespace CppJieba string tag; double logFreq; //logFreq = log(freq/sum(freq)); TrieNodeInfo():freq(0),logFreq(0.0) - { - } + {} TrieNodeInfo(const TrieNodeInfo& nodeInfo):word(nodeInfo.word), freq(nodeInfo.freq), tag(nodeInfo.tag), logFreq(nodeInfo.logFreq) - { - } + {} TrieNodeInfo(const Unicode& _word):word(_word),freq(0),logFreq(MIN_DOUBLE) - { - } - bool operator == (const TrieNodeInfo & rhs) const - { - return word == rhs.word && freq == rhs.freq && tag == rhs.tag && abs(logFreq - rhs.logFreq) < 0.001; - } + {} }; inline ostream& operator << (ostream& os, const TrieNodeInfo & nodeInfo) @@ -63,7 +57,7 @@ namespace CppJieba return os << nodeInfo.word << ":" << nodeInfo.freq << ":" << nodeInfo.tag << ":" << nodeInfo.logFreq ; } - typedef map DagType; + typedef map DagType; class Trie: public InitOnOff { @@ -89,10 +83,6 @@ namespace CppJieba } ~Trie() { - if(!_getInitFlag()) - { - return; - } _deleteNode(_root); } public: @@ -102,7 +92,7 @@ namespace CppJieba _root = new TrieNode; assert(_root); - if(!_trieInsert(filePath.c_str())) + if(!_trieInsert(filePath)) { LogError("_trieInsert failed."); return false; @@ -118,16 +108,6 @@ namespace CppJieba public: const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const { - - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return NULL; - } - if(begin >= end) - { - return NULL; - } TrieNode* p = _root; for(Unicode::const_iterator it = begin; it != end; it++) { @@ -143,7 +123,7 @@ namespace CppJieba } if(p->isLeaf) { - uint pos = p->nodeInfoVecPos; + size_t pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { return &(_nodeInfoVec[pos]); @@ -157,18 +137,8 @@ namespace CppJieba return NULL; } - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector >& res) const + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, vector >& res) const { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return false; - } - if (begin >= end) - { - LogFatal("begin >= end"); - return false; - } TrieNode* p = _root; for (Unicode::const_iterator itr = begin; itr != end; itr++) { @@ -179,7 +149,7 @@ namespace CppJieba p = p->hmap[*itr]; if(p->isLeaf) { - uint pos = p->nodeInfoVecPos; + size_t pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos])); @@ -194,18 +164,8 @@ namespace CppJieba return !res.empty(); } - bool find(Unicode::const_iterator begin, Unicode::const_iterator end, uint offset, DagType & res) const + bool find(Unicode::const_iterator begin, Unicode::const_iterator end, size_t offset, DagType & res) const { - if(!_getInitFlag()) - { - LogFatal("trie not initted!"); - return false; - } - if (begin >= end) - { - LogFatal("begin >= end"); - return false; - } TrieNode* p = _root; for (Unicode::const_iterator itr = begin; itr != end; itr++) { @@ -216,10 +176,9 @@ namespace CppJieba p = p->hmap[*itr]; if(p->isLeaf) { - uint pos = p->nodeInfoVecPos; + size_t pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { - //res.push_back(make_pair(itr-begin, &_nodeInfoVec[pos])); res[itr-begin + offset] = &_nodeInfoVec[pos]; } else @@ -233,32 +192,22 @@ namespace CppJieba } public: - double getMinLogFreq()const{return _minLogFreq;}; + double getMinLogFreq() const {return _minLogFreq;}; private: - bool _insert(const TrieNodeInfo& nodeInfo) + void _insert(const TrieNodeInfo& nodeInfo) { const Unicode& uintVec = nodeInfo.word; TrieNode* p = _root; - for(uint i = 0; i < uintVec.size(); i++) + for(size_t i = 0; i < uintVec.size(); i++) { uint16_t cu = uintVec[i]; - if(NULL == p) - { - return false; - } + assert(p); if(p->hmap.end() == p->hmap.find(cu)) { - TrieNode * next = NULL; - try - { - next = new TrieNode; - } - catch(const bad_alloc& e) - { - return false; - } + TrieNode * next = new TrieNode; + assert(next); p->hmap[cu] = next; p = next; } @@ -267,62 +216,41 @@ namespace CppJieba p = p->hmap[cu]; } } - if(NULL == p) - { - return false; - } - if(p->isLeaf) - { - LogError("this node already _inserted"); - return false; - } + assert(p); + assert(!p->isLeaf); p->isLeaf = true; _nodeInfoVec.push_back(nodeInfo); p->nodeInfoVecPos = _nodeInfoVec.size() - 1; - return true; } private: - bool _trieInsert(const char * const filePath) + bool _trieInsert(const string& filePath) { - ifstream ifs(filePath); + ifstream ifs(filePath.c_str()); if(!ifs) { - LogError("open %s failed.", filePath); + LogError("open %s failed.", filePath.c_str()); return false; } string line; vector vecBuf; TrieNodeInfo nodeInfo; - size_t lineno = 0; - while(getline(ifs, line)) + for(size_t lineno = 0 ; getline(ifs, line); lineno++) { - vecBuf.clear(); - lineno ++; split(line, vecBuf, " "); - if(3 < vecBuf.size()) - { - LogError("line[%u:%s] illegal.", lineno, line.c_str()); - return false; - } + assert(vecBuf.size() == DICT_COLUMN_NUM); if(!TransCode::decode(vecBuf[0], nodeInfo.word)) { LogError("line[%u:%s] illegal.", lineno, line.c_str()); return false; } nodeInfo.freq = atoi(vecBuf[1].c_str()); - if(3 == vecBuf.size()) - { - nodeInfo.tag = vecBuf[2]; - } + nodeInfo.tag = vecBuf[2]; - if(!_insert(nodeInfo)) - { - assert(false); - } + _insert(nodeInfo); } return true; } @@ -340,21 +268,13 @@ namespace CppJieba _freqSum += _nodeInfoVec[i].freq; } - if(0 == _freqSum) - { - LogError("_freqSum == 0 ."); - return false; - } + assert(_freqSum); //normalize - for(uint i = 0; i < _nodeInfoVec.size(); i++) + for(size_t i = 0; i < _nodeInfoVec.size(); i++) { TrieNodeInfo& nodeInfo = _nodeInfoVec[i]; - if(0 == nodeInfo.freq) - { - LogFatal("nodeInfo.freq == 0!"); - return false; - } + assert(nodeInfo.freq); nodeInfo.logFreq = log(double(nodeInfo.freq)/double(_freqSum)); if(_minLogFreq > nodeInfo.logFreq) { @@ -367,12 +287,15 @@ namespace CppJieba void _deleteNode(TrieNode* node) { + if(!node) + { + return; + } for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) { TrieNode* next = it->second; _deleteNode(next); } - delete node; } diff --git a/test/unittest/TSegmentBase.cpp b/test/unittest/TSegmentBase.cpp index 5b00bee..6ee708e 100644 --- a/test/unittest/TSegmentBase.cpp +++ b/test/unittest/TSegmentBase.cpp @@ -12,11 +12,11 @@ TEST(SegmentBaseTest, Test1) buf.push_back("你好"); buf.push_back("...hh"); vector res; - uint size = strlen(str); - uint offset = 0; + size_t size = strlen(str); + size_t offset = 0; while(offset < size) { - uint len = 0; + size_t len = 0; const char* t = str + offset; SegmentBase::filterAscii(t, size - offset, len); s.assign(t, len); diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 8b1c636..67dcf7a 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -7,6 +7,7 @@ static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; TEST(TrieTest, Test1) { + string s1, s2; Trie trie; ASSERT_TRUE(trie.init(DICT_FILE)); ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001); @@ -18,14 +19,16 @@ TEST(TrieTest, Test1) nodeInfo.freq = 8779; nodeInfo.tag = "v"; nodeInfo.logFreq = -8.87033; + s1 << nodeInfo; + s2 << (*trie.find(uni.begin(), uni.end())); - EXPECT_EQ(nodeInfo, *trie.find(uni.begin(), uni.end())); + EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2); word = "清华大学"; - vector > res; - map resMap; - map map; + vector > res; + map resMap; + map map; const char * words[] = {"清", "清华", "清华大学"}; - for(uint i = 0; i < sizeof(words)/sizeof(words[0]); i++) + for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); @@ -34,7 +37,7 @@ TEST(TrieTest, Test1) //TrieNodeInfo //res.push_back(make_pair(0, )) - vector > vec; + vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); //print(uni); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), vec));