rm TrieManager.hpp

This commit is contained in:
wyy 2014-03-15 22:02:48 +08:00
parent e3b58d6ddc
commit ddaa5589f1
11 changed files with 42 additions and 258 deletions

View File

@ -9,14 +9,13 @@
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
#include "TrieManager.hpp"
namespace CppJieba namespace CppJieba
{ {
class FullSegment: public SegmentBase class FullSegment: public SegmentBase
{ {
private: private:
Trie* _trie; Trie _trie;
public: public:
FullSegment(){_setInitFlag(false);}; FullSegment(){_setInitFlag(false);};
@ -30,12 +29,8 @@ namespace CppJieba
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
_trie = TrieManager::getInstance().getTrie(dictPath.c_str()); _trie.init(dictPath.c_str());
if (NULL == _trie) assert(_trie);
{
LogError("get NULL pointor from getTrie(\"%s\")", dictPath.c_str());
return false;
}
return _setInitFlag(true); return _setInitFlag(true);
} }
@ -66,7 +61,7 @@ namespace CppJieba
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++) for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
{ {
//find word start from uItr //find word start from uItr
if (_trie->find(uItr, end, tRes)) if (_trie.find(uItr, end, tRes))
{ {
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {

View File

@ -10,7 +10,7 @@
#include <cassert> #include <cassert>
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Trie.hpp" #include "Trie.hpp"
#include "TrieManager.hpp" #include "Trie.hpp"
#include "ISegment.hpp" #include "ISegment.hpp"
#include "SegmentBase.hpp" #include "SegmentBase.hpp"
@ -25,15 +25,14 @@ namespace CppJieba
double weight; double weight;
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0) SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
{ {}
}
}; };
typedef vector<SegmentChar> SegmentContext; typedef vector<SegmentChar> SegmentContext;
class MPSegment: public SegmentBase class MPSegment: public SegmentBase
{ {
protected: protected:
Trie* _trie; Trie _trie;
public: public:
MPSegment(){_setInitFlag(false);}; MPSegment(){_setInitFlag(false);};
@ -50,12 +49,8 @@ namespace CppJieba
LogError("already inited before now."); LogError("already inited before now.");
return false; return false;
} }
_trie = TrieManager::getInstance().getTrie(dictPath.c_str()); _trie.init(dictPath);
if (_trie == NULL) assert(_trie);
{
LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str());
return false;
}
LogInfo("MPSegment init(%s) ok", dictPath.c_str()); LogInfo("MPSegment init(%s) ok", dictPath.c_str());
return _setInitFlag(true); return _setInitFlag(true);
} }
@ -129,7 +124,7 @@ namespace CppJieba
{ {
SegmentChar schar(*it); SegmentChar schar(*it);
uint i = it - begin; uint i = it - begin;
_trie->find(it, end, i, schar.dag); _trie.find(it, end, i, schar.dag);
//DagType::iterator dagIter; //DagType::iterator dagIter;
if(schar.dag.end() == schar.dag.find(i)) if(schar.dag.end() == schar.dag.find(i))
{ {
@ -167,7 +162,7 @@ namespace CppJieba
} }
else else
{ {
val += _trie->getMinLogFreq(); val += _trie.getMinLogFreq();
} }
if(val > segContext[i].weight) if(val > segContext[i].weight)
{ {
@ -195,7 +190,7 @@ namespace CppJieba
TrieNodeInfo nodeInfo; TrieNodeInfo nodeInfo;
nodeInfo.word.push_back(segContext[i].uniCh); nodeInfo.word.push_back(segContext[i].uniCh);
nodeInfo.freq = 0; nodeInfo.freq = 0;
nodeInfo.logFreq = _trie->getMinLogFreq(); nodeInfo.logFreq = _trie.getMinLogFreq();
res.push_back(nodeInfo); res.push_back(nodeInfo);
i++; i++;
} }

View File

@ -18,16 +18,13 @@ namespace CppJieba
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict) explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
{ {
_setInitFlag(init(mpSegDict, hmmSegDict)); _setInitFlag(init(mpSegDict, hmmSegDict));
assert(_getInitFlag());
} }
virtual ~MixSegment(){} virtual ~MixSegment(){}
public: public:
bool init(const string& mpSegDict, const string& hmmSegDict) bool init(const string& mpSegDict, const string& hmmSegDict)
{ {
if(_getInitFlag()) assert(!_getInitFlag());
{
LogError("inited.");
return false;
}
if(!_mpSeg.init(mpSegDict)) if(!_mpSeg.init(mpSegDict))
{ {
LogError("_mpSeg init"); LogError("_mpSeg init");

View File

@ -4,7 +4,6 @@
#include "MixSegment.hpp" #include "MixSegment.hpp"
#include "Limonp/str_functs.hpp" #include "Limonp/str_functs.hpp"
#include "Trie.hpp" #include "Trie.hpp"
#include "TrieManager.hpp"
namespace CppJieba namespace CppJieba
{ {
@ -14,7 +13,7 @@ namespace CppJieba
{ {
private: private:
MixSegment _segment; MixSegment _segment;
Trie* _trie; Trie _trie;
public: public:
PosTagger(){_setInitFlag(false);}; PosTagger(){_setInitFlag(false);};
@ -26,17 +25,10 @@ namespace CppJieba
public: public:
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb) bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
{ {
if (_getInitFlag())
{ assert(!_getInitFlag());
LogError("already inited before."); _trie.init(dictPath);
return false; assert(_trie);
}
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
if (NULL == _trie)
{
LogError("get a NULL pointor from getTrie(\"%s\").", dictPath.c_str());
return false;
}
return _setInitFlag(_segment.init(dictPath, hmmFilePath)); return _setInitFlag(_segment.init(dictPath, hmmFilePath));
}; };
@ -59,7 +51,7 @@ namespace CppJieba
LogError("decode failed."); LogError("decode failed.");
return false; return false;
} }
tmp = _trie->find(unico.begin(), unico.end()); tmp = _trie.find(unico.begin(), unico.end());
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
} }
tmp = NULL; tmp = NULL;

View File

@ -11,7 +11,7 @@
#include "FullSegment.hpp" #include "FullSegment.hpp"
#include "MixSegment.hpp" #include "MixSegment.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
#include "TrieManager.hpp" #include "Trie.hpp"
namespace CppJieba namespace CppJieba
{ {

View File

@ -14,6 +14,7 @@
#include <limits> #include <limits>
#include "Limonp/str_functs.hpp" #include "Limonp/str_functs.hpp"
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Limonp/InitOnOff.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
@ -64,14 +65,13 @@ namespace CppJieba
typedef map<uint, const TrieNodeInfo*> DagType; typedef map<uint, const TrieNodeInfo*> DagType;
class Trie class Trie: public InitOnOff
{ {
private: private:
TrieNode* _root; TrieNode* _root;
vector<TrieNodeInfo> _nodeInfoVec; vector<TrieNodeInfo> _nodeInfoVec;
bool _initFlag;
int64_t _freqSum; int64_t _freqSum;
double _minLogFreq; double _minLogFreq;
@ -81,57 +81,28 @@ namespace CppJieba
_root = NULL; _root = NULL;
_freqSum = 0; _freqSum = 0;
_minLogFreq = MAX_DOUBLE; _minLogFreq = MAX_DOUBLE;
_initFlag = false; _setInitFlag(false);
}
Trie(const string& filePath): Trie()
{
_setInitFlag(init(filePath));
} }
~Trie() ~Trie()
{
dispose();
}
bool init()
{
if(_getInitFlag())
{
LogError("already initted!");
return false;
}
try
{
_root = new TrieNode;
}
catch(const bad_alloc& e)
{
return false;
}
if(NULL == _root)
{
return false;
}
_setInitFlag(true);
return true;
}
bool dispose()
{ {
if(!_getInitFlag()) if(!_getInitFlag())
{ {
return false; return;
} }
bool ret = _deleteNode(_root); _deleteNode(_root);
if(!ret) }
public:
bool init(const string& filePath)
{ {
LogFatal("_deleteNode failed!"); assert(!_getInitFlag());
return false;
}
_root = NULL;
_nodeInfoVec.clear();
_setInitFlag(false); _root = new TrieNode;
return ret; assert(_root);
} if(!_trieInsert(filePath.c_str()))
bool loadDict(const char * const filePath)
{
assert(_getInitFlag());
if(!_trieInsert(filePath))
{ {
LogError("_trieInsert failed."); LogError("_trieInsert failed.");
return false; return false;
@ -141,13 +112,9 @@ namespace CppJieba
LogError("_countWeight failed."); LogError("_countWeight failed.");
return false; return false;
} }
return true; return _setInitFlag(true);
} }
private:
void _setInitFlag(bool on){_initFlag = on;};
bool _getInitFlag()const{return _initFlag;};
public: public:
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
{ {
@ -271,12 +238,6 @@ namespace CppJieba
private: private:
bool _insert(const TrieNodeInfo& nodeInfo) bool _insert(const TrieNodeInfo& nodeInfo)
{ {
if(!_getInitFlag())
{
LogFatal("not initted!");
return false;
}
const Unicode& uintVec = nodeInfo.word; const Unicode& uintVec = nodeInfo.word;
TrieNode* p = _root; TrieNode* p = _root;
@ -358,10 +319,9 @@ namespace CppJieba
nodeInfo.tag = vecBuf[2]; nodeInfo.tag = vecBuf[2];
} }
//_insert node
if(!_insert(nodeInfo)) if(!_insert(nodeInfo))
{ {
LogError("_insert node failed!"); assert(false);
} }
} }
return true; return true;
@ -405,7 +365,7 @@ namespace CppJieba
return true; return true;
} }
bool _deleteNode(TrieNode* node) void _deleteNode(TrieNode* node)
{ {
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++) for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
{ {
@ -414,7 +374,6 @@ namespace CppJieba
} }
delete node; delete node;
return true;
} }
}; };

View File

@ -1,73 +0,0 @@
#ifndef CPPJIEBA_TRIEMANAGER_H
#define CPPJIEBA_TRIEMANAGER_H
#include "Trie.hpp"
#include "Limonp/md5.hpp"
#include "Limonp/logger.hpp"
namespace CppJieba
{
using namespace Limonp;
class TrieManager
{
private:
unordered_map<std::string, Trie*> _tries;
TrieManager(){};
TrieManager(TrieManager& tm){};
public:
Trie* getTrie(const char* dictpath)
{
string md5;
if (!md5File(dictpath, md5))
{
LogError("error when getting md5 for file '%s'", dictpath);
return NULL;
}
if (_tries.find(md5) != _tries.end())
{
return _tries[md5.c_str()];
}
//LogDebug("create a new trie for md5: '%s'", md5.c_str());
Trie* trie = NULL;
try
{
trie = new Trie();
}
catch (const bad_alloc& e)
{
LogError("error when new a trie for file '%s'", dictpath);
return NULL;
}
if (NULL == trie)
{
LogError("get NULL from new trie for file '%s'", dictpath);
return NULL;
}
if (!trie->init())
{
LogError("trie init error for file '%s'", dictpath);
return NULL;
}
if (!trie->loadDict(dictpath))
{
LogError("trie->loadDict(%s) failed...", dictpath);
return NULL;
}
_tries[md5.c_str()] = trie;
LogDebug("trie->loadDict(%s)", dictpath);
return trie;
}
static TrieManager& getInstance()
{
static TrieManager _this;
return _this;
}
};
}
#endif

View File

@ -6,7 +6,7 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN) ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc) ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TMPSegment.cpp TTrie.cpp TFullSegment.cpp TMd5.cpp TQuerySegment.cpp TTrieManager TFullSegment.cpp TMd5.cpp TQuerySegment.cpp TTrieManager.cpp THMMSegment.cpp TMixSegment.cpp TSegmentBase.cpp) ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TMPSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp THMMSegment.cpp TMixSegment.cpp TSegmentBase.cpp)
TARGET_LINK_LIBRARIES(gtest pthread) TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread) TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -1,28 +0,0 @@
#include "src/TrieManager.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
const char* const DICT_FILE[] = {
"../test/testdata/jieba.dict.0.utf8",
"../test/testdata/jieba.dict.0.1.utf8",
"../test/testdata/jieba.dict.1.utf8",
"../test/testdata/jieba.dict.2.utf8"};
const char* const DICT_FILE_MD5[] = {
"5aef74a56b363d994095c407c4809d84",
"5aef74a56b363d994095c407c4809d84",
"55f1116c05c8051ab53171f0b7455197",
"b123553a2418c4bda51abc64d705d5d4"};
TEST(Md5Test, Test1)
{
ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0]));
string tmp;
for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
{
md5File(DICT_FILE[i], tmp);
ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));
}
}

View File

@ -8,8 +8,7 @@ static const char* const DICT_FILE = "../dict/jieba.dict.utf8";
TEST(TrieTest, Test1) TEST(TrieTest, Test1)
{ {
Trie trie; Trie trie;
ASSERT_TRUE(trie.init()); ASSERT_TRUE(trie.init(DICT_FILE));
ASSERT_TRUE(trie.loadDict(DICT_FILE));
ASSERT_LT(trie.getMinLogFreq() + 17.2184, 0.001); ASSERT_LT(trie.getMinLogFreq() + 17.2184, 0.001);
string word("来到"); string word("来到");
Unicode uni; Unicode uni;

View File

@ -1,52 +0,0 @@
#include "src/TrieManager.hpp"
#include "gtest/gtest.h"
using namespace CppJieba;
struct md5_ptr
{
string md5;
Trie* ptr;
};
typedef struct md5_ptr MD5_PTR;
static const char* const DICT_FILE[] = {
"../test/testdata/jieba.dict.0.utf8",
"../test/testdata/jieba.dict.0.utf8",
"../test/testdata/jieba.dict.0.utf8",
"../test/testdata/jieba.dict.0.1.utf8",
"../test/testdata/jieba.dict.0.1.utf8",
"../test/testdata/jieba.dict.0.1.utf8",
"../test/testdata/jieba.dict.1.utf8",
"../test/testdata/jieba.dict.1.utf8",
"../test/testdata/jieba.dict.1.utf8",
"../test/testdata/jieba.dict.2.utf8",
"../test/testdata/jieba.dict.2.utf8",
"../test/testdata/jieba.dict.2.utf8",
"../test/testdata/jieba.dict.2.utf8"};
TEST(TrieManagerTest, Test1)
{
vector<MD5_PTR> tries(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]));
for (uint i = 0; i < tries.size(); i++)
{
tries[i].ptr = TrieManager::getInstance().getTrie(DICT_FILE[i]);
ASSERT_TRUE(md5File(DICT_FILE[i], tries[i].md5));
}
for (uint i = 0; i < tries.size(); i++)
{
for (uint j = i + 1; j < tries.size(); j++)
{
if (tries[i].md5 == tries[j].md5)
{
ASSERT_EQ(tries[i].ptr, tries[j].ptr);
}
else
{
ASSERT_NE(tries[i].ptr, tries[j].ptr);
}
}
}
}