mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
rm TrieManager.hpp
This commit is contained in:
parent
e3b58d6ddc
commit
ddaa5589f1
@ -9,14 +9,13 @@
|
||||
#include "ISegment.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "TrieManager.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
class FullSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
Trie* _trie;
|
||||
Trie _trie;
|
||||
|
||||
public:
|
||||
FullSegment(){_setInitFlag(false);};
|
||||
@ -30,12 +29,8 @@ namespace CppJieba
|
||||
LogError("already inited before now.");
|
||||
return false;
|
||||
}
|
||||
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
|
||||
if (NULL == _trie)
|
||||
{
|
||||
LogError("get NULL pointor from getTrie(\"%s\")", dictPath.c_str());
|
||||
return false;
|
||||
}
|
||||
_trie.init(dictPath.c_str());
|
||||
assert(_trie);
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
|
||||
@ -66,7 +61,7 @@ namespace CppJieba
|
||||
for (Unicode::const_iterator uItr = begin; uItr != end; uItr++)
|
||||
{
|
||||
//find word start from uItr
|
||||
if (_trie->find(uItr, end, tRes))
|
||||
if (_trie.find(uItr, end, tRes))
|
||||
{
|
||||
for (vector<pair<uint, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
{
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include <cassert>
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "TrieManager.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "ISegment.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
|
||||
@ -25,15 +25,14 @@ namespace CppJieba
|
||||
double weight;
|
||||
|
||||
SegmentChar(uint16_t uni):uniCh(uni), pInfo(NULL), weight(0.0)
|
||||
{
|
||||
}
|
||||
{}
|
||||
};
|
||||
typedef vector<SegmentChar> SegmentContext;
|
||||
|
||||
class MPSegment: public SegmentBase
|
||||
{
|
||||
protected:
|
||||
Trie* _trie;
|
||||
Trie _trie;
|
||||
|
||||
public:
|
||||
MPSegment(){_setInitFlag(false);};
|
||||
@ -50,12 +49,8 @@ namespace CppJieba
|
||||
LogError("already inited before now.");
|
||||
return false;
|
||||
}
|
||||
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
|
||||
if (_trie == NULL)
|
||||
{
|
||||
LogError("get a NULL pointor form getTrie(\"%s\").", dictPath.c_str());
|
||||
return false;
|
||||
}
|
||||
_trie.init(dictPath);
|
||||
assert(_trie);
|
||||
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
@ -129,7 +124,7 @@ namespace CppJieba
|
||||
{
|
||||
SegmentChar schar(*it);
|
||||
uint i = it - begin;
|
||||
_trie->find(it, end, i, schar.dag);
|
||||
_trie.find(it, end, i, schar.dag);
|
||||
//DagType::iterator dagIter;
|
||||
if(schar.dag.end() == schar.dag.find(i))
|
||||
{
|
||||
@ -167,7 +162,7 @@ namespace CppJieba
|
||||
}
|
||||
else
|
||||
{
|
||||
val += _trie->getMinLogFreq();
|
||||
val += _trie.getMinLogFreq();
|
||||
}
|
||||
if(val > segContext[i].weight)
|
||||
{
|
||||
@ -195,7 +190,7 @@ namespace CppJieba
|
||||
TrieNodeInfo nodeInfo;
|
||||
nodeInfo.word.push_back(segContext[i].uniCh);
|
||||
nodeInfo.freq = 0;
|
||||
nodeInfo.logFreq = _trie->getMinLogFreq();
|
||||
nodeInfo.logFreq = _trie.getMinLogFreq();
|
||||
res.push_back(nodeInfo);
|
||||
i++;
|
||||
}
|
||||
|
@ -18,16 +18,13 @@ namespace CppJieba
|
||||
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
|
||||
{
|
||||
_setInitFlag(init(mpSegDict, hmmSegDict));
|
||||
assert(_getInitFlag());
|
||||
}
|
||||
virtual ~MixSegment(){}
|
||||
public:
|
||||
bool init(const string& mpSegDict, const string& hmmSegDict)
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("inited.");
|
||||
return false;
|
||||
}
|
||||
assert(!_getInitFlag());
|
||||
if(!_mpSeg.init(mpSegDict))
|
||||
{
|
||||
LogError("_mpSeg init");
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include "MixSegment.hpp"
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Trie.hpp"
|
||||
#include "TrieManager.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
@ -14,7 +13,7 @@ namespace CppJieba
|
||||
{
|
||||
private:
|
||||
MixSegment _segment;
|
||||
Trie* _trie;
|
||||
Trie _trie;
|
||||
|
||||
public:
|
||||
PosTagger(){_setInitFlag(false);};
|
||||
@ -26,17 +25,10 @@ namespace CppJieba
|
||||
public:
|
||||
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb)
|
||||
{
|
||||
if (_getInitFlag())
|
||||
{
|
||||
LogError("already inited before.");
|
||||
return false;
|
||||
}
|
||||
_trie = TrieManager::getInstance().getTrie(dictPath.c_str());
|
||||
if (NULL == _trie)
|
||||
{
|
||||
LogError("get a NULL pointor from getTrie(\"%s\").", dictPath.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(!_getInitFlag());
|
||||
_trie.init(dictPath);
|
||||
assert(_trie);
|
||||
return _setInitFlag(_segment.init(dictPath, hmmFilePath));
|
||||
};
|
||||
|
||||
@ -59,7 +51,7 @@ namespace CppJieba
|
||||
LogError("decode failed.");
|
||||
return false;
|
||||
}
|
||||
tmp = _trie->find(unico.begin(), unico.end());
|
||||
tmp = _trie.find(unico.begin(), unico.end());
|
||||
res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag));
|
||||
}
|
||||
tmp = NULL;
|
||||
|
@ -11,7 +11,7 @@
|
||||
#include "FullSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "TrieManager.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
|
79
src/Trie.hpp
79
src/Trie.hpp
@ -14,6 +14,7 @@
|
||||
#include <limits>
|
||||
#include "Limonp/str_functs.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Limonp/InitOnOff.hpp"
|
||||
#include "TransCode.hpp"
|
||||
|
||||
|
||||
@ -64,14 +65,13 @@ namespace CppJieba
|
||||
|
||||
typedef map<uint, const TrieNodeInfo*> DagType;
|
||||
|
||||
class Trie
|
||||
class Trie: public InitOnOff
|
||||
{
|
||||
|
||||
private:
|
||||
TrieNode* _root;
|
||||
vector<TrieNodeInfo> _nodeInfoVec;
|
||||
|
||||
bool _initFlag;
|
||||
int64_t _freqSum;
|
||||
double _minLogFreq;
|
||||
|
||||
@ -81,57 +81,28 @@ namespace CppJieba
|
||||
_root = NULL;
|
||||
_freqSum = 0;
|
||||
_minLogFreq = MAX_DOUBLE;
|
||||
_initFlag = false;
|
||||
_setInitFlag(false);
|
||||
}
|
||||
Trie(const string& filePath): Trie()
|
||||
{
|
||||
_setInitFlag(init(filePath));
|
||||
}
|
||||
~Trie()
|
||||
{
|
||||
dispose();
|
||||
}
|
||||
bool init()
|
||||
{
|
||||
if(_getInitFlag())
|
||||
{
|
||||
LogError("already initted!");
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
_root = new TrieNode;
|
||||
}
|
||||
catch(const bad_alloc& e)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if(NULL == _root)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
_setInitFlag(true);
|
||||
return true;
|
||||
}
|
||||
bool dispose()
|
||||
{
|
||||
if(!_getInitFlag())
|
||||
{
|
||||
return false;
|
||||
return;
|
||||
}
|
||||
bool ret = _deleteNode(_root);
|
||||
if(!ret)
|
||||
{
|
||||
LogFatal("_deleteNode failed!");
|
||||
return false;
|
||||
}
|
||||
_root = NULL;
|
||||
_nodeInfoVec.clear();
|
||||
|
||||
_setInitFlag(false);
|
||||
return ret;
|
||||
_deleteNode(_root);
|
||||
}
|
||||
bool loadDict(const char * const filePath)
|
||||
public:
|
||||
bool init(const string& filePath)
|
||||
{
|
||||
assert(_getInitFlag());
|
||||
if(!_trieInsert(filePath))
|
||||
assert(!_getInitFlag());
|
||||
|
||||
_root = new TrieNode;
|
||||
assert(_root);
|
||||
if(!_trieInsert(filePath.c_str()))
|
||||
{
|
||||
LogError("_trieInsert failed.");
|
||||
return false;
|
||||
@ -141,13 +112,9 @@ namespace CppJieba
|
||||
LogError("_countWeight failed.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return _setInitFlag(true);
|
||||
}
|
||||
|
||||
private:
|
||||
void _setInitFlag(bool on){_initFlag = on;};
|
||||
bool _getInitFlag()const{return _initFlag;};
|
||||
|
||||
public:
|
||||
const TrieNodeInfo* find(Unicode::const_iterator begin, Unicode::const_iterator end)const
|
||||
{
|
||||
@ -271,12 +238,6 @@ namespace CppJieba
|
||||
private:
|
||||
bool _insert(const TrieNodeInfo& nodeInfo)
|
||||
{
|
||||
if(!_getInitFlag())
|
||||
{
|
||||
LogFatal("not initted!");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
const Unicode& uintVec = nodeInfo.word;
|
||||
TrieNode* p = _root;
|
||||
@ -358,10 +319,9 @@ namespace CppJieba
|
||||
nodeInfo.tag = vecBuf[2];
|
||||
}
|
||||
|
||||
//_insert node
|
||||
if(!_insert(nodeInfo))
|
||||
{
|
||||
LogError("_insert node failed!");
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@ -405,7 +365,7 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
|
||||
bool _deleteNode(TrieNode* node)
|
||||
void _deleteNode(TrieNode* node)
|
||||
{
|
||||
for(TrieNodeMap::iterator it = node->hmap.begin(); it != node->hmap.end(); it++)
|
||||
{
|
||||
@ -414,7 +374,6 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
delete node;
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -1,73 +0,0 @@
|
||||
#ifndef CPPJIEBA_TRIEMANAGER_H
|
||||
#define CPPJIEBA_TRIEMANAGER_H
|
||||
|
||||
#include "Trie.hpp"
|
||||
#include "Limonp/md5.hpp"
|
||||
#include "Limonp/logger.hpp"
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace Limonp;
|
||||
class TrieManager
|
||||
{
|
||||
private:
|
||||
unordered_map<std::string, Trie*> _tries;
|
||||
TrieManager(){};
|
||||
TrieManager(TrieManager& tm){};
|
||||
public:
|
||||
Trie* getTrie(const char* dictpath)
|
||||
{
|
||||
string md5;
|
||||
if (!md5File(dictpath, md5))
|
||||
{
|
||||
LogError("error when getting md5 for file '%s'", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (_tries.find(md5) != _tries.end())
|
||||
{
|
||||
return _tries[md5.c_str()];
|
||||
}
|
||||
|
||||
//LogDebug("create a new trie for md5: '%s'", md5.c_str());
|
||||
Trie* trie = NULL;
|
||||
try
|
||||
{
|
||||
trie = new Trie();
|
||||
}
|
||||
catch (const bad_alloc& e)
|
||||
{
|
||||
LogError("error when new a trie for file '%s'", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
if (NULL == trie)
|
||||
{
|
||||
LogError("get NULL from new trie for file '%s'", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!trie->init())
|
||||
{
|
||||
LogError("trie init error for file '%s'", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!trie->loadDict(dictpath))
|
||||
{
|
||||
LogError("trie->loadDict(%s) failed...", dictpath);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
_tries[md5.c_str()] = trie;
|
||||
LogDebug("trie->loadDict(%s)", dictpath);
|
||||
return trie;
|
||||
}
|
||||
|
||||
static TrieManager& getInstance()
|
||||
{
|
||||
static TrieManager _this;
|
||||
return _this;
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
@ -6,7 +6,7 @@ SET(GTEST_ROOT_DIR gtest-1.6.0)
|
||||
ADD_DEFINITIONS(-DLOGGER_LEVEL=LL_WARN)
|
||||
INCLUDE_DIRECTORIES(${GTEST_ROOT_DIR} ${GTEST_ROOT_DIR}/include ${PROJECT_SOURCE_DIR})
|
||||
ADD_LIBRARY(gtest STATIC ${GTEST_ROOT_DIR}/src/gtest-all.cc)
|
||||
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TMPSegment.cpp TTrie.cpp TFullSegment.cpp TMd5.cpp TQuerySegment.cpp TTrieManager TFullSegment.cpp TMd5.cpp TQuerySegment.cpp TTrieManager.cpp THMMSegment.cpp TMixSegment.cpp TSegmentBase.cpp)
|
||||
ADD_EXECUTABLE(test.run gtest_main.cpp TKeywordExtractor.cpp TMPSegment.cpp TTrie.cpp TFullSegment.cpp TQuerySegment.cpp THMMSegment.cpp TMixSegment.cpp TSegmentBase.cpp)
|
||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||
|
||||
|
@ -1,28 +0,0 @@
|
||||
#include "src/TrieManager.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
const char* const DICT_FILE[] = {
|
||||
"../test/testdata/jieba.dict.0.utf8",
|
||||
"../test/testdata/jieba.dict.0.1.utf8",
|
||||
"../test/testdata/jieba.dict.1.utf8",
|
||||
"../test/testdata/jieba.dict.2.utf8"};
|
||||
|
||||
const char* const DICT_FILE_MD5[] = {
|
||||
"5aef74a56b363d994095c407c4809d84",
|
||||
"5aef74a56b363d994095c407c4809d84",
|
||||
"55f1116c05c8051ab53171f0b7455197",
|
||||
"b123553a2418c4bda51abc64d705d5d4"};
|
||||
|
||||
TEST(Md5Test, Test1)
|
||||
{
|
||||
ASSERT_EQ(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]), sizeof(DICT_FILE_MD5)/sizeof(DICT_FILE_MD5[0]));
|
||||
string tmp;
|
||||
for (uint i = 0; i < sizeof(DICT_FILE)/sizeof(DICT_FILE[0]); i++)
|
||||
{
|
||||
md5File(DICT_FILE[i], tmp);
|
||||
ASSERT_EQ(tmp, string(DICT_FILE_MD5[i]));
|
||||
}
|
||||
}
|
||||
|
@ -8,8 +8,7 @@ static const char* const DICT_FILE = "../dict/jieba.dict.utf8";
|
||||
TEST(TrieTest, Test1)
|
||||
{
|
||||
Trie trie;
|
||||
ASSERT_TRUE(trie.init());
|
||||
ASSERT_TRUE(trie.loadDict(DICT_FILE));
|
||||
ASSERT_TRUE(trie.init(DICT_FILE));
|
||||
ASSERT_LT(trie.getMinLogFreq() + 17.2184, 0.001);
|
||||
string word("来到");
|
||||
Unicode uni;
|
||||
|
@ -1,52 +0,0 @@
|
||||
#include "src/TrieManager.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
struct md5_ptr
|
||||
{
|
||||
string md5;
|
||||
Trie* ptr;
|
||||
};
|
||||
typedef struct md5_ptr MD5_PTR;
|
||||
|
||||
static const char* const DICT_FILE[] = {
|
||||
"../test/testdata/jieba.dict.0.utf8",
|
||||
"../test/testdata/jieba.dict.0.utf8",
|
||||
"../test/testdata/jieba.dict.0.utf8",
|
||||
"../test/testdata/jieba.dict.0.1.utf8",
|
||||
"../test/testdata/jieba.dict.0.1.utf8",
|
||||
"../test/testdata/jieba.dict.0.1.utf8",
|
||||
"../test/testdata/jieba.dict.1.utf8",
|
||||
"../test/testdata/jieba.dict.1.utf8",
|
||||
"../test/testdata/jieba.dict.1.utf8",
|
||||
"../test/testdata/jieba.dict.2.utf8",
|
||||
"../test/testdata/jieba.dict.2.utf8",
|
||||
"../test/testdata/jieba.dict.2.utf8",
|
||||
"../test/testdata/jieba.dict.2.utf8"};
|
||||
|
||||
TEST(TrieManagerTest, Test1)
|
||||
{
|
||||
vector<MD5_PTR> tries(sizeof(DICT_FILE)/sizeof(DICT_FILE[0]));
|
||||
for (uint i = 0; i < tries.size(); i++)
|
||||
{
|
||||
tries[i].ptr = TrieManager::getInstance().getTrie(DICT_FILE[i]);
|
||||
ASSERT_TRUE(md5File(DICT_FILE[i], tries[i].md5));
|
||||
}
|
||||
|
||||
for (uint i = 0; i < tries.size(); i++)
|
||||
{
|
||||
for (uint j = i + 1; j < tries.size(); j++)
|
||||
{
|
||||
if (tries[i].md5 == tries[j].md5)
|
||||
{
|
||||
ASSERT_EQ(tries[i].ptr, tries[j].ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
ASSERT_NE(tries[i].ptr, tries[j].ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user