compile ok

This commit is contained in:
wyy 2014-04-10 09:16:35 -07:00
parent 776191b375
commit 24120c92b1
6 changed files with 108 additions and 56 deletions

View File

@ -12,7 +12,7 @@
#include "Limonp/logger.hpp" #include "Limonp/logger.hpp"
#include "Limonp/InitOnOff.hpp" #include "Limonp/InitOnOff.hpp"
#include "TransCode.hpp" #include "TransCode.hpp"
#include "DictTrie.hpp" #include "Trie.hpp"
@ -32,20 +32,30 @@ namespace CppJieba
double logFreq; //logFreq = log(freq/sum(freq)); double logFreq; //logFreq = log(freq/sum(freq));
}; };
inline ostream & operator << (ostream& os, const DictUnit& unit)
{
string s;
s << unit.word;
return os << string_format("%s %u %s %llf", s.c_str(), unit.freq, unit.tag.c_str(), unit.logFreq);
}
typedef map<size_t, const DictUnit*> DagType; typedef map<size_t, const DictUnit*> DagType;
class DictTrie: InitOnOff class DictTrie: public InitOnOff
{ {
public:
typedef Trie<Unicode::value_type, DictUnit> TrieType;
private: private:
vector<DictUnit> _nodeInfos; vector<DictUnit> _nodeInfos;
TrieType * _trie;
int64_t _freqSum; size_t _freqSum;
double _minLogFreq; double _minLogFreq;
public: public:
DictTrie() DictTrie()
{ {
_trie = NULL;
_freqSum = 0; _freqSum = 0;
_minLogFreq = MAX_DOUBLE; _minLogFreq = MAX_DOUBLE;
_setInitFlag(false); _setInitFlag(false);
@ -57,6 +67,10 @@ namespace CppJieba
} }
~DictTrie() ~DictTrie()
{ {
if(_trie)
{
delete _trie;
}
} }
private: private:
@ -69,14 +83,40 @@ namespace CppJieba
_freqSum = _calculateFreqSum(_nodeInfos); _freqSum = _calculateFreqSum(_nodeInfos);
assert(_freqSum); assert(_freqSum);
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum); _minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
return _setInitFlag(true); _trie = _creatTrie(_nodeInfos);
return _setInitFlag(_trie);
} }
public:
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
return _trie->find(begin, end);
}
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
{
return _trie->find(begin, end, dag, offset);
}
public: public:
double getMinLogFreq() const {return _minLogFreq;}; double getMinLogFreq() const {return _minLogFreq;};
private: private:
TrieType * _creatTrie(const vector<DictUnit>& dictUnits)
{
if(dictUnits.empty())
{
return NULL;
}
vector<Unicode> words;
vector<const DictUnit*> valuePointers;
for(size_t i = 0 ; i < dictUnits.size(); i ++)
{
words.push_back(dictUnits[i].word);
valuePointers.push_back(&dictUnits[i]);
}
TrieType * trie = new TrieType(words, valuePointers);
return trie;
}
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
{ {
ifstream ifs(filePath.c_str()); ifstream ifs(filePath.c_str());

View File

@ -64,7 +64,7 @@ namespace CppJieba
if (_dictTrie.find(uItr, end, tRes, 0)) if (_dictTrie.find(uItr, end, tRes, 0))
{ {
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
//for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++) //for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
{ {
wordLen = itr->second->word.size(); wordLen = itr->second->word.size();
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx)) if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))

View File

@ -21,7 +21,7 @@ namespace CppJieba
{ {
uint16_t uniCh; uint16_t uniCh;
DagType dag; DagType dag;
const TrieNodeInfo * pInfo; const DictUnit * pInfo;
double weight; double weight;
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0) SegmentChar():uniCh(0), pInfo(NULL), weight(0.0)
@ -142,7 +142,7 @@ namespace CppJieba
} }
size_t nextPos; size_t nextPos;
const TrieNodeInfo* p; const DictUnit* p;
double val; double val;
for(int i = segContext.size() - 1; i >= 0; i--) for(int i = segContext.size() - 1; i >= 0; i--)
@ -182,7 +182,7 @@ namespace CppJieba
size_t i = 0; size_t i = 0;
while(i < segContext.size()) while(i < segContext.size())
{ {
const TrieNodeInfo* p = segContext[i].pInfo; const DictUnit* p = segContext[i].pInfo;
if(p) if(p)
{ {
res.push_back(p->word); res.push_back(p->word);

View File

@ -42,7 +42,7 @@ namespace CppJieba
return false; return false;
} }
const TrieNodeInfo *tmp = NULL; const DictUnit *tmp = NULL;
Unicode unico; Unicode unico;
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
{ {

View File

@ -6,26 +6,28 @@
namespace CppJieba namespace CppJieba
{ {
using namespace std;
template <class KeyType, class ValueType> template <class KeyType, class ValueType>
class TrieNode class TrieNode
{ {
public: public:
typedef unordered_map<KeyType, TrieNode*> TrieNodeMapType; typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > KeyMapType;
public: public:
TrieNodeMap * ptKeyMap; KeyMapType * ptKeyMap;
const ValueType * ptValue; const ValueType * ptValue;
}; };
template <class KeyType, class ValueType> template <class KeyType, class ValueType>
class Trie class Trie
{ {
private:
TrieNode* _root;
private:
public: public:
Trie(const vector<KeyType>& keys, const vector<ValueType* >& valuePointers) typedef TrieNode<KeyType, ValueType> TrieNodeType;
private:
TrieNodeType* _root;
public:
Trie(const vector<vector<KeyType> >& keys, const vector<const ValueType* >& valuePointers)
{ {
_root = new TrieNode; _root = new TrieNodeType;
_root->ptKeyMap = NULL; _root->ptKeyMap = NULL;
_root->ptValue = NULL; _root->ptValue = NULL;
@ -33,28 +35,33 @@ namespace CppJieba
} }
~Trie() ~Trie()
{ {
if(_root)
{
_deleteNode(_root);
}
} }
public: public:
const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const const ValueType* find(typename vector<KeyType>::const_iterator begin, typename vector<KeyType>::const_iterator end) const
{ {
TrieNodeMapType::const_iterator citer; typename TrieNodeType::KeyMapType::const_iterator citer;
const TrieNode* ptNode = _root; const TrieNodeType* ptNode = _root;
for(KeyType::const_iterator it = begin; it != end; it++) for(typename vector<KeyType>::const_iterator it = begin; it != end; it++)
{ {
citer = ptNode->ptKeyMap->find(*it); citer = ptNode->ptKeyMap->find(*it);
if(ptNode->ptKeyMap->end() == citer) if(ptNode->ptKeyMap->end() == citer)
{ {
return NULL; return NULL;
} }
ptNode= citer->second; ptNode = citer->second;
} }
return ptNode->ptValue; return ptNode->ptValue;
} }
bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map<KeyType::size_type, const ValueType* >& ordererMap) const bool find(typename vector<KeyType>::const_iterator begin, typename vector<KeyType> ::const_iterator end, map<typename vector<KeyType>::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
{ {
const TrieNode * ptNode = _root; const TrieNodeType * ptNode = _root;
TrieNodeMapType::const_iterator citer; typename TrieNodeType::KeyMapType::const_iterator citer;
for(KeyType::const_iterator itr = begin; itr != end ; itr++) ordererMap.clear();
for(typename vector<KeyType>::const_iterator itr = begin; itr != end ; itr++)
{ {
citer = ptNode->ptKeyMap->find(*itr); citer = ptNode->ptKeyMap->find(*itr);
if(ptNode->ptKeyMap->end() == citer) if(ptNode->ptKeyMap->end() == citer)
@ -64,14 +71,15 @@ namespace CppJieba
ptNode = citer->second; ptNode = citer->second;
if(ptNode->ptValue) if(ptNode->ptValue)
{ {
ordererMap[itr - begin] = ptNode->ptValue; ordererMap[itr - begin + offset] = ptNode->ptValue;
} }
} }
return ordererMap.size();
} }
private: private:
void _createTrie(const vector<KeyType>& keys, const vector<ValueType*>& valuePointers) void _createTrie(const vector<vector<KeyType> >& keys, const vector<const ValueType*>& valuePointers)
{ {
if(values.empty() || keys.empty()) if(valuePointers.empty() || keys.empty())
{ {
return; return;
} }
@ -83,27 +91,27 @@ namespace CppJieba
} }
} }
private: private:
void _insertNode(const KeyType& key, const Value* ptValue) void _insertNode(const vector<KeyType>& key, const ValueType* ptValue)
{ {
TrieNode* ptNode = _root; TrieNodeType* ptNode = _root;
TrieNode::KeyMapType::const_iterator kmIter; typename TrieNodeType::KeyMapType::const_iterator kmIter;
for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++) for(typename vector<KeyType>::const_iterator citer = key.begin(); citer != key.end(); citer++)
{ {
if(NULL == ptNode->ptKeyMap) if(NULL == ptNode->ptKeyMap)
{ {
ptNode->ptKeyMap = new TrieNode::KeyMapType; ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType;
} }
kmIter = ptNode->ptKeyMap->find(*citer); kmIter = ptNode->ptKeyMap->find(*citer);
if(ptNode->ptKeyMap->end() == kmIter) if(ptNode->ptKeyMap->end() == kmIter)
{ {
TrieNode * nextNode = new TrieNode; TrieNodeType * nextNode = new TrieNodeType;
nextNode->ptKeyMap = NULL; nextNode->ptKeyMap = NULL;
nextNode->ptValue = NULL; nextNode->ptValue = NULL;
ptNode->ptKeyMap[*citer] = nextNode; (*ptNode->ptKeyMap)[*citer] = nextNode;
ptNode = next; ptNode = nextNode;
} }
else else
{ {
@ -112,20 +120,24 @@ namespace CppJieba
} }
ptNode->ptValue = ptValue; ptNode->ptValue = ptValue;
} }
void _deleteNode(TrieNode* node) void _deleteNode(TrieNodeType* node)
{ {
if(!node) if(!node)
{ {
return; return;
} }
for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++) if(node->ptKeyMap)
{
typename TrieNodeType::KeyMapType::iterator it;
for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
{ {
_deleteNode(it->second); _deleteNode(it->second);
} }
delete node->ptKeyMap; delete node->ptKeyMap;
}
delete node; delete node;
} }
} };
} }
#endif #endif

View File

@ -1,30 +1,30 @@
#include "src/Trie.hpp" #include "src/DictTrie.hpp"
#include "gtest/gtest.h" #include "gtest/gtest.h"
using namespace CppJieba; using namespace CppJieba;
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8"; static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
TEST(TrieTest, NewAndDelete) TEST(DictTrieTest, NewAndDelete)
{ {
Trie * trie; DictTrie * trie;
trie = new Trie(DICT_FILE); trie = new DictTrie(DICT_FILE);
delete trie; delete trie;
trie = new Trie(); trie = new DictTrie();
delete trie; delete trie;
} }
TEST(TrieTest, Test1) TEST(DictTrieTest, Test1)
{ {
string s1, s2; string s1, s2;
Trie trie; DictTrie trie;
ASSERT_TRUE(trie.init(DICT_FILE)); ASSERT_TRUE(trie.init(DICT_FILE));
ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001); ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001);
string word("来到"); string word("来到");
Unicode uni; Unicode uni;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
TrieNodeInfo nodeInfo; DictUnit nodeInfo;
nodeInfo.word = uni; nodeInfo.word = uni;
nodeInfo.freq = 8779; nodeInfo.freq = 8779;
nodeInfo.tag = "v"; nodeInfo.tag = "v";
@ -34,9 +34,9 @@ TEST(TrieTest, Test1)
EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2); EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2);
word = "清华大学"; word = "清华大学";
vector<pair<size_t, const TrieNodeInfo*> > res; vector<pair<size_t, const DictUnit*> > res;
map<size_t, const TrieNodeInfo* > resMap; map<size_t, const DictUnit* > resMap;
map<size_t, const TrieNodeInfo* > mp; map<size_t, const DictUnit* > mp;
const char * words[] = {"", "清华", "清华大学"}; const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
{ {
@ -44,10 +44,10 @@ TEST(TrieTest, Test1)
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
} }
//TrieNodeInfo //DictUnit
//res.push_back(make_pair(0, )) //res.push_back(make_pair(0, ))
vector<pair<size_t, const TrieNodeInfo*> > vec; vector<pair<size_t, const DictUnit*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
//print(uni); //print(uni);
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));