mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
compile ok
This commit is contained in:
parent
776191b375
commit
24120c92b1
@ -12,7 +12,7 @@
|
||||
#include "Limonp/logger.hpp"
|
||||
#include "Limonp/InitOnOff.hpp"
|
||||
#include "TransCode.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "Trie.hpp"
|
||||
|
||||
|
||||
|
||||
@ -32,20 +32,30 @@ namespace CppJieba
|
||||
double logFreq; //logFreq = log(freq/sum(freq));
|
||||
};
|
||||
|
||||
inline ostream & operator << (ostream& os, const DictUnit& unit)
|
||||
{
|
||||
string s;
|
||||
s << unit.word;
|
||||
return os << string_format("%s %u %s %llf", s.c_str(), unit.freq, unit.tag.c_str(), unit.logFreq);
|
||||
}
|
||||
|
||||
typedef map<size_t, const DictUnit*> DagType;
|
||||
|
||||
class DictTrie: InitOnOff
|
||||
class DictTrie: public InitOnOff
|
||||
{
|
||||
|
||||
public:
|
||||
typedef Trie<Unicode::value_type, DictUnit> TrieType;
|
||||
private:
|
||||
vector<DictUnit> _nodeInfos;
|
||||
TrieType * _trie;
|
||||
|
||||
int64_t _freqSum;
|
||||
size_t _freqSum;
|
||||
double _minLogFreq;
|
||||
|
||||
public:
|
||||
DictTrie()
|
||||
{
|
||||
_trie = NULL;
|
||||
_freqSum = 0;
|
||||
_minLogFreq = MAX_DOUBLE;
|
||||
_setInitFlag(false);
|
||||
@ -57,6 +67,10 @@ namespace CppJieba
|
||||
}
|
||||
~DictTrie()
|
||||
{
|
||||
if(_trie)
|
||||
{
|
||||
delete _trie;
|
||||
}
|
||||
}
|
||||
private:
|
||||
|
||||
@ -69,14 +83,40 @@ namespace CppJieba
|
||||
_freqSum = _calculateFreqSum(_nodeInfos);
|
||||
assert(_freqSum);
|
||||
_minLogFreq = _calculateLogFreqAndGetMinValue(_nodeInfos, _freqSum);
|
||||
return _setInitFlag(true);
|
||||
_trie = _creatTrie(_nodeInfos);
|
||||
return _setInitFlag(_trie);
|
||||
}
|
||||
|
||||
public:
|
||||
const DictUnit* find(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
return _trie->find(begin, end);
|
||||
}
|
||||
bool find(Unicode::const_iterator begin, Unicode::const_iterator end, DagType& dag, size_t offset = 0) const
|
||||
{
|
||||
return _trie->find(begin, end, dag, offset);
|
||||
}
|
||||
|
||||
public:
|
||||
double getMinLogFreq() const {return _minLogFreq;};
|
||||
|
||||
private:
|
||||
TrieType * _creatTrie(const vector<DictUnit>& dictUnits)
|
||||
{
|
||||
if(dictUnits.empty())
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
vector<Unicode> words;
|
||||
vector<const DictUnit*> valuePointers;
|
||||
for(size_t i = 0 ; i < dictUnits.size(); i ++)
|
||||
{
|
||||
words.push_back(dictUnits[i].word);
|
||||
valuePointers.push_back(&dictUnits[i]);
|
||||
}
|
||||
TrieType * trie = new TrieType(words, valuePointers);
|
||||
return trie;
|
||||
}
|
||||
void _loadDict(const string& filePath, vector<DictUnit>& nodeInfos) const
|
||||
{
|
||||
ifstream ifs(filePath.c_str());
|
||||
|
@ -64,7 +64,7 @@ namespace CppJieba
|
||||
if (_dictTrie.find(uItr, end, tRes, 0))
|
||||
{
|
||||
for(DagType::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
//for (vector<pair<size_t, const TrieNodeInfo*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
//for (vector<pair<size_t, const DictUnit*> >::const_iterator itr = tRes.begin(); itr != tRes.end(); itr++)
|
||||
{
|
||||
wordLen = itr->second->word.size();
|
||||
if (wordLen >= 2 || (tRes.size() == 1 && maxIdx <= uIdx))
|
||||
|
@ -21,7 +21,7 @@ namespace CppJieba
|
||||
{
|
||||
uint16_t uniCh;
|
||||
DagType dag;
|
||||
const TrieNodeInfo * pInfo;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
|
||||
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0)
|
||||
@ -142,7 +142,7 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
size_t nextPos;
|
||||
const TrieNodeInfo* p;
|
||||
const DictUnit* p;
|
||||
double val;
|
||||
|
||||
for(int i = segContext.size() - 1; i >= 0; i--)
|
||||
@ -182,7 +182,7 @@ namespace CppJieba
|
||||
size_t i = 0;
|
||||
while(i < segContext.size())
|
||||
{
|
||||
const TrieNodeInfo* p = segContext[i].pInfo;
|
||||
const DictUnit* p = segContext[i].pInfo;
|
||||
if(p)
|
||||
{
|
||||
res.push_back(p->word);
|
||||
|
@ -42,7 +42,7 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
|
||||
const TrieNodeInfo *tmp = NULL;
|
||||
const DictUnit *tmp = NULL;
|
||||
Unicode unico;
|
||||
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
|
||||
{
|
||||
|
78
src/Trie.hpp
78
src/Trie.hpp
@ -6,26 +6,28 @@
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace std;
|
||||
template <class KeyType, class ValueType>
|
||||
class TrieNode
|
||||
{
|
||||
public:
|
||||
typedef unordered_map<KeyType, TrieNode*> TrieNodeMapType;
|
||||
typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > KeyMapType;
|
||||
public:
|
||||
TrieNodeMap * ptKeyMap;
|
||||
KeyMapType * ptKeyMap;
|
||||
const ValueType * ptValue;
|
||||
};
|
||||
|
||||
template <class KeyType, class ValueType>
|
||||
class Trie
|
||||
{
|
||||
private:
|
||||
TrieNode* _root;
|
||||
private:
|
||||
public:
|
||||
Trie(const vector<KeyType>& keys, const vector<ValueType* >& valuePointers)
|
||||
typedef TrieNode<KeyType, ValueType> TrieNodeType;
|
||||
private:
|
||||
TrieNodeType* _root;
|
||||
public:
|
||||
Trie(const vector<vector<KeyType> >& keys, const vector<const ValueType* >& valuePointers)
|
||||
{
|
||||
_root = new TrieNode;
|
||||
_root = new TrieNodeType;
|
||||
_root->ptKeyMap = NULL;
|
||||
_root->ptValue = NULL;
|
||||
|
||||
@ -33,28 +35,33 @@ namespace CppJieba
|
||||
}
|
||||
~Trie()
|
||||
{
|
||||
if(_root)
|
||||
{
|
||||
_deleteNode(_root);
|
||||
}
|
||||
}
|
||||
public:
|
||||
const ValueType* find(KeyType::const_iterator begin; KeyType::const_iterator end) const
|
||||
const ValueType* find(typename vector<KeyType>::const_iterator begin, typename vector<KeyType>::const_iterator end) const
|
||||
{
|
||||
TrieNodeMapType::const_iterator citer;
|
||||
const TrieNode* ptNode = _root;
|
||||
for(KeyType::const_iterator it = begin; it != end; it++)
|
||||
typename TrieNodeType::KeyMapType::const_iterator citer;
|
||||
const TrieNodeType* ptNode = _root;
|
||||
for(typename vector<KeyType>::const_iterator it = begin; it != end; it++)
|
||||
{
|
||||
citer = ptNode->ptKeyMap->find(*it);
|
||||
if(ptNode->ptKeyMap->end() == citer)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
ptNode= citer->second;
|
||||
ptNode = citer->second;
|
||||
}
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
bool find(KeyType::const_iterator begin, KeyType::const_iterator end, map<KeyType::size_type, const ValueType* >& ordererMap) const
|
||||
bool find(typename vector<KeyType>::const_iterator begin, typename vector<KeyType> ::const_iterator end, map<typename vector<KeyType>::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
|
||||
{
|
||||
const TrieNode * ptNode = _root;
|
||||
TrieNodeMapType::const_iterator citer;
|
||||
for(KeyType::const_iterator itr = begin; itr != end ; itr++)
|
||||
const TrieNodeType * ptNode = _root;
|
||||
typename TrieNodeType::KeyMapType::const_iterator citer;
|
||||
ordererMap.clear();
|
||||
for(typename vector<KeyType>::const_iterator itr = begin; itr != end ; itr++)
|
||||
{
|
||||
citer = ptNode->ptKeyMap->find(*itr);
|
||||
if(ptNode->ptKeyMap->end() == citer)
|
||||
@ -64,46 +71,47 @@ namespace CppJieba
|
||||
ptNode = citer->second;
|
||||
if(ptNode->ptValue)
|
||||
{
|
||||
ordererMap[itr - begin] = ptNode->ptValue;
|
||||
ordererMap[itr - begin + offset] = ptNode->ptValue;
|
||||
}
|
||||
}
|
||||
return ordererMap.size();
|
||||
}
|
||||
private:
|
||||
void _createTrie(const vector<KeyType>& keys, const vector<ValueType*>& valuePointers)
|
||||
void _createTrie(const vector<vector<KeyType> >& keys, const vector<const ValueType*>& valuePointers)
|
||||
{
|
||||
if(values.empty() || keys.empty())
|
||||
if(valuePointers.empty() || keys.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
assert(keys.size() == valuePointers.size());
|
||||
|
||||
|
||||
for(size_t i = 0; i < keys.size(); i++)
|
||||
{
|
||||
_insertNode(keys[i], valuePointers[i]);
|
||||
}
|
||||
}
|
||||
private:
|
||||
void _insertNode(const KeyType& key, const Value* ptValue)
|
||||
void _insertNode(const vector<KeyType>& key, const ValueType* ptValue)
|
||||
{
|
||||
TrieNode* ptNode = _root;
|
||||
TrieNodeType* ptNode = _root;
|
||||
|
||||
TrieNode::KeyMapType::const_iterator kmIter;
|
||||
typename TrieNodeType::KeyMapType::const_iterator kmIter;
|
||||
|
||||
for(KeyType::const_iterator citer = key.begin(); citer != key.end(); citer++)
|
||||
for(typename vector<KeyType>::const_iterator citer = key.begin(); citer != key.end(); citer++)
|
||||
{
|
||||
if(NULL == ptNode->ptKeyMap)
|
||||
{
|
||||
ptNode->ptKeyMap = new TrieNode::KeyMapType;
|
||||
ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType;
|
||||
}
|
||||
kmIter = ptNode->ptKeyMap->find(*citer);
|
||||
if(ptNode->ptKeyMap->end() == kmIter)
|
||||
{
|
||||
TrieNode * nextNode = new TrieNode;
|
||||
TrieNodeType * nextNode = new TrieNodeType;
|
||||
nextNode->ptKeyMap = NULL;
|
||||
nextNode->ptValue = NULL;
|
||||
|
||||
ptNode->ptKeyMap[*citer] = nextNode;
|
||||
ptNode = next;
|
||||
(*ptNode->ptKeyMap)[*citer] = nextNode;
|
||||
ptNode = nextNode;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -112,20 +120,24 @@ namespace CppJieba
|
||||
}
|
||||
ptNode->ptValue = ptValue;
|
||||
}
|
||||
void _deleteNode(TrieNode* node)
|
||||
void _deleteNode(TrieNodeType* node)
|
||||
{
|
||||
if(!node)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for(TrieNodeMapType::iterator it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
|
||||
if(node->ptKeyMap)
|
||||
{
|
||||
_deleteNode(it->second);
|
||||
typename TrieNodeType::KeyMapType::iterator it;
|
||||
for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
|
||||
{
|
||||
_deleteNode(it->second);
|
||||
}
|
||||
delete node->ptKeyMap;
|
||||
}
|
||||
delete node->ptKeyMap;
|
||||
delete node;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1,30 +1,30 @@
|
||||
#include "src/Trie.hpp"
|
||||
#include "src/DictTrie.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
|
||||
static const char* const DICT_FILE = "../dict/extra_dict/jieba.dict.small.utf8";
|
||||
|
||||
TEST(TrieTest, NewAndDelete)
|
||||
TEST(DictTrieTest, NewAndDelete)
|
||||
{
|
||||
Trie * trie;
|
||||
trie = new Trie(DICT_FILE);
|
||||
DictTrie * trie;
|
||||
trie = new DictTrie(DICT_FILE);
|
||||
delete trie;
|
||||
trie = new Trie();
|
||||
trie = new DictTrie();
|
||||
delete trie;
|
||||
}
|
||||
|
||||
TEST(TrieTest, Test1)
|
||||
TEST(DictTrieTest, Test1)
|
||||
{
|
||||
|
||||
string s1, s2;
|
||||
Trie trie;
|
||||
DictTrie trie;
|
||||
ASSERT_TRUE(trie.init(DICT_FILE));
|
||||
ASSERT_LT(trie.getMinLogFreq() + 15.6479, 0.001);
|
||||
string word("来到");
|
||||
Unicode uni;
|
||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||
TrieNodeInfo nodeInfo;
|
||||
DictUnit nodeInfo;
|
||||
nodeInfo.word = uni;
|
||||
nodeInfo.freq = 8779;
|
||||
nodeInfo.tag = "v";
|
||||
@ -34,9 +34,9 @@ TEST(TrieTest, Test1)
|
||||
|
||||
EXPECT_EQ("[\"26469\", \"21040\"]:8779:v:-8.87033", s2);
|
||||
word = "清华大学";
|
||||
vector<pair<size_t, const TrieNodeInfo*> > res;
|
||||
map<size_t, const TrieNodeInfo* > resMap;
|
||||
map<size_t, const TrieNodeInfo* > mp;
|
||||
vector<pair<size_t, const DictUnit*> > res;
|
||||
map<size_t, const DictUnit* > resMap;
|
||||
map<size_t, const DictUnit* > mp;
|
||||
const char * words[] = {"清", "清华", "清华大学"};
|
||||
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
|
||||
{
|
||||
@ -44,10 +44,10 @@ TEST(TrieTest, Test1)
|
||||
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
||||
resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
||||
}
|
||||
//TrieNodeInfo
|
||||
//DictUnit
|
||||
//res.push_back(make_pair(0, ))
|
||||
|
||||
vector<pair<size_t, const TrieNodeInfo*> > vec;
|
||||
vector<pair<size_t, const DictUnit*> > vec;
|
||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||
//print(uni);
|
||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
|
||||
|
Loading…
x
Reference in New Issue
Block a user