mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
use automation
This commit is contained in:
parent
b9736ee132
commit
3ced451212
@ -23,24 +23,6 @@ namespace CppJieba
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
const char* const UNKNOWN_TAG = "x";
|
||||
|
||||
|
||||
|
||||
struct DictUnit
|
||||
{
|
||||
Unicode word;
|
||||
double weight;
|
||||
string tag;
|
||||
};
|
||||
|
||||
inline ostream & operator << (ostream& os, const DictUnit& unit)
|
||||
{
|
||||
string s;
|
||||
s << unit.word;
|
||||
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||
}
|
||||
|
||||
typedef std::vector<std::pair<size_t, const DictUnit*> > DagType;
|
||||
|
||||
class DictTrie
|
||||
{
|
||||
public:
|
||||
@ -107,6 +89,14 @@ namespace CppJieba
|
||||
{
|
||||
return _trie->find(begin, end, dag, offset);
|
||||
}
|
||||
void find(
|
||||
Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<SegmentChar>& res
|
||||
) const
|
||||
{
|
||||
_trie->find(begin, end, res);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
|
@ -1,7 +1,3 @@
|
||||
/************************************
|
||||
* file enc : ASCII
|
||||
* author : wuyanyi09@gmail.com
|
||||
************************************/
|
||||
#ifndef CPPJIEBA_MPSEGMENT_H
|
||||
#define CPPJIEBA_MPSEGMENT_H
|
||||
|
||||
@ -17,17 +13,6 @@
|
||||
namespace CppJieba
|
||||
{
|
||||
|
||||
struct SegmentChar
|
||||
{
|
||||
uint16_t uniCh;
|
||||
DagType dag;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
size_t nextPos;
|
||||
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
|
||||
{}
|
||||
};
|
||||
|
||||
class MPSegment: public SegmentBase
|
||||
{
|
||||
private:
|
||||
@ -85,16 +70,9 @@ namespace CppJieba
|
||||
{
|
||||
return false;
|
||||
}
|
||||
vector<SegmentChar> segmentChars(end - begin);
|
||||
vector<SegmentChar> segmentChars;
|
||||
|
||||
//calc DAG
|
||||
for(size_t i = 0; i < segmentChars.size(); i ++)
|
||||
{
|
||||
segmentChars[i].uniCh = *(begin + i);
|
||||
segmentChars[i].dag.clear();
|
||||
segmentChars[i].dag.push_back(std::pair<size_t, const DictUnit*>(i, NULL));
|
||||
_dictTrie.find(begin + i, end, segmentChars[i].dag, i);
|
||||
}
|
||||
_dictTrie.find(begin, end, segmentChars);
|
||||
|
||||
_calcDP(segmentChars);
|
||||
|
||||
@ -112,24 +90,25 @@ namespace CppJieba
|
||||
}
|
||||
|
||||
private:
|
||||
void _calcDP(vector<SegmentChar>& SegmentChars) const
|
||||
void _calcDP(vector<SegmentChar>& segmentChars) const
|
||||
{
|
||||
size_t nextPos;
|
||||
const DictUnit* p;
|
||||
double val;
|
||||
|
||||
for(int i = SegmentChars.size() - 1; i >= 0; i--)
|
||||
for(ssize_t i = segmentChars.size() - 1; i >= 0; i--)
|
||||
{
|
||||
SegmentChars[i].pInfo = NULL;
|
||||
SegmentChars[i].weight = MIN_DOUBLE;
|
||||
for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++)
|
||||
segmentChars[i].pInfo = NULL;
|
||||
segmentChars[i].weight = MIN_DOUBLE;
|
||||
assert(!segmentChars[i].dag.empty());
|
||||
for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++)
|
||||
{
|
||||
nextPos = it->first;
|
||||
p = it->second;
|
||||
val = 0.0;
|
||||
if(nextPos + 1 < SegmentChars.size())
|
||||
if(nextPos + 1 < segmentChars.size())
|
||||
{
|
||||
val += SegmentChars[nextPos + 1].weight;
|
||||
val += segmentChars[nextPos + 1].weight;
|
||||
}
|
||||
|
||||
if(p)
|
||||
@ -140,10 +119,10 @@ namespace CppJieba
|
||||
{
|
||||
val += _dictTrie.getMinWeight();
|
||||
}
|
||||
if(val > SegmentChars[i].weight)
|
||||
if(val > segmentChars[i].weight)
|
||||
{
|
||||
SegmentChars[i].pInfo = p;
|
||||
SegmentChars[i].weight = val;
|
||||
segmentChars[i].pInfo = p;
|
||||
segmentChars[i].weight = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
150
src/Trie.hpp
150
src/Trie.hpp
@ -3,18 +3,51 @@
|
||||
|
||||
#include "Limonp/StdExtension.hpp"
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
|
||||
namespace CppJieba
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
struct DictUnit
|
||||
{
|
||||
Unicode word;
|
||||
double weight;
|
||||
string tag;
|
||||
};
|
||||
|
||||
inline ostream & operator << (ostream& os, const DictUnit& unit)
|
||||
{
|
||||
string s;
|
||||
s << unit.word;
|
||||
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||
}
|
||||
|
||||
typedef std::vector<std::pair<size_t, const DictUnit*> > DagType;
|
||||
|
||||
struct SegmentChar
|
||||
{
|
||||
uint16_t uniCh;
|
||||
DagType dag;
|
||||
const DictUnit * pInfo;
|
||||
double weight;
|
||||
size_t nextPos;
|
||||
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
|
||||
{}
|
||||
};
|
||||
|
||||
template <class KeyType, class ValueType>
|
||||
class TrieNode
|
||||
{
|
||||
public:
|
||||
typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > KeyMapType;
|
||||
typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > NextMap;
|
||||
public:
|
||||
KeyMapType * ptKeyMap;
|
||||
TrieNode * fail;
|
||||
NextMap * next;
|
||||
const ValueType * ptValue;
|
||||
public:
|
||||
TrieNode(): fail(NULL), next(NULL), ptValue(NULL) {
|
||||
}
|
||||
};
|
||||
|
||||
template <class KeyType, class ValueType, class KeyContainerType = vector<KeyType>, class KeysContainerType = vector<KeyContainerType>, class ValueContainerType = vector<const ValueType* > >
|
||||
@ -28,10 +61,8 @@ namespace CppJieba
|
||||
Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
|
||||
{
|
||||
_root = new TrieNodeType;
|
||||
_root->ptKeyMap = NULL;
|
||||
_root->ptValue = NULL;
|
||||
|
||||
_createTrie(keys, valuePointers);
|
||||
_build();// build automation
|
||||
}
|
||||
~Trie()
|
||||
{
|
||||
@ -43,12 +74,12 @@ namespace CppJieba
|
||||
public:
|
||||
const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const
|
||||
{
|
||||
typename TrieNodeType::KeyMapType::const_iterator citer;
|
||||
typename TrieNodeType::NextMap::const_iterator citer;
|
||||
const TrieNodeType* ptNode = _root;
|
||||
for(typename KeyContainerType::const_iterator it = begin; it != end; it++)
|
||||
{
|
||||
{// build automation
|
||||
assert(ptNode);
|
||||
if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it)))
|
||||
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
@ -56,6 +87,47 @@ namespace CppJieba
|
||||
}
|
||||
return ptNode->ptValue;
|
||||
}
|
||||
void find(
|
||||
typename KeyContainerType::const_iterator begin,
|
||||
typename KeyContainerType::const_iterator end,
|
||||
vector<struct SegmentChar>& res
|
||||
) const
|
||||
{
|
||||
res.resize(end - begin);
|
||||
const TrieNodeType * now = _root;
|
||||
typename TrieNodeType::NextMap::const_iterator iter;
|
||||
for (size_t i = 0; i < end - begin; i++) {
|
||||
bool flag = false;
|
||||
res[i].uniCh = *(begin + i);
|
||||
assert(res[i].dag.empty());
|
||||
res[i].dag.reserve(4);//TODO
|
||||
while( now != _root && (now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end())) {
|
||||
now = now->fail;
|
||||
}
|
||||
if(now->next == NULL || (iter = now->next->find(*(begin + i))) == now->next->end()) {
|
||||
now = _root;
|
||||
} else {
|
||||
now = iter->second;
|
||||
const TrieNodeType * temp = now;
|
||||
while(temp != _root) {
|
||||
if (temp->ptValue) {
|
||||
string str;
|
||||
TransCode::encode(temp->ptValue->word, str);
|
||||
size_t pos = i - temp->ptValue->word.size() + 1;
|
||||
res[pos].dag.push_back(pair<typename KeysContainerType::size_type, const ValueType* >(i, temp->ptValue));
|
||||
if(temp->ptValue->word.size() == 1) {
|
||||
flag = true;
|
||||
}
|
||||
}
|
||||
temp = temp->fail;
|
||||
assert(temp);
|
||||
}
|
||||
}
|
||||
if(!flag) {
|
||||
res[i].dag.push_back(pair<typename KeysContainerType::size_type, const ValueType* >(i, NULL));
|
||||
}
|
||||
}
|
||||
}
|
||||
bool find(
|
||||
typename KeyContainerType::const_iterator begin,
|
||||
typename KeyContainerType::const_iterator end,
|
||||
@ -63,11 +135,11 @@ namespace CppJieba
|
||||
size_t offset = 0) const
|
||||
{
|
||||
const TrieNodeType * ptNode = _root;
|
||||
typename TrieNodeType::KeyMapType::const_iterator citer;
|
||||
typename TrieNodeType::NextMap::const_iterator citer;
|
||||
for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
|
||||
{
|
||||
assert(ptNode);
|
||||
if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr)))
|
||||
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
|
||||
{
|
||||
break;
|
||||
}
|
||||
@ -86,6 +158,42 @@ namespace CppJieba
|
||||
}
|
||||
return !res.empty();
|
||||
}
|
||||
private:
|
||||
void _build()
|
||||
{
|
||||
queue<TrieNodeType*> que;
|
||||
assert(_root->ptValue == NULL);
|
||||
assert(_root->next);
|
||||
_root->fail = NULL;
|
||||
for(typename TrieNodeType::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
|
||||
iter->second->fail = _root;
|
||||
que.push(iter->second);
|
||||
}
|
||||
TrieNodeType* back = NULL;
|
||||
typename TrieNodeType::NextMap::iterator backiter;
|
||||
while(!que.empty()) {
|
||||
TrieNodeType * now = que.front();
|
||||
que.pop();
|
||||
if(now->next == NULL) {
|
||||
continue;
|
||||
}
|
||||
for(typename TrieNodeType::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
|
||||
back = now->fail;
|
||||
while(back != NULL) {
|
||||
if(back->next && (backiter = back->next->find(iter->first)) != back->next->end())
|
||||
{
|
||||
iter->second->fail = backiter->second;
|
||||
break;
|
||||
}
|
||||
back = back->fail;
|
||||
}
|
||||
if(back == NULL) {
|
||||
iter->second->fail = _root;
|
||||
}
|
||||
que.push(iter->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
|
||||
{
|
||||
@ -105,22 +213,22 @@ namespace CppJieba
|
||||
{
|
||||
TrieNodeType* ptNode = _root;
|
||||
|
||||
typename TrieNodeType::KeyMapType::const_iterator kmIter;
|
||||
typename TrieNodeType::NextMap::const_iterator kmIter;
|
||||
|
||||
for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++)
|
||||
{
|
||||
if(NULL == ptNode->ptKeyMap)
|
||||
if(NULL == ptNode->next)
|
||||
{
|
||||
ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType;
|
||||
ptNode->next = new typename TrieNodeType::NextMap;
|
||||
}
|
||||
kmIter = ptNode->ptKeyMap->find(*citer);
|
||||
if(ptNode->ptKeyMap->end() == kmIter)
|
||||
kmIter = ptNode->next->find(*citer);
|
||||
if(ptNode->next->end() == kmIter)
|
||||
{
|
||||
TrieNodeType * nextNode = new TrieNodeType;
|
||||
nextNode->ptKeyMap = NULL;
|
||||
nextNode->next = NULL;
|
||||
nextNode->ptValue = NULL;
|
||||
|
||||
(*ptNode->ptKeyMap)[*citer] = nextNode;
|
||||
(*ptNode->next)[*citer] = nextNode;
|
||||
ptNode = nextNode;
|
||||
}
|
||||
else
|
||||
@ -136,14 +244,14 @@ namespace CppJieba
|
||||
{
|
||||
return;
|
||||
}
|
||||
if(node->ptKeyMap)
|
||||
if(node->next)
|
||||
{
|
||||
typename TrieNodeType::KeyMapType::iterator it;
|
||||
for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
|
||||
typename TrieNodeType::NextMap::iterator it;
|
||||
for(it = node->next->begin(); it != node->next->end(); it++)
|
||||
{
|
||||
_deleteNode(it->second);
|
||||
}
|
||||
delete node->ptKeyMap;
|
||||
delete node->next;
|
||||
}
|
||||
delete node;
|
||||
}
|
||||
|
@ -21,7 +21,6 @@ TEST(MixSegmentTest, Test1)
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
ASSERT_TRUE(segment.cut(str2, words));
|
||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||
//exit(0);
|
||||
}
|
||||
|
||||
TEST(MixSegmentTest, NoUserDict)
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "src/DictTrie.hpp"
|
||||
#include "src/MPSegment.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace CppJieba;
|
||||
@ -64,3 +65,14 @@ TEST(DictTrieTest, UserDict)
|
||||
res << *unit;
|
||||
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res);
|
||||
}
|
||||
|
||||
TEST(DictTrieTest, automation)
|
||||
{
|
||||
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
|
||||
//string word = "yasherhs";
|
||||
string word = "abcderf";
|
||||
Unicode unicode;
|
||||
ASSERT_TRUE(TransCode::decode(word, unicode));
|
||||
vector<struct SegmentChar> res;
|
||||
trie.find(unicode.begin(), unicode.end(), res);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user