This commit is contained in:
aholic 2014-11-14 13:20:04 +08:00
commit 7791290473
7 changed files with 309 additions and 174 deletions

View File

@ -1,5 +1,11 @@
# CppJieba ChangeLog # CppJieba ChangeLog
## v2.4.4 (is coming)
1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。
2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构(同时也修改 Trie 的 DAG 查询函数),提高分词速度 8% 。
3. 使用了 `Aho-Corasick-Automation` 算法提速 Trie 查找的过程等优化,提升性能。
## v2.4.3 ## v2.4.3
1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。 1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。

View File

@ -23,31 +23,11 @@ namespace CppJieba
const size_t DICT_COLUMN_NUM = 3; const size_t DICT_COLUMN_NUM = 3;
const char* const UNKNOWN_TAG = "x"; const char* const UNKNOWN_TAG = "x";
struct DictUnit
{
Unicode word;
double weight;
string tag;
};
inline ostream & operator << (ostream& os, const DictUnit& unit)
{
string s;
s << unit.word;
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
}
typedef map<size_t, const DictUnit*> DagType;
class DictTrie class DictTrie
{ {
public:
typedef Trie<Unicode::value_type, DictUnit, Unicode, vector<Unicode>, vector<const DictUnit*> > TrieType;
private: private:
vector<DictUnit> _nodeInfos; vector<DictUnit> _nodeInfos;
TrieType * _trie; Trie * _trie;
double _minWeight; double _minWeight;
private: private:
@ -107,10 +87,18 @@ namespace CppJieba
{ {
return _trie->find(begin, end, dag, offset); return _trie->find(begin, end, dag, offset);
} }
void find(
Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<SegmentChar>& res
) const
{
_trie->find(begin, end, res);
}
private: private:
TrieType * _createTrie(const vector<DictUnit>& dictUnits) Trie * _createTrie(const vector<DictUnit>& dictUnits)
{ {
assert(dictUnits.size()); assert(dictUnits.size());
vector<Unicode> words; vector<Unicode> words;
@ -121,7 +109,7 @@ namespace CppJieba
valuePointers.push_back(&dictUnits[i]); valuePointers.push_back(&dictUnits[i]);
} }
TrieType * trie = new TrieType(words, valuePointers); Trie * trie = new Trie(words, valuePointers);
return trie; return trie;
} }
void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag) void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)

View File

@ -1,7 +1,3 @@
/************************************
* file enc : ASCII
* author : wuyanyi09@gmail.com
************************************/
#ifndef CPPJIEBA_MPSEGMENT_H #ifndef CPPJIEBA_MPSEGMENT_H
#define CPPJIEBA_MPSEGMENT_H #define CPPJIEBA_MPSEGMENT_H
@ -17,17 +13,6 @@
namespace CppJieba namespace CppJieba
{ {
struct SegmentChar
{
uint16_t uniCh;
DagType dag;
const DictUnit * pInfo;
double weight;
size_t nextPos;
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
{}
};
class MPSegment: public SegmentBase class MPSegment: public SegmentBase
{ {
private: private:
@ -85,24 +70,13 @@ namespace CppJieba
{ {
return false; return false;
} }
vector<SegmentChar> segmentChars(end - begin); vector<SegmentChar> segmentChars;
//calc DAG _dictTrie.find(begin, end, segmentChars);
for(size_t i = 0; i < segmentChars.size(); i ++)
{
segmentChars[i].uniCh = *(begin + i);
segmentChars[i].dag.clear();
_dictTrie.find(begin + i, end, segmentChars[i].dag, i);
segmentChars[i].dag.insert(pair<DagType::key_type, DagType::mapped_type>(i, NULL));
}
_calcDP(segmentChars); _calcDP(segmentChars);
if(!_cut(segmentChars, res)) _cut(segmentChars, res);
{
LogError("_cut failed.");
return false;
}
return true; return true;
} }
@ -112,24 +86,25 @@ namespace CppJieba
} }
private: private:
void _calcDP(vector<SegmentChar>& SegmentChars) const void _calcDP(vector<SegmentChar>& segmentChars) const
{ {
size_t nextPos; size_t nextPos;
const DictUnit* p; const DictUnit* p;
double val; double val;
for(int i = SegmentChars.size() - 1; i >= 0; i--) for(ssize_t i = segmentChars.size() - 1; i >= 0; i--)
{ {
SegmentChars[i].pInfo = NULL; segmentChars[i].pInfo = NULL;
SegmentChars[i].weight = MIN_DOUBLE; segmentChars[i].weight = MIN_DOUBLE;
for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++) assert(!segmentChars[i].dag.empty());
for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++)
{ {
nextPos = it->first; nextPos = it->first;
p = it->second; p = it->second;
val = 0.0; val = 0.0;
if(nextPos + 1 < SegmentChars.size()) if(nextPos + 1 < segmentChars.size())
{ {
val += SegmentChars[nextPos + 1].weight; val += segmentChars[nextPos + 1].weight;
} }
if(p) if(p)
@ -140,15 +115,15 @@ namespace CppJieba
{ {
val += _dictTrie.getMinWeight(); val += _dictTrie.getMinWeight();
} }
if(val > SegmentChars[i].weight) if(val > segmentChars[i].weight)
{ {
SegmentChars[i].pInfo = p; segmentChars[i].pInfo = p;
SegmentChars[i].weight = val; segmentChars[i].weight = val;
} }
} }
} }
} }
bool _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res)const void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const
{ {
size_t i = 0; size_t i = 0;
while(i < segmentChars.size()) while(i < segmentChars.size())
@ -165,7 +140,6 @@ namespace CppJieba
i++; i++;
} }
} }
return true;
} }

View File

@ -3,141 +3,295 @@
#include "Limonp/StdExtension.hpp" #include "Limonp/StdExtension.hpp"
#include <vector> #include <vector>
#include <queue>
namespace CppJieba namespace CppJieba
{ {
using namespace std; using namespace std;
template <class KeyType, class ValueType>
class TrieNode
{
public:
typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > KeyMapType;
public:
KeyMapType * ptKeyMap;
const ValueType * ptValue;
};
template <class KeyType, class ValueType, class KeyContainerType = vector<KeyType>, class KeysContainerType = vector<KeyContainerType>, class ValueContainerType = vector<const ValueType* > > struct DictUnit
class Trie {
{ Unicode word;
public: double weight;
typedef TrieNode<KeyType, ValueType> TrieNodeType; string tag;
private: };
TrieNodeType* _root;
public:
Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
{
_root = new TrieNodeType;
_root->ptKeyMap = NULL;
_root->ptValue = NULL;
_createTrie(keys, valuePointers); // for debugging
} inline ostream & operator << (ostream& os, const DictUnit& unit)
~Trie() {
string s;
s << unit.word;
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
}
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
struct SegmentChar
{
uint16_t uniCh;
DagType dag;
const DictUnit * pInfo;
double weight;
size_t nextPos;
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
{}
~SegmentChar()
{}
};
typedef Unicode::value_type TrieKey;
class TrieNode
{
public:
typedef unordered_map<TrieKey, TrieNode*> NextMap;
public:
TrieNode * fail;
NextMap * next;
const DictUnit * ptValue;
public:
TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
{}
const TrieNode * findNext(TrieKey key) const
{
if(next == NULL)
{ {
if(_root) return NULL;
}
typename NextMap::const_iterator iter = next->find(key);
if(iter == next->end())
{
return NULL;
}
return iter->second;
}
};
class Trie
{
private:
TrieNode* _root;
public:
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{
_root = new TrieNode;
_createTrie(keys, valuePointers);
_build();// build automation
}
~Trie()
{
if(_root)
{
_deleteNode(_root);
}
}
public:
const DictUnit* find(typename Unicode::const_iterator begin, typename Unicode::const_iterator end) const
{
typename TrieNode::NextMap::const_iterator citer;
const TrieNode* ptNode = _root;
for(typename Unicode::const_iterator it = begin; it != end; it++)
{// build automation
assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
{ {
_deleteNode(_root); return NULL;
} }
ptNode = citer->second;
} }
public: return ptNode->ptValue;
const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const }
// aho-corasick-automation
void find(
typename Unicode::const_iterator begin,
typename Unicode::const_iterator end,
vector<struct SegmentChar>& res
) const
{
res.resize(end - begin);
const TrieNode * now = _root;
//typename TrieNode::NextMap::const_iterator iter;
const TrieNode* node;
for (size_t i = 0; i < end - begin; i++)
{ {
typename TrieNodeType::KeyMapType::const_iterator citer; Unicode::value_type ch = *(begin + i);
const TrieNodeType* ptNode = _root; res[i].uniCh = ch;
for(typename KeyContainerType::const_iterator it = begin; it != end; it++) assert(res[i].dag.empty());
res[i].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, NULL));
bool flag = false;
// rollback
while( now != _root )
{ {
assert(ptNode); node = now->findNext(ch);
if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it))) if (node != NULL)
{
return NULL;
}
ptNode = citer->second;
}
return ptNode->ptValue;
}
bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map<typename KeyContainerType::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
{
const TrieNodeType * ptNode = _root;
typename TrieNodeType::KeyMapType::const_iterator citer;
ordererMap.clear();
for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
{
assert(ptNode);
if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr)))
{ {
flag = true;
break; break;
} }
ptNode = citer->second; else
if(ptNode->ptValue)
{ {
ordererMap[itr - begin + offset] = ptNode->ptValue; now = now->fail;
} }
} }
return ordererMap.size();
} if(!flag)
private:
void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
{
if(valuePointers.empty() || keys.empty())
{ {
return; node = now->findNext(ch);
} }
assert(keys.size() == valuePointers.size()); if(node == NULL)
for(size_t i = 0; i < keys.size(); i++)
{ {
_insertNode(keys[i], valuePointers[i]); now = _root;
} }
} else
private:
void _insertNode(const KeyContainerType& key, const ValueType* ptValue)
{
TrieNodeType* ptNode = _root;
typename TrieNodeType::KeyMapType::const_iterator kmIter;
for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++)
{ {
if(NULL == ptNode->ptKeyMap) now = node;
const TrieNode * temp = now;
while(temp != _root)
{ {
ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType; if (temp->ptValue)
{
size_t pos = i - temp->ptValue->word.size() + 1;
res[pos].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
if(pos == i)
{
res[pos].dag[0].second = temp->ptValue;
}
}
temp = temp->fail;
assert(temp);
} }
kmIter = ptNode->ptKeyMap->find(*citer); }
if(ptNode->ptKeyMap->end() == kmIter) }
}
bool find(
typename Unicode::const_iterator begin,
typename Unicode::const_iterator end,
DagType & res,
size_t offset = 0) const
{
const TrieNode * ptNode = _root;
typename TrieNode::NextMap::const_iterator citer;
for(typename Unicode::const_iterator itr = begin; itr != end ; itr++)
{
assert(ptNode);
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
{
break;
}
ptNode = citer->second;
if(ptNode->ptValue)
{
if(itr == begin && res.size() == 1) // first singleword
{ {
TrieNodeType * nextNode = new TrieNodeType; res[0].second = ptNode->ptValue;
nextNode->ptKeyMap = NULL;
nextNode->ptValue = NULL;
(*ptNode->ptKeyMap)[*citer] = nextNode;
ptNode = nextNode;
} }
else else
{ {
ptNode = kmIter->second; res.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
} }
} }
ptNode->ptValue = ptValue;
} }
void _deleteNode(TrieNodeType* node) return !res.empty();
}
private:
void _build()
{
queue<TrieNode*> que;
assert(_root->ptValue == NULL);
assert(_root->next);
_root->fail = NULL;
for(typename TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
iter->second->fail = _root;
que.push(iter->second);
}
TrieNode* back = NULL;
typename TrieNode::NextMap::iterator backiter;
while(!que.empty()) {
TrieNode * now = que.front();
que.pop();
if(now->next == NULL) {
continue;
}
for(typename TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
back = now->fail;
while(back != NULL) {
if(back->next && (backiter = back->next->find(iter->first)) != back->next->end())
{
iter->second->fail = backiter->second;
break;
}
back = back->fail;
}
if(back == NULL) {
iter->second->fail = _root;
}
que.push(iter->second);
}
}
}
private:
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
{
if(valuePointers.empty() || keys.empty())
{ {
if(!node) return;
{
return;
}
if(node->ptKeyMap)
{
typename TrieNodeType::KeyMapType::iterator it;
for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
{
_deleteNode(it->second);
}
delete node->ptKeyMap;
}
delete node;
} }
}; assert(keys.size() == valuePointers.size());
for(size_t i = 0; i < keys.size(); i++)
{
_insertNode(keys[i], valuePointers[i]);
}
}
private:
void _insertNode(const Unicode& key, const DictUnit* ptValue)
{
TrieNode* ptNode = _root;
typename TrieNode::NextMap::const_iterator kmIter;
for(typename Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++)
{
if(NULL == ptNode->next)
{
ptNode->next = new typename TrieNode::NextMap;
}
kmIter = ptNode->next->find(*citer);
if(ptNode->next->end() == kmIter)
{
TrieNode * nextNode = new TrieNode;
nextNode->next = NULL;
nextNode->ptValue = NULL;
(*ptNode->next)[*citer] = nextNode;
ptNode = nextNode;
}
else
{
ptNode = kmIter->second;
}
}
ptNode->ptValue = ptValue;
}
void _deleteNode(TrieNode* node)
{
if(!node)
{
return;
}
if(node->next)
{
typename TrieNode::NextMap::iterator it;
for(it = node->next->begin(); it != node->next->end(); it++)
{
_deleteNode(it->second);
}
delete node->next;
}
delete node;
}
};
} }
#endif #endif

View File

@ -8,7 +8,7 @@
using namespace CppJieba; using namespace CppJieba;
void cut(size_t times = 20) void cut(size_t times = 50)
{ {
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8"); MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
vector<string> res; vector<string> res;

View File

@ -21,7 +21,6 @@ TEST(MixSegmentTest, Test1)
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0]))); ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
ASSERT_TRUE(segment.cut(str2, words)); ASSERT_TRUE(segment.cut(str2, words));
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0]))); ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
//exit(0);
} }
TEST(MixSegmentTest, NoUserDict) TEST(MixSegmentTest, NoUserDict)

View File

@ -1,4 +1,5 @@
#include "src/DictTrie.hpp" #include "src/DictTrie.hpp"
#include "src/MPSegment.hpp"
#include "gtest/gtest.h" #include "gtest/gtest.h"
using namespace CppJieba; using namespace CppJieba;
@ -33,23 +34,25 @@ TEST(DictTrieTest, Test1)
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
word = "清华大学"; word = "清华大学";
vector<pair<size_t, const DictUnit*> > res; LocalVector<pair<size_t, const DictUnit*> > res;
map<size_t, const DictUnit* > resMap; //vector<pair<size_t, const DictUnit* > resMap;
map<size_t, const DictUnit* > mp; LocalVector<pair<size_t, const DictUnit*> > res2;
const char * words[] = {"", "清华", "清华大学"}; const char * words[] = {"", "清华", "清华大学"};
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
{ {
ASSERT_TRUE(TransCode::decode(words[i], uni)); ASSERT_TRUE(TransCode::decode(words[i], uni));
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
} }
//DictUnit //DictUnit
//res.push_back(make_pair(0, )) //res.push_back(make_pair(0, ))
vector<pair<size_t, const DictUnit*> > vec; vector<pair<size_t, const DictUnit*> > vec;
ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(TransCode::decode(word, uni));
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
ASSERT_EQ(mp, resMap); s1 << res;
s2 << res;
ASSERT_EQ(s1, s2);
} }
TEST(DictTrieTest, UserDict) TEST(DictTrieTest, UserDict)
@ -64,3 +67,14 @@ TEST(DictTrieTest, UserDict)
res << *unit; res << *unit;
ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res); ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res);
} }
TEST(DictTrieTest, automation)
{
DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
//string word = "yasherhs";
string word = "abcderf";
Unicode unicode;
ASSERT_TRUE(TransCode::decode(word, unicode));
vector<struct SegmentChar> res;
trie.find(unicode.begin(), unicode.end(), res);
}