mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
update trie and dag , make cut faster . see details in changelog.md
This commit is contained in:
parent
11b041ed52
commit
b9736ee132
@ -1,5 +1,10 @@
|
|||||||
# CppJieba ChangeLog
|
# CppJieba ChangeLog
|
||||||
|
|
||||||
|
## v2.4.4 (is coming)
|
||||||
|
|
||||||
|
1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。
|
||||||
|
2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构(同时也修改 Trie 的 DAG 查询函数),提高分词速度 8% 。
|
||||||
|
|
||||||
## v2.4.3
|
## v2.4.3
|
||||||
|
|
||||||
1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。
|
1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。
|
||||||
|
@ -39,7 +39,7 @@ namespace CppJieba
|
|||||||
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef map<size_t, const DictUnit*> DagType;
|
typedef std::vector<std::pair<size_t, const DictUnit*> > DagType;
|
||||||
|
|
||||||
class DictTrie
|
class DictTrie
|
||||||
{
|
{
|
||||||
|
@ -92,8 +92,8 @@ namespace CppJieba
|
|||||||
{
|
{
|
||||||
segmentChars[i].uniCh = *(begin + i);
|
segmentChars[i].uniCh = *(begin + i);
|
||||||
segmentChars[i].dag.clear();
|
segmentChars[i].dag.clear();
|
||||||
|
segmentChars[i].dag.push_back(std::pair<size_t, const DictUnit*>(i, NULL));
|
||||||
_dictTrie.find(begin + i, end, segmentChars[i].dag, i);
|
_dictTrie.find(begin + i, end, segmentChars[i].dag, i);
|
||||||
segmentChars[i].dag.insert(pair<DagType::key_type, DagType::mapped_type>(i, NULL));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_calcDP(segmentChars);
|
_calcDP(segmentChars);
|
||||||
|
18
src/Trie.hpp
18
src/Trie.hpp
@ -56,11 +56,14 @@ namespace CppJieba
|
|||||||
}
|
}
|
||||||
return ptNode->ptValue;
|
return ptNode->ptValue;
|
||||||
}
|
}
|
||||||
bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map<typename KeyContainerType::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
|
bool find(
|
||||||
|
typename KeyContainerType::const_iterator begin,
|
||||||
|
typename KeyContainerType::const_iterator end,
|
||||||
|
std::vector<std::pair<typename KeyContainerType::size_type, const ValueType* > >& res,
|
||||||
|
size_t offset = 0) const
|
||||||
{
|
{
|
||||||
const TrieNodeType * ptNode = _root;
|
const TrieNodeType * ptNode = _root;
|
||||||
typename TrieNodeType::KeyMapType::const_iterator citer;
|
typename TrieNodeType::KeyMapType::const_iterator citer;
|
||||||
ordererMap.clear();
|
|
||||||
for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
|
for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
|
||||||
{
|
{
|
||||||
assert(ptNode);
|
assert(ptNode);
|
||||||
@ -71,10 +74,17 @@ namespace CppJieba
|
|||||||
ptNode = citer->second;
|
ptNode = citer->second;
|
||||||
if(ptNode->ptValue)
|
if(ptNode->ptValue)
|
||||||
{
|
{
|
||||||
ordererMap[itr - begin + offset] = ptNode->ptValue;
|
if(itr == begin && res.size() == 1) // first singleword
|
||||||
|
{
|
||||||
|
res[0].second = ptNode->ptValue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
res.push_back(pair<typename KeysContainerType::size_type, const ValueType* >(itr - begin + offset, ptNode->ptValue));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ordererMap.size();
|
return !res.empty();
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
|
void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
|
||||||
|
@ -34,22 +34,22 @@ TEST(DictTrieTest, Test1)
|
|||||||
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
|
||||||
word = "清华大学";
|
word = "清华大学";
|
||||||
vector<pair<size_t, const DictUnit*> > res;
|
vector<pair<size_t, const DictUnit*> > res;
|
||||||
map<size_t, const DictUnit* > resMap;
|
//vector<pair<size_t, const DictUnit* > resMap;
|
||||||
map<size_t, const DictUnit* > mp;
|
vector<pair<size_t, const DictUnit*> > res2;
|
||||||
const char * words[] = {"清", "清华", "清华大学"};
|
const char * words[] = {"清", "清华", "清华大学"};
|
||||||
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
|
for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
|
||||||
{
|
{
|
||||||
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
ASSERT_TRUE(TransCode::decode(words[i], uni));
|
||||||
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
|
||||||
resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
//resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
|
||||||
}
|
}
|
||||||
//DictUnit
|
//DictUnit
|
||||||
//res.push_back(make_pair(0, ))
|
//res.push_back(make_pair(0, ))
|
||||||
|
|
||||||
vector<pair<size_t, const DictUnit*> > vec;
|
vector<pair<size_t, const DictUnit*> > vec;
|
||||||
ASSERT_TRUE(TransCode::decode(word, uni));
|
ASSERT_TRUE(TransCode::decode(word, uni));
|
||||||
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
|
ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
|
||||||
ASSERT_EQ(mp, resMap);
|
ASSERT_EQ(res, res2);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(DictTrieTest, UserDict)
|
TEST(DictTrieTest, UserDict)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user