diff --git a/ChangeLog.md b/ChangeLog.md index c968114..5e5f78b 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,10 @@ # CppJieba ChangeLog +## v2.4.4 (is coming) + +1. 修改两条更细粒度的特殊过滤规则,将连续的数字(包括浮点数)和连续的字母单独切分出来(而不会混在一起)。 +2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构(同时也修改 Trie 的 DAG 查询函数),提高分词速度 8% 。 + ## v2.4.3 1. 更新 [Husky] 服务代码,新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。 diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp index 16a46e5..8c2d44f 100644 --- a/src/DictTrie.hpp +++ b/src/DictTrie.hpp @@ -39,7 +39,7 @@ namespace CppJieba return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); } - typedef map DagType; + typedef std::vector > DagType; class DictTrie { diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 3b62f4a..f397928 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -92,8 +92,8 @@ namespace CppJieba { segmentChars[i].uniCh = *(begin + i); segmentChars[i].dag.clear(); + segmentChars[i].dag.push_back(std::pair(i, NULL)); _dictTrie.find(begin + i, end, segmentChars[i].dag, i); - segmentChars[i].dag.insert(pair(i, NULL)); } _calcDP(segmentChars); diff --git a/src/Trie.hpp b/src/Trie.hpp index dbaf989..d89b38b 100644 --- a/src/Trie.hpp +++ b/src/Trie.hpp @@ -56,11 +56,14 @@ namespace CppJieba } return ptNode->ptValue; } - bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map& ordererMap, size_t offset = 0) const + bool find( + typename KeyContainerType::const_iterator begin, + typename KeyContainerType::const_iterator end, + std::vector >& res, + size_t offset = 0) const { const TrieNodeType * ptNode = _root; typename TrieNodeType::KeyMapType::const_iterator citer; - ordererMap.clear(); for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++) { assert(ptNode); @@ -71,10 +74,17 @@ namespace CppJieba ptNode = citer->second; if(ptNode->ptValue) { - ordererMap[itr - begin + offset] = ptNode->ptValue; + if(itr == begin && res.size() == 1) // first singleword + { + res[0].second = ptNode->ptValue; + } + else + { + res.push_back(pair(itr - begin + offset, ptNode->ptValue)); + } } } - return ordererMap.size(); + return !res.empty(); } private: void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers) diff --git a/test/unittest/TTrie.cpp b/test/unittest/TTrie.cpp index 2791884..18ff9a9 100644 --- a/test/unittest/TTrie.cpp +++ b/test/unittest/TTrie.cpp @@ -34,22 +34,22 @@ TEST(DictTrieTest, Test1) EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; vector > res; - map resMap; - map mp; + //vector resMap; + vector > res2; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); - resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); + //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); } //DictUnit //res.push_back(make_pair(0, )) vector > vec; ASSERT_TRUE(TransCode::decode(word, uni)); - ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0)); - ASSERT_EQ(mp, resMap); + ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); + ASSERT_EQ(res, res2); } TEST(DictTrieTest, UserDict)