update trie and dag , make cut faster . see details in changelog.md

2025-07-18 00:00:12 +08:00 · 2014-11-05 15:31:09 +08:00 · 2014-11-05 15:31:09 +08:00 · b9736ee132
commit b9736ee132
parent 11b041ed52
5 changed files with 26 additions and 11 deletions
--- a/ChangeLog.md
+++ b/ChangeLog.md
@ -1,5 +1,10 @@
 # CppJieba ChangeLog

+## v2.4.4 (is coming)
+
+1. 修改两条更细粒度的特殊过滤规则，将连续的数字（包括浮点数）和连续的字母单独切分出来（而不会混在一起）。
+2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构（同时也修改 Trie 的 DAG 查询函数），提高分词速度 8% 。
+
 ## v2.4.3

 1. 更新 [Husky] 服务代码，新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@ -39,7 +39,7 @@ namespace CppJieba
        return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
    }

-    typedef map<size_t, const DictUnit*> DagType;
+    typedef std::vector<std::pair<size_t, const DictUnit*> > DagType;

    class DictTrie
    {
--- a/src/MPSegment.hpp
+++ b/src/MPSegment.hpp
@ -92,8 +92,8 @@ namespace CppJieba
                {
                    segmentChars[i].uniCh = *(begin + i);
                    segmentChars[i].dag.clear();
+                    segmentChars[i].dag.push_back(std::pair<size_t, const DictUnit*>(i, NULL));
                    _dictTrie.find(begin + i, end, segmentChars[i].dag, i);
-                    segmentChars[i].dag.insert(pair<DagType::key_type, DagType::mapped_type>(i, NULL));
                }

                _calcDP(segmentChars);
--- a/src/Trie.hpp
+++ b/src/Trie.hpp
@ -56,11 +56,14 @@ namespace CppJieba
                    }
                    return ptNode->ptValue;
                }
-                bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map<typename KeyContainerType::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
+                bool find(
+                            typename KeyContainerType::const_iterator begin, 
+                            typename KeyContainerType::const_iterator end, 
+                            std::vector<std::pair<typename KeyContainerType::size_type, const ValueType* > >& res, 
+                            size_t offset = 0) const
                {
                    const TrieNodeType * ptNode = _root;
                    typename TrieNodeType::KeyMapType::const_iterator citer;
-                    ordererMap.clear();
                    for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
                    {
                        assert(ptNode);
@ -71,10 +74,17 @@ namespace CppJieba
                        ptNode = citer->second;
                        if(ptNode->ptValue)
                        {
-                            ordererMap[itr - begin + offset] = ptNode->ptValue;
+                            if(itr == begin && res.size() == 1) // first singleword
+                            {
+                                res[0].second = ptNode->ptValue;
+                            }
+                            else
+                            {
+                                res.push_back(pair<typename KeysContainerType::size_type, const ValueType* >(itr - begin + offset, ptNode->ptValue));
+                            }
                        }
                    }
-                    return ordererMap.size();
+                    return !res.empty();
                }
            private:
                void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
--- a/test/unittest/TTrie.cpp
+++ b/test/unittest/TTrie.cpp
@ -34,22 +34,22 @@ TEST(DictTrieTest, Test1)
    EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
    word = "清华大学";
    vector<pair<size_t, const DictUnit*> > res;
-    map<size_t, const DictUnit* > resMap;
-    map<size_t, const DictUnit* > mp;
+    //vector<pair<size_t, const DictUnit* > resMap;
+    vector<pair<size_t, const DictUnit*> > res2;
    const char * words[] = {"清", "清华", "清华大学"};
    for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
    {
        ASSERT_TRUE(TransCode::decode(words[i], uni));
        res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
-        resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
+        //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
    }
    //DictUnit
    //res.push_back(make_pair(0, ))

    vector<pair<size_t, const DictUnit*> > vec;
    ASSERT_TRUE(TransCode::decode(word, uni));
-    ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
-    ASSERT_EQ(mp, resMap);
+    ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
+    ASSERT_EQ(res, res2);
 }

 TEST(DictTrieTest, UserDict)