Merge https://github.com/aszxqw/cppjieba

2025-07-18 00:00:12 +08:00 · 2014-11-14 13:20:04 +08:00 · 2014-11-14 13:20:04 +08:00 · 7791290473
commit 7791290473
parent 283c65db0a 9d5359fc34
7 changed files with 309 additions and 174 deletions
--- a/ChangeLog.md
+++ b/ChangeLog.md
@ -1,5 +1,11 @@
 # CppJieba ChangeLog

+## v2.4.4 (is coming)
+
+1. 修改两条更细粒度的特殊过滤规则，将连续的数字（包括浮点数）和连续的字母单独切分出来（而不会混在一起）。
+2. 修改最大概率法时动态规划过程需要使用的 DAG 数据结构（同时也修改 Trie 的 DAG 查询函数），提高分词速度 8% 。
+3. 使用了 `Aho-Corasick-Automation` 算法提速 Trie 查找的过程等优化，提升性能。
+
 ## v2.4.3

 1. 更新 [Husky] 服务代码，新 [Husky] 为基于线程池的服务器简易框架。并且修复当 HTTP POST 请求时 body 过长数据可能丢失的问题。
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@ -23,31 +23,11 @@ namespace CppJieba
    const size_t DICT_COLUMN_NUM = 3;
    const char* const UNKNOWN_TAG = "x";

-
-
-    struct DictUnit
-    {
-        Unicode word;
-        double weight; 
-        string tag;
-    };
-
-    inline ostream & operator << (ostream& os, const DictUnit& unit)
-    {
-        string s;
-        s << unit.word;
-        return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
-    }
-
-    typedef map<size_t, const DictUnit*> DagType;
-
    class DictTrie
    {
-        public:
-            typedef Trie<Unicode::value_type, DictUnit, Unicode, vector<Unicode>, vector<const DictUnit*> > TrieType;
        private:
            vector<DictUnit> _nodeInfos;
-            TrieType * _trie;
+            Trie * _trie;

            double _minWeight;
        private:
@ -107,10 +87,18 @@ namespace CppJieba
            {
                return _trie->find(begin, end, dag, offset);
            }
+            void find(
+                        Unicode::const_iterator begin, 
+                        Unicode::const_iterator end, 
+                        vector<SegmentChar>& res
+                        ) const
+            {
+                _trie->find(begin, end, res);
+            }


        private:
-            TrieType * _createTrie(const vector<DictUnit>& dictUnits)
+            Trie * _createTrie(const vector<DictUnit>& dictUnits)
            {
                assert(dictUnits.size());
                vector<Unicode> words;
@ -121,7 +109,7 @@ namespace CppJieba
                    valuePointers.push_back(&dictUnits[i]);
                }

-                TrieType * trie = new TrieType(words, valuePointers);
+                Trie * trie = new Trie(words, valuePointers);
                return trie;
            }
            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
--- a/src/MPSegment.hpp
+++ b/src/MPSegment.hpp
@ -1,7 +1,3 @@
-/************************************
- * file enc : ASCII
- * author   : wuyanyi09@gmail.com
- ************************************/
 #ifndef CPPJIEBA_MPSEGMENT_H
 #define CPPJIEBA_MPSEGMENT_H

@ -17,17 +13,6 @@
 namespace CppJieba
 {

-    struct SegmentChar 
-    {
-        uint16_t uniCh;
-        DagType dag;
-        const DictUnit * pInfo;
-        double weight;
-        size_t nextPos;
-        SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
-        {}
-    };
-
    class MPSegment: public SegmentBase
    {
        private:
@ -85,24 +70,13 @@ namespace CppJieba
                {
                    return false;
                }
-                vector<SegmentChar> segmentChars(end - begin);
+                vector<SegmentChar> segmentChars;

-                //calc DAG
-                for(size_t i = 0; i < segmentChars.size(); i ++)
-                {
-                    segmentChars[i].uniCh = *(begin + i);
-                    segmentChars[i].dag.clear();
-                    _dictTrie.find(begin + i, end, segmentChars[i].dag, i);
-                    segmentChars[i].dag.insert(pair<DagType::key_type, DagType::mapped_type>(i, NULL));
-                }
+                _dictTrie.find(begin, end, segmentChars);

                _calcDP(segmentChars);

-                if(!_cut(segmentChars, res))
-                {
-                    LogError("_cut failed.");
-                    return false;
-                }
+                _cut(segmentChars, res);

                return true;
            }
@ -112,24 +86,25 @@ namespace CppJieba
            }

        private:
-            void _calcDP(vector<SegmentChar>& SegmentChars) const
+            void _calcDP(vector<SegmentChar>& segmentChars) const
            {
                size_t nextPos;
                const DictUnit* p;
                double val;

-                for(int i = SegmentChars.size() - 1; i >= 0; i--)
+                for(ssize_t i = segmentChars.size() - 1; i >= 0; i--)
                {
-                    SegmentChars[i].pInfo = NULL;
-                    SegmentChars[i].weight = MIN_DOUBLE;
-                    for(DagType::const_iterator it = SegmentChars[i].dag.begin(); it != SegmentChars[i].dag.end(); it++)
+                    segmentChars[i].pInfo = NULL;
+                    segmentChars[i].weight = MIN_DOUBLE;
+                    assert(!segmentChars[i].dag.empty());
+                    for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++)
                    {
                        nextPos = it->first;
                        p = it->second;
                        val = 0.0;
-                        if(nextPos + 1 < SegmentChars.size())
+                        if(nextPos + 1 < segmentChars.size())
                        {
-                            val += SegmentChars[nextPos + 1].weight;
+                            val += segmentChars[nextPos + 1].weight;
                        }

                        if(p)
@ -140,15 +115,15 @@ namespace CppJieba
                        {
                            val += _dictTrie.getMinWeight();
                        }
-                        if(val > SegmentChars[i].weight)
+                        if(val > segmentChars[i].weight)
                        {
-                            SegmentChars[i].pInfo = p;
-                            SegmentChars[i].weight = val;
+                            segmentChars[i].pInfo = p;
+                            segmentChars[i].weight = val;
                        }
                    }
                }
            }
-            bool _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res)const
+            void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const
            {
                size_t i = 0;
                while(i < segmentChars.size())
@ -165,7 +140,6 @@ namespace CppJieba
                        i++;
                    }
                }
-                return true;
            }


--- a/src/Trie.hpp
+++ b/src/Trie.hpp
@ -3,141 +3,295 @@

 #include "Limonp/StdExtension.hpp"
 #include <vector>
+#include <queue>

 namespace CppJieba
 {
    using namespace std;
-    template <class KeyType, class ValueType>
-        class TrieNode
-        {
-            public:
-                typedef unordered_map<KeyType, TrieNode<KeyType, ValueType>* > KeyMapType;
-            public:
-                KeyMapType * ptKeyMap;
-                const ValueType * ptValue;
-        };

-    template <class KeyType, class ValueType, class KeyContainerType = vector<KeyType>, class KeysContainerType = vector<KeyContainerType>, class ValueContainerType = vector<const ValueType* > >
-        class Trie
-        {
-            public:
-                typedef TrieNode<KeyType, ValueType> TrieNodeType;
-            private:
-                TrieNodeType* _root;
-            public:
-                Trie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
-                {
-                    _root = new TrieNodeType;
-                    _root->ptKeyMap = NULL;
-                    _root->ptValue = NULL;
+    struct DictUnit
+    {
+        Unicode word;
+        double weight; 
+        string tag;
+    };

-                    _createTrie(keys, valuePointers);
-                }
-                ~Trie()
+    // for debugging
+    inline ostream & operator << (ostream& os, const DictUnit& unit)
+    {
+        string s;
+        s << unit.word;
+        return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
+    }
+
+    typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
+
+    struct SegmentChar 
+    {
+        uint16_t uniCh;
+        DagType dag;
+        const DictUnit * pInfo;
+        double weight;
+        size_t nextPos;
+        SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
+        {}
+        ~SegmentChar() 
+        {}
+    };
+
+    typedef Unicode::value_type TrieKey;
+
+    class TrieNode
+    {
+        public:
+            typedef unordered_map<TrieKey,  TrieNode*> NextMap;
+        public:
+            TrieNode * fail;
+            NextMap * next;
+            const DictUnit * ptValue;
+        public:
+            TrieNode(): fail(NULL), next(NULL), ptValue(NULL) 
+            {}
+            const TrieNode * findNext(TrieKey key) const
+            {
+                if(next == NULL)
                {
-                    if(_root)
+                    return NULL;
+                }
+                typename NextMap::const_iterator iter = next->find(key);
+                if(iter == next->end()) 
+                {
+                    return NULL;
+                }
+                return iter->second;
+            }
+    };
+
+    class Trie
+    {
+        private:
+            TrieNode* _root;
+        public:
+            Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
+            {
+                _root = new TrieNode;
+                _createTrie(keys, valuePointers);
+                _build();// build automation
+            }
+            ~Trie()
+            {
+                if(_root)
+                {
+                    _deleteNode(_root);
+                }
+            }
+        public:
+            const DictUnit* find(typename Unicode::const_iterator begin, typename Unicode::const_iterator end) const
+            {
+                typename TrieNode::NextMap::const_iterator citer;
+                const TrieNode* ptNode = _root;
+                for(typename Unicode::const_iterator it = begin; it != end; it++)
+                {// build automation
+                    assert(ptNode);
+                    if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
                    {
-                        _deleteNode(_root);
+                        return NULL;
                    }
+                    ptNode = citer->second;
                }
-            public:
-                const ValueType* find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end) const
+                return ptNode->ptValue;
+            }
+            // aho-corasick-automation 
+            void find(
+                        typename Unicode::const_iterator begin, 
+                        typename Unicode::const_iterator end, 
+                        vector<struct SegmentChar>& res
+                        ) const
+            {
+                res.resize(end - begin);
+                const TrieNode * now = _root;
+                //typename TrieNode::NextMap::const_iterator iter;
+                const TrieNode* node;
+                for (size_t i = 0; i < end - begin; i++) 
                {
-                    typename TrieNodeType::KeyMapType::const_iterator citer;
-                    const TrieNodeType* ptNode = _root;
-                    for(typename KeyContainerType::const_iterator it = begin; it != end; it++)
+                    Unicode::value_type ch = *(begin + i);
+                    res[i].uniCh = ch;
+                    assert(res[i].dag.empty());
+                    res[i].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, NULL));
+                    bool flag = false;
+
+                    // rollback
+                    while( now != _root )
                    {
-                        assert(ptNode);
-                        if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*it)))
-                        {
-                            return NULL;
-                        }
-                        ptNode = citer->second;
-                    }
-                    return ptNode->ptValue;
-                }
-                bool find(typename KeyContainerType::const_iterator begin, typename KeyContainerType::const_iterator end, map<typename KeyContainerType::size_type, const ValueType* >& ordererMap, size_t offset = 0) const
-                {
-                    const TrieNodeType * ptNode = _root;
-                    typename TrieNodeType::KeyMapType::const_iterator citer;
-                    ordererMap.clear();
-                    for(typename KeyContainerType::const_iterator itr = begin; itr != end ; itr++)
-                    {
-                        assert(ptNode);
-                        if(NULL == ptNode->ptKeyMap || ptNode->ptKeyMap->end() == (citer = ptNode->ptKeyMap->find(*itr)))
+                        node = now->findNext(ch);
+                        if (node != NULL) 
                        {
+                            flag = true;
                            break;
                        }
-                        ptNode = citer->second;
-                        if(ptNode->ptValue)
+                        else 
                        {
-                            ordererMap[itr - begin + offset] = ptNode->ptValue;
+                            now = now->fail;
                        }
                    }
-                    return ordererMap.size();
-                }
-            private:
-                void _createTrie(const KeysContainerType& keys, const ValueContainerType& valuePointers)
-                {
-                    if(valuePointers.empty() || keys.empty())
+
+                    if(!flag)
                    {
-                        return;
+                        node = now->findNext(ch);
                    }
-                    assert(keys.size() == valuePointers.size());
-
-                    for(size_t i = 0; i < keys.size(); i++)
+                    if(node == NULL) 
                    {
-                        _insertNode(keys[i], valuePointers[i]);
-                    }
-                }
-            private:
-                void _insertNode(const KeyContainerType& key, const ValueType* ptValue)
-                {
-                    TrieNodeType* ptNode  = _root;
-
-                    typename TrieNodeType::KeyMapType::const_iterator kmIter;
-
-                    for(typename KeyContainerType::const_iterator citer = key.begin(); citer != key.end(); citer++)
+                        now = _root;
+                    } 
+                    else 
                    {
-                        if(NULL == ptNode->ptKeyMap)
+                        now = node;
+                        const TrieNode * temp = now;
+                        while(temp != _root) 
                        {
-                            ptNode->ptKeyMap = new typename TrieNodeType::KeyMapType;
+                            if (temp->ptValue) 
+                            {
+                                size_t pos = i - temp->ptValue->word.size() + 1;
+                                res[pos].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
+                                if(pos == i) 
+                                {
+                                    res[pos].dag[0].second = temp->ptValue;
+                                }
+                            }
+                            temp = temp->fail;
+                            assert(temp);
                        }
-                        kmIter = ptNode->ptKeyMap->find(*citer);
-                        if(ptNode->ptKeyMap->end() == kmIter)
+                    }
+                }
+            }
+            bool find(
+                        typename Unicode::const_iterator begin, 
+                        typename Unicode::const_iterator end, 
+                        DagType & res,
+                        size_t offset = 0) const
+            {
+                const TrieNode * ptNode = _root;
+                typename TrieNode::NextMap::const_iterator citer;
+                for(typename Unicode::const_iterator itr = begin; itr != end ; itr++)
+                {
+                    assert(ptNode);
+                    if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
+                    {
+                        break;
+                    }
+                    ptNode = citer->second;
+                    if(ptNode->ptValue)
+                    {
+                        if(itr == begin && res.size() == 1) // first singleword
                        {
-                            TrieNodeType * nextNode = new TrieNodeType;
-                            nextNode->ptKeyMap = NULL;
-                            nextNode->ptValue = NULL;
-
-                            (*ptNode->ptKeyMap)[*citer] = nextNode;
-                            ptNode = nextNode;
+                            res[0].second = ptNode->ptValue;
                        }
                        else
                        {
-                            ptNode = kmIter->second;
+                            res.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
                        }
                    }
-                    ptNode->ptValue = ptValue;
                }
-                void _deleteNode(TrieNodeType* node)
+                return !res.empty();
+            }
+        private:
+            void _build()
+            {
+                queue<TrieNode*> que;
+                assert(_root->ptValue == NULL);
+                assert(_root->next);
+                _root->fail = NULL;
+                for(typename TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
+                    iter->second->fail = _root;
+                    que.push(iter->second);
+                }
+                TrieNode* back = NULL;
+                typename TrieNode::NextMap::iterator backiter;
+                while(!que.empty()) {
+                    TrieNode * now = que.front();
+                    que.pop();
+                    if(now->next == NULL) {
+                        continue;
+                    }
+                    for(typename TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
+                        back = now->fail;
+                        while(back != NULL) {
+                            if(back->next && (backiter = back->next->find(iter->first)) != back->next->end()) 
+                            {
+                                iter->second->fail = backiter->second;
+                                break;
+                            }
+                            back = back->fail;
+                        }
+                        if(back == NULL) {
+                            iter->second->fail = _root;
+                        }
+                        que.push(iter->second);
+                    }
+                }
+            }
+        private:
+            void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
+            {
+                if(valuePointers.empty() || keys.empty())
                {
-                    if(!node)
-                    {
-                        return;
-                    }
-                    if(node->ptKeyMap)
-                    {
-                        typename TrieNodeType::KeyMapType::iterator it;
-                        for(it = node->ptKeyMap->begin(); it != node->ptKeyMap->end(); it++)
-                        {
-                            _deleteNode(it->second);
-                        }
-                        delete node->ptKeyMap;
-                    }
-                    delete node;
+                    return;
                }
-        };
+                assert(keys.size() == valuePointers.size());
+
+                for(size_t i = 0; i < keys.size(); i++)
+                {
+                    _insertNode(keys[i], valuePointers[i]);
+                }
+            }
+        private:
+            void _insertNode(const Unicode& key, const DictUnit* ptValue)
+            {
+                TrieNode* ptNode  = _root;
+
+                typename TrieNode::NextMap::const_iterator kmIter;
+
+                for(typename Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++)
+                {
+                    if(NULL == ptNode->next)
+                    {
+                        ptNode->next = new typename TrieNode::NextMap;
+                    }
+                    kmIter = ptNode->next->find(*citer);
+                    if(ptNode->next->end() == kmIter)
+                    {
+                        TrieNode * nextNode = new TrieNode;
+                        nextNode->next = NULL;
+                        nextNode->ptValue = NULL;
+
+                        (*ptNode->next)[*citer] = nextNode;
+                        ptNode = nextNode;
+                    }
+                    else
+                    {
+                        ptNode = kmIter->second;
+                    }
+                }
+                ptNode->ptValue = ptValue;
+            }
+            void _deleteNode(TrieNode* node)
+            {
+                if(!node)
+                {
+                    return;
+                }
+                if(node->next)
+                {
+                    typename TrieNode::NextMap::iterator it;
+                    for(it = node->next->begin(); it != node->next->end(); it++)
+                    {
+                        _deleteNode(it->second);
+                    }
+                    delete node->next;
+                }
+                delete node;
+            }
+    };
 }

 #endif
--- a/test/load_test.cpp
+++ b/test/load_test.cpp
@ -8,7 +8,7 @@

 using namespace CppJieba;

-void cut(size_t times = 20)
+void cut(size_t times = 50)
 {
    MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
    vector<string> res;
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -21,7 +21,6 @@ TEST(MixSegmentTest, Test1)
    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
    ASSERT_TRUE(segment.cut(str2, words));
    ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
-    //exit(0);
 }

 TEST(MixSegmentTest, NoUserDict)
--- a/test/unittest/TTrie.cpp
+++ b/test/unittest/TTrie.cpp
@ -1,4 +1,5 @@
 #include "src/DictTrie.hpp"
+#include "src/MPSegment.hpp"
 #include "gtest/gtest.h"

 using namespace CppJieba;
@ -33,23 +34,25 @@ TEST(DictTrieTest, Test1)
    
    EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
    word = "清华大学";
-    vector<pair<size_t, const DictUnit*> > res;
-    map<size_t, const DictUnit* > resMap;
-    map<size_t, const DictUnit* > mp;
+    LocalVector<pair<size_t, const DictUnit*> > res;
+    //vector<pair<size_t, const DictUnit* > resMap;
+    LocalVector<pair<size_t, const DictUnit*> > res2;
    const char * words[] = {"清", "清华", "清华大学"};
    for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++)
    {
        ASSERT_TRUE(TransCode::decode(words[i], uni));
        res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end())));
-        resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
+        //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end());
    }
    //DictUnit
    //res.push_back(make_pair(0, ))

    vector<pair<size_t, const DictUnit*> > vec;
    ASSERT_TRUE(TransCode::decode(word, uni));
-    ASSERT_TRUE(trie.find(uni.begin(), uni.end(), mp, 0));
-    ASSERT_EQ(mp, resMap);
+    ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0));
+    s1 << res;
+    s2 << res;
+    ASSERT_EQ(s1, s2);
 }

 TEST(DictTrieTest, UserDict)
@ -64,3 +67,14 @@ TEST(DictTrieTest, UserDict)
    res << *unit;
    ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] x -2.975", res);
 }
+
+TEST(DictTrieTest, automation)
+{
+    DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
+    //string word = "yasherhs";
+    string word = "abcderf";
+    Unicode unicode;
+    ASSERT_TRUE(TransCode::decode(word, unicode));
+    vector<struct SegmentChar> res;
+    trie.find(unicode.begin(), unicode.end(), res);
+}