make single one chinese word in userdict will not be ignored in mixsegment.hpp

2025-07-18 00:00:12 +08:00 · 2014-05-17 16:22:54 +08:00 · 2014-05-17 16:22:54 +08:00 · 5b654f66db
commit 5b654f66db
parent 5174ac098a
5 changed files with 25 additions and 7 deletions
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@ -50,6 +50,13 @@ namespace CppJieba
            TrieType * _trie;
            double _minWeight;
        private:
            unordered_set<Unicode::value_type> _userDictSingleChineseWord;
        public:
            bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
            {
                return isIn(_userDictSingleChineseWord, word);
            }
        public:
            double getMinWeight() const {return _minWeight;};
@ -84,7 +91,7 @@ namespace CppJieba
                if(userDictPath.size())
                {
                    double maxWeight = _findMaxWeight(_nodeInfos);
-                    _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
+                    _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
                }
                _shrink(_nodeInfos);
                _trie = _creatTrie(_nodeInfos);
@ -118,7 +125,7 @@ namespace CppJieba
                TrieType * trie = new TrieType(words, valuePointers);
                return trie;
            }
-            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag,  vector<DictUnit>& nodeInfos) const
+            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
            {
                ifstream ifs(filePath.c_str());
                assert(ifs);
@ -132,9 +139,13 @@ namespace CppJieba
                        LogError("line[%u:%s] illegal.", lineno, line.c_str());
                        continue;
                    }
                    if(nodeInfo.word.size() == 1)
                    {
                        _userDictSingleChineseWord.insert(nodeInfo.word[0]);
                    }
                    nodeInfo.weight = defaultWeight; 
                    nodeInfo.tag = defaultTag;
-                    nodeInfos.push_back(nodeInfo);
+                    _nodeInfos.push_back(nodeInfo);
                }
                LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
            }
--- a/src/MPSegment.hpp
+++ b/src/MPSegment.hpp
@ -53,6 +53,10 @@ namespace CppJieba
                LogInfo("MPSegment init(%s) ok", dictPath.c_str());
                return _setInitFlag(true);
            }
            bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
            {
                return _dictTrie.isUserDictSingleChineseWord(value);
            }
        public:
            using SegmentBase::cut;
            virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
--- a/src/MixSegment.hpp
+++ b/src/MixSegment.hpp
@ -56,15 +56,15 @@ namespace CppJieba
                for (size_t i = 0, j = 0; i < words.size(); i++)
                {
                    //if mp get a word, it's ok, put it into result
-                    if (1 != words[i].size())
+                    if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
                    {
                        res.push_back(words[i]);
                        continue;
                    }
-                    // if mp get a single one, collect it in sequence
+                    // if mp get a single one and it is not in userdict, collect it in sequence
                    j = i;
-                    while (j < words.size() && words[j].size() == 1)
+                    while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
                    {
                        piece.push_back(words[j][0]);
                        j++;
@ -77,7 +77,7 @@ namespace CppJieba
                        return false;
                    }
-                    //put hmm result to return
+                    //put hmm result to result
                    for (size_t k = 0; k < hmmRes.size(); k++)
                    {
                        res.push_back(hmmRes[k]);
--- a/test/testdata/testlines.utf8
+++ b/test/testdata/testlines.utf8
@ -5,3 +5,4 @@
 人事处女干事
 去医院做B超，编号123
 令狐冲是云计算行业的专家
 AB
--- a/test/testdata/userdict.utf8
+++ b/test/testdata/userdict.utf8
@ -1,2 +1,4 @@
 云计算
 韩玉鉴赏
 A
 B