From 5b654f66dbcd24b2356051e4754c253433b5955b Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Sat, 17 May 2014 16:22:54 +0800
Subject: [PATCH] make single one chinese word in userdict will not be ignored
 in mixsegment.hpp

---
 src/DictTrie.hpp             | 17 ++++++++++++++---
 src/MPSegment.hpp            |  4 ++++
 src/MixSegment.hpp           |  8 ++++----
 test/testdata/testlines.utf8 |  1 +
 test/testdata/userdict.utf8  |  2 ++
 5 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/src/DictTrie.hpp b/src/DictTrie.hpp
index 53a6291..f655d7d 100644
--- a/src/DictTrie.hpp
+++ b/src/DictTrie.hpp
@@ -50,6 +50,13 @@ namespace CppJieba
             TrieType * _trie;
 
             double _minWeight;
+        private:
+            unordered_set<Unicode::value_type> _userDictSingleChineseWord;
+        public:
+            bool isUserDictSingleChineseWord(const Unicode::value_type& word) const
+            {
+                return isIn(_userDictSingleChineseWord, word);
+            }
         public:
             double getMinWeight() const {return _minWeight;};
 
@@ -84,7 +91,7 @@ namespace CppJieba
                 if(userDictPath.size())
                 {
                     double maxWeight = _findMaxWeight(_nodeInfos);
-                    _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
+                    _loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG);
                 }
                 _shrink(_nodeInfos);
                 _trie = _creatTrie(_nodeInfos);
@@ -118,7 +125,7 @@ namespace CppJieba
                 TrieType * trie = new TrieType(words, valuePointers);
                 return trie;
             }
-            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag,  vector<DictUnit>& nodeInfos) const
+            void _loadUserDict(const string& filePath, double defaultWeight, const string& defaultTag)
             {
                 ifstream ifs(filePath.c_str());
                 assert(ifs);
@@ -132,9 +139,13 @@ namespace CppJieba
                         LogError("line[%u:%s] illegal.", lineno, line.c_str());
                         continue;
                     }
+                    if(nodeInfo.word.size() == 1)
+                    {
+                        _userDictSingleChineseWord.insert(nodeInfo.word[0]);
+                    }
                     nodeInfo.weight = defaultWeight; 
                     nodeInfo.tag = defaultTag;
-                    nodeInfos.push_back(nodeInfo);
+                    _nodeInfos.push_back(nodeInfo);
                 }
                 LogInfo("load userdict[%s] ok. lines[%u]", filePath.c_str(), lineno);
             }
diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp
index f132b61..3ae78b9 100644
--- a/src/MPSegment.hpp
+++ b/src/MPSegment.hpp
@@ -53,6 +53,10 @@ namespace CppJieba
                 LogInfo("MPSegment init(%s) ok", dictPath.c_str());
                 return _setInitFlag(true);
             }
+            bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
+            {
+                return _dictTrie.isUserDictSingleChineseWord(value);
+            }
         public:
             using SegmentBase::cut;
             virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
diff --git a/src/MixSegment.hpp b/src/MixSegment.hpp
index cc6a02a..e613c8f 100644
--- a/src/MixSegment.hpp
+++ b/src/MixSegment.hpp
@@ -56,15 +56,15 @@ namespace CppJieba
                 for (size_t i = 0, j = 0; i < words.size(); i++)
                 {
                     //if mp get a word, it's ok, put it into result
-                    if (1 != words[i].size())
+                    if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
                     {
                         res.push_back(words[i]);
                         continue;
                     }
 
-                    // if mp get a single one, collect it in sequence
+                    // if mp get a single one and it is not in userdict, collect it in sequence
                     j = i;
-                    while (j < words.size() && words[j].size() == 1)
+                    while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
                     {
                         piece.push_back(words[j][0]);
                         j++;
@@ -77,7 +77,7 @@ namespace CppJieba
                         return false;
                     }
 
-                    //put hmm result to return
+                    //put hmm result to result
                     for (size_t k = 0; k < hmmRes.size(); k++)
                     {
                         res.push_back(hmmRes[k]);
diff --git a/test/testdata/testlines.utf8 b/test/testdata/testlines.utf8
index 18510c8..769e0d2 100644
--- a/test/testdata/testlines.utf8
+++ b/test/testdata/testlines.utf8
@@ -5,3 +5,4 @@
 人事处女干事
 去医院做B超，编号123
 令狐冲是云计算行业的专家
+AB
diff --git a/test/testdata/userdict.utf8 b/test/testdata/userdict.utf8
index 0e77ef4..b34db3b 100644
--- a/test/testdata/userdict.utf8
+++ b/test/testdata/userdict.utf8
@@ -1,2 +1,4 @@
 云计算
 韩玉鉴赏
+A
+B