增加两条分词规则

2025-07-18 00:00:12 +08:00 · 2014-11-03 10:54:53 +08:00 · 2014-11-03 10:54:53 +08:00 · fbae0f6075
commit fbae0f6075
parent b68a76e63a
2 changed files with 89 additions and 16 deletions
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -74,10 +74,19 @@ namespace CppJieba
                            return false;
                        }
                        left = right;
-                        while(*right < 0x80 && right != end)
-                        {
-                            right++;
-                        }
+                        do {
+                            right = _sequentialLetterRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right = _numbersRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right ++;
+                        } while(false);
                        res.push_back(Unicode(left, right));
                        left = right;
                    }
@ -93,6 +102,50 @@ namespace CppJieba
                return true;
            }
        private:
+            // sequential letters rule
+            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x;
+                while(begin != end)
+                {
+                    x = *begin;
+                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+                    {
+                        begin ++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
+            // 
+            Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x = *begin;
+                if('0' <= x && x <= '9')
+                {
+                    begin ++;
+                }
+                else
+                {
+                    return begin;
+                }
+                while(begin != end)
+                {
+                    x = *begin;
+                    if( ('0' <= x && x <= '9') || x == '.')
+                    {
+                        begin++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
            bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
            {
                vector<size_t> status; 
@ -231,7 +284,6 @@ namespace CppJieba
                for(size_t j = 0; j< tmp.size(); j++)
                {
                    _startProb[j] = atof(tmp[j].c_str());
-                    //cout<<_startProb[j]<<endl;
                }

                //load _transProb
@ -250,7 +302,6 @@ namespace CppJieba
                    for(size_t j =0; j < STATUS_SUM; j++)
                    {
                        _transProb[i][j] = atof(tmp[j].c_str());
-                        //cout<<_transProb[i][j]<<endl;
                    }
                }

--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -37,11 +37,23 @@ TEST(MixSegmentTest, NoUserDict)
 TEST(MixSegmentTest, UserDict)
 {
    MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
-    const char* str = "令狐冲是云计算方面的专家";
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    string res;
-    ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    {
+        const char* str = "令狐冲是云计算方面的专家";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    }
+    {
+        const char* str = "小明先就职于IBM,后在日本京都大学深造";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        print(res);
+        exit(1);
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res);
+    }
    
 }

@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2)
 TEST(HMMSegmentTest, Test1)
 {
    HMMSegment segment("../dict/hmm_model.utf8");;
-    const char* str = "我来自北京邮电大学。。。学号123456";
-    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    {
+        const char* str = "我来自北京邮电大学。。。学号123456";
+        const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
+    
+    {
+        const char* str = "IBM,1.2,123";
+        const char* res[] = {"IBM", ",", "1.2", ",", "123"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
 }

 TEST(FullSegment, Test1)