From fbae0f60758763742cb54f54c265fc081427cc7e Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Mon, 3 Nov 2014 10:54:53 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=A4=E6=9D=A1=E5=88=86?=
 =?UTF-8?q?=E8=AF=8D=E8=A7=84=E5=88=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/HMMSegment.hpp          | 63 +++++++++++++++++++++++++++++++++----
 test/unittest/TSegments.cpp | 42 +++++++++++++++++++------
 2 files changed, 89 insertions(+), 16 deletions(-)
diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp
index 6935958..175e405 100644
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@@ -74,10 +74,19 @@ namespace CppJieba
                             return false;
                         }
                         left = right;
-                        while(*right < 0x80 && right != end)
-                        {
-                            right++;
-                        }
+                        do {
+                            right = _sequentialLetterRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right = _numbersRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right ++;
+                        } while(false);
                         res.push_back(Unicode(left, right));
                         left = right;
                     }
@@ -93,6 +102,50 @@ namespace CppJieba
                 return true;
             }
         private:
+            // sequential letters rule
+            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x;
+                while(begin != end)
+                {
+                    x = *begin;
+                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+                    {
+                        begin ++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
+            // 
+            Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x = *begin;
+                if('0' <= x && x <= '9')
+                {
+                    begin ++;
+                }
+                else
+                {
+                    return begin;
+                }
+                while(begin != end)
+                {
+                    x = *begin;
+                    if( ('0' <= x && x <= '9') || x == '.')
+                    {
+                        begin++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
             bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
             {
                 vector<size_t> status; 
@@ -231,7 +284,6 @@ namespace CppJieba
                 for(size_t j = 0; j< tmp.size(); j++)
                 {
                     _startProb[j] = atof(tmp[j].c_str());
-                    //cout<<_startProb[j]<<endl;
                 }
 
                 //load _transProb
@@ -250,7 +302,6 @@ namespace CppJieba
                     for(size_t j =0; j < STATUS_SUM; j++)
                     {
                         _transProb[i][j] = atof(tmp[j].c_str());
-                        //cout<<_transProb[i][j]<<endl;
                     }
                 }
 
diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp
index 803ea02..c59f233 100644
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@@ -37,11 +37,23 @@ TEST(MixSegmentTest, NoUserDict)
 TEST(MixSegmentTest, UserDict)
 {
     MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
-    const char* str = "令狐冲是云计算方面的专家";
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    string res;
-    ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    {
+        const char* str = "令狐冲是云计算方面的专家";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    }
+    {
+        const char* str = "小明先就职于IBM,后在日本京都大学深造";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        print(res);
+        exit(1);
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res);
+    }
     
 }
 
@@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2)
 TEST(HMMSegmentTest, Test1)
 {
     HMMSegment segment("../dict/hmm_model.utf8");;
-    const char* str = "我来自北京邮电大学。。。学号123456";
-    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    {
+        const char* str = "我来自北京邮电大学。。。学号123456";
+        const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
+    
+    {
+        const char* str = "IBM,1.2,123";
+        const char* res[] = {"IBM", ",", "1.2", ",", "123"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
 }
 
 TEST(FullSegment, Test1)