From b68a76e63a41a3903e2b2a17a72acc139956a49f Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Sun, 26 Oct 2014 12:21:10 +0800
Subject: [PATCH 1/5] =?UTF-8?q?=E5=AE=8C=E5=96=84=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                       | 7 +++++++
 test/servertest/go_load_test.sh | 2 ++
 test/testdata/load_test.urls    | 1 +
 3 files changed, 10 insertions(+)
 create mode 100755 test/servertest/go_load_test.sh

diff --git a/README.md b/README.md
index a0ef574..2c2be27 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,13 @@ cmake ..
 make
 ```
 
+有兴趣的可以跑跑测试(可选):
+
+```
+./test/test.run
+./load_test
+```
+
 ## 演示
 
 ```
diff --git a/test/servertest/go_load_test.sh b/test/servertest/go_load_test.sh
new file mode 100755
index 0000000..568588b
--- /dev/null
+++ b/test/servertest/go_load_test.sh
@@ -0,0 +1,2 @@
+# go get github.com/aszxqw/go_http_load
+go_http_load -method=GET -get_urls="../test/testdata/load_test.urls" -loop_count=500 -goroutines=2
diff --git a/test/testdata/load_test.urls b/test/testdata/load_test.urls
index b710af2..96e2781 100644
--- a/test/testdata/load_test.urls
+++ b/test/testdata/load_test.urls
@@ -1 +1,2 @@
 http://127.0.0.1:11200/?key=南京市长江大桥
+http://127.0.0.1:11200/?key=长春市长春药店

From fbae0f60758763742cb54f54c265fc081427cc7e Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Mon, 3 Nov 2014 10:54:53 +0800
Subject: [PATCH 2/5] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=A4=E6=9D=A1?=
 =?UTF-8?q?=E5=88=86=E8=AF=8D=E8=A7=84=E5=88=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/HMMSegment.hpp          | 63 +++++++++++++++++++++++++++++++++----
 test/unittest/TSegments.cpp | 42 +++++++++++++++++++------
 2 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp
index 6935958..175e405 100644
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@@ -74,10 +74,19 @@ namespace CppJieba
                             return false;
                         }
                         left = right;
-                        while(*right < 0x80 && right != end)
-                        {
-                            right++;
-                        }
+                        do {
+                            right = _sequentialLetterRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right = _numbersRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right ++;
+                        } while(false);
                         res.push_back(Unicode(left, right));
                         left = right;
                     }
@@ -93,6 +102,50 @@ namespace CppJieba
                 return true;
             }
         private:
+            // sequential letters rule
+            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x;
+                while(begin != end)
+                {
+                    x = *begin;
+                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+                    {
+                        begin ++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
+            // 
+            Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x = *begin;
+                if('0' <= x && x <= '9')
+                {
+                    begin ++;
+                }
+                else
+                {
+                    return begin;
+                }
+                while(begin != end)
+                {
+                    x = *begin;
+                    if( ('0' <= x && x <= '9') || x == '.')
+                    {
+                        begin++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
             bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
             {
                 vector<size_t> status; 
@@ -231,7 +284,6 @@ namespace CppJieba
                 for(size_t j = 0; j< tmp.size(); j++)
                 {
                     _startProb[j] = atof(tmp[j].c_str());
-                    //cout<<_startProb[j]<<endl;
                 }
 
                 //load _transProb
@@ -250,7 +302,6 @@ namespace CppJieba
                     for(size_t j =0; j < STATUS_SUM; j++)
                     {
                         _transProb[i][j] = atof(tmp[j].c_str());
-                        //cout<<_transProb[i][j]<<endl;
                     }
                 }
 
diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp
index 803ea02..c59f233 100644
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@@ -37,11 +37,23 @@ TEST(MixSegmentTest, NoUserDict)
 TEST(MixSegmentTest, UserDict)
 {
     MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
-    const char* str = "令狐冲是云计算方面的专家";
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    string res;
-    ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    {
+        const char* str = "令狐冲是云计算方面的专家";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    }
+    {
+        const char* str = "小明先就职于IBM,后在日本京都大学深造";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        print(res);
+        exit(1);
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res);
+    }
     
 }
 
@@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2)
 TEST(HMMSegmentTest, Test1)
 {
     HMMSegment segment("../dict/hmm_model.utf8");;
-    const char* str = "我来自北京邮电大学。。。学号123456";
-    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    {
+        const char* str = "我来自北京邮电大学。。。学号123456";
+        const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
+    
+    {
+        const char* str = "IBM,1.2,123";
+        const char* res[] = {"IBM", ",", "1.2", ",", "123"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
 }
 
 TEST(FullSegment, Test1)

From 107638f7d87a0c2a7751f78b200b1897d028d22e Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Mon, 3 Nov 2014 11:19:00 +0800
Subject: [PATCH 3/5] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E7=AD=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/segment_demo.cpp        | 2 +-
 test/testdata/testlines.utf8 | 2 +-
 test/unittest/TSegments.cpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/segment_demo.cpp b/test/segment_demo.cpp
index 9103bbb..36d8e5d 100644
--- a/test/segment_demo.cpp
+++ b/test/segment_demo.cpp
@@ -12,7 +12,7 @@ using namespace CppJieba;
 const char * const TEST_FILE = "../test/testdata/testlines.utf8";
 const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
 const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
-const char * const USER_DICT_FILE = "../test/testdata/userdict.utf8";
+const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
 
 void cut(const ISegment& seg, const char * const filePath)
 {
diff --git a/test/testdata/testlines.utf8 b/test/testdata/testlines.utf8
index 769e0d2..eb382da 100644
--- a/test/testdata/testlines.utf8
+++ b/test/testdata/testlines.utf8
@@ -5,4 +5,4 @@
 人事处女干事
 去医院做B超，编号123
 令狐冲是云计算行业的专家
-AB
+IBM,3.14
diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp
index c59f233..811f107 100644
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@@ -36,7 +36,7 @@ TEST(MixSegmentTest, NoUserDict)
 }
 TEST(MixSegmentTest, UserDict)
 {
-    MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
+    MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
     {
         const char* str = "令狐冲是云计算方面的专家";
         vector<string> words;

From 471a68e08e1ab9b39336404c5b08a14ed7c29f78 Mon Sep 17 00:00:00 2001
From: wyy <wuyanyi09@gmail.com>
Date: Mon, 3 Nov 2014 11:30:45 +0800
Subject: [PATCH 4/5] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/unittest/TSegments.cpp | 40 +++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp
index 811f107..36eb36f 100644
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@@ -50,11 +50,43 @@ TEST(MixSegmentTest, UserDict)
         ASSERT_TRUE(segment.cut(str, words));
         string res;
         res << words;
-        print(res);
-        exit(1);
-        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res);
+        ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
+    }
+    {
+        const char* str = "IBM,3.14";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
+    }
+}
+TEST(MixSegmentTest, UserDict2)
+{
+    MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
+    {
+        const char* str = "令狐冲是云计算方面的专家";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    }
+    {
+        const char* str = "小明先就职于IBM,后在日本京都大学深造";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
+    }
+    {
+        const char* str = "IBM,3.14";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
     }
-    
 }
 
 TEST(MPSegmentTest, Test1)

From 7bf2bceee4d9a5abe66e683f0770a3b609257a84 Mon Sep 17 00:00:00 2001
From: Qin Wenfeng <qinwf@users.noreply.github.com>
Date: Tue, 4 Nov 2014 12:07:33 +0800
Subject: [PATCH 5/5] =?UTF-8?q?README=20=E4=B8=AD=E6=B7=BB=E5=8A=A0=20jieb?=
 =?UTF-8?q?aR?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加了CppJieba的R语言封装 jiebaR。
---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 2c2be27..822dd6f 100644
--- a/README.md
+++ b/README.md
@@ -308,6 +308,10 @@ make && ./keyword.demo
 
 如果有需要在`erlang`中使用分词的话，不妨试一下[exjieba]。
 
+### jiebaR
+
+如果有需要在`R`中使用分词的话，不妨试一下[jiebaR]。
+
 ### libcppjieba
 
 [libcppjieba] 是最简单易懂的CppJieba头文件库使用示例。
@@ -344,6 +348,7 @@ https://github.com/fxsjy/jieba
 [cppjiebapy]:https://github.com/jannson/cppjiebapy
 [cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
 [NodeJieba]:https://github.com/aszxqw/nodejieba
+[jiebaR]:https://github.com/qinwf/jiebaR
 [simhash]:https://github.com/aszxqw/simhash
 [代码详解]:https://github.com/aszxqw/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
 [libcppjieba]:https://github.com/aszxqw/libcppjieba