Merge https://github.com/aszxqw/cppjieba

2025-07-18 00:00:12 +08:00 · 2014-11-05 11:12:33 +08:00 · 2014-11-05 11:12:33 +08:00 · c2125b5371
commit c2125b5371
parent e85a3ef8d3 a3671ab252
7 changed files with 139 additions and 19 deletions
--- a/README.md
+++ b/README.md
@ -34,6 +34,13 @@ cmake ..
 make
 ```

+有兴趣的可以跑跑测试(可选):
+
+```
+./test/test.run
+./load_test
+```
+
 ## 演示

 ```
@ -301,6 +308,10 @@ make && ./keyword.demo

 如果有需要在`erlang`中使用分词的话，不妨试一下[exjieba]。

+### jiebaR
+
+如果有需要在`R`中使用分词的话，不妨试一下[jiebaR]。
+
 ### libcppjieba

 [libcppjieba] 是最简单易懂的CppJieba头文件库使用示例。
@ -337,6 +348,7 @@ https://github.com/fxsjy/jieba
 [cppjiebapy]:https://github.com/jannson/cppjiebapy
 [cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1
 [NodeJieba]:https://github.com/aszxqw/nodejieba
+[jiebaR]:https://github.com/qinwf/jiebaR
 [simhash]:https://github.com/aszxqw/simhash
 [代码详解]:https://github.com/aszxqw/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
 [libcppjieba]:https://github.com/aszxqw/libcppjieba
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -74,10 +74,19 @@ namespace CppJieba
                            return false;
                        }
                        left = right;
-                        while(*right < 0x80 && right != end)
-                        {
-                            right++;
-                        }
+                        do {
+                            right = _sequentialLetterRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right = _numbersRule(left, end);
+                            if(right != left)
+                            {
+                                break;
+                            }
+                            right ++;
+                        } while(false);
                        res.push_back(Unicode(left, right));
                        left = right;
                    }
@ -93,6 +102,50 @@ namespace CppJieba
                return true;
            }
        private:
+            // sequential letters rule
+            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x;
+                while(begin != end)
+                {
+                    x = *begin;
+                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+                    {
+                        begin ++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
+            // 
+            Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
+            {
+                Unicode::value_type x = *begin;
+                if('0' <= x && x <= '9')
+                {
+                    begin ++;
+                }
+                else
+                {
+                    return begin;
+                }
+                while(begin != end)
+                {
+                    x = *begin;
+                    if( ('0' <= x && x <= '9') || x == '.')
+                    {
+                        begin++;
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+                return begin;
+            }
            bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const 
            {
                vector<size_t> status; 
@ -231,7 +284,6 @@ namespace CppJieba
                for(size_t j = 0; j< tmp.size(); j++)
                {
                    _startProb[j] = atof(tmp[j].c_str());
-                    //cout<<_startProb[j]<<endl;
                }

                //load _transProb
@ -250,7 +302,6 @@ namespace CppJieba
                    for(size_t j =0; j < STATUS_SUM; j++)
                    {
                        _transProb[i][j] = atof(tmp[j].c_str());
-                        //cout<<_transProb[i][j]<<endl;
                    }
                }

--- a/test/segment_demo.cpp
+++ b/test/segment_demo.cpp
@ -12,7 +12,7 @@ using namespace CppJieba;
 const char * const TEST_FILE = "../test/testdata/testlines.utf8";
 const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
 const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
-const char * const USER_DICT_FILE = "../test/testdata/userdict.utf8";
+const char * const USER_DICT_FILE = "../dict/user.dict.utf8";

 void cut(const ISegment& seg, const char * const filePath)
 {
--- a/test/servertest/go_load_test.sh
+++ b/test/servertest/go_load_test.sh
@ -0,0 +1,2 @@
+# go get github.com/aszxqw/go_http_load
+go_http_load -method=GET -get_urls="../test/testdata/load_test.urls" -loop_count=500 -goroutines=2
--- a/test/testdata/load_test.urls
+++ b/test/testdata/load_test.urls
@ -1 +1,2 @@
 http://127.0.0.1:11200/?key=南京市长江大桥
+http://127.0.0.1:11200/?key=长春市长春药店
--- a/test/testdata/testlines.utf8
+++ b/test/testdata/testlines.utf8
@ -5,4 +5,4 @@
 人事处女干事
 去医院做B超，编号123
 令狐冲是云计算行业的专家
-AB
+IBM,3.14
--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -35,14 +35,58 @@ TEST(MixSegmentTest, NoUserDict)
    
 }
 TEST(MixSegmentTest, UserDict)
+{
+    MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
+    {
+        const char* str = "令狐冲是云计算方面的专家";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    }
+    {
+        const char* str = "小明先就职于IBM,后在日本京都大学深造";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
+    }
+    {
+        const char* str = "IBM,3.14";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
+    }
+}
+TEST(MixSegmentTest, UserDict2)
 {
    MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
-    const char* str = "令狐冲是云计算方面的专家";
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    string res;
-    ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
-    
+    {
+        const char* str = "令狐冲是云计算方面的专家";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
+    }
+    {
+        const char* str = "小明先就职于IBM,后在日本京都大学深造";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
+    }
+    {
+        const char* str = "IBM,3.14";
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        string res;
+        res << words;
+        ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
+    }
 }

 TEST(MPSegmentTest, Test1)
@ -97,11 +141,21 @@ TEST(MPSegmentTest, Test2)
 TEST(HMMSegmentTest, Test1)
 {
    HMMSegment segment("../dict/hmm_model.utf8");;
-    const char* str = "我来自北京邮电大学。。。学号123456";
-    const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
-    vector<string> words;
-    ASSERT_TRUE(segment.cut(str, words));
-    ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    {
+        const char* str = "我来自北京邮电大学。。。学号123456";
+        const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
+    
+    {
+        const char* str = "IBM,1.2,123";
+        const char* res[] = {"IBM", ",", "1.2", ",", "123"};
+        vector<string> words;
+        ASSERT_TRUE(segment.cut(str, words));
+        ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
+    }
 }

 TEST(FullSegment, Test1)