diff --git a/README.md b/README.md index a0ef574..822dd6f 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,13 @@ cmake .. make ``` +有兴趣的可以跑跑测试(可选): + +``` +./test/test.run +./load_test +``` + ## 演示 ``` @@ -301,6 +308,10 @@ make && ./keyword.demo 如果有需要在`erlang`中使用分词的话,不妨试一下[exjieba]。 +### jiebaR + +如果有需要在`R`中使用分词的话,不妨试一下[jiebaR]。 + ### libcppjieba [libcppjieba] 是最简单易懂的CppJieba头文件库使用示例。 @@ -337,6 +348,7 @@ https://github.com/fxsjy/jieba [cppjiebapy]:https://github.com/jannson/cppjiebapy [cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1 [NodeJieba]:https://github.com/aszxqw/nodejieba +[jiebaR]:https://github.com/qinwf/jiebaR [simhash]:https://github.com/aszxqw/simhash [代码详解]:https://github.com/aszxqw/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3 [libcppjieba]:https://github.com/aszxqw/libcppjieba diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 6935958..175e405 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -74,10 +74,19 @@ namespace CppJieba return false; } left = right; - while(*right < 0x80 && right != end) - { - right++; - } + do { + right = _sequentialLetterRule(left, end); + if(right != left) + { + break; + } + right = _numbersRule(left, end); + if(right != left) + { + break; + } + right ++; + } while(false); res.push_back(Unicode(left, right)); left = right; } @@ -93,6 +102,50 @@ namespace CppJieba return true; } private: + // sequential letters rule + Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + Unicode::value_type x; + while(begin != end) + { + x = *begin; + if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) + { + begin ++; + } + else + { + break; + } + } + return begin; + } + // + Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + Unicode::value_type x = *begin; + if('0' <= x && x <= '9') + { + begin ++; + } + else + { + return begin; + } + while(begin != end) + { + x = *begin; + if( ('0' <= x && x <= '9') || x == '.') + { + begin++; + } + else + { + break; + } + } + return begin; + } bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { vector status; @@ -231,7 +284,6 @@ namespace CppJieba for(size_t j = 0; j< tmp.size(); j++) { _startProb[j] = atof(tmp[j].c_str()); - //cout<<_startProb[j]< words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + } + { + const char* str = "小明先就职于IBM,后在日本京都大学深造"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res); + } + { + const char* str = "IBM,3.14"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res); + } +} +TEST(MixSegmentTest, UserDict2) { MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); - const char* str = "令狐冲是云计算方面的专家"; - vector words; - ASSERT_TRUE(segment.cut(str, words)); - string res; - ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); - + { + const char* str = "令狐冲是云计算方面的专家"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + } + { + const char* str = "小明先就职于IBM,后在日本京都大学深造"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res); + } + { + const char* str = "IBM,3.14"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res); + } } TEST(MPSegmentTest, Test1) @@ -97,11 +141,21 @@ TEST(MPSegmentTest, Test2) TEST(HMMSegmentTest, Test1) { HMMSegment segment("../dict/hmm_model.utf8");; - const char* str = "我来自北京邮电大学。。。学号123456"; - const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; - vector words; - ASSERT_TRUE(segment.cut(str, words)); - ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + { + const char* str = "我来自北京邮电大学。。。学号123456"; + const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + } + + { + const char* str = "IBM,1.2,123"; + const char* res[] = {"IBM", ",", "1.2", ",", "123"}; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + } } TEST(FullSegment, Test1)