From b68a76e63a41a3903e2b2a17a72acc139956a49f Mon Sep 17 00:00:00 2001 From: wyy Date: Sun, 26 Oct 2014 12:21:10 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E5=AE=8C=E5=96=84=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 7 +++++++ test/servertest/go_load_test.sh | 2 ++ test/testdata/load_test.urls | 1 + 3 files changed, 10 insertions(+) create mode 100755 test/servertest/go_load_test.sh diff --git a/README.md b/README.md index a0ef574..2c2be27 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,13 @@ cmake .. make ``` +有兴趣的可以跑跑测试(可选): + +``` +./test/test.run +./load_test +``` + ## 演示 ``` diff --git a/test/servertest/go_load_test.sh b/test/servertest/go_load_test.sh new file mode 100755 index 0000000..568588b --- /dev/null +++ b/test/servertest/go_load_test.sh @@ -0,0 +1,2 @@ +# go get github.com/aszxqw/go_http_load +go_http_load -method=GET -get_urls="../test/testdata/load_test.urls" -loop_count=500 -goroutines=2 diff --git a/test/testdata/load_test.urls b/test/testdata/load_test.urls index b710af2..96e2781 100644 --- a/test/testdata/load_test.urls +++ b/test/testdata/load_test.urls @@ -1 +1,2 @@ http://127.0.0.1:11200/?key=南京市长江大桥 +http://127.0.0.1:11200/?key=长春市长春药店 From fbae0f60758763742cb54f54c265fc081427cc7e Mon Sep 17 00:00:00 2001 From: wyy Date: Mon, 3 Nov 2014 10:54:53 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=A4=E6=9D=A1?= =?UTF-8?q?=E5=88=86=E8=AF=8D=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/HMMSegment.hpp | 63 +++++++++++++++++++++++++++++++++---- test/unittest/TSegments.cpp | 42 +++++++++++++++++++------ 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 6935958..175e405 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -74,10 +74,19 @@ namespace CppJieba return false; } left = right; - while(*right < 0x80 && right != end) - { - right++; - } + do { + right = _sequentialLetterRule(left, end); + if(right != left) + { + break; + } + right = _numbersRule(left, end); + if(right != left) + { + break; + } + right ++; + } while(false); res.push_back(Unicode(left, right)); left = right; } @@ -93,6 +102,50 @@ namespace CppJieba return true; } private: + // sequential letters rule + Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + Unicode::value_type x; + while(begin != end) + { + x = *begin; + if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) + { + begin ++; + } + else + { + break; + } + } + return begin; + } + // + Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + Unicode::value_type x = *begin; + if('0' <= x && x <= '9') + { + begin ++; + } + else + { + return begin; + } + while(begin != end) + { + x = *begin; + if( ('0' <= x && x <= '9') || x == '.') + { + begin++; + } + else + { + break; + } + } + return begin; + } bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { vector status; @@ -231,7 +284,6 @@ namespace CppJieba for(size_t j = 0; j< tmp.size(); j++) { _startProb[j] = atof(tmp[j].c_str()); - //cout<<_startProb[j]< words; - ASSERT_TRUE(segment.cut(str, words)); - string res; - ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + { + const char* str = "令狐冲是云计算方面的专家"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + } + { + const char* str = "小明先就职于IBM,后在日本京都大学深造"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + print(res); + exit(1); + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res); + } } @@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2) TEST(HMMSegmentTest, Test1) { HMMSegment segment("../dict/hmm_model.utf8");; - const char* str = "我来自北京邮电大学。。。学号123456"; - const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; - vector words; - ASSERT_TRUE(segment.cut(str, words)); - ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + { + const char* str = "我来自北京邮电大学。。。学号123456"; + const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + } + + { + const char* str = "IBM,1.2,123"; + const char* res[] = {"IBM", ",", "1.2", ",", "123"}; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + } } TEST(FullSegment, Test1) From 107638f7d87a0c2a7751f78b200b1897d028d22e Mon Sep 17 00:00:00 2001 From: wyy Date: Mon, 3 Nov 2014 11:19:00 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E7=AD=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/segment_demo.cpp | 2 +- test/testdata/testlines.utf8 | 2 +- test/unittest/TSegments.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/segment_demo.cpp b/test/segment_demo.cpp index 9103bbb..36d8e5d 100644 --- a/test/segment_demo.cpp +++ b/test/segment_demo.cpp @@ -12,7 +12,7 @@ using namespace CppJieba; const char * const TEST_FILE = "../test/testdata/testlines.utf8"; const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8"; const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8"; -const char * const USER_DICT_FILE = "../test/testdata/userdict.utf8"; +const char * const USER_DICT_FILE = "../dict/user.dict.utf8"; void cut(const ISegment& seg, const char * const filePath) { diff --git a/test/testdata/testlines.utf8 b/test/testdata/testlines.utf8 index 769e0d2..eb382da 100644 --- a/test/testdata/testlines.utf8 +++ b/test/testdata/testlines.utf8 @@ -5,4 +5,4 @@ 人事处女干事 去医院做B超,编号123 令狐冲是云计算行业的专家 -AB +IBM,3.14 diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index c59f233..811f107 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -36,7 +36,7 @@ TEST(MixSegmentTest, NoUserDict) } TEST(MixSegmentTest, UserDict) { - MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); + MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8"); { const char* str = "令狐冲是云计算方面的专家"; vector words; From 471a68e08e1ab9b39336404c5b08a14ed7c29f78 Mon Sep 17 00:00:00 2001 From: wyy Date: Mon, 3 Nov 2014 11:30:45 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/unittest/TSegments.cpp | 40 +++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 811f107..36eb36f 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -50,11 +50,43 @@ TEST(MixSegmentTest, UserDict) ASSERT_TRUE(segment.cut(str, words)); string res; res << words; - print(res); - exit(1); - ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res); + ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res); + } + { + const char* str = "IBM,3.14"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res); + } +} +TEST(MixSegmentTest, UserDict2) +{ + MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8"); + { + const char* str = "令狐冲是云计算方面的专家"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + } + { + const char* str = "小明先就职于IBM,后在日本京都大学深造"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res); + } + { + const char* str = "IBM,3.14"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res); } - } TEST(MPSegmentTest, Test1) From 7bf2bceee4d9a5abe66e683f0770a3b609257a84 Mon Sep 17 00:00:00 2001 From: Qin Wenfeng Date: Tue, 4 Nov 2014 12:07:33 +0800 Subject: [PATCH 5/5] =?UTF-8?q?README=20=E4=B8=AD=E6=B7=BB=E5=8A=A0=20jieb?= =?UTF-8?q?aR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加了CppJieba的R语言封装 jiebaR。 --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 2c2be27..822dd6f 100644 --- a/README.md +++ b/README.md @@ -308,6 +308,10 @@ make && ./keyword.demo 如果有需要在`erlang`中使用分词的话,不妨试一下[exjieba]。 +### jiebaR + +如果有需要在`R`中使用分词的话,不妨试一下[jiebaR]。 + ### libcppjieba [libcppjieba] 是最简单易懂的CppJieba头文件库使用示例。 @@ -344,6 +348,7 @@ https://github.com/fxsjy/jieba [cppjiebapy]:https://github.com/jannson/cppjiebapy [cppjiebapy_discussion]:https://github.com/aszxqw/cppjieba/issues/1 [NodeJieba]:https://github.com/aszxqw/nodejieba +[jiebaR]:https://github.com/qinwf/jiebaR [simhash]:https://github.com/aszxqw/simhash [代码详解]:https://github.com/aszxqw/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3 [libcppjieba]:https://github.com/aszxqw/libcppjieba