From fbae0f60758763742cb54f54c265fc081427cc7e Mon Sep 17 00:00:00 2001 From: wyy Date: Mon, 3 Nov 2014 10:54:53 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=A4=E6=9D=A1=E5=88=86?= =?UTF-8?q?=E8=AF=8D=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/HMMSegment.hpp | 63 +++++++++++++++++++++++++++++++++---- test/unittest/TSegments.cpp | 42 +++++++++++++++++++------ 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 6935958..175e405 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -74,10 +74,19 @@ namespace CppJieba return false; } left = right; - while(*right < 0x80 && right != end) - { - right++; - } + do { + right = _sequentialLetterRule(left, end); + if(right != left) + { + break; + } + right = _numbersRule(left, end); + if(right != left) + { + break; + } + right ++; + } while(false); res.push_back(Unicode(left, right)); left = right; } @@ -93,6 +102,50 @@ namespace CppJieba return true; } private: + // sequential letters rule + Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + Unicode::value_type x; + while(begin != end) + { + x = *begin; + if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) + { + begin ++; + } + else + { + break; + } + } + return begin; + } + // + Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const + { + Unicode::value_type x = *begin; + if('0' <= x && x <= '9') + { + begin ++; + } + else + { + return begin; + } + while(begin != end) + { + x = *begin; + if( ('0' <= x && x <= '9') || x == '.') + { + begin++; + } + else + { + break; + } + } + return begin; + } bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { vector status; @@ -231,7 +284,6 @@ namespace CppJieba for(size_t j = 0; j< tmp.size(); j++) { _startProb[j] = atof(tmp[j].c_str()); - //cout<<_startProb[j]< words; - ASSERT_TRUE(segment.cut(str, words)); - string res; - ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + { + const char* str = "令狐冲是云计算方面的专家"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words); + } + { + const char* str = "小明先就职于IBM,后在日本京都大学深造"; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + string res; + res << words; + print(res); + exit(1); + ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res); + } } @@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2) TEST(HMMSegmentTest, Test1) { HMMSegment segment("../dict/hmm_model.utf8");; - const char* str = "我来自北京邮电大学。。。学号123456"; - const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; - vector words; - ASSERT_TRUE(segment.cut(str, words)); - ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + { + const char* str = "我来自北京邮电大学。。。学号123456"; + const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"}; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + } + + { + const char* str = "IBM,1.2,123"; + const char* res[] = {"IBM", ",", "1.2", ",", "123"}; + vector words; + ASSERT_TRUE(segment.cut(str, words)); + ASSERT_EQ(words, vector(res, res + sizeof(res)/sizeof(res[0]))); + } } TEST(FullSegment, Test1)