From c0bdef74fb7a4e99f39f7ca543085f370ec8adfa Mon Sep 17 00:00:00 2001 From: qinwf Date: Fri, 6 Feb 2015 10:19:43 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=8B=B1=E6=96=87+=E6=95=B0?= =?UTF-8?q?=E5=AD=97=E5=88=86=E8=AF=8D=E8=A7=84=E5=88=99=20qinwf/jiebaR#7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/HMMSegment.hpp | 12 ++++++++++-- test/unittest/TKeywordExtractor.cpp | 2 +- test/unittest/TSegments.cpp | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/HMMSegment.hpp b/src/HMMSegment.hpp index 838eeef..d7c8c89 100644 --- a/src/HMMSegment.hpp +++ b/src/HMMSegment.hpp @@ -120,11 +120,19 @@ namespace CppJieba // sequential letters rule Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { - Unicode::value_type x; + Unicode::value_type x = *begin; + if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) + { + begin ++; + } + else + { + return begin; + } while(begin != end) { x = *begin; - if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) + if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { begin ++; } diff --git a/test/unittest/TKeywordExtractor.cpp b/test/unittest/TKeywordExtractor.cpp index 2392260..9ea203c 100644 --- a/test/unittest/TKeywordExtractor.cpp +++ b/test/unittest/TKeywordExtractor.cpp @@ -26,7 +26,7 @@ TEST(KeywordExtractorTest, Test1) size_t topN = 5; extractor.extract(s, wordweights, topN); res << wordweights; - ASSERT_EQ(res, "[\"iPhone:11.7392\", \"一部:6.47592\"]"); + ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]"); } } diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index dcdc5c1..54e5f64 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -12,8 +12,8 @@ using namespace CppJieba; TEST(MixSegmentTest, Test1) { MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");; - const char* str = "我来自北京邮电大学。。。学号123456"; - const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456"}; + const char* str = "我来自北京邮电大学。。。学号123456,用AK47"; + const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456",",","用","AK47"}; const char* str2 = "B超 T恤"; const char* res2[] = {"B超"," ", "T恤"}; vector words;