添加英文+数字分词规则 qinwf/jiebaR#7

2025-07-18 00:00:12 +08:00 · 2015-02-06 10:19:43 +08:00 · 2015-02-06 10:19:43 +08:00 · c0bdef74fb
commit c0bdef74fb
parent 10e9b32258
3 changed files with 13 additions and 5 deletions
--- a/src/HMMSegment.hpp
+++ b/src/HMMSegment.hpp
@ -120,11 +120,19 @@ namespace CppJieba
            // sequential letters rule
            Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
            {
-                Unicode::value_type x;
+                Unicode::value_type x = *begin;
+                if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+                {
+                    begin ++;
+                }
+                else
+                {
+                    return begin;
+                }
                while(begin != end)
                {
                    x = *begin;
-                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
+                    if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
                    {
                        begin ++;
                    }
--- a/test/unittest/TKeywordExtractor.cpp
+++ b/test/unittest/TKeywordExtractor.cpp
@ -26,7 +26,7 @@ TEST(KeywordExtractorTest, Test1)
        size_t topN = 5;
        extractor.extract(s, wordweights, topN);
        res << wordweights;
-        ASSERT_EQ(res, "[\"iPhone:11.7392\", \"一部:6.47592\"]");
+        ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
    }
 }

--- a/test/unittest/TSegments.cpp
+++ b/test/unittest/TSegments.cpp
@ -12,8 +12,8 @@ using namespace CppJieba;
 TEST(MixSegmentTest, Test1)
 {
    MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
-    const char* str = "我来自北京邮电大学。。。学号123456";
-    const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456"};
+    const char* str = "我来自北京邮电大学。。。学号123456，用AK47";
+    const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456","，","用","AK47"};
    const char* str2 = "B超 T恤";
    const char* res2[] = {"B超"," ", "T恤"};
    vector<string> words;