添加英文+数字分词规则 qinwf/jiebaR#7

This commit is contained in:
qinwf 2015-02-06 10:19:43 +08:00
parent 10e9b32258
commit c0bdef74fb
3 changed files with 13 additions and 5 deletions

View File

@ -120,11 +120,19 @@ namespace CppJieba
// sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
Unicode::value_type x;
Unicode::value_type x = *begin;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
{
begin ++;
}
else
{
return begin;
}
while(begin != end)
{
x = *begin;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
{
begin ++;
}

View File

@ -26,7 +26,7 @@ TEST(KeywordExtractorTest, Test1)
size_t topN = 5;
extractor.extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[\"iPhone:11.7392\", \"一部:6.47592\"]");
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
}
}

View File

@ -12,8 +12,8 @@ using namespace CppJieba;
TEST(MixSegmentTest, Test1)
{
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"", "来自", "北京邮电大学", "","","", "学号", "123456"};
const char* str = "我来自北京邮电大学。。。学号123456用AK47";
const char* res[] = {"", "来自", "北京邮电大学", "","","", "学号", "123456","","","AK47"};
const char* str2 = "B超 T恤";
const char* res2[] = {"B超"," ", "T恤"};
vector<string> words;