mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
添加英文+数字分词规则 qinwf/jiebaR#7
This commit is contained in:
parent
10e9b32258
commit
c0bdef74fb
@ -120,11 +120,19 @@ namespace CppJieba
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
Unicode::value_type x;
|
||||
Unicode::value_type x = *begin;
|
||||
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
||||
{
|
||||
begin ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
return begin;
|
||||
}
|
||||
while(begin != end)
|
||||
{
|
||||
x = *begin;
|
||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
|
||||
{
|
||||
begin ++;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ TEST(KeywordExtractorTest, Test1)
|
||||
size_t topN = 5;
|
||||
extractor.extract(s, wordweights, topN);
|
||||
res << wordweights;
|
||||
ASSERT_EQ(res, "[\"iPhone:11.7392\", \"一部:6.47592\"]");
|
||||
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,8 +12,8 @@ using namespace CppJieba;
|
||||
TEST(MixSegmentTest, Test1)
|
||||
{
|
||||
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456"};
|
||||
const char* str = "我来自北京邮电大学。。。学号123456,用AK47";
|
||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456",",","用","AK47"};
|
||||
const char* str2 = "B超 T恤";
|
||||
const char* res2[] = {"B超"," ", "T恤"};
|
||||
vector<string> words;
|
||||
|
Loading…
x
Reference in New Issue
Block a user