mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
添加英文+数字分词规则 qinwf/jiebaR#7
This commit is contained in:
parent
10e9b32258
commit
c0bdef74fb
@ -120,11 +120,19 @@ namespace CppJieba
|
|||||||
// sequential letters rule
|
// sequential letters rule
|
||||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||||
{
|
{
|
||||||
Unicode::value_type x;
|
Unicode::value_type x = *begin;
|
||||||
|
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
||||||
|
{
|
||||||
|
begin ++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return begin;
|
||||||
|
}
|
||||||
while(begin != end)
|
while(begin != end)
|
||||||
{
|
{
|
||||||
x = *begin;
|
x = *begin;
|
||||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9'))
|
||||||
{
|
{
|
||||||
begin ++;
|
begin ++;
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ TEST(KeywordExtractorTest, Test1)
|
|||||||
size_t topN = 5;
|
size_t topN = 5;
|
||||||
extractor.extract(s, wordweights, topN);
|
extractor.extract(s, wordweights, topN);
|
||||||
res << wordweights;
|
res << wordweights;
|
||||||
ASSERT_EQ(res, "[\"iPhone:11.7392\", \"一部:6.47592\"]");
|
ASSERT_EQ(res, "[\"iPhone6:11.7392\", \"一部:6.47592\"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,8 +12,8 @@ using namespace CppJieba;
|
|||||||
TEST(MixSegmentTest, Test1)
|
TEST(MixSegmentTest, Test1)
|
||||||
{
|
{
|
||||||
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
|
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
|
||||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
const char* str = "我来自北京邮电大学。。。学号123456,用AK47";
|
||||||
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456"};
|
const char* res[] = {"我", "来自", "北京邮电大学", "。","。","。", "学号", "123456",",","用","AK47"};
|
||||||
const char* str2 = "B超 T恤";
|
const char* str2 = "B超 T恤";
|
||||||
const char* res2[] = {"B超"," ", "T恤"};
|
const char* res2[] = {"B超"," ", "T恤"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user