mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
增加两条分词规则
This commit is contained in:
parent
b68a76e63a
commit
fbae0f6075
@ -74,10 +74,19 @@ namespace CppJieba
|
||||
return false;
|
||||
}
|
||||
left = right;
|
||||
while(*right < 0x80 && right != end)
|
||||
{
|
||||
right++;
|
||||
}
|
||||
do {
|
||||
right = _sequentialLetterRule(left, end);
|
||||
if(right != left)
|
||||
{
|
||||
break;
|
||||
}
|
||||
right = _numbersRule(left, end);
|
||||
if(right != left)
|
||||
{
|
||||
break;
|
||||
}
|
||||
right ++;
|
||||
} while(false);
|
||||
res.push_back(Unicode(left, right));
|
||||
left = right;
|
||||
}
|
||||
@ -93,6 +102,50 @@ namespace CppJieba
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
Unicode::value_type x;
|
||||
while(begin != end)
|
||||
{
|
||||
x = *begin;
|
||||
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
||||
{
|
||||
begin ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
//
|
||||
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
||||
{
|
||||
Unicode::value_type x = *begin;
|
||||
if('0' <= x && x <= '9')
|
||||
{
|
||||
begin ++;
|
||||
}
|
||||
else
|
||||
{
|
||||
return begin;
|
||||
}
|
||||
while(begin != end)
|
||||
{
|
||||
x = *begin;
|
||||
if( ('0' <= x && x <= '9') || x == '.')
|
||||
{
|
||||
begin++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
||||
{
|
||||
vector<size_t> status;
|
||||
@ -231,7 +284,6 @@ namespace CppJieba
|
||||
for(size_t j = 0; j< tmp.size(); j++)
|
||||
{
|
||||
_startProb[j] = atof(tmp[j].c_str());
|
||||
//cout<<_startProb[j]<<endl;
|
||||
}
|
||||
|
||||
//load _transProb
|
||||
@ -250,7 +302,6 @@ namespace CppJieba
|
||||
for(size_t j =0; j < STATUS_SUM; j++)
|
||||
{
|
||||
_transProb[i][j] = atof(tmp[j].c_str());
|
||||
//cout<<_transProb[i][j]<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,11 +37,23 @@ TEST(MixSegmentTest, NoUserDict)
|
||||
TEST(MixSegmentTest, UserDict)
|
||||
{
|
||||
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
{
|
||||
const char* str = "令狐冲是云计算方面的专家";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
||||
}
|
||||
{
|
||||
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
string res;
|
||||
res << words;
|
||||
print(res);
|
||||
exit(1);
|
||||
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2)
|
||||
TEST(HMMSegmentTest, Test1)
|
||||
{
|
||||
HMMSegment segment("../dict/hmm_model.utf8");;
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
{
|
||||
const char* str = "我来自北京邮电大学。。。学号123456";
|
||||
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
|
||||
{
|
||||
const char* str = "IBM,1.2,123";
|
||||
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
|
||||
vector<string> words;
|
||||
ASSERT_TRUE(segment.cut(str, words));
|
||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FullSegment, Test1)
|
||||
|
Loading…
x
Reference in New Issue
Block a user