增加两条分词规则

This commit is contained in:
wyy 2014-11-03 10:54:53 +08:00
parent b68a76e63a
commit fbae0f6075
2 changed files with 89 additions and 16 deletions

View File

@ -74,10 +74,19 @@ namespace CppJieba
return false;
}
left = right;
while(*right < 0x80 && right != end)
{
right++;
}
do {
right = _sequentialLetterRule(left, end);
if(right != left)
{
break;
}
right = _numbersRule(left, end);
if(right != left)
{
break;
}
right ++;
} while(false);
res.push_back(Unicode(left, right));
left = right;
}
@ -93,6 +102,50 @@ namespace CppJieba
return true;
}
private:
// sequential letters rule
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
Unicode::value_type x;
while(begin != end)
{
x = *begin;
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
{
begin ++;
}
else
{
break;
}
}
return begin;
}
//
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
{
Unicode::value_type x = *begin;
if('0' <= x && x <= '9')
{
begin ++;
}
else
{
return begin;
}
while(begin != end)
{
x = *begin;
if( ('0' <= x && x <= '9') || x == '.')
{
begin++;
}
else
{
break;
}
}
return begin;
}
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
{
vector<size_t> status;
@ -231,7 +284,6 @@ namespace CppJieba
for(size_t j = 0; j< tmp.size(); j++)
{
_startProb[j] = atof(tmp[j].c_str());
//cout<<_startProb[j]<<endl;
}
//load _transProb
@ -250,7 +302,6 @@ namespace CppJieba
for(size_t j =0; j < STATUS_SUM; j++)
{
_transProb[i][j] = atof(tmp[j].c_str());
//cout<<_transProb[i][j]<<endl;
}
}

View File

@ -37,11 +37,23 @@ TEST(MixSegmentTest, NoUserDict)
TEST(MixSegmentTest, UserDict)
{
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
{
const char* str = "令狐冲是云计算方面的专家";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res << words);
}
{
const char* str = "小明先就职于IBM,后在日本京都大学深造";
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
string res;
res << words;
print(res);
exit(1);
ASSERT_EQ("[\"令狐冲\", \"\", \"云计算\", \"方面\", \"\", \"专家\"]", res);
}
}
@ -97,11 +109,21 @@ TEST(MPSegmentTest, Test2)
TEST(HMMSegmentTest, Test1)
{
HMMSegment segment("../dict/hmm_model.utf8");;
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
{
const char* str = "我来自北京邮电大学。。。学号123456";
const char* res[] = {"我来", "自北京", "邮电大学", "", "", "", "学号", "123456"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
{
const char* str = "IBM,1.2,123";
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
}
}
TEST(FullSegment, Test1)