修复FullSegment对于有些单字没有输出的bug

This commit is contained in:
yanyiwu 2015-08-30 13:09:37 +08:00
parent 001a69d8c6
commit 3c60c35906
2 changed files with 15 additions and 9 deletions

View File

@ -49,11 +49,14 @@ class FullSegment: public SegmentBase {
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
const DictUnit* du = dags[i].nexts[j].second;
if (du == NULL) {
continue;
}
wordLen = du->word.size();
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
res.push_back(du->word);
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
res.push_back(Unicode(1, dags[i].rune));
}
} else {
wordLen = du->word.size();
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
res.push_back(du->word);
}
}
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
}

View File

@ -154,14 +154,17 @@ TEST(HMMSegmentTest, Test1) {
TEST(FullSegment, Test1) {
FullSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
const char* str = "我来自北京邮电大学";
vector<string> words;
ASSERT_EQ(segment.cut(str, words), true);
string s;
ASSERT_TRUE(segment.cut("我来自北京邮电大学", words));
s << words;
ASSERT_EQ(s, "[\"\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
ASSERT_TRUE(segment.cut("上市公司CEO", words));
s << words;
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
}
TEST(QuerySegment, Test1) {