增加MPSegment的细粒度分词功能。

This commit is contained in:
yanyiwu 2015-08-30 01:04:30 +08:00
parent fae951a95d
commit 001a69d8c6
3 changed files with 30 additions and 14 deletions

View File

@ -3,7 +3,8 @@
## next version
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
2. 新增层次分词器: LevelSegment
2. 新增层次分词器: LevelSegment 。
3. 增加MPSegment的细粒度分词功能。
## v3.1.0

View File

@ -28,23 +28,35 @@ class MPSegment: public SegmentBase {
}
}
bool isUserDictSingleChineseWord(const Rune & value) const {
return dictTrie_->isUserDictSingleChineseWord(value);
}
using SegmentBase::cut;
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
vector<Dag> dags;
dictTrie_->find(begin, end, dags);
CalcDP(dags);
Cut(dags, res);
Cut(dags, words);
}
bool cut(const string& sentence,
vector<string>& words,
size_t max_word_len) const {
Unicode unicode;
if (!TransCode::decode(sentence, unicode)) {
return false;
}
vector<Unicode> unicodeWords;
cut(unicode.begin(), unicode.end(),
unicodeWords, max_word_len);
words.resize(unicodeWords.size());
for (size_t i = 0; i < words.size(); i++) {
TransCode::encode(unicodeWords[i], words[i]);
}
return true;
}
void cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<Unicode>& res,
vector<Unicode>& words,
size_t max_word_len) const {
vector<Dag> dags;
dictTrie_->find(begin,
@ -52,12 +64,15 @@ class MPSegment: public SegmentBase {
dags,
max_word_len);
CalcDP(dags);
Cut(dags, res);
Cut(dags, words);
}
const DictTrie* getDictTrie() const {
return dictTrie_;
}
bool isUserDictSingleChineseWord(const Rune & value) const {
return dictTrie_->isUserDictSingleChineseWord(value);
}
private:
void CalcDP(vector<Dag>& dags) const {
size_t nextPos;
@ -89,15 +104,15 @@ class MPSegment: public SegmentBase {
}
}
void Cut(const vector<Dag>& dags,
vector<Unicode>& res) const {
vector<Unicode>& words) const {
size_t i = 0;
while(i < dags.size()) {
const DictUnit* p = dags[i].pInfo;
if(p) {
res.push_back(p->word);
words.push_back(p->word);
i += p->word.size();
} else { //single chinese word
res.push_back(Unicode(1, dags[i].rune));
words.push_back(Unicode(1, dags[i].rune));
i++;
}
}

View File

@ -99,8 +99,8 @@ TEST(MPSegmentTest, Test1) {
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
// MaxWordLen
//ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
//ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
}
//TEST(MPSegmentTest, Test2) {