增加MPSegment的细粒度分词功能。

This commit is contained in:
yanyiwu 2015-08-30 01:04:30 +08:00
parent fae951a95d
commit 001a69d8c6
3 changed files with 30 additions and 14 deletions

View File

@ -3,7 +3,8 @@
## next version ## next version
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。 1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
2. 新增层次分词器: LevelSegment 2. 新增层次分词器: LevelSegment 。
3. 增加MPSegment的细粒度分词功能。
## v3.1.0 ## v3.1.0

View File

@ -28,23 +28,35 @@ class MPSegment: public SegmentBase {
} }
} }
bool isUserDictSingleChineseWord(const Rune & value) const {
return dictTrie_->isUserDictSingleChineseWord(value);
}
using SegmentBase::cut; using SegmentBase::cut;
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const { void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
vector<Dag> dags; vector<Dag> dags;
dictTrie_->find(begin, end, dags); dictTrie_->find(begin, end, dags);
CalcDP(dags); CalcDP(dags);
Cut(dags, res); Cut(dags, words);
}
bool cut(const string& sentence,
vector<string>& words,
size_t max_word_len) const {
Unicode unicode;
if (!TransCode::decode(sentence, unicode)) {
return false;
}
vector<Unicode> unicodeWords;
cut(unicode.begin(), unicode.end(),
unicodeWords, max_word_len);
words.resize(unicodeWords.size());
for (size_t i = 0; i < words.size(); i++) {
TransCode::encode(unicodeWords[i], words[i]);
}
return true;
} }
void cut(Unicode::const_iterator begin, void cut(Unicode::const_iterator begin,
Unicode::const_iterator end, Unicode::const_iterator end,
vector<Unicode>& res, vector<Unicode>& words,
size_t max_word_len) const { size_t max_word_len) const {
vector<Dag> dags; vector<Dag> dags;
dictTrie_->find(begin, dictTrie_->find(begin,
@ -52,12 +64,15 @@ class MPSegment: public SegmentBase {
dags, dags,
max_word_len); max_word_len);
CalcDP(dags); CalcDP(dags);
Cut(dags, res); Cut(dags, words);
} }
const DictTrie* getDictTrie() const { const DictTrie* getDictTrie() const {
return dictTrie_; return dictTrie_;
} }
bool isUserDictSingleChineseWord(const Rune & value) const {
return dictTrie_->isUserDictSingleChineseWord(value);
}
private: private:
void CalcDP(vector<Dag>& dags) const { void CalcDP(vector<Dag>& dags) const {
size_t nextPos; size_t nextPos;
@ -89,15 +104,15 @@ class MPSegment: public SegmentBase {
} }
} }
void Cut(const vector<Dag>& dags, void Cut(const vector<Dag>& dags,
vector<Unicode>& res) const { vector<Unicode>& words) const {
size_t i = 0; size_t i = 0;
while(i < dags.size()) { while(i < dags.size()) {
const DictUnit* p = dags[i].pInfo; const DictUnit* p = dags[i].pInfo;
if(p) { if(p) {
res.push_back(p->word); words.push_back(p->word);
i += p->word.size(); i += p->word.size();
} else { //single chinese word } else { //single chinese word
res.push_back(Unicode(1, dags[i].rune)); words.push_back(Unicode(1, dags[i].rune));
i++; i++;
} }
} }

View File

@ -99,8 +99,8 @@ TEST(MPSegmentTest, Test1) {
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words); ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
// MaxWordLen // MaxWordLen
//ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3)); ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
//ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words); ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
} }
//TEST(MPSegmentTest, Test2) { //TEST(MPSegmentTest, Test2) {