diff --git a/ChangeLog.md b/ChangeLog.md index 4cd9377..1ef5413 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -3,7 +3,8 @@ ## next version 1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。 -2. 新增层次分词器: LevelSegment +2. 新增层次分词器: LevelSegment 。 +3. 增加MPSegment的细粒度分词功能。 ## v3.1.0 diff --git a/src/MPSegment.hpp b/src/MPSegment.hpp index 919b393..4d05ff5 100644 --- a/src/MPSegment.hpp +++ b/src/MPSegment.hpp @@ -28,23 +28,35 @@ class MPSegment: public SegmentBase { } } - bool isUserDictSingleChineseWord(const Rune & value) const { - return dictTrie_->isUserDictSingleChineseWord(value); - } - using SegmentBase::cut; - void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& res) const { + void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector& words) const { vector dags; dictTrie_->find(begin, end, dags); CalcDP(dags); - Cut(dags, res); + Cut(dags, words); + } + bool cut(const string& sentence, + vector& words, + size_t max_word_len) const { + Unicode unicode; + if (!TransCode::decode(sentence, unicode)) { + return false; + } + vector unicodeWords; + cut(unicode.begin(), unicode.end(), + unicodeWords, max_word_len); + words.resize(unicodeWords.size()); + for (size_t i = 0; i < words.size(); i++) { + TransCode::encode(unicodeWords[i], words[i]); + } + return true; } void cut(Unicode::const_iterator begin, Unicode::const_iterator end, - vector& res, + vector& words, size_t max_word_len) const { vector dags; dictTrie_->find(begin, @@ -52,12 +64,15 @@ class MPSegment: public SegmentBase { dags, max_word_len); CalcDP(dags); - Cut(dags, res); + Cut(dags, words); } const DictTrie* getDictTrie() const { return dictTrie_; } + bool isUserDictSingleChineseWord(const Rune & value) const { + return dictTrie_->isUserDictSingleChineseWord(value); + } private: void CalcDP(vector& dags) const { size_t nextPos; @@ -89,15 +104,15 @@ class MPSegment: public SegmentBase { } } void Cut(const vector& dags, - vector& res) const { + vector& words) const { size_t i = 0; while(i < dags.size()) { const DictUnit* p = dags[i].pInfo; if(p) { - res.push_back(p->word); + words.push_back(p->word); i += p->word.size(); } else { //single chinese word - res.push_back(Unicode(1, dags[i].rune)); + words.push_back(Unicode(1, dags[i].rune)); i++; } } diff --git a/test/unittest/TSegments.cpp b/test/unittest/TSegments.cpp index 3fff55d..655518f 100644 --- a/test/unittest/TSegments.cpp +++ b/test/unittest/TSegments.cpp @@ -99,8 +99,8 @@ TEST(MPSegmentTest, Test1) { ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words); // MaxWordLen - //ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3)); - //ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words); + ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3)); + ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words); } //TEST(MPSegmentTest, Test2) {