mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
增加MPSegment的细粒度分词功能。
This commit is contained in:
parent
fae951a95d
commit
001a69d8c6
@ -3,7 +3,8 @@
|
|||||||
## next version
|
## next version
|
||||||
|
|
||||||
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
|
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
|
||||||
2. 新增层次分词器: LevelSegment
|
2. 新增层次分词器: LevelSegment 。
|
||||||
|
3. 增加MPSegment的细粒度分词功能。
|
||||||
|
|
||||||
## v3.1.0
|
## v3.1.0
|
||||||
|
|
||||||
|
@ -28,23 +28,35 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isUserDictSingleChineseWord(const Rune & value) const {
|
|
||||||
return dictTrie_->isUserDictSingleChineseWord(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
using SegmentBase::cut;
|
using SegmentBase::cut;
|
||||||
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
|
||||||
vector<Dag> dags;
|
vector<Dag> dags;
|
||||||
|
|
||||||
dictTrie_->find(begin, end, dags);
|
dictTrie_->find(begin, end, dags);
|
||||||
|
|
||||||
CalcDP(dags);
|
CalcDP(dags);
|
||||||
|
|
||||||
Cut(dags, res);
|
Cut(dags, words);
|
||||||
|
}
|
||||||
|
bool cut(const string& sentence,
|
||||||
|
vector<string>& words,
|
||||||
|
size_t max_word_len) const {
|
||||||
|
Unicode unicode;
|
||||||
|
if (!TransCode::decode(sentence, unicode)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
vector<Unicode> unicodeWords;
|
||||||
|
cut(unicode.begin(), unicode.end(),
|
||||||
|
unicodeWords, max_word_len);
|
||||||
|
words.resize(unicodeWords.size());
|
||||||
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
|
TransCode::encode(unicodeWords[i], words[i]);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
void cut(Unicode::const_iterator begin,
|
void cut(Unicode::const_iterator begin,
|
||||||
Unicode::const_iterator end,
|
Unicode::const_iterator end,
|
||||||
vector<Unicode>& res,
|
vector<Unicode>& words,
|
||||||
size_t max_word_len) const {
|
size_t max_word_len) const {
|
||||||
vector<Dag> dags;
|
vector<Dag> dags;
|
||||||
dictTrie_->find(begin,
|
dictTrie_->find(begin,
|
||||||
@ -52,12 +64,15 @@ class MPSegment: public SegmentBase {
|
|||||||
dags,
|
dags,
|
||||||
max_word_len);
|
max_word_len);
|
||||||
CalcDP(dags);
|
CalcDP(dags);
|
||||||
Cut(dags, res);
|
Cut(dags, words);
|
||||||
}
|
}
|
||||||
const DictTrie* getDictTrie() const {
|
const DictTrie* getDictTrie() const {
|
||||||
return dictTrie_;
|
return dictTrie_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isUserDictSingleChineseWord(const Rune & value) const {
|
||||||
|
return dictTrie_->isUserDictSingleChineseWord(value);
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
void CalcDP(vector<Dag>& dags) const {
|
void CalcDP(vector<Dag>& dags) const {
|
||||||
size_t nextPos;
|
size_t nextPos;
|
||||||
@ -89,15 +104,15 @@ class MPSegment: public SegmentBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
void Cut(const vector<Dag>& dags,
|
void Cut(const vector<Dag>& dags,
|
||||||
vector<Unicode>& res) const {
|
vector<Unicode>& words) const {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while(i < dags.size()) {
|
while(i < dags.size()) {
|
||||||
const DictUnit* p = dags[i].pInfo;
|
const DictUnit* p = dags[i].pInfo;
|
||||||
if(p) {
|
if(p) {
|
||||||
res.push_back(p->word);
|
words.push_back(p->word);
|
||||||
i += p->word.size();
|
i += p->word.size();
|
||||||
} else { //single chinese word
|
} else { //single chinese word
|
||||||
res.push_back(Unicode(1, dags[i].rune));
|
words.push_back(Unicode(1, dags[i].rune));
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -99,8 +99,8 @@ TEST(MPSegmentTest, Test1) {
|
|||||||
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
||||||
|
|
||||||
// MaxWordLen
|
// MaxWordLen
|
||||||
//ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
|
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
|
||||||
//ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
||||||
}
|
}
|
||||||
|
|
||||||
//TEST(MPSegmentTest, Test2) {
|
//TEST(MPSegmentTest, Test2) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user