mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
增加MPSegment的细粒度分词功能。
This commit is contained in:
parent
fae951a95d
commit
001a69d8c6
@ -3,7 +3,8 @@
|
||||
## next version
|
||||
|
||||
1. 使用工程上比较 tricky 的 Trie树优化办法。废弃了之前的 `Aho-Corasick-Automation` 实现,可读性更好,性能更高。
|
||||
2. 新增层次分词器: LevelSegment
|
||||
2. 新增层次分词器: LevelSegment 。
|
||||
3. 增加MPSegment的细粒度分词功能。
|
||||
|
||||
## v3.1.0
|
||||
|
||||
|
@ -28,23 +28,35 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
|
||||
bool isUserDictSingleChineseWord(const Rune & value) const {
|
||||
return dictTrie_->isUserDictSingleChineseWord(value);
|
||||
}
|
||||
|
||||
using SegmentBase::cut;
|
||||
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const {
|
||||
void cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& words) const {
|
||||
vector<Dag> dags;
|
||||
|
||||
dictTrie_->find(begin, end, dags);
|
||||
|
||||
CalcDP(dags);
|
||||
|
||||
Cut(dags, res);
|
||||
Cut(dags, words);
|
||||
}
|
||||
bool cut(const string& sentence,
|
||||
vector<string>& words,
|
||||
size_t max_word_len) const {
|
||||
Unicode unicode;
|
||||
if (!TransCode::decode(sentence, unicode)) {
|
||||
return false;
|
||||
}
|
||||
vector<Unicode> unicodeWords;
|
||||
cut(unicode.begin(), unicode.end(),
|
||||
unicodeWords, max_word_len);
|
||||
words.resize(unicodeWords.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
TransCode::encode(unicodeWords[i], words[i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void cut(Unicode::const_iterator begin,
|
||||
Unicode::const_iterator end,
|
||||
vector<Unicode>& res,
|
||||
vector<Unicode>& words,
|
||||
size_t max_word_len) const {
|
||||
vector<Dag> dags;
|
||||
dictTrie_->find(begin,
|
||||
@ -52,12 +64,15 @@ class MPSegment: public SegmentBase {
|
||||
dags,
|
||||
max_word_len);
|
||||
CalcDP(dags);
|
||||
Cut(dags, res);
|
||||
Cut(dags, words);
|
||||
}
|
||||
const DictTrie* getDictTrie() const {
|
||||
return dictTrie_;
|
||||
}
|
||||
|
||||
bool isUserDictSingleChineseWord(const Rune & value) const {
|
||||
return dictTrie_->isUserDictSingleChineseWord(value);
|
||||
}
|
||||
private:
|
||||
void CalcDP(vector<Dag>& dags) const {
|
||||
size_t nextPos;
|
||||
@ -89,15 +104,15 @@ class MPSegment: public SegmentBase {
|
||||
}
|
||||
}
|
||||
void Cut(const vector<Dag>& dags,
|
||||
vector<Unicode>& res) const {
|
||||
vector<Unicode>& words) const {
|
||||
size_t i = 0;
|
||||
while(i < dags.size()) {
|
||||
const DictUnit* p = dags[i].pInfo;
|
||||
if(p) {
|
||||
res.push_back(p->word);
|
||||
words.push_back(p->word);
|
||||
i += p->word.size();
|
||||
} else { //single chinese word
|
||||
res.push_back(Unicode(1, dags[i].rune));
|
||||
words.push_back(Unicode(1, dags[i].rune));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -99,8 +99,8 @@ TEST(MPSegmentTest, Test1) {
|
||||
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
||||
|
||||
// MaxWordLen
|
||||
//ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
|
||||
//ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
||||
ASSERT_TRUE(segment.cut("南京市长江大桥", words, 3));
|
||||
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
||||
}
|
||||
|
||||
//TEST(MPSegmentTest, Test2) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user