cppjieba/src/LevelSegment.hpp
2015-10-08 21:13:11 +08:00

83 lines
2.1 KiB
C++

#ifndef CPPJIEBA_LEVELSEGMENT_H
#define CPPJIEBA_LEVELSEGMENT_H
#include "MPSegment.hpp"
namespace cppjieba {
class LevelSegment: public SegmentBase{
public:
LevelSegment(const string& dictPath,
const string& userDictPath = "")
: mpSeg_(dictPath, userDictPath) {
LogInfo("LevelSegment init");
}
LevelSegment(const DictTrie* dictTrie)
: mpSeg_(dictTrie) {
}
~LevelSegment() {
}
void cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<pair<Unicode, size_t> >& res) const {
res.clear();
vector<Unicode> words;
vector<Unicode> smallerWords;
words.reserve(end - begin);
mpSeg_.cut(begin, end, words);
smallerWords.reserve(words.size());
res.reserve(words.size());
size_t level = 0;
while (!words.empty()) {
smallerWords.clear();
for (size_t i = 0; i < words.size(); i++) {
if (words[i].size() >= 3) {
size_t len = words[i].size() - 1;
mpSeg_.cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear
}
if (words[i].size() > 1) {
res.push_back(pair<Unicode, size_t>(words[i], level));
}
}
words.swap(smallerWords);
level++;
}
}
void cut(const string& sentence,
vector<pair<string, size_t> >& words) const {
words.clear();
Unicode unicode;
TransCode::decode(sentence, unicode);
vector<pair<Unicode, size_t> > unicodeWords;
cut(unicode.begin(), unicode.end(), unicodeWords);
words.resize(unicodeWords.size());
for (size_t i = 0; i < words.size(); i++) {
TransCode::encode(unicodeWords[i].first, words[i].first);
words[i].second = unicodeWords[i].second;
}
}
bool cut(const string& sentence,
vector<string>& res) const {
vector<pair<string, size_t> > words;
cut(sentence, words);
res.clear();
res.reserve(words.size());
for (size_t i = 0; i < words.size(); i++) {
res.push_back(words[i].first);
}
return true;
}
private:
MPSegment mpSeg_;
}; // class LevelSegment
} // namespace cppjieba
#endif // CPPJIEBA_LEVELSEGMENT_H