add PreFilter

This commit is contained in:
yanyiwu 2015-09-13 15:10:10 +08:00
parent 710ddacd38
commit 0542dd1cfd
5 changed files with 92 additions and 2 deletions

64
src/PreFilter.hpp Normal file
View File

@ -0,0 +1,64 @@
#ifndef CPPJIEBA_PRE_FILTER_H
#define CPPJIEBA_PRE_FILTER_H
#include "TransCode.hpp"
namespace CppJieba {
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
class PreFilter {
public:
struct Range {
Unicode::const_iterator begin;
Unicode::const_iterator end;
}; // struct Range
PreFilter() {
LoadSpecialSymbols();
}
~PreFilter() {
}
void Reset(const string& sentence) {
TransCode::decode(sentence, sentence_);
cursor_ = sentence_.begin();
}
bool HasNext() const {
return cursor_ != sentence_.end();
}
Range Next() {
Range range;
range.begin = cursor_;
while (cursor_ != sentence_.end()) {
if (isIn(specialSymbols_, *cursor_)) {
if (range.begin == cursor_) {
cursor_ ++;
}
range.end = cursor_;
return range;
}
cursor_ ++;
}
range.end = sentence_.end();
return range;
}
private:
Unicode sentence_;
Unicode::const_iterator cursor_;
void LoadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) {
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
}
assert(specialSymbols_.size());
}
unordered_set<Rune> specialSymbols_;
}; // class PreFilter
} // namespace CppJieba
#endif // CPPJIEBA_PRE_FILTER_H

View File

@ -23,9 +23,9 @@ class SegmentBase: public ISegment, public NonCopyable {
public:
SegmentBase() {
LoadSpecialSymbols();
};
}
virtual ~SegmentBase() {
};
}
public:
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const {

View File

@ -57,6 +57,13 @@ inline Unicode decode(const string& str) {
return unicode;
}
inline void encode(const vector<Unicode>& input, vector<string>& output) {
output.resize(input.size());
for (size_t i = 0; i < output.size(); i++) {
encode(input[i], output[i]);
}
}
} // namespace TransCode
} // namespace CppJieba

View File

@ -23,6 +23,7 @@ ADD_EXECUTABLE(test.run
TSegments.cpp
TPosTagger.cpp
TApplication.cpp
TPreFilter.cpp
)
TARGET_LINK_LIBRARIES(gtest pthread)
TARGET_LINK_LIBRARIES(test.run gtest pthread)

View File

@ -0,0 +1,18 @@
#include "gtest/gtest.h"
#include "src/PreFilter.hpp"
using namespace CppJieba;
TEST(PreFilterTest, Test1) {
PreFilter filter;
filter.Reset("你好,美丽的,世界");
const char* expected[] = {"你好", "", "美丽的", "", "世界"};
ASSERT_TRUE(filter.HasNext());
vector<string> words;
while (filter.HasNext()) {
PreFilter::Range range;
range = filter.Next();
words.push_back(TransCode::encode(range.begin, range.end));
}
ASSERT_EQ(vector<string>(expected, expected + sizeof(expected)/sizeof(*expected)), words);
}