mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
add PreFilter
This commit is contained in:
parent
710ddacd38
commit
0542dd1cfd
64
src/PreFilter.hpp
Normal file
64
src/PreFilter.hpp
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
#ifndef CPPJIEBA_PRE_FILTER_H
|
||||||
|
#define CPPJIEBA_PRE_FILTER_H
|
||||||
|
|
||||||
|
#include "TransCode.hpp"
|
||||||
|
|
||||||
|
namespace CppJieba {
|
||||||
|
|
||||||
|
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
||||||
|
|
||||||
|
class PreFilter {
|
||||||
|
public:
|
||||||
|
struct Range {
|
||||||
|
Unicode::const_iterator begin;
|
||||||
|
Unicode::const_iterator end;
|
||||||
|
}; // struct Range
|
||||||
|
|
||||||
|
PreFilter() {
|
||||||
|
LoadSpecialSymbols();
|
||||||
|
}
|
||||||
|
~PreFilter() {
|
||||||
|
}
|
||||||
|
|
||||||
|
void Reset(const string& sentence) {
|
||||||
|
TransCode::decode(sentence, sentence_);
|
||||||
|
cursor_ = sentence_.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HasNext() const {
|
||||||
|
return cursor_ != sentence_.end();
|
||||||
|
}
|
||||||
|
Range Next() {
|
||||||
|
Range range;
|
||||||
|
range.begin = cursor_;
|
||||||
|
while (cursor_ != sentence_.end()) {
|
||||||
|
if (isIn(specialSymbols_, *cursor_)) {
|
||||||
|
if (range.begin == cursor_) {
|
||||||
|
cursor_ ++;
|
||||||
|
}
|
||||||
|
range.end = cursor_;
|
||||||
|
return range;
|
||||||
|
}
|
||||||
|
cursor_ ++;
|
||||||
|
}
|
||||||
|
range.end = sentence_.end();
|
||||||
|
return range;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
Unicode sentence_;
|
||||||
|
Unicode::const_iterator cursor_;
|
||||||
|
|
||||||
|
void LoadSpecialSymbols() {
|
||||||
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
|
for(size_t i = 0; i < size; i ++) {
|
||||||
|
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
|
||||||
|
}
|
||||||
|
assert(specialSymbols_.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
unordered_set<Rune> specialSymbols_;
|
||||||
|
}; // class PreFilter
|
||||||
|
|
||||||
|
} // namespace CppJieba
|
||||||
|
|
||||||
|
#endif // CPPJIEBA_PRE_FILTER_H
|
@ -23,9 +23,9 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
public:
|
public:
|
||||||
SegmentBase() {
|
SegmentBase() {
|
||||||
LoadSpecialSymbols();
|
LoadSpecialSymbols();
|
||||||
};
|
}
|
||||||
virtual ~SegmentBase() {
|
virtual ~SegmentBase() {
|
||||||
};
|
}
|
||||||
public:
|
public:
|
||||||
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||||
virtual bool cut(const string& str, vector<string>& res) const {
|
virtual bool cut(const string& str, vector<string>& res) const {
|
||||||
|
@ -57,6 +57,13 @@ inline Unicode decode(const string& str) {
|
|||||||
return unicode;
|
return unicode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void encode(const vector<Unicode>& input, vector<string>& output) {
|
||||||
|
output.resize(input.size());
|
||||||
|
for (size_t i = 0; i < output.size(); i++) {
|
||||||
|
encode(input[i], output[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace TransCode
|
} // namespace TransCode
|
||||||
} // namespace CppJieba
|
} // namespace CppJieba
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ ADD_EXECUTABLE(test.run
|
|||||||
TSegments.cpp
|
TSegments.cpp
|
||||||
TPosTagger.cpp
|
TPosTagger.cpp
|
||||||
TApplication.cpp
|
TApplication.cpp
|
||||||
|
TPreFilter.cpp
|
||||||
)
|
)
|
||||||
TARGET_LINK_LIBRARIES(gtest pthread)
|
TARGET_LINK_LIBRARIES(gtest pthread)
|
||||||
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
||||||
|
18
test/unittest/TPreFilter.cpp
Normal file
18
test/unittest/TPreFilter.cpp
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#include "gtest/gtest.h"
|
||||||
|
#include "src/PreFilter.hpp"
|
||||||
|
|
||||||
|
using namespace CppJieba;
|
||||||
|
|
||||||
|
TEST(PreFilterTest, Test1) {
|
||||||
|
PreFilter filter;
|
||||||
|
filter.Reset("你好,美丽的,世界");
|
||||||
|
const char* expected[] = {"你好", ",", "美丽的", ",", "世界"};
|
||||||
|
ASSERT_TRUE(filter.HasNext());
|
||||||
|
vector<string> words;
|
||||||
|
while (filter.HasNext()) {
|
||||||
|
PreFilter::Range range;
|
||||||
|
range = filter.Next();
|
||||||
|
words.push_back(TransCode::encode(range.begin, range.end));
|
||||||
|
}
|
||||||
|
ASSERT_EQ(vector<string>(expected, expected + sizeof(expected)/sizeof(*expected)), words);
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user