mirror of
https://github.com/yanyiwu/cppjieba.git
synced 2025-07-18 00:00:12 +08:00
use PreFilter in SegmentBase
This commit is contained in:
parent
0542dd1cfd
commit
28bcb3bf57
@ -5,7 +5,16 @@
|
|||||||
|
|
||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
|
|
||||||
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
//class PreFilterIterator {
|
||||||
|
// public:
|
||||||
|
// PreFilterIterator() {
|
||||||
|
// }
|
||||||
|
// ~PreFilterIterator() {
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// private:
|
||||||
|
// const unordered_set<Rune>& specialSymbols_;
|
||||||
|
//}; // PreFilterIterator
|
||||||
|
|
||||||
class PreFilter {
|
class PreFilter {
|
||||||
public:
|
public:
|
||||||
@ -14,17 +23,14 @@ class PreFilter {
|
|||||||
Unicode::const_iterator end;
|
Unicode::const_iterator end;
|
||||||
}; // struct Range
|
}; // struct Range
|
||||||
|
|
||||||
PreFilter() {
|
PreFilter(const unordered_set<Rune>& symbols,
|
||||||
LoadSpecialSymbols();
|
const string& sentence)
|
||||||
}
|
: symbols_(symbols) {
|
||||||
~PreFilter() {
|
|
||||||
}
|
|
||||||
|
|
||||||
void Reset(const string& sentence) {
|
|
||||||
TransCode::decode(sentence, sentence_);
|
TransCode::decode(sentence, sentence_);
|
||||||
cursor_ = sentence_.begin();
|
cursor_ = sentence_.begin();
|
||||||
}
|
}
|
||||||
|
~PreFilter() {
|
||||||
|
}
|
||||||
bool HasNext() const {
|
bool HasNext() const {
|
||||||
return cursor_ != sentence_.end();
|
return cursor_ != sentence_.end();
|
||||||
}
|
}
|
||||||
@ -32,7 +38,7 @@ class PreFilter {
|
|||||||
Range range;
|
Range range;
|
||||||
range.begin = cursor_;
|
range.begin = cursor_;
|
||||||
while (cursor_ != sentence_.end()) {
|
while (cursor_ != sentence_.end()) {
|
||||||
if (isIn(specialSymbols_, *cursor_)) {
|
if (isIn(symbols_, *cursor_)) {
|
||||||
if (range.begin == cursor_) {
|
if (range.begin == cursor_) {
|
||||||
cursor_ ++;
|
cursor_ ++;
|
||||||
}
|
}
|
||||||
@ -45,18 +51,9 @@ class PreFilter {
|
|||||||
return range;
|
return range;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
Unicode sentence_;
|
|
||||||
Unicode::const_iterator cursor_;
|
Unicode::const_iterator cursor_;
|
||||||
|
Unicode sentence_;
|
||||||
void LoadSpecialSymbols() {
|
const unordered_set<Rune>& symbols_;
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
|
||||||
for(size_t i = 0; i < size; i ++) {
|
|
||||||
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
|
|
||||||
}
|
|
||||||
assert(specialSymbols_.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
unordered_set<Rune> specialSymbols_;
|
|
||||||
}; // class PreFilter
|
}; // class PreFilter
|
||||||
|
|
||||||
} // namespace CppJieba
|
} // namespace CppJieba
|
||||||
|
@ -1,57 +1,56 @@
|
|||||||
#ifndef CPPJIEBA_SEGMENTBASE_H
|
#ifndef CPPJIEBA_SEGMENTBASE_H
|
||||||
#define CPPJIEBA_SEGMENTBASE_H
|
#define CPPJIEBA_SEGMENTBASE_H
|
||||||
|
|
||||||
#include "TransCode.hpp"
|
|
||||||
#include "limonp/Logger.hpp"
|
#include "limonp/Logger.hpp"
|
||||||
#include "limonp/NonCopyable.hpp"
|
#include "PreFilter.hpp"
|
||||||
#include "limonp/HandyMacro.hpp"
|
|
||||||
#include "ISegment.hpp"
|
#include "ISegment.hpp"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
|
|
||||||
namespace CppJieba {
|
namespace CppJieba {
|
||||||
|
|
||||||
|
//const char* const SPECIAL_CHARS = " \t\n,。";
|
||||||
|
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
|
||||||
|
|
||||||
using namespace limonp;
|
using namespace limonp;
|
||||||
|
|
||||||
//const char* const SPECIAL_CHARS = " \t\n";
|
class SegmentBase: public ISegment {
|
||||||
#ifndef CPPJIEBA_GBK
|
|
||||||
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
|
||||||
#else
|
|
||||||
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class SegmentBase: public ISegment, public NonCopyable {
|
|
||||||
public:
|
public:
|
||||||
SegmentBase() {
|
SegmentBase() {
|
||||||
LoadSpecialSymbols();
|
|
||||||
}
|
}
|
||||||
virtual ~SegmentBase() {
|
virtual ~SegmentBase() {
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
|
||||||
virtual bool cut(const string& str, vector<string>& res) const {
|
virtual bool cut(const string& str, vector<string>& res) const {
|
||||||
|
PreFilter pre_filter(symbols_, str);
|
||||||
|
PreFilter::Range range;
|
||||||
res.clear();
|
res.clear();
|
||||||
|
while (pre_filter.HasNext()) {
|
||||||
Unicode unicode;
|
range = pre_filter.Next();
|
||||||
unicode.reserve(str.size());
|
cut(range.begin, range.end, res);
|
||||||
|
|
||||||
TransCode::decode(str, unicode);
|
|
||||||
|
|
||||||
Unicode::const_iterator left = unicode.begin();
|
|
||||||
Unicode::const_iterator right;
|
|
||||||
|
|
||||||
for(right = unicode.begin(); right != unicode.end(); right++) {
|
|
||||||
if(isIn(specialSymbols_, *right)) {
|
|
||||||
if(left != right) {
|
|
||||||
cut(left, right, res);
|
|
||||||
}
|
|
||||||
res.resize(res.size() + 1);
|
|
||||||
TransCode::encode(right, right + 1, res.back());
|
|
||||||
left = right + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(left != right) {
|
|
||||||
cut(left, right, res);
|
|
||||||
}
|
}
|
||||||
|
//Unicode unicode;
|
||||||
|
//unicode.reserve(str.size());
|
||||||
|
|
||||||
|
//TransCode::decode(str, unicode);
|
||||||
|
|
||||||
|
//Unicode::const_iterator left = unicode.begin();
|
||||||
|
//Unicode::const_iterator right;
|
||||||
|
|
||||||
|
//for(right = unicode.begin(); right != unicode.end(); right++) {
|
||||||
|
// if(isIn(specialSymbols_, *right)) {
|
||||||
|
// if(left != right) {
|
||||||
|
// cut(left, right, res);
|
||||||
|
// }
|
||||||
|
// res.resize(res.size() + 1);
|
||||||
|
// TransCode::encode(right, right + 1, res.back());
|
||||||
|
// left = right + 1;
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
//if(left != right) {
|
||||||
|
// cut(left, right, res);
|
||||||
|
//}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -66,19 +65,20 @@ class SegmentBase: public ISegment, public NonCopyable {
|
|||||||
res.resize(res.size() + uRes.size());
|
res.resize(res.size() + uRes.size());
|
||||||
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
|
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
|
||||||
TransCode::encode(uRes[i], res[offset]);
|
TransCode::encode(uRes[i], res[offset]);
|
||||||
|
cout << __FILE__ << __LINE__ << endl;
|
||||||
|
cout << res[offset] << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
void LoadSpecialSymbols() {
|
void LoadSpecialSymbols() {
|
||||||
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
||||||
for(size_t i = 0; i < size; i ++) {
|
for(size_t i = 0; i < size; i ++) {
|
||||||
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
|
symbols_.insert(SPECIAL_SYMBOL[i]);
|
||||||
}
|
}
|
||||||
assert(specialSymbols_.size());
|
assert(symbols_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_set<Rune> specialSymbols_;
|
unordered_set<Rune> symbols_;
|
||||||
|
|
||||||
}; // class SegmentBase
|
}; // class SegmentBase
|
||||||
|
|
||||||
} // CppJieba
|
} // CppJieba
|
||||||
|
@ -4,15 +4,39 @@
|
|||||||
using namespace CppJieba;
|
using namespace CppJieba;
|
||||||
|
|
||||||
TEST(PreFilterTest, Test1) {
|
TEST(PreFilterTest, Test1) {
|
||||||
PreFilter filter;
|
unordered_set<Rune> symbol;
|
||||||
filter.Reset("你好,美丽的,世界");
|
symbol.insert(65292u); // ","
|
||||||
const char* expected[] = {"你好", ",", "美丽的", ",", "世界"};
|
symbol.insert(12290u); // "。"
|
||||||
ASSERT_TRUE(filter.HasNext());
|
string expected;
|
||||||
vector<string> words;
|
string res;
|
||||||
while (filter.HasNext()) {
|
|
||||||
PreFilter::Range range;
|
{
|
||||||
range = filter.Next();
|
PreFilter filter(symbol, "你好,美丽的,世界");
|
||||||
words.push_back(TransCode::encode(range.begin, range.end));
|
expected = "你好/,/美丽的/,/世界";
|
||||||
|
ASSERT_TRUE(filter.HasNext());
|
||||||
|
vector<string> words;
|
||||||
|
while (filter.HasNext()) {
|
||||||
|
PreFilter::Range range;
|
||||||
|
range = filter.Next();
|
||||||
|
words.push_back(TransCode::encode(range.begin, range.end));
|
||||||
|
}
|
||||||
|
res = join(words.begin(), words.end(), "/");
|
||||||
|
ASSERT_EQ(res, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
PreFilter filter(symbol, "我来自北京邮电大学。。。学号123456,用AK47");
|
||||||
|
expected = "我来自北京邮电大学/。/。/。/学号123456/,/用AK47";
|
||||||
|
ASSERT_TRUE(filter.HasNext());
|
||||||
|
vector<string> words;
|
||||||
|
while (filter.HasNext()) {
|
||||||
|
PreFilter::Range range;
|
||||||
|
range = filter.Next();
|
||||||
|
words.push_back(TransCode::encode(range.begin, range.end));
|
||||||
|
}
|
||||||
|
res = join(words.begin(), words.end(), "/");
|
||||||
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
|
}
|
||||||
|
ASSERT_EQ(res, expected);
|
||||||
}
|
}
|
||||||
ASSERT_EQ(vector<string>(expected, expected + sizeof(expected)/sizeof(*expected)), words);
|
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ TEST(MixSegmentTest, Test1) {
|
|||||||
const char* res2[] = {"B超"," ", "T恤"};
|
const char* res2[] = {"B超"," ", "T恤"};
|
||||||
vector<string> words;
|
vector<string> words;
|
||||||
ASSERT_TRUE(segment.cut(str, words));
|
ASSERT_TRUE(segment.cut(str, words));
|
||||||
|
cout << words << endl;
|
||||||
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
||||||
ASSERT_TRUE(segment.cut(str2, words));
|
ASSERT_TRUE(segment.cut(str2, words));
|
||||||
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user