use PreFilter in SegmentBase

This commit is contained in:
yanyiwu 2015-09-13 16:05:17 +08:00
parent 0542dd1cfd
commit 28bcb3bf57
4 changed files with 90 additions and 68 deletions

View File

@ -5,7 +5,16 @@
namespace CppJieba {
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
//class PreFilterIterator {
// public:
// PreFilterIterator() {
// }
// ~PreFilterIterator() {
// }
//
// private:
// const unordered_set<Rune>& specialSymbols_;
//}; // PreFilterIterator
class PreFilter {
public:
@ -14,17 +23,14 @@ class PreFilter {
Unicode::const_iterator end;
}; // struct Range
PreFilter() {
LoadSpecialSymbols();
}
~PreFilter() {
}
void Reset(const string& sentence) {
PreFilter(const unordered_set<Rune>& symbols,
const string& sentence)
: symbols_(symbols) {
TransCode::decode(sentence, sentence_);
cursor_ = sentence_.begin();
}
~PreFilter() {
}
bool HasNext() const {
return cursor_ != sentence_.end();
}
@ -32,7 +38,7 @@ class PreFilter {
Range range;
range.begin = cursor_;
while (cursor_ != sentence_.end()) {
if (isIn(specialSymbols_, *cursor_)) {
if (isIn(symbols_, *cursor_)) {
if (range.begin == cursor_) {
cursor_ ++;
}
@ -45,18 +51,9 @@ class PreFilter {
return range;
}
private:
Unicode sentence_;
Unicode::const_iterator cursor_;
void LoadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) {
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
}
assert(specialSymbols_.size());
}
unordered_set<Rune> specialSymbols_;
Unicode sentence_;
const unordered_set<Rune>& symbols_;
}; // class PreFilter
} // namespace CppJieba

View File

@ -1,57 +1,56 @@
#ifndef CPPJIEBA_SEGMENTBASE_H
#define CPPJIEBA_SEGMENTBASE_H
#include "TransCode.hpp"
#include "limonp/Logger.hpp"
#include "limonp/NonCopyable.hpp"
#include "limonp/HandyMacro.hpp"
#include "PreFilter.hpp"
#include "ISegment.hpp"
#include <cassert>
namespace CppJieba {
//const char* const SPECIAL_CHARS = " \t\n。";
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
using namespace limonp;
//const char* const SPECIAL_CHARS = " \t\n";
#ifndef CPPJIEBA_GBK
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
#else
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u};
#endif
class SegmentBase: public ISegment, public NonCopyable {
class SegmentBase: public ISegment {
public:
SegmentBase() {
LoadSpecialSymbols();
}
virtual ~SegmentBase() {
}
public:
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const = 0;
virtual bool cut(const string& str, vector<string>& res) const {
PreFilter pre_filter(symbols_, str);
PreFilter::Range range;
res.clear();
Unicode unicode;
unicode.reserve(str.size());
TransCode::decode(str, unicode);
Unicode::const_iterator left = unicode.begin();
Unicode::const_iterator right;
for(right = unicode.begin(); right != unicode.end(); right++) {
if(isIn(specialSymbols_, *right)) {
if(left != right) {
cut(left, right, res);
}
res.resize(res.size() + 1);
TransCode::encode(right, right + 1, res.back());
left = right + 1;
}
}
if(left != right) {
cut(left, right, res);
while (pre_filter.HasNext()) {
range = pre_filter.Next();
cut(range.begin, range.end, res);
}
//Unicode unicode;
//unicode.reserve(str.size());
//TransCode::decode(str, unicode);
//Unicode::const_iterator left = unicode.begin();
//Unicode::const_iterator right;
//for(right = unicode.begin(); right != unicode.end(); right++) {
// if(isIn(specialSymbols_, *right)) {
// if(left != right) {
// cut(left, right, res);
// }
// res.resize(res.size() + 1);
// TransCode::encode(right, right + 1, res.back());
// left = right + 1;
// }
//}
//if(left != right) {
// cut(left, right, res);
//}
return true;
}
@ -66,19 +65,20 @@ class SegmentBase: public ISegment, public NonCopyable {
res.resize(res.size() + uRes.size());
for(size_t i = 0; i < uRes.size(); i ++, offset++) {
TransCode::encode(uRes[i], res[offset]);
cout << __FILE__ << __LINE__ << endl;
cout << res[offset] << endl;
}
}
private:
void LoadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for(size_t i = 0; i < size; i ++) {
specialSymbols_.insert(SPECIAL_SYMBOL[i]);
symbols_.insert(SPECIAL_SYMBOL[i]);
}
assert(specialSymbols_.size());
assert(symbols_.size());
}
unordered_set<Rune> specialSymbols_;
unordered_set<Rune> symbols_;
}; // class SegmentBase
} // CppJieba

View File

@ -4,9 +4,15 @@
using namespace CppJieba;
TEST(PreFilterTest, Test1) {
PreFilter filter;
filter.Reset("你好,美丽的,世界");
const char* expected[] = {"你好", "", "美丽的", "", "世界"};
unordered_set<Rune> symbol;
symbol.insert(65292u); // ""
symbol.insert(12290u); // "。"
string expected;
string res;
{
PreFilter filter(symbol, "你好,美丽的,世界");
expected = "你好//美丽的//世界";
ASSERT_TRUE(filter.HasNext());
vector<string> words;
while (filter.HasNext()) {
@ -14,5 +20,23 @@ TEST(PreFilterTest, Test1) {
range = filter.Next();
words.push_back(TransCode::encode(range.begin, range.end));
}
ASSERT_EQ(vector<string>(expected, expected + sizeof(expected)/sizeof(*expected)), words);
res = join(words.begin(), words.end(), "/");
ASSERT_EQ(res, expected);
}
{
PreFilter filter(symbol, "我来自北京邮电大学。。。学号123456用AK47");
expected = "我来自北京邮电大学/。/。/。/学号123456//用AK47";
ASSERT_TRUE(filter.HasNext());
vector<string> words;
while (filter.HasNext()) {
PreFilter::Range range;
range = filter.Next();
words.push_back(TransCode::encode(range.begin, range.end));
}
res = join(words.begin(), words.end(), "/");
for (size_t i = 0; i < words.size(); i++) {
}
ASSERT_EQ(res, expected);
}
}

View File

@ -18,6 +18,7 @@ TEST(MixSegmentTest, Test1) {
const char* res2[] = {"B超"," ", "T恤"};
vector<string> words;
ASSERT_TRUE(segment.cut(str, words));
cout << words << endl;
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
ASSERT_TRUE(segment.cut(str2, words));
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));