mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
String Arithmetics: __add__ ops (#68)
* adds support for __add__ ops for string objects and literals * adds tests for string arithmetic * updates comment in numeric field resolution * adds op_type parameter for numeric_ops
This commit is contained in:
parent
33f5495352
commit
a3dd86075a
@ -68,9 +68,11 @@ class Operations:
|
||||
task = ('tail', (index.sort_field, n))
|
||||
self._tasks.append(task)
|
||||
|
||||
def arithmetic_op_fields(self, field_name, op_name, left_field, right_field):
|
||||
task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))))
|
||||
|
||||
def arithmetic_op_fields(self, field_name, op_name, left_field, right_field, op_type=None):
|
||||
if op_type:
|
||||
task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))), op_type)
|
||||
else:
|
||||
task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))))
|
||||
# Set this as a column we want to retrieve
|
||||
self.set_field_names([field_name])
|
||||
|
||||
@ -907,101 +909,163 @@ class Operations:
|
||||
left_field = item[1][1][1][0]
|
||||
right_field = item[1][1][1][1]
|
||||
|
||||
try:
|
||||
op_type = item[2]
|
||||
except IndexError:
|
||||
op_type = None
|
||||
|
||||
# https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
|
||||
if isinstance(left_field, str) and isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / doc[right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__truediv__':
|
||||
source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__floordiv__':
|
||||
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__pow__':
|
||||
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__mod__':
|
||||
source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__mul__':
|
||||
source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__sub__':
|
||||
source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
if not op_type:
|
||||
if isinstance(left_field, str) and isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = {}
|
||||
query_params['query_script_fields'][field_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / doc[right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__truediv__':
|
||||
source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__floordiv__':
|
||||
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__pow__':
|
||||
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__mod__':
|
||||
source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__mul__':
|
||||
source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__sub__':
|
||||
source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / right_field"
|
||||
}
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = {}
|
||||
query_params['query_script_fields'][field_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "doc['{0}'].value + {1}".format(left_field, right_field)
|
||||
elif op_name == '__truediv__':
|
||||
source = "doc['{0}'].value / {1}".format(left_field, right_field)
|
||||
elif op_name == '__floordiv__':
|
||||
source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
|
||||
elif op_name == '__pow__':
|
||||
source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
|
||||
elif op_name == '__mod__':
|
||||
source = "doc['{0}'].value % {1}".format(left_field, right_field)
|
||||
elif op_name == '__mul__':
|
||||
source = "doc['{0}'].value * {1}".format(left_field, right_field)
|
||||
elif op_name == '__sub__':
|
||||
source = "doc['{0}'].value - {1}".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "left_field / doc['right_field'].value"
|
||||
}
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / right_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "{0} + doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__truediv__':
|
||||
source = "{0} / doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__floordiv__':
|
||||
source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__pow__':
|
||||
source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__mod__':
|
||||
source = "{0} % doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__mul__':
|
||||
source = "{0} * doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__sub__':
|
||||
source = "{0} - doc['{1}'].value".format(left_field, right_field)
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "doc['{0}'].value + {1}".format(left_field, right_field)
|
||||
elif op_name == '__truediv__':
|
||||
source = "doc['{0}'].value / {1}".format(left_field, right_field)
|
||||
elif op_name == '__floordiv__':
|
||||
source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
|
||||
elif op_name == '__pow__':
|
||||
source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
|
||||
elif op_name == '__mod__':
|
||||
source = "doc['{0}'].value % {1}".format(left_field, right_field)
|
||||
elif op_name == '__mul__':
|
||||
source = "doc['{0}'].value * {1}".format(left_field, right_field)
|
||||
elif op_name == '__sub__':
|
||||
source = "doc['{0}'].value - {1}".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "left_field / doc['right_field'].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "{0} + doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__truediv__':
|
||||
source = "{0} / doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__floordiv__':
|
||||
source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__pow__':
|
||||
source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field)
|
||||
elif op_name == '__mod__':
|
||||
source = "{0} % doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__mul__':
|
||||
source = "{0} * doc['{1}'].value".format(left_field, right_field)
|
||||
elif op_name == '__sub__':
|
||||
source = "{0} - doc['{1}'].value".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
else:
|
||||
raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name)
|
||||
raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name)
|
||||
|
||||
elif op_type[0] == "string":
|
||||
# we need to check the type of string addition
|
||||
if op_type[1] == "s":
|
||||
"""
|
||||
(if op_name = '__add__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value + doc[right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
elif op_type[1] == "r":
|
||||
if isinstance(left_field, str) and isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = '__add__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value + right_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "doc['{0}'].value + '{1}'".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
elif op_type[1] == 'l':
|
||||
if isinstance(left_field, str) and isinstance(right_field, str):
|
||||
"""
|
||||
(if op_name = '__add__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "left_field + doc[right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if op_name == '__add__':
|
||||
source = "'{0}' + doc['{1}'].value".format(left_field, right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
|
||||
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = {}
|
||||
|
@ -517,10 +517,44 @@ class ElandQueryCompiler:
|
||||
"{0} != {1}".format(self._index_pattern, right._index_pattern)
|
||||
)
|
||||
|
||||
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field):
|
||||
def check_str_arithmetics(self, right, self_field, right_field):
|
||||
"""
|
||||
In the case of string arithmetics, we need an additional check to ensure that the
|
||||
selected fields are aggregatable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
right: ElandQueryCompiler
|
||||
The query compiler to compare self to
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError, ValueError
|
||||
If string arithmetic operations aren't possible
|
||||
"""
|
||||
|
||||
# only check compatibility if right is an ElandQueryCompiler
|
||||
# else return the raw string as the new field name
|
||||
right_agg = {right_field: right_field}
|
||||
if right:
|
||||
self.check_arithmetics(right)
|
||||
right_agg = right._mappings.aggregatable_field_names([right_field])
|
||||
|
||||
self_agg = self._mappings.aggregatable_field_names([self_field])
|
||||
|
||||
if self_agg and right_agg:
|
||||
return list(self_agg.keys())[0], list(right_agg.keys())[0]
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Can not perform arithmetic operations on non aggregatable fields"
|
||||
"One of [{}, {}] is not aggregatable.".format(self.name, right.name)
|
||||
)
|
||||
|
||||
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
|
||||
result = self.copy()
|
||||
|
||||
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field)
|
||||
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field, op_type)
|
||||
|
||||
return result
|
||||
|
||||
|
103
eland/series.py
103
eland/series.py
@ -503,8 +503,27 @@ class Series(NDFrame):
|
||||
3 176.979996
|
||||
4 82.980003
|
||||
dtype: float64
|
||||
>>> df.customer_first_name + df.customer_last_name
|
||||
0 EddieUnderwood
|
||||
1 MaryBailey
|
||||
2 GwenButler
|
||||
3 DianeChandler
|
||||
4 EddieWeber
|
||||
dtype: object
|
||||
>>> "First name: " + df.customer_first_name
|
||||
0 First name: Eddie
|
||||
1 First name: Mary
|
||||
2 First name: Gwen
|
||||
3 First name: Diane
|
||||
4 First name: Eddie
|
||||
Name: customer_first_name, dtype: object
|
||||
"""
|
||||
return self._numeric_op(right, _get_method_name())
|
||||
if self._dtype == 'object':
|
||||
op_type = ('string',)
|
||||
else:
|
||||
op_type = ('numeric',)
|
||||
|
||||
return self._numeric_op(right, _get_method_name(), op_type)
|
||||
|
||||
def __truediv__(self, right):
|
||||
"""
|
||||
@ -770,7 +789,12 @@ class Series(NDFrame):
|
||||
4 81.980003
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
if self._dtype == 'object':
|
||||
op_type = ('string',)
|
||||
else:
|
||||
op_type = ('numeric',)
|
||||
|
||||
return self._numeric_rop(left, _get_method_name(), op_type)
|
||||
|
||||
def __rtruediv__(self, left):
|
||||
"""
|
||||
@ -988,7 +1012,7 @@ class Series(NDFrame):
|
||||
rsubtract = __rsub__
|
||||
rtruediv = __rtruediv__
|
||||
|
||||
def _numeric_op(self, right, method_name):
|
||||
def _numeric_op(self, right, method_name, op_type=None):
|
||||
"""
|
||||
return a op b
|
||||
|
||||
@ -1000,20 +1024,36 @@ class Series(NDFrame):
|
||||
# Check compatibility of Elasticsearch cluster
|
||||
self._query_compiler.check_arithmetics(right._query_compiler)
|
||||
|
||||
# Check compatibility of dtypes
|
||||
# either not a number?
|
||||
if not (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)):
|
||||
# check left numeric series and right numeric series
|
||||
if (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)):
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
|
||||
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right.name))
|
||||
series.name = None
|
||||
|
||||
return series
|
||||
|
||||
# check left object series and right object series
|
||||
elif self._dtype == 'object' and right._dtype == 'object':
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
|
||||
# our operation is between series
|
||||
op_type = op_type + tuple('s')
|
||||
# check if fields are aggregatable
|
||||
self.name, right.name = self._query_compiler.check_str_arithmetics(right._query_compiler, self.name, right.name)
|
||||
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right.name, op_type))
|
||||
series.name = None
|
||||
|
||||
return series
|
||||
|
||||
else:
|
||||
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
|
||||
raise TypeError("Unsupported operation: '{}' {} '{}'".format(self._dtype, method_name, right._dtype))
|
||||
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
|
||||
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right.name))
|
||||
series.name = None
|
||||
|
||||
return series
|
||||
# check left number and right numeric series
|
||||
elif np.issubdtype(np.dtype(type(right)), np.number) and np.issubdtype(self._dtype, np.number):
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
|
||||
|
||||
@ -1025,13 +1065,30 @@ class Series(NDFrame):
|
||||
series.name = self.name
|
||||
|
||||
return series
|
||||
|
||||
# check left str series and right str
|
||||
elif isinstance(right, str) and self._dtype == 'object':
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
|
||||
self.name, right = self._query_compiler.check_str_arithmetics(None, self.name, right)
|
||||
# our operation is between a series and a string on the right
|
||||
op_type = op_type + tuple('r')
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right, op_type))
|
||||
|
||||
# truncate last occurence of '.keyword'
|
||||
new_series_name = self.name.rsplit('.keyword', 1)[0]
|
||||
series.name = new_series_name
|
||||
|
||||
return series
|
||||
|
||||
else:
|
||||
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
|
||||
raise TypeError(
|
||||
"unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(right))
|
||||
)
|
||||
|
||||
def _numeric_rop(self, left, method_name):
|
||||
def _numeric_rop(self, left, method_name, op_type=None):
|
||||
"""
|
||||
e.g. 1 + ed.Series
|
||||
"""
|
||||
@ -1051,6 +1108,22 @@ class Series(NDFrame):
|
||||
series.name = self.name
|
||||
|
||||
return series
|
||||
|
||||
elif isinstance(left, str) and self._dtype == 'object':
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, op_method_name, str(left).replace('.', '_'))
|
||||
self.name, left = self._query_compiler.check_str_arithmetics(None, self.name, left)
|
||||
# our operation is between a series and a string on the right
|
||||
op_type = op_type + tuple('l')
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, op_method_name, left, self.name, op_type))
|
||||
|
||||
# truncate last occurence of '.keyword'
|
||||
new_series_name = self.name.rsplit('.keyword', 1)[0]
|
||||
series.name = new_series_name
|
||||
|
||||
return series
|
||||
|
||||
else:
|
||||
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
|
||||
raise TypeError(
|
||||
|
40
eland/tests/series/test_str_arithmetics_pytest.py
Normal file
40
eland/tests/series/test_str_arithmetics_pytest.py
Normal file
@ -0,0 +1,40 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from eland.tests.common import TestData, assert_pandas_eland_series_equal
|
||||
|
||||
|
||||
class TestSeriesArithmetics(TestData):
|
||||
|
||||
def test_invalid_add_num(self):
|
||||
with pytest.raises(TypeError):
|
||||
assert 2 + self.ed_ecommerce()['currency']
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
assert self.ed_ecommerce()['currency'] + 2
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
assert self.ed_ecommerce()['currency'] + self.ed_ecommerce()['total_quantity']
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
assert self.ed_ecommerce()['total_quantity'] + self.ed_ecommerce()['currency']
|
||||
|
||||
def test_str_add_ser(self):
|
||||
|
||||
edadd = self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_last_name']
|
||||
pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name']
|
||||
|
||||
assert_pandas_eland_series_equal(pdadd, edadd)
|
||||
|
||||
def test_ser_add_str(self):
|
||||
edadd = self.ed_ecommerce()['customer_first_name'] + " is the first name."
|
||||
pdadd = self.pd_ecommerce()['customer_first_name'] + " is the first name."
|
||||
|
||||
assert_pandas_eland_series_equal(pdadd, edadd)
|
||||
|
||||
def test_ser_add_ser(self):
|
||||
edadd = "The last name is: " + self.ed_ecommerce()['customer_last_name']
|
||||
pdadd = "The last name is: " + self.pd_ecommerce()['customer_last_name']
|
||||
|
||||
assert_pandas_eland_series_equal(pdadd, edadd)
|
Loading…
x
Reference in New Issue
Block a user