diff --git a/eland/operations.py b/eland/operations.py index 680de46..b888f86 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -68,9 +68,11 @@ class Operations: task = ('tail', (index.sort_field, n)) self._tasks.append(task) - def arithmetic_op_fields(self, field_name, op_name, left_field, right_field): - task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field)))) - + def arithmetic_op_fields(self, field_name, op_name, left_field, right_field, op_type=None): + if op_type: + task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))), op_type) + else: + task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field)))) # Set this as a column we want to retrieve self.set_field_names([field_name]) @@ -907,101 +909,163 @@ class Operations: left_field = item[1][1][1][0] right_field = item[1][1][1][1] + try: + op_type = item[2] + except IndexError: + op_type = None + # https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math - if isinstance(left_field, str) and isinstance(right_field, str): - """ - (if op_name = '__truediv__') - - "script_fields": { - "field_name": { - "script": { - "source": "doc[left_field].value / doc[right_field].value" - } - } - } - """ - if op_name == '__add__': - source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field) - elif op_name == '__truediv__': - source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field) - elif op_name == '__floordiv__': - source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field) - elif op_name == '__pow__': - source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field) - elif op_name == '__mod__': - source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field) - elif op_name == '__mul__': - source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field) - elif op_name == '__sub__': - source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field) - else: - raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + if not op_type: + if isinstance(left_field, str) and isinstance(right_field, str): + """ + (if op_name = '__truediv__') - if query_params['query_script_fields'] is None: - query_params['query_script_fields'] = {} - query_params['query_script_fields'][field_name] = { - 'script': { - 'source': source + "script_fields": { + "field_name": { + "script": { + "source": "doc[left_field].value / doc[right_field].value" + } + } } - } - elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number): - """ - (if op_name = '__truediv__') + """ + if op_name == '__add__': + source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field) + elif op_name == '__truediv__': + source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field) + elif op_name == '__floordiv__': + source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field) + elif op_name == '__pow__': + source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field) + elif op_name == '__mod__': + source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field) + elif op_name == '__mul__': + source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field) + elif op_name == '__sub__': + source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field) + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) - "script_fields": { - "field_name": { - "script": { - "source": "doc[left_field].value / right_field" - } + if query_params['query_script_fields'] is None: + query_params['query_script_fields'] = {} + query_params['query_script_fields'][field_name] = { + 'script': { + 'source': source + } } - } - """ - if op_name == '__add__': - source = "doc['{0}'].value + {1}".format(left_field, right_field) - elif op_name == '__truediv__': - source = "doc['{0}'].value / {1}".format(left_field, right_field) - elif op_name == '__floordiv__': - source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field) - elif op_name == '__pow__': - source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field) - elif op_name == '__mod__': - source = "doc['{0}'].value % {1}".format(left_field, right_field) - elif op_name == '__mul__': - source = "doc['{0}'].value * {1}".format(left_field, right_field) - elif op_name == '__sub__': - source = "doc['{0}'].value - {1}".format(left_field, right_field) - else: - raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) - elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str): - """ - (if op_name = '__truediv__') + elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number): + """ + (if op_name = '__truediv__') - "script_fields": { - "field_name": { - "script": { - "source": "left_field / doc['right_field'].value" - } + "script_fields": { + "field_name": { + "script": { + "source": "doc[left_field].value / right_field" + } + } } - } - """ - if op_name == '__add__': - source = "{0} + doc['{1}'].value".format(left_field, right_field) - elif op_name == '__truediv__': - source = "{0} / doc['{1}'].value".format(left_field, right_field) - elif op_name == '__floordiv__': - source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field) - elif op_name == '__pow__': - source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field) - elif op_name == '__mod__': - source = "{0} % doc['{1}'].value".format(left_field, right_field) - elif op_name == '__mul__': - source = "{0} * doc['{1}'].value".format(left_field, right_field) - elif op_name == '__sub__': - source = "{0} - doc['{1}'].value".format(left_field, right_field) + """ + if op_name == '__add__': + source = "doc['{0}'].value + {1}".format(left_field, right_field) + elif op_name == '__truediv__': + source = "doc['{0}'].value / {1}".format(left_field, right_field) + elif op_name == '__floordiv__': + source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field) + elif op_name == '__pow__': + source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field) + elif op_name == '__mod__': + source = "doc['{0}'].value % {1}".format(left_field, right_field) + elif op_name == '__mul__': + source = "doc['{0}'].value * {1}".format(left_field, right_field) + elif op_name == '__sub__': + source = "doc['{0}'].value - {1}".format(left_field, right_field) + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str): + """ + (if op_name = '__truediv__') + + "script_fields": { + "field_name": { + "script": { + "source": "left_field / doc['right_field'].value" + } + } + } + """ + if op_name == '__add__': + source = "{0} + doc['{1}'].value".format(left_field, right_field) + elif op_name == '__truediv__': + source = "{0} / doc['{1}'].value".format(left_field, right_field) + elif op_name == '__floordiv__': + source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field) + elif op_name == '__pow__': + source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field) + elif op_name == '__mod__': + source = "{0} % doc['{1}'].value".format(left_field, right_field) + elif op_name == '__mul__': + source = "{0} * doc['{1}'].value".format(left_field, right_field) + elif op_name == '__sub__': + source = "{0} - doc['{1}'].value".format(left_field, right_field) + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + else: - raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) - else: - raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name) + raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name) + + elif op_type[0] == "string": + # we need to check the type of string addition + if op_type[1] == "s": + """ + (if op_name = '__add__') + + "script_fields": { + "field_name": { + "script": { + "source": "doc[left_field].value + doc[right_field].value" + } + } + } + """ + if op_name == '__add__': + source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field) + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + + elif op_type[1] == "r": + if isinstance(left_field, str) and isinstance(right_field, str): + """ + (if op_name = '__add__') + + "script_fields": { + "field_name": { + "script": { + "source": "doc[left_field].value + right_field" + } + } + } + """ + if op_name == '__add__': + source = "doc['{0}'].value + '{1}'".format(left_field, right_field) + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + + elif op_type[1] == 'l': + if isinstance(left_field, str) and isinstance(right_field, str): + """ + (if op_name = '__add__') + + "script_fields": { + "field_name": { + "script": { + "source": "left_field + doc[right_field].value" + } + } + } + """ + if op_name == '__add__': + source = "'{0}' + doc['{1}'].value".format(left_field, right_field) + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) if query_params['query_script_fields'] is None: query_params['query_script_fields'] = {} diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 2cfc151..cc7ce6b 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -517,10 +517,44 @@ class ElandQueryCompiler: "{0} != {1}".format(self._index_pattern, right._index_pattern) ) - def arithmetic_op_fields(self, new_field_name, op, left_field, right_field): + def check_str_arithmetics(self, right, self_field, right_field): + """ + In the case of string arithmetics, we need an additional check to ensure that the + selected fields are aggregatable. + + Parameters + ---------- + right: ElandQueryCompiler + The query compiler to compare self to + + Raises + ------ + TypeError, ValueError + If string arithmetic operations aren't possible + """ + + # only check compatibility if right is an ElandQueryCompiler + # else return the raw string as the new field name + right_agg = {right_field: right_field} + if right: + self.check_arithmetics(right) + right_agg = right._mappings.aggregatable_field_names([right_field]) + + self_agg = self._mappings.aggregatable_field_names([self_field]) + + if self_agg and right_agg: + return list(self_agg.keys())[0], list(right_agg.keys())[0] + + else: + raise ValueError( + "Can not perform arithmetic operations on non aggregatable fields" + "One of [{}, {}] is not aggregatable.".format(self.name, right.name) + ) + + def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None): result = self.copy() - result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field) + result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field, op_type) return result diff --git a/eland/series.py b/eland/series.py index a9b4cac..caa64e3 100644 --- a/eland/series.py +++ b/eland/series.py @@ -503,8 +503,27 @@ class Series(NDFrame): 3 176.979996 4 82.980003 dtype: float64 + >>> df.customer_first_name + df.customer_last_name + 0 EddieUnderwood + 1 MaryBailey + 2 GwenButler + 3 DianeChandler + 4 EddieWeber + dtype: object + >>> "First name: " + df.customer_first_name + 0 First name: Eddie + 1 First name: Mary + 2 First name: Gwen + 3 First name: Diane + 4 First name: Eddie + Name: customer_first_name, dtype: object """ - return self._numeric_op(right, _get_method_name()) + if self._dtype == 'object': + op_type = ('string',) + else: + op_type = ('numeric',) + + return self._numeric_op(right, _get_method_name(), op_type) def __truediv__(self, right): """ @@ -770,7 +789,12 @@ class Series(NDFrame): 4 81.980003 Name: taxful_total_price, dtype: float64 """ - return self._numeric_rop(left, _get_method_name()) + if self._dtype == 'object': + op_type = ('string',) + else: + op_type = ('numeric',) + + return self._numeric_rop(left, _get_method_name(), op_type) def __rtruediv__(self, left): """ @@ -988,7 +1012,7 @@ class Series(NDFrame): rsubtract = __rsub__ rtruediv = __rtruediv__ - def _numeric_op(self, right, method_name): + def _numeric_op(self, right, method_name, op_type=None): """ return a op b @@ -1000,20 +1024,36 @@ class Series(NDFrame): # Check compatibility of Elasticsearch cluster self._query_compiler.check_arithmetics(right._query_compiler) - # Check compatibility of dtypes - # either not a number? - if not (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)): + # check left numeric series and right numeric series + if (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)): + new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name) + + # Compatible, so create new Series + series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( + new_field_name, method_name, self.name, right.name)) + series.name = None + + return series + + # check left object series and right object series + elif self._dtype == 'object' and right._dtype == 'object': + new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name) + # our operation is between series + op_type = op_type + tuple('s') + # check if fields are aggregatable + self.name, right.name = self._query_compiler.check_str_arithmetics(right._query_compiler, self.name, right.name) + + series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( + new_field_name, method_name, self.name, right.name, op_type)) + series.name = None + + return series + + else: # TODO - support limited ops on strings https://github.com/elastic/eland/issues/65 raise TypeError("Unsupported operation: '{}' {} '{}'".format(self._dtype, method_name, right._dtype)) - new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name) - - # Compatible, so create new Series - series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( - new_field_name, method_name, self.name, right.name)) - series.name = None - - return series + # check left number and right numeric series elif np.issubdtype(np.dtype(type(right)), np.number) and np.issubdtype(self._dtype, np.number): new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_')) @@ -1025,13 +1065,30 @@ class Series(NDFrame): series.name = self.name return series + + # check left str series and right str + elif isinstance(right, str) and self._dtype == 'object': + new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_')) + self.name, right = self._query_compiler.check_str_arithmetics(None, self.name, right) + # our operation is between a series and a string on the right + op_type = op_type + tuple('r') + # Compatible, so create new Series + series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( + new_field_name, method_name, self.name, right, op_type)) + + # truncate last occurence of '.keyword' + new_series_name = self.name.rsplit('.keyword', 1)[0] + series.name = new_series_name + + return series + else: # TODO - support limited ops on strings https://github.com/elastic/eland/issues/65 raise TypeError( "unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(right)) ) - def _numeric_rop(self, left, method_name): + def _numeric_rop(self, left, method_name, op_type=None): """ e.g. 1 + ed.Series """ @@ -1051,6 +1108,22 @@ class Series(NDFrame): series.name = self.name return series + + elif isinstance(left, str) and self._dtype == 'object': + new_field_name = "{0}_{1}_{2}".format(self.name, op_method_name, str(left).replace('.', '_')) + self.name, left = self._query_compiler.check_str_arithmetics(None, self.name, left) + # our operation is between a series and a string on the right + op_type = op_type + tuple('l') + # Compatible, so create new Series + series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( + new_field_name, op_method_name, left, self.name, op_type)) + + # truncate last occurence of '.keyword' + new_series_name = self.name.rsplit('.keyword', 1)[0] + series.name = new_series_name + + return series + else: # TODO - support limited ops on strings https://github.com/elastic/eland/issues/65 raise TypeError( diff --git a/eland/tests/series/test_str_arithmetics_pytest.py b/eland/tests/series/test_str_arithmetics_pytest.py new file mode 100644 index 0000000..682d9ac --- /dev/null +++ b/eland/tests/series/test_str_arithmetics_pytest.py @@ -0,0 +1,40 @@ +# File called _pytest for PyCharm compatability +import pytest +import numpy as np + +from eland.tests.common import TestData, assert_pandas_eland_series_equal + + +class TestSeriesArithmetics(TestData): + + def test_invalid_add_num(self): + with pytest.raises(TypeError): + assert 2 + self.ed_ecommerce()['currency'] + + with pytest.raises(TypeError): + assert self.ed_ecommerce()['currency'] + 2 + + with pytest.raises(TypeError): + assert self.ed_ecommerce()['currency'] + self.ed_ecommerce()['total_quantity'] + + with pytest.raises(TypeError): + assert self.ed_ecommerce()['total_quantity'] + self.ed_ecommerce()['currency'] + + def test_str_add_ser(self): + + edadd = self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_last_name'] + pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name'] + + assert_pandas_eland_series_equal(pdadd, edadd) + + def test_ser_add_str(self): + edadd = self.ed_ecommerce()['customer_first_name'] + " is the first name." + pdadd = self.pd_ecommerce()['customer_first_name'] + " is the first name." + + assert_pandas_eland_series_equal(pdadd, edadd) + + def test_ser_add_ser(self): + edadd = "The last name is: " + self.ed_ecommerce()['customer_last_name'] + pdadd = "The last name is: " + self.pd_ecommerce()['customer_last_name'] + + assert_pandas_eland_series_equal(pdadd, edadd)