String Arithmetics: __add__ ops (#68)

* adds support for __add__ ops for string objects and literals

* adds tests for string arithmetic

* updates comment in numeric field resolution

* adds op_type parameter for numeric_ops
This commit is contained in:
Michael Hirsch 2019-11-27 10:44:17 -05:00 committed by GitHub
parent 33f5495352
commit a3dd86075a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 318 additions and 107 deletions

View File

@ -68,9 +68,11 @@ class Operations:
task = ('tail', (index.sort_field, n))
self._tasks.append(task)
def arithmetic_op_fields(self, field_name, op_name, left_field, right_field):
task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))))
def arithmetic_op_fields(self, field_name, op_name, left_field, right_field, op_type=None):
if op_type:
task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))), op_type)
else:
task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))))
# Set this as a column we want to retrieve
self.set_field_names([field_name])
@ -907,101 +909,163 @@ class Operations:
left_field = item[1][1][1][0]
right_field = item[1][1][1][1]
try:
op_type = item[2]
except IndexError:
op_type = None
# https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
if isinstance(left_field, str) and isinstance(right_field, str):
"""
(if op_name = '__truediv__')
if not op_type:
if isinstance(left_field, str) and isinstance(right_field, str):
"""
(if op_name = '__truediv__')
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / doc[right_field].value"
}
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / doc[right_field].value"
}
}
}
}
"""
if op_name == '__add__':
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
"""
if op_name == '__add__':
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
}
}
elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number):
"""
(if op_name = '__truediv__')
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / right_field"
}
}
}
"""
if op_name == '__add__':
source = "doc['{0}'].value + {1}".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / {1}".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % {1}".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * {1}".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - {1}".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str):
"""
(if op_name = '__truediv__')
"script_fields": {
"field_name": {
"script": {
"source": "left_field / doc['right_field'].value"
}
}
}
"""
if op_name == '__add__':
source = "{0} + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "{0} / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "{0} % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "{0} * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "{0} - doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
}
}
elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number):
"""
(if op_name = '__truediv__')
elif op_type[0] == "string":
# we need to check the type of string addition
if op_type[1] == "s":
"""
(if op_name = '__add__')
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / right_field"
}
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value + doc[right_field].value"
}
}
}
}
"""
if op_name == '__add__':
source = "doc['{0}'].value + {1}".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / {1}".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % {1}".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * {1}".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - {1}".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str):
"""
(if op_name = '__truediv__')
"""
if op_name == '__add__':
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
"script_fields": {
"field_name": {
"script": {
"source": "left_field / doc['right_field'].value"
}
}
}
"""
if op_name == '__add__':
source = "{0} + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "{0} / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "{0} % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "{0} * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "{0} - doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
else:
raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name)
elif op_type[1] == "r":
if isinstance(left_field, str) and isinstance(right_field, str):
"""
(if op_name = '__add__')
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value + right_field"
}
}
}
"""
if op_name == '__add__':
source = "doc['{0}'].value + '{1}'".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
elif op_type[1] == 'l':
if isinstance(left_field, str) and isinstance(right_field, str):
"""
(if op_name = '__add__')
"script_fields": {
"field_name": {
"script": {
"source": "left_field + doc[right_field].value"
}
}
}
"""
if op_name == '__add__':
source = "'{0}' + doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}

View File

@ -517,10 +517,44 @@ class ElandQueryCompiler:
"{0} != {1}".format(self._index_pattern, right._index_pattern)
)
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field):
def check_str_arithmetics(self, right, self_field, right_field):
"""
In the case of string arithmetics, we need an additional check to ensure that the
selected fields are aggregatable.
Parameters
----------
right: ElandQueryCompiler
The query compiler to compare self to
Raises
------
TypeError, ValueError
If string arithmetic operations aren't possible
"""
# only check compatibility if right is an ElandQueryCompiler
# else return the raw string as the new field name
right_agg = {right_field: right_field}
if right:
self.check_arithmetics(right)
right_agg = right._mappings.aggregatable_field_names([right_field])
self_agg = self._mappings.aggregatable_field_names([self_field])
if self_agg and right_agg:
return list(self_agg.keys())[0], list(right_agg.keys())[0]
else:
raise ValueError(
"Can not perform arithmetic operations on non aggregatable fields"
"One of [{}, {}] is not aggregatable.".format(self.name, right.name)
)
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
result = self.copy()
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field)
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field, op_type)
return result

View File

@ -503,8 +503,27 @@ class Series(NDFrame):
3 176.979996
4 82.980003
dtype: float64
>>> df.customer_first_name + df.customer_last_name
0 EddieUnderwood
1 MaryBailey
2 GwenButler
3 DianeChandler
4 EddieWeber
dtype: object
>>> "First name: " + df.customer_first_name
0 First name: Eddie
1 First name: Mary
2 First name: Gwen
3 First name: Diane
4 First name: Eddie
Name: customer_first_name, dtype: object
"""
return self._numeric_op(right, _get_method_name())
if self._dtype == 'object':
op_type = ('string',)
else:
op_type = ('numeric',)
return self._numeric_op(right, _get_method_name(), op_type)
def __truediv__(self, right):
"""
@ -770,7 +789,12 @@ class Series(NDFrame):
4 81.980003
Name: taxful_total_price, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
if self._dtype == 'object':
op_type = ('string',)
else:
op_type = ('numeric',)
return self._numeric_rop(left, _get_method_name(), op_type)
def __rtruediv__(self, left):
"""
@ -988,7 +1012,7 @@ class Series(NDFrame):
rsubtract = __rsub__
rtruediv = __rtruediv__
def _numeric_op(self, right, method_name):
def _numeric_op(self, right, method_name, op_type=None):
"""
return a op b
@ -1000,20 +1024,36 @@ class Series(NDFrame):
# Check compatibility of Elasticsearch cluster
self._query_compiler.check_arithmetics(right._query_compiler)
# Check compatibility of dtypes
# either not a number?
if not (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)):
# check left numeric series and right numeric series
if (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)):
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, method_name, self.name, right.name))
series.name = None
return series
# check left object series and right object series
elif self._dtype == 'object' and right._dtype == 'object':
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
# our operation is between series
op_type = op_type + tuple('s')
# check if fields are aggregatable
self.name, right.name = self._query_compiler.check_str_arithmetics(right._query_compiler, self.name, right.name)
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, method_name, self.name, right.name, op_type))
series.name = None
return series
else:
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
raise TypeError("Unsupported operation: '{}' {} '{}'".format(self._dtype, method_name, right._dtype))
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, method_name, self.name, right.name))
series.name = None
return series
# check left number and right numeric series
elif np.issubdtype(np.dtype(type(right)), np.number) and np.issubdtype(self._dtype, np.number):
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
@ -1025,13 +1065,30 @@ class Series(NDFrame):
series.name = self.name
return series
# check left str series and right str
elif isinstance(right, str) and self._dtype == 'object':
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
self.name, right = self._query_compiler.check_str_arithmetics(None, self.name, right)
# our operation is between a series and a string on the right
op_type = op_type + tuple('r')
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, method_name, self.name, right, op_type))
# truncate last occurence of '.keyword'
new_series_name = self.name.rsplit('.keyword', 1)[0]
series.name = new_series_name
return series
else:
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
raise TypeError(
"unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(right))
)
def _numeric_rop(self, left, method_name):
def _numeric_rop(self, left, method_name, op_type=None):
"""
e.g. 1 + ed.Series
"""
@ -1051,6 +1108,22 @@ class Series(NDFrame):
series.name = self.name
return series
elif isinstance(left, str) and self._dtype == 'object':
new_field_name = "{0}_{1}_{2}".format(self.name, op_method_name, str(left).replace('.', '_'))
self.name, left = self._query_compiler.check_str_arithmetics(None, self.name, left)
# our operation is between a series and a string on the right
op_type = op_type + tuple('l')
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, op_method_name, left, self.name, op_type))
# truncate last occurence of '.keyword'
new_series_name = self.name.rsplit('.keyword', 1)[0]
series.name = new_series_name
return series
else:
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
raise TypeError(

View File

@ -0,0 +1,40 @@
# File called _pytest for PyCharm compatability
import pytest
import numpy as np
from eland.tests.common import TestData, assert_pandas_eland_series_equal
class TestSeriesArithmetics(TestData):
def test_invalid_add_num(self):
with pytest.raises(TypeError):
assert 2 + self.ed_ecommerce()['currency']
with pytest.raises(TypeError):
assert self.ed_ecommerce()['currency'] + 2
with pytest.raises(TypeError):
assert self.ed_ecommerce()['currency'] + self.ed_ecommerce()['total_quantity']
with pytest.raises(TypeError):
assert self.ed_ecommerce()['total_quantity'] + self.ed_ecommerce()['currency']
def test_str_add_ser(self):
edadd = self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_last_name']
pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name']
assert_pandas_eland_series_equal(pdadd, edadd)
def test_ser_add_str(self):
edadd = self.ed_ecommerce()['customer_first_name'] + " is the first name."
pdadd = self.pd_ecommerce()['customer_first_name'] + " is the first name."
assert_pandas_eland_series_equal(pdadd, edadd)
def test_ser_add_ser(self):
edadd = "The last name is: " + self.ed_ecommerce()['customer_last_name']
pdadd = "The last name is: " + self.pd_ecommerce()['customer_last_name']
assert_pandas_eland_series_equal(pdadd, edadd)