# Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import numpy as np import pandas as pd # File called _pytest for PyCharm compatibility import pytest from pandas.testing import assert_frame_equal, assert_series_equal from tests.common import TestData, assert_almost_equal class TestDataFrameMetrics(TestData): funcs = ["max", "min", "mean", "sum"] extended_funcs = ["median", "mad", "var", "std"] filter_data = [ "AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry", ] @pytest.mark.parametrize("numeric_only", [False, None]) def test_flights_metrics(self, numeric_only): pd_flights = self.pd_flights() ed_flights = self.ed_flights() for func in self.funcs: # Pandas v1.0 doesn't support mean() on datetime # Pandas and Eland don't support sum() on datetime if not numeric_only: dtype_include = ( [np.number, np.datetime64] if func not in ("mean", "sum") else [np.number] ) pd_flights = pd_flights.select_dtypes(include=dtype_include) ed_flights = ed_flights.select_dtypes(include=dtype_include) pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only) ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only) assert_series_equal(pd_metric, ed_metric, check_dtype=False) def test_flights_extended_metrics(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() # Test on reduced set of data for more consistent # median behaviour + better var, std test for sample vs population pd_flights = pd_flights[["AvgTicketPrice"]] ed_flights = ed_flights[["AvgTicketPrice"]] import logging logger = logging.getLogger("elasticsearch") logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) for func in self.extended_funcs: pd_metric = getattr(pd_flights, func)( **({"numeric_only": True} if func != "mad" else {}) ) ed_metric = getattr(ed_flights, func)(numeric_only=True) pd_value = pd_metric["AvgTicketPrice"] ed_value = ed_metric["AvgTicketPrice"] assert (ed_value * 0.9) <= pd_value <= (ed_value * 1.1) # +/-10% def test_flights_extended_metrics_nan(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() # Test on single row to test NaN behaviour of sample std/variance pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]] ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]] for func in self.extended_funcs: pd_metric = getattr(pd_flights_1, func)() ed_metric = getattr(ed_flights_1, func)(numeric_only=False) assert_series_equal(pd_metric, ed_metric, check_exact=False) # Test on zero rows to test NaN behaviour of sample std/variance pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]] ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]] for func in self.extended_funcs: pd_metric = getattr(pd_flights_0, func)() ed_metric = getattr(ed_flights_0, func)(numeric_only=False) assert_series_equal(pd_metric, ed_metric, check_exact=False) def test_ecommerce_selected_non_numeric_source_fields(self): # None of these are numeric columns = [ "category", "currency", "customer_birth_date", "customer_first_name", "user", ] pd_ecommerce = self.pd_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns] for func in self.funcs: assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), check_exact=False, ) def test_ecommerce_selected_mixed_numeric_source_fields(self): # Some of these are numeric columns = [ "category", "currency", "taxless_total_price", "customer_birth_date", "total_quantity", "customer_first_name", "user", ] pd_ecommerce = self.pd_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns] for func in self.funcs: assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), check_exact=False, ) def test_ecommerce_selected_all_numeric_source_fields(self): # All of these are numeric columns = ["total_quantity", "taxful_total_price", "taxless_total_price"] pd_ecommerce = self.pd_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns] for func in self.funcs: assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), check_exact=False, ) def test_flights_datetime_metrics_agg(self): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { "max": pd.Timestamp("2018-02-11 23:50:12"), "min": pd.Timestamp("2018-01-01 00:00:00"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "sum": pd.NaT, "mad": pd.NaT, "var": pd.NaT, "std": pd.NaT, "nunique": 12236, } ed_metrics = ed_timestamps.agg( self.funcs + self.extended_funcs + ["nunique"], numeric_only=False ) ed_metrics_dict = ed_metrics["timestamp"].to_dict() ed_metrics_dict.pop("median") # Median is tested below. for key, expected_value in expected_values.items(): assert_almost_equal(ed_metrics_dict[key], expected_value) @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) def test_flights_datetime_metrics_single_agg(self, agg): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { "min": pd.Timestamp("2018-01-01 00:00:00"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "max": pd.Timestamp("2018-02-11 23:50:12"), "nunique": 12236, } ed_metric = ed_timestamps.agg([agg]) if agg == "nunique": # df with timestamp column should return int64 assert ed_metric.dtypes["timestamp"] == np.int64 else: # df with timestamp column should return datetime64[ns] assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert_almost_equal(ed_metric["timestamp"][0], expected_values[agg]) @pytest.mark.parametrize("agg", ["mean", "min", "max"]) def test_flights_datetime_metrics_agg_func(self, agg): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { "min": pd.Timestamp("2018-01-01 00:00:00"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "max": pd.Timestamp("2018-02-11 23:50:12"), } ed_metric = getattr(ed_timestamps, agg)(numeric_only=False) assert ed_metric.dtype == np.dtype("datetime64[ns]") assert_almost_equal(ed_metric[0], expected_values[agg]) @pytest.mark.parametrize("agg", ["median", "quantile"]) def test_flights_datetime_metrics_median_quantile(self, agg): ed_df = self.ed_flights_small()[["timestamp"]] median = ed_df.median(numeric_only=False)[0] assert isinstance(median, pd.Timestamp) assert ( pd.to_datetime("2018-01-01 10:00:00.000") <= median <= pd.to_datetime("2018-01-01 12:00:00.000") ) agg_value = ed_df.agg([agg])["timestamp"][0] assert isinstance(agg_value, pd.Timestamp) assert ( pd.to_datetime("2018-01-01 10:00:00.000") <= agg_value <= pd.to_datetime("2018-01-01 12:00:00.000") ) def test_metric_agg_keep_dtypes(self): # max, min and median maintain their dtypes df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]] assert df.min().tolist() == [131.81910705566406, False, 0] assert df.max().tolist() == [989.9527587890625, True, 0] assert df.median().tolist() == [550.276123046875, False, 0] all_agg = df.agg(["min", "max", "median"]) assert all_agg.dtypes.tolist() == [ np.dtype("float64"), np.dtype("bool"), np.dtype("int64"), ] assert all_agg.to_dict() == { "AvgTicketPrice": { "max": 989.9527587890625, "median": 550.276123046875, "min": 131.81910705566406, }, "Cancelled": {"max": True, "median": False, "min": False}, "dayOfWeek": {"max": 0, "median": 0, "min": 0}, } # sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64. sum_agg = df.agg(["sum"]) assert sum_agg.dtypes.to_list() == [ np.dtype("float64"), np.dtype("int64"), np.dtype("int64"), ] assert sum_agg.to_dict() == { "AvgTicketPrice": {"sum": 26521.624084472656}, "Cancelled": {"sum": 6}, "dayOfWeek": {"sum": 0}, } def test_flights_numeric_only(self): # All Aggregations Data Check ed_flights = self.ed_flights().filter(self.filter_data) pd_flights = self.pd_flights().filter(self.filter_data) # agg => numeric_only True returns float64 values # We compare it with individual single agg functions of pandas with numeric_only=True filtered_aggs = self.funcs + self.extended_funcs agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose() for agg in filtered_aggs: # Explicitly check for mad because it returns nan for bools if agg == "mad": assert np.isnan(agg_data[agg]["Cancelled"]) else: assert_series_equal( agg_data[agg].rename(None), getattr(pd_flights, agg)(numeric_only=True).astype(float), check_exact=False, rtol=True, ) # all single aggs return float64 for numeric_only=True def test_numeric_only_true_single_aggs(self): ed_flights = self.ed_flights().filter(self.filter_data) for agg in self.funcs + self.extended_funcs: result = getattr(ed_flights, agg)(numeric_only=True) assert result.dtype == np.dtype("float64") assert result.shape == ((3,) if agg != "mad" else (2,)) # check dtypes and shape of min, max and median for numeric_only=False | None @pytest.mark.parametrize("agg", ["min", "max", "median"]) @pytest.mark.parametrize("numeric_only", [False, None]) def test_min_max_median_numeric_only(self, agg, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is False: calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["Cancelled"], np.bool_) assert isinstance(calculated_values["dayOfWeek"], np.int64) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["Cancelled"], np.bool_) assert isinstance(calculated_values["dayOfWeek"], np.int64) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert calculated_values.shape == (4,) # check dtypes and shape for sum @pytest.mark.parametrize("numeric_only", [False, None]) def test_sum_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is False: calculated_values = ed_flights.sum(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["dayOfWeek"], np.int64) assert isinstance(calculated_values["Cancelled"], np.int64) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.sum(numeric_only=numeric_only) dtype_list = [calculated_values[i].dtype for i in calculated_values.index] assert dtype_list == [ np.dtype("float64"), np.dtype("int64"), np.dtype("int64"), ] assert calculated_values.shape == (3,) # check dtypes and shape for std @pytest.mark.parametrize("numeric_only", [False, None]) def test_std_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is False: calculated_values = ed_flights.std(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["Cancelled"], float) assert isinstance(calculated_values["dayOfWeek"], float) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.std(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["Cancelled"], float) assert isinstance(calculated_values["dayOfWeek"], float) assert calculated_values.shape == (3,) # check dtypes and shape for var @pytest.mark.parametrize("numeric_only", [False, None]) def test_var_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is False: calculated_values = ed_flights.var(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], np.float64) assert isinstance(calculated_values["dayOfWeek"], np.float64) assert isinstance(calculated_values["Cancelled"], np.float64) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.var(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["Cancelled"], float) assert isinstance(calculated_values["dayOfWeek"], float) assert calculated_values.shape == (3,) # check dtypes and shape for mean @pytest.mark.parametrize("numeric_only", [False, None]) def test_mean_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is False: calculated_values = ed_flights.mean(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["dayOfWeek"], float) assert isinstance(calculated_values["Cancelled"], float) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert np.isnan(calculated_values["DestCountry"]) assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.mean(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["Cancelled"], float) assert isinstance(calculated_values["dayOfWeek"], float) assert isinstance(calculated_values["timestamp"], pd.Timestamp) assert calculated_values.shape == (4,) # check dtypes and shape for mad @pytest.mark.parametrize("numeric_only", [False, None]) def test_mad_numeric_only(self, numeric_only): ed_flights = self.ed_flights().filter(self.filter_data) if numeric_only is False: calculated_values = ed_flights.mad(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["Cancelled"], np.float64) assert isinstance(calculated_values["dayOfWeek"], float) assert pd.isnull(calculated_values["timestamp"]) assert np.isnan(calculated_values["DestCountry"]) assert calculated_values.shape == (5,) elif numeric_only is None: calculated_values = ed_flights.mad(numeric_only=numeric_only) assert isinstance(calculated_values["AvgTicketPrice"], float) assert isinstance(calculated_values["dayOfWeek"], float) assert calculated_values.shape == (2,) def test_aggs_count(self): pd_flights = self.pd_flights().filter(self.filter_data) ed_flights = self.ed_flights().filter(self.filter_data) pd_count = pd_flights.agg(["count"]) ed_count = ed_flights.agg(["count"]) assert_frame_equal(pd_count, ed_count) @pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.parametrize("es_size", [1, 2, 20, 100, 5000, 3000]) def test_aggs_mode(self, es_size, numeric_only): # FlightNum has unique values, so we can test `fill` NaN/NaT for remaining columns pd_flights = self.pd_flights().filter( ["Cancelled", "dayOfWeek", "timestamp", "DestCountry", "FlightNum"] ) ed_flights = self.ed_flights().filter( ["Cancelled", "dayOfWeek", "timestamp", "DestCountry", "FlightNum"] ) pd_mode = pd_flights.mode(numeric_only=numeric_only)[:es_size] ed_mode = ed_flights.mode(numeric_only=numeric_only, es_size=es_size) # Skipping dtype check because eland is giving Cancelled dtype as bool # but pandas is referring it as object assert_frame_equal( pd_mode, ed_mode, check_dtype=(False if es_size == 1 else True) ) @pytest.mark.parametrize("quantiles", [[0.2, 0.5], [0, 1], [0.75, 0.2, 0.1, 0.5]]) @pytest.mark.parametrize("numeric_only", [False, None]) def test_flights_quantile(self, quantiles, numeric_only): pd_flights = self.pd_flights() ed_flights = self.ed_flights() pd_quantile = pd_flights.filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ).quantile(q=quantiles, numeric_only=numeric_only) ed_quantile = ed_flights.filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ).quantile(q=quantiles, numeric_only=numeric_only) assert_frame_equal(pd_quantile, ed_quantile, check_exact=False, rtol=2) pd_quantile = pd_flights[["timestamp"]].quantile( q=quantiles, numeric_only=numeric_only ) ed_quantile = ed_flights[["timestamp"]].quantile( q=quantiles, numeric_only=numeric_only ) pd_timestamp = pd.to_numeric(pd_quantile.squeeze(), downcast="float") ed_timestamp = pd.to_numeric(ed_quantile.squeeze(), downcast="float") assert_series_equal(pd_timestamp, ed_timestamp, check_exact=False, rtol=2) @pytest.mark.parametrize("quantiles", [5, [2, 1], -1.5, [1.2, 0.2]]) def test_flights_quantile_error(self, quantiles): ed_flights = self.ed_flights().filter(self.filter_data) match = f"quantile should be in range of 0 and 1, given {quantiles[0] if isinstance(quantiles, list) else quantiles}" with pytest.raises(ValueError, match=match): ed_flights[["timestamp"]].quantile(q=quantiles) @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_flights_agg_quantile(self, numeric_only): pd_flights = self.pd_flights().filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) ed_flights = self.ed_flights().filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only) ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only) assert_frame_equal( pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False ) def test_flights_idx_on_index(self): pd_flights = self.pd_flights().filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) ed_flights = self.ed_flights().filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) pd_idxmax = pd_flights.idxmax() ed_idxmax = ed_flights.idxmax() assert_series_equal(pd_idxmax, ed_idxmax) pd_idxmin = pd_flights.idxmin() ed_idxmin = ed_flights.idxmin() assert_series_equal(pd_idxmin, ed_idxmin) def test_flights_idx_on_columns(self): match = "This feature is not implemented yet for 'axis = 1'" with pytest.raises(NotImplementedError, match=match): ed_flights = self.ed_flights().filter( ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) ed_flights.idxmax(axis=1)