From 4d96ad39fdff5101dc7ac361be1d11aace462d90 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" <33659563+V1NAY8@users.noreply.github.com> Date: Tue, 22 Sep 2020 21:02:27 +0530 Subject: [PATCH] Switch agg defaults to numeric_only=None --- eland/dataframe.py | 56 +++- eland/field_mappings.py | 3 + eland/ndframe.py | 301 +++++++++++++------ eland/operations.py | 121 ++++---- eland/query_compiler.py | 58 ++-- eland/tests/dataframe/test_aggs_pytest.py | 30 +- eland/tests/dataframe/test_metrics_pytest.py | 218 ++++++++++++-- eland/tests/series/test_metrics_pytest.py | 6 +- 8 files changed, 576 insertions(+), 217 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index f740687..bd25a75 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,7 +19,7 @@ import sys import warnings from io import StringIO import re -from typing import Optional, Sequence, Union, Tuple +from typing import Optional, Sequence, Union, Tuple, List import numpy as np import pandas as pd @@ -1328,7 +1328,14 @@ class DataFrame(NDFrame): """ return self.columns - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate( + self, + func: Union[str, List[str]], + axis: int = 0, + numeric_only: Optional[bool] = None, + *args, + **kwargs, + ) -> Union[pd.Series, pd.DataFrame]: """ Aggregate using one or more operations over the specified axis. @@ -1347,8 +1354,13 @@ class DataFrame(NDFrame): Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var']`` - axis + axis: int Currently, we only support axis=0 (index) + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: returns all values with float64, NaN/NaT are ignored. + - False: returns all values with float64. + - None: returns all values with default datatype. *args Positional arguments to pass to `func` **kwargs @@ -1368,12 +1380,30 @@ class DataFrame(NDFrame): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') - >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int) - DistanceKilometers AvgTicketPrice - sum 92616288 8204364 - min 0 100 - std 4578 266 + >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry']) + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int) + AvgTicketPrice DistanceKilometers + sum 8204364 92616288 + min 100 0 + std 266 4578 + + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True) + AvgTicketPrice DistanceKilometers + sum 8.204365e+06 9.261629e+07 + min 1.000205e+02 0.000000e+00 + std 2.664071e+02 4.578614e+03 + + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=False) + AvgTicketPrice DistanceKilometers timestamp DestCountry + sum 8.204365e+06 9.261629e+07 NaT NaN + min 1.000205e+02 0.000000e+00 2018-01-01 NaN + std 2.664071e+02 4.578614e+03 NaT NaN + + >>> df.aggregate(['sum', 'min', 'std'], numeric_only=None) + AvgTicketPrice DistanceKilometers timestamp DestCountry + sum 8.204365e+06 9.261629e+07 NaT NaN + min 1.000205e+02 0.000000e+00 2018-01-01 NaN + std 2.664071e+02 4.578614e+03 NaT NaN """ axis = pd.DataFrame._get_axis_number(axis) @@ -1387,10 +1417,14 @@ class DataFrame(NDFrame): # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] if isinstance(func, str): # Wrap in list - return self._query_compiler.aggs([func]).squeeze().rename(None) + return ( + self._query_compiler.aggs([func], numeric_only=numeric_only) + .squeeze() + .rename(None) + ) elif is_list_like(func): # we have a list! - return self._query_compiler.aggs(func) + return self._query_compiler.aggs(func, numeric_only=numeric_only) agg = aggregate diff --git a/eland/field_mappings.py b/eland/field_mappings.py index dd7757b..32b71c4 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -100,6 +100,9 @@ class Field(NamedTuple): # Cardinality works for all types # Numerics and bools work for all aggs + # Except "median_absolute_deviation" which doesn't support bool + if es_agg == "median_absolute_deviation" and self.is_bool: + return False if es_agg == "cardinality" or self.is_numeric or self.is_bool: return True # Timestamps also work for 'min', 'max' and 'avg' diff --git a/eland/ndframe.py b/eland/ndframe.py index 16a01df..6659ee2 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -17,7 +17,7 @@ import sys from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Tuple, Optional import pandas as pd from eland.query_compiler import QueryCompiler @@ -162,12 +162,19 @@ class NDFrame(ABC): def _es_info(self, buf): self._query_compiler.es_info(buf) - def mean(self, numeric_only: bool = True) -> pd.Series: + def mean(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return mean value for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved Returns ------- pandas.Series @@ -179,27 +186,44 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mean() - AvgTicketPrice 628.253689 - Cancelled 0.128494 - DistanceKilometers 7092.142457 - DistanceMiles 4406.853010 - FlightDelay 0.251168 - FlightDelayMin 47.335171 - FlightTimeHour 8.518797 - FlightTimeMin 511.127842 - dayOfWeek 2.835975 + AvgTicketPrice 628.254 + Cancelled 0.128494 + dayOfWeek 2.83598 + timestamp 2018-01-21 19:20:45.564438232 + dtype: object + + >>> df.mean(numeric_only=True) + AvgTicketPrice 628.253689 + Cancelled 0.128494 + dayOfWeek 2.835975 dtype: float64 + + >>> df.mean(numeric_only=False) + AvgTicketPrice 628.254 + Cancelled 0.128494 + dayOfWeek 2.83598 + timestamp 2018-01-21 19:20:45.564438232 + DestCountry NaN + dtype: object """ return self._query_compiler.mean(numeric_only=numeric_only) - def sum(self, numeric_only: bool = True) -> pd.Series: + def sum(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return sum for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + Returns ------- pandas.Series @@ -211,27 +235,43 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.sum() - AvgTicketPrice 8.204365e+06 - Cancelled 1.678000e+03 - DistanceKilometers 9.261629e+07 - DistanceMiles 5.754909e+07 - FlightDelay 3.280000e+03 - FlightDelayMin 6.181500e+05 - FlightTimeHour 1.112470e+05 - FlightTimeMin 6.674818e+06 - dayOfWeek 3.703500e+04 + AvgTicketPrice 8.20436e+06 + Cancelled 1678 + dayOfWeek 37035 + dtype: object + + >>> df.sum(numeric_only=True) + AvgTicketPrice 8.204365e+06 + Cancelled 1.678000e+03 + dayOfWeek 3.703500e+04 dtype: float64 + + >>> df.sum(numeric_only=False) + AvgTicketPrice 8.20436e+06 + Cancelled 1678 + dayOfWeek 37035 + timestamp NaT + DestCountry NaN + dtype: object """ return self._query_compiler.sum(numeric_only=numeric_only) - def min(self, numeric_only: bool = True) -> pd.Series: + def min(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the minimum value for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + Returns ------- pandas.Series @@ -243,25 +283,42 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.min() - AvgTicketPrice 100.021 - Cancelled False - DistanceKilometers 0 - DistanceMiles 0 - FlightDelay False - FlightDelayMin 0 - FlightTimeHour 0 - FlightTimeMin 0 - dayOfWeek 0 + AvgTicketPrice 100.021 + Cancelled False + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 + dtype: object + + >>> df.min(numeric_only=True) + AvgTicketPrice 100.020531 + Cancelled 0.000000 + dayOfWeek 0.000000 + dtype: float64 + + >>> df.min(numeric_only=False) + AvgTicketPrice 100.021 + Cancelled False + dayOfWeek 0 + timestamp 2018-01-01 00:00:00 + DestCountry NaN dtype: object """ return self._query_compiler.min(numeric_only=numeric_only) - def var(self, numeric_only: bool = True) -> pd.Series: + def var(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return variance for each numeric column + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + Returns ------- pandas.Series @@ -273,25 +330,41 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.var() # doctest: +SKIP - AvgTicketPrice 7.096185e+04 - Cancelled 1.119831e-01 - DistanceKilometers 2.096049e+07 - DistanceMiles 8.092892e+06 - FlightDelay 1.880825e-01 - FlightDelayMin 9.359209e+03 - FlightTimeHour 3.112545e+01 - FlightTimeMin 1.120516e+05 - dayOfWeek 3.761135e+00 + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df.var() + AvgTicketPrice 70964.570234 + Cancelled 0.111987 + dayOfWeek 3.761279 dtype: float64 + + >>> df.var(numeric_only=True) + AvgTicketPrice 70964.570234 + Cancelled 0.111987 + dayOfWeek 3.761279 + dtype: float64 + + >>> df.var(numeric_only=False) + AvgTicketPrice 70964.6 + Cancelled 0.111987 + dayOfWeek 3.76128 + timestamp NaT + DestCountry NaN + dtype: object """ return self._query_compiler.var(numeric_only=numeric_only) - def std(self, numeric_only: bool = True) -> pd.Series: + def std(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return standard deviation for each numeric column + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + Returns ------- pandas.Series @@ -303,25 +376,41 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.std() # doctest: +SKIP - AvgTicketPrice 266.386661 - Cancelled 0.334639 - DistanceKilometers 4578.263193 - DistanceMiles 2844.800855 - FlightDelay 0.433685 - FlightDelayMin 96.743006 - FlightTimeHour 5.579019 - FlightTimeMin 334.741135 - dayOfWeek 1.939365 + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) + >>> df.std() + AvgTicketPrice 266.407061 + Cancelled 0.334664 + dayOfWeek 1.939513 dtype: float64 + + >>> df.std(numeric_only=True) + AvgTicketPrice 266.407061 + Cancelled 0.334664 + dayOfWeek 1.939513 + dtype: float64 + + >>> df.std(numeric_only=False) + AvgTicketPrice 266.407 + Cancelled 0.334664 + dayOfWeek 1.93951 + timestamp NaT + DestCountry NaN + dtype: object """ return self._query_compiler.std(numeric_only=numeric_only) - def median(self, numeric_only: bool = True) -> pd.Series: + def median(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the median value for each numeric column + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + Returns ------- pandas.Series @@ -333,27 +422,44 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.median() # doctest: +SKIP - AvgTicketPrice 640.387285 - Cancelled 0.000000 - DistanceKilometers 7612.072403 - DistanceMiles 4729.922470 - FlightDelay 0.000000 - FlightDelayMin 0.000000 - FlightTimeHour 8.383113 - FlightTimeMin 503.148975 - dayOfWeek 3.000000 + AvgTicketPrice 640.363 + Cancelled False + dayOfWeek 3 + timestamp 2018-01-21 23:54:06.624776611 + dtype: object + + >>> df.median(numeric_only=True) # doctest: +SKIP + AvgTicketPrice 640.362667 + Cancelled 0.000000 + dayOfWeek 3.000000 dtype: float64 + + >>> df.median(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 640.387 + Cancelled False + dayOfWeek 3 + timestamp 2018-01-21 23:54:06.624776611 + DestCountry NaN + dtype: object """ return self._query_compiler.median(numeric_only=numeric_only) - def max(self, numeric_only: bool = True) -> pd.Series: + def max(self, numeric_only: Optional[bool] = None) -> pd.Series: """ Return the maximum value for each numeric column TODO - implement remainder of pandas arguments, currently non-numerics are not supported + Parameters + ---------- + numeric_only: {True, False, None} Default is None + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + Returns ------- pandas.Series @@ -365,17 +471,26 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.max() - AvgTicketPrice 1199.73 - Cancelled True - DistanceKilometers 19881.5 - DistanceMiles 12353.8 - FlightDelay True - FlightDelayMin 360 - FlightTimeHour 31.715 - FlightTimeMin 1902.9 - dayOfWeek 6 + AvgTicketPrice 1199.73 + Cancelled True + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 + dtype: object + + >>> df.max(numeric_only=True) + AvgTicketPrice 1199.729004 + Cancelled 1.000000 + dayOfWeek 6.000000 + dtype: float64 + + >>> df.max(numeric_only=False) + AvgTicketPrice 1199.73 + Cancelled True + dayOfWeek 6 + timestamp 2018-02-11 23:50:12 + DestCountry NaN dtype: object """ return self._query_compiler.max(numeric_only=numeric_only) @@ -441,18 +556,24 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') + >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df.mad() # doctest: +SKIP - AvgTicketPrice 213.368709 - Cancelled 0.000000 - DistanceKilometers 2946.168236 - DistanceMiles 1830.987236 - FlightDelay 0.000000 - FlightDelayMin 0.000000 - FlightTimeHour 3.819435 - FlightTimeMin 229.142297 - dayOfWeek 2.000000 + AvgTicketPrice 213.35497 + dayOfWeek 2.00000 dtype: float64 + + >>> df.mad(numeric_only=True) # doctest: +SKIP + AvgTicketPrice 213.473011 + dayOfWeek 2.000000 + dtype: float64 + + >>> df.mad(numeric_only=False) # doctest: +SKIP + AvgTicketPrice 213.484 + Cancelled NaN + dayOfWeek 2 + timestamp NaT + DestCountry NaN + dtype: object """ return self._query_compiler.mad(numeric_only=numeric_only) diff --git a/eland/operations.py b/eland/operations.py index 9728be7..cd48ef5 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -145,43 +145,28 @@ class Operations: return build_pd_series(data=counts, index=fields) - def mean(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) - - def var(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) - - def std(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) - - def median(self, query_compiler, numeric_only=True): - results = self._metric_aggs( - query_compiler, ["median"], numeric_only=numeric_only - ) - return build_pd_series(results, index=results.keys()) - - def sum(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) - - def max(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) - - def min(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) - - def nunique(self, query_compiler): - results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False) - return build_pd_series(results, index=results.keys()) - - def mad(self, query_compiler, numeric_only=True): - results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only) - return build_pd_series(results, index=results.keys()) + def _metric_agg_series( + self, + query_compiler: "QueryCompiler", + agg: List, + numeric_only: Optional[bool] = None, + ) -> pd.Series: + results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only) + if numeric_only: + return build_pd_series(results, index=results.keys(), dtype=np.float64) + else: + # If all results are float convert into float64 + if all(isinstance(i, float) for i in results.values()): + dtype = np.float64 + # If all results are int convert into int64 + elif all(isinstance(i, int) for i in results.values()): + dtype = np.int64 + # If single result is present consider that datatype instead of object + elif len(results) <= 1: + dtype = None + else: + dtype = "object" + return build_pd_series(results, index=results.keys(), dtype=dtype) def value_counts(self, query_compiler, es_size): return self._terms_aggs(query_compiler, "terms", es_size) @@ -189,7 +174,21 @@ class Operations: def hist(self, query_compiler, bins): return self._hist_aggs(query_compiler, bins) - def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True): + def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame: + results = self._metric_aggs( + query_compiler, pd_aggs, numeric_only=numeric_only, is_dataframe_agg=True + ) + return pd.DataFrame( + results, index=pd_aggs, dtype=(np.float64 if numeric_only else None) + ) + + def _metric_aggs( + self, + query_compiler: "QueryCompiler", + pd_aggs, + numeric_only: Optional[bool] = None, + is_dataframe_agg: bool = False, + ) -> Dict: query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) @@ -201,6 +200,7 @@ class Operations: results = {} fields = query_compiler._mappings.all_source_fields() if numeric_only: + # Consider if field is Int/Float/Bool fields = [field for field in fields if (field.is_numeric or field.is_bool)] body = Query(query_params.query) @@ -210,6 +210,7 @@ class Operations: for field in fields: for es_agg in es_aggs: + # NaN/NaT fields are ignored if not field.is_es_agg_compatible(es_agg): continue @@ -241,10 +242,18 @@ class Operations: for field in fields: values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): - - # If the field and agg aren't compatible we add a NaN/NaT + # is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean() + # If the field and agg aren't compatible we add a NaN/NaT for agg + # If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean() if not field.is_es_agg_compatible(es_agg): - values.append(field.nan_value) + if is_dataframe_agg and not numeric_only: + values.append(field.nan_value) + elif not is_dataframe_agg and numeric_only is False: + values.append(field.nan_value) + # Explicit condition for mad to add NaN because it doesn't support bool + elif is_dataframe_agg and numeric_only: + if pd_agg == "mad": + values.append(field.nan_value) continue if isinstance(es_agg, tuple): @@ -269,7 +278,7 @@ class Operations: # All of the below calculations result in NaN if count<=1 if count <= 1: - agg_value = np.float64(np.NaN) + agg_value = np.NaN elif es_agg[1] == "std_deviation": agg_value *= count / (count - 1.0) @@ -287,8 +296,11 @@ class Operations: ]["value"] # Null usually means there were no results. - if agg_value is None: - agg_value = field.nan_value + if agg_value is None or np.isnan(agg_value): + if is_dataframe_agg and not numeric_only: + agg_value = np.NaN + elif not is_dataframe_agg and numeric_only is False: + agg_value = np.NaN # Cardinality is always either NaN or integer. elif pd_agg == "nunique": @@ -299,14 +311,21 @@ class Operations: agg_value = elasticsearch_date_to_pandas_date( agg_value, field.es_date_format ) - - # These aggregations maintain the column datatype - elif pd_agg in {"max", "min", "median"}: - agg_value = field.np_dtype.type(agg_value) + # If numeric_only is False | None then maintain column datatype + elif not numeric_only: + # we're only converting to bool for lossless aggs like min, max, and median. + if pd_agg in {"max", "min", "median", "sum"}: + # 'sum' isn't representable with bool, use int64 + if pd_agg == "sum" and field.is_bool: + agg_value = np.int64(agg_value) + else: + agg_value = field.np_dtype.type(agg_value) values.append(agg_value) - results[field.index] = values if len(values) > 1 else values[0] + # If numeric_only is True and We only have a NaN type field then we check for empty. + if values: + results[field.index] = values if len(values) > 1 else values[0] return results @@ -540,10 +559,6 @@ class Operations: return es_aggs - def aggs(self, query_compiler, pd_aggs): - results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False) - return pd.DataFrame(results, index=pd_aggs) - def filter(self, query_compiler, items=None, like=None, regex=None): # This function is only called for axis='index', # DataFrame.filter(..., axis="columns") calls .drop() diff --git a/eland/query_compiler.py b/eland/query_compiler.py index ddceff5..60309f7 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -17,7 +17,7 @@ import copy from datetime import datetime -from typing import Optional, TYPE_CHECKING +from typing import Optional, TYPE_CHECKING, List import numpy as np import pandas as pd @@ -490,38 +490,56 @@ class QueryCompiler: result._operations.filter(self, items=items, like=like, regex=regex) return result - def aggs(self, func): - return self._operations.aggs(self, func) + def aggs(self, func: List[str], numeric_only: Optional[bool] = None): + return self._operations.aggs(self, func, numeric_only=numeric_only) def count(self): return self._operations.count(self) - def mean(self, numeric_only=None): - return self._operations.mean(self, numeric_only=numeric_only) + def mean(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["mean"], numeric_only=numeric_only + ) - def var(self, numeric_only=None): - return self._operations.var(self, numeric_only=numeric_only) + def var(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["var"], numeric_only=numeric_only + ) - def std(self, numeric_only=None): - return self._operations.std(self, numeric_only=numeric_only) + def std(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["std"], numeric_only=numeric_only + ) - def mad(self, numeric_only=None): - return self._operations.mad(self, numeric_only=numeric_only) + def mad(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["mad"], numeric_only=numeric_only + ) - def median(self, numeric_only=None): - return self._operations.median(self, numeric_only=numeric_only) + def median(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["median"], numeric_only=numeric_only + ) - def sum(self, numeric_only=None): - return self._operations.sum(self, numeric_only=numeric_only) + def sum(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["sum"], numeric_only=numeric_only + ) - def min(self, numeric_only=None): - return self._operations.min(self, numeric_only=numeric_only) + def min(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["min"], numeric_only=numeric_only + ) - def max(self, numeric_only=None): - return self._operations.max(self, numeric_only=numeric_only) + def max(self, numeric_only: Optional[bool] = None): + return self._operations._metric_agg_series( + self, ["max"], numeric_only=numeric_only + ) def nunique(self): - return self._operations.nunique(self) + return self._operations._metric_agg_series( + self, ["nunique"], numeric_only=False + ) def value_counts(self, es_size): return self._operations.value_counts(self, es_size) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 330fb61..e483f47 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -29,7 +29,9 @@ class TestDataFrameAggs(TestData): ed_flights = self.ed_flights() pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg( + ["sum", "min"], numeric_only=True + ) # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more @@ -40,22 +42,22 @@ class TestDataFrameAggs(TestData): ["sum", "min", "std"] ) ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"] + ["sum", "min", "std"], numeric_only=True ) print(pd_sum_min_std.dtypes) print(ed_sum_min_std.dtypes) - assert_frame_equal( - pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True - ) + assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True) def test_terms_aggs(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg( + ["sum", "min"], numeric_only=True + ) # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more @@ -66,15 +68,13 @@ class TestDataFrameAggs(TestData): ["sum", "min", "std"] ) ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( - ["sum", "min", "std"] + ["sum", "min", "std"], numeric_only=True ) print(pd_sum_min_std.dtypes) print(ed_sum_min_std.dtypes) - assert_frame_equal( - pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True - ) + assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True) def test_aggs_median_var(self): pd_ecommerce = self.pd_ecommerce() @@ -85,7 +85,7 @@ class TestDataFrameAggs(TestData): ].agg(["median", "var"]) ed_aggs = ed_ecommerce[ ["taxful_total_price", "taxless_total_price", "total_quantity"] - ].agg(["median", "var"]) + ].agg(["median", "var"], numeric_only=True) print(pd_aggs, pd_aggs.dtypes) print(ed_aggs, ed_aggs.dtypes) @@ -102,7 +102,9 @@ class TestDataFrameAggs(TestData): ed_flights = self.ed_flights() pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg) - ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg) + ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( + agg, numeric_only=True + ) assert_series_equal(pd_sum_min_std, ed_sum_min_std) @@ -112,7 +114,9 @@ class TestDataFrameAggs(TestData): ed_flights = self.ed_flights() pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg( + ["mean"], numeric_only=True + ) assert_frame_equal(pd_sum_min, ed_sum_min) diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index 494e42d..dbef894 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -16,18 +16,23 @@ # under the License. # File called _pytest for PyCharm compatibility - import pytest import numpy as np import pandas as pd from pandas.testing import assert_series_equal - from eland.tests.common import TestData class TestDataFrameMetrics(TestData): funcs = ["max", "min", "mean", "sum"] extended_funcs = ["median", "mad", "var", "std"] + filter_data = [ + "AvgTicketPrice", + "Cancelled", + "dayOfWeek", + "timestamp", + "DestCountry", + ] @pytest.mark.parametrize("numeric_only", [False, None]) def test_flights_metrics(self, numeric_only): @@ -49,7 +54,7 @@ class TestDataFrameMetrics(TestData): pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only) ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only) - assert_series_equal(pd_metric, ed_metric) + assert_series_equal(pd_metric, ed_metric, check_dtype=False) def test_flights_extended_metrics(self): pd_flights = self.pd_flights() @@ -86,11 +91,9 @@ class TestDataFrameMetrics(TestData): for func in self.extended_funcs: pd_metric = getattr(pd_flights_1, func)() - ed_metric = getattr(ed_flights_1, func)() + ed_metric = getattr(ed_flights_1, func)(numeric_only=False) - assert_series_equal( - pd_metric, ed_metric, check_exact=False, check_less_precise=True - ) + assert_series_equal(pd_metric, ed_metric, check_exact=False) # Test on zero rows to test NaN behaviour of sample std/variance pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]] @@ -98,11 +101,9 @@ class TestDataFrameMetrics(TestData): for func in self.extended_funcs: pd_metric = getattr(pd_flights_0, func)() - ed_metric = getattr(ed_flights_0, func)() + ed_metric = getattr(ed_flights_0, func)(numeric_only=False) - assert_series_equal( - pd_metric, ed_metric, check_exact=False, check_less_precise=True - ) + assert_series_equal(pd_metric, ed_metric, check_exact=False) def test_ecommerce_selected_non_numeric_source_fields(self): # None of these are numeric @@ -121,7 +122,7 @@ class TestDataFrameMetrics(TestData): assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), - check_less_precise=True, + check_exact=False, ) def test_ecommerce_selected_mixed_numeric_source_fields(self): @@ -143,7 +144,7 @@ class TestDataFrameMetrics(TestData): assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), - check_less_precise=True, + check_exact=False, ) def test_ecommerce_selected_all_numeric_source_fields(self): @@ -157,27 +158,27 @@ class TestDataFrameMetrics(TestData): assert_series_equal( getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), - check_less_precise=True, + check_exact=False, ) def test_flights_datetime_metrics_agg(self): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { - "timestamp": { - "min": pd.Timestamp("2018-01-01 00:00:00"), - "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), - "max": pd.Timestamp("2018-02-11 23:50:12"), - "nunique": 12236, - "mad": pd.NaT, - "std": pd.NaT, - "sum": pd.NaT, - "var": pd.NaT, - } + "max": pd.Timestamp("2018-02-11 23:50:12"), + "min": pd.Timestamp("2018-01-01 00:00:00"), + "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), + "sum": pd.NaT, + "mad": pd.NaT, + "var": pd.NaT, + "std": pd.NaT, + "nunique": 12236, } - ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"]) - ed_metrics_dict = ed_metrics.to_dict() - ed_metrics_dict["timestamp"].pop("median") # Median is tested below. + ed_metrics = ed_timestamps.agg( + self.funcs + self.extended_funcs + ["nunique"], numeric_only=False + ) + ed_metrics_dict = ed_metrics["timestamp"].to_dict() + ed_metrics_dict.pop("median") # Median is tested below. assert ed_metrics_dict == expected_values @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) @@ -192,8 +193,10 @@ class TestDataFrameMetrics(TestData): ed_metric = ed_timestamps.agg([agg]) if agg == "nunique": + # df with timestamp column should return int64 assert ed_metric.dtypes["timestamp"] == np.int64 else: + # df with timestamp column should return datetime64[ns] assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert ed_metric["timestamp"][0] == expected_values[agg] @@ -230,7 +233,7 @@ class TestDataFrameMetrics(TestData): ) def test_metric_agg_keep_dtypes(self): - # max, min, and median maintain their dtypes + # max, min and median maintain their dtypes df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]] assert df.min().tolist() == [131.81910705566406, False, 0] assert df.max().tolist() == [989.9527587890625, True, 0] @@ -250,3 +253,162 @@ class TestDataFrameMetrics(TestData): "Cancelled": {"max": True, "median": False, "min": False}, "dayOfWeek": {"max": 0, "median": 0, "min": 0}, } + # sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64. + sum_agg = df.agg(["sum"]) + assert sum_agg.dtypes.to_list() == [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype("int64"), + ] + assert sum_agg.to_dict() == { + "AvgTicketPrice": {"sum": 26521.624084472656}, + "Cancelled": {"sum": 6}, + "dayOfWeek": {"sum": 0}, + } + + def test_flights_numeric_only(self): + # All Aggregations Data Check + ed_flights = self.ed_flights().filter(self.filter_data) + pd_flights = self.pd_flights().filter(self.filter_data) + # agg => numeric_only True returns float64 values + # We compare it with individual single agg functions of pandas with numeric_only=True + filtered_aggs = self.funcs + self.extended_funcs + agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose() + for agg in filtered_aggs: + # Explicitly check for mad because it returns nan for bools + if agg == "mad": + assert np.isnan(agg_data[agg]["Cancelled"]) + else: + assert_series_equal( + agg_data[agg].rename(None), + getattr(pd_flights, agg)(numeric_only=True), + check_exact=False, + rtol=True, + ) + + # all single aggs return float64 for numeric_only=True + def test_numeric_only_true_single_aggs(self): + ed_flights = self.ed_flights().filter(self.filter_data) + for agg in self.funcs + self.extended_funcs: + result = getattr(ed_flights, agg)(numeric_only=True) + assert result.dtype == np.dtype("float64") + assert result.shape == ((3,) if agg != "mad" else (2,)) + + # check dtypes and shape of min, max and median for numeric_only=False | None + @pytest.mark.parametrize("agg", ["min", "max", "median"]) + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_min_max_median_numeric_only(self, agg, numeric_only): + ed_flights = self.ed_flights().filter(self.filter_data) + if numeric_only is False: + calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["Cancelled"], np.bool_) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) + elif numeric_only is None: + calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["Cancelled"], np.bool_) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert calculated_values.shape == (4,) + + # check dtypes and shape for sum + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_sum_numeric_only(self, numeric_only): + ed_flights = self.ed_flights().filter(self.filter_data) + if numeric_only is False: + calculated_values = ed_flights.sum(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.int64) + assert isinstance(calculated_values["Cancelled"], np.int64) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) + elif numeric_only is None: + calculated_values = ed_flights.sum(numeric_only=numeric_only) + dtype_list = [calculated_values[i].dtype for i in calculated_values.index] + assert dtype_list == [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype("int64"), + ] + assert calculated_values.shape == (3,) + + # check dtypes and shape for std + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_std_numeric_only(self, numeric_only): + ed_flights = self.ed_flights().filter(self.filter_data) + if numeric_only is False: + calculated_values = ed_flights.std(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) + elif numeric_only is None: + calculated_values = ed_flights.std(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert calculated_values.shape == (3,) + + # check dtypes and shape for var + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_var_numeric_only(self, numeric_only): + ed_flights = self.ed_flights().filter(self.filter_data) + if numeric_only is False: + calculated_values = ed_flights.var(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], np.float64) + assert isinstance(calculated_values["Cancelled"], np.float64) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) + elif numeric_only is None: + calculated_values = ed_flights.var(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert calculated_values.shape == (3,) + + # check dtypes and shape for mean + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_mean_numeric_only(self, numeric_only): + ed_flights = self.ed_flights().filter(self.filter_data) + if numeric_only is False: + calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) + elif numeric_only is None: + calculated_values = ed_flights.mean(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert isinstance(calculated_values["timestamp"], pd.Timestamp) + assert calculated_values.shape == (4,) + + # check dtypes and shape for mad + @pytest.mark.parametrize("numeric_only", [False, None]) + def test_mad_numeric_only(self, numeric_only): + ed_flights = self.ed_flights().filter(self.filter_data) + if numeric_only is False: + calculated_values = ed_flights.mad(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["Cancelled"], np.float64) + assert isinstance(calculated_values["dayOfWeek"], float) + assert pd.isnull(calculated_values["timestamp"]) + assert np.isnan(calculated_values["DestCountry"]) + assert calculated_values.shape == (5,) + elif numeric_only is None: + calculated_values = ed_flights.mad(numeric_only=numeric_only) + assert isinstance(calculated_values["AvgTicketPrice"], float) + assert isinstance(calculated_values["dayOfWeek"], float) + assert calculated_values.shape == (2,) diff --git a/eland/tests/series/test_metrics_pytest.py b/eland/tests/series/test_metrics_pytest.py index 86a9557..d8e213d 100644 --- a/eland/tests/series/test_metrics_pytest.py +++ b/eland/tests/series/test_metrics_pytest.py @@ -72,7 +72,7 @@ class TestSeriesMetrics(TestData): if func == "nunique": # nunique never returns 'NaN' continue - ed_metric = getattr(ed_ecommerce, func)() + ed_metric = getattr(ed_ecommerce, func)(numeric_only=False) print(func, ed_metric) assert np.isnan(ed_metric) @@ -86,7 +86,9 @@ class TestSeriesMetrics(TestData): for func in self.all_funcs: pd_metric = getattr(pd_ecommerce, func)() - ed_metric = getattr(ed_ecommerce, func)() + ed_metric = getattr(ed_ecommerce, func)( + **({"numeric_only": True} if (func != "nunique") else {}) + ) self.assert_almost_equal_for_agg(func, pd_metric, ed_metric) @pytest.mark.parametrize("agg", ["mean", "min", "max"])