Switch agg defaults to numeric_only=None

This commit is contained in:
P. Sai Vinay 2020-09-22 21:02:27 +05:30 committed by GitHub
parent c86371733d
commit 4d96ad39fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 576 additions and 217 deletions

View File

@ -19,7 +19,7 @@ import sys
import warnings import warnings
from io import StringIO from io import StringIO
import re import re
from typing import Optional, Sequence, Union, Tuple from typing import Optional, Sequence, Union, Tuple, List
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -1328,7 +1328,14 @@ class DataFrame(NDFrame):
""" """
return self.columns return self.columns
def aggregate(self, func, axis=0, *args, **kwargs): def aggregate(
self,
func: Union[str, List[str]],
axis: int = 0,
numeric_only: Optional[bool] = None,
*args,
**kwargs,
) -> Union[pd.Series, pd.DataFrame]:
""" """
Aggregate using one or more operations over the specified axis. Aggregate using one or more operations over the specified axis.
@ -1347,8 +1354,13 @@ class DataFrame(NDFrame):
Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
'rank', 'sem', 'skew', 'sum', 'std', 'var']`` 'rank', 'sem', 'skew', 'sum', 'std', 'var']``
axis axis: int
Currently, we only support axis=0 (index) Currently, we only support axis=0 (index)
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: returns all values with float64, NaN/NaT are ignored.
- False: returns all values with float64.
- None: returns all values with default datatype.
*args *args
Positional arguments to pass to `func` Positional arguments to pass to `func`
**kwargs **kwargs
@ -1368,12 +1380,30 @@ class DataFrame(NDFrame):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int) >>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int)
DistanceKilometers AvgTicketPrice AvgTicketPrice DistanceKilometers
sum 92616288 8204364 sum 8204364 92616288
min 0 100 min 100 0
std 4578 266 std 266 4578
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True)
AvgTicketPrice DistanceKilometers
sum 8.204365e+06 9.261629e+07
min 1.000205e+02 0.000000e+00
std 2.664071e+02 4.578614e+03
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=False)
AvgTicketPrice DistanceKilometers timestamp DestCountry
sum 8.204365e+06 9.261629e+07 NaT NaN
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
std 2.664071e+02 4.578614e+03 NaT NaN
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=None)
AvgTicketPrice DistanceKilometers timestamp DestCountry
sum 8.204365e+06 9.261629e+07 NaT NaN
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
std 2.664071e+02 4.578614e+03 NaT NaN
""" """
axis = pd.DataFrame._get_axis_number(axis) axis = pd.DataFrame._get_axis_number(axis)
@ -1387,10 +1417,14 @@ class DataFrame(NDFrame):
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, str): if isinstance(func, str):
# Wrap in list # Wrap in list
return self._query_compiler.aggs([func]).squeeze().rename(None) return (
self._query_compiler.aggs([func], numeric_only=numeric_only)
.squeeze()
.rename(None)
)
elif is_list_like(func): elif is_list_like(func):
# we have a list! # we have a list!
return self._query_compiler.aggs(func) return self._query_compiler.aggs(func, numeric_only=numeric_only)
agg = aggregate agg = aggregate

View File

@ -100,6 +100,9 @@ class Field(NamedTuple):
# Cardinality works for all types # Cardinality works for all types
# Numerics and bools work for all aggs # Numerics and bools work for all aggs
# Except "median_absolute_deviation" which doesn't support bool
if es_agg == "median_absolute_deviation" and self.is_bool:
return False
if es_agg == "cardinality" or self.is_numeric or self.is_bool: if es_agg == "cardinality" or self.is_numeric or self.is_bool:
return True return True
# Timestamps also work for 'min', 'max' and 'avg' # Timestamps also work for 'min', 'max' and 'avg'

View File

@ -17,7 +17,7 @@
import sys import sys
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Tuple from typing import TYPE_CHECKING, Tuple, Optional
import pandas as pd import pandas as pd
from eland.query_compiler import QueryCompiler from eland.query_compiler import QueryCompiler
@ -162,12 +162,19 @@ class NDFrame(ABC):
def _es_info(self, buf): def _es_info(self, buf):
self._query_compiler.es_info(buf) self._query_compiler.es_info(buf)
def mean(self, numeric_only: bool = True) -> pd.Series: def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return mean value for each numeric column Return mean value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -179,27 +186,44 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mean() >>> df.mean()
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
timestamp 2018-01-21 19:20:45.564438232
dtype: object
>>> df.mean(numeric_only=True)
AvgTicketPrice 628.253689 AvgTicketPrice 628.253689
Cancelled 0.128494 Cancelled 0.128494
DistanceKilometers 7092.142457
DistanceMiles 4406.853010
FlightDelay 0.251168
FlightDelayMin 47.335171
FlightTimeHour 8.518797
FlightTimeMin 511.127842
dayOfWeek 2.835975 dayOfWeek 2.835975
dtype: float64 dtype: float64
>>> df.mean(numeric_only=False)
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
timestamp 2018-01-21 19:20:45.564438232
DestCountry NaN
dtype: object
""" """
return self._query_compiler.mean(numeric_only=numeric_only) return self._query_compiler.mean(numeric_only=numeric_only)
def sum(self, numeric_only: bool = True) -> pd.Series: def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return sum for each numeric column Return sum for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -211,27 +235,43 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.sum() >>> df.sum()
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
dtype: object
>>> df.sum(numeric_only=True)
AvgTicketPrice 8.204365e+06 AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03 Cancelled 1.678000e+03
DistanceKilometers 9.261629e+07
DistanceMiles 5.754909e+07
FlightDelay 3.280000e+03
FlightDelayMin 6.181500e+05
FlightTimeHour 1.112470e+05
FlightTimeMin 6.674818e+06
dayOfWeek 3.703500e+04 dayOfWeek 3.703500e+04
dtype: float64 dtype: float64
>>> df.sum(numeric_only=False)
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
timestamp NaT
DestCountry NaN
dtype: object
""" """
return self._query_compiler.sum(numeric_only=numeric_only) return self._query_compiler.sum(numeric_only=numeric_only)
def min(self, numeric_only: bool = True) -> pd.Series: def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return the minimum value for each numeric column Return the minimum value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -243,25 +283,42 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.min() >>> df.min()
AvgTicketPrice 100.021 AvgTicketPrice 100.021
Cancelled False Cancelled False
DistanceKilometers 0
DistanceMiles 0
FlightDelay False
FlightDelayMin 0
FlightTimeHour 0
FlightTimeMin 0
dayOfWeek 0 dayOfWeek 0
timestamp 2018-01-01 00:00:00
dtype: object
>>> df.min(numeric_only=True)
AvgTicketPrice 100.020531
Cancelled 0.000000
dayOfWeek 0.000000
dtype: float64
>>> df.min(numeric_only=False)
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
timestamp 2018-01-01 00:00:00
DestCountry NaN
dtype: object dtype: object
""" """
return self._query_compiler.min(numeric_only=numeric_only) return self._query_compiler.min(numeric_only=numeric_only)
def var(self, numeric_only: bool = True) -> pd.Series: def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return variance for each numeric column Return variance for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -273,25 +330,41 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.var() # doctest: +SKIP >>> df.var()
AvgTicketPrice 7.096185e+04 AvgTicketPrice 70964.570234
Cancelled 1.119831e-01 Cancelled 0.111987
DistanceKilometers 2.096049e+07 dayOfWeek 3.761279
DistanceMiles 8.092892e+06
FlightDelay 1.880825e-01
FlightDelayMin 9.359209e+03
FlightTimeHour 3.112545e+01
FlightTimeMin 1.120516e+05
dayOfWeek 3.761135e+00
dtype: float64 dtype: float64
>>> df.var(numeric_only=True)
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
dtype: float64
>>> df.var(numeric_only=False)
AvgTicketPrice 70964.6
Cancelled 0.111987
dayOfWeek 3.76128
timestamp NaT
DestCountry NaN
dtype: object
""" """
return self._query_compiler.var(numeric_only=numeric_only) return self._query_compiler.var(numeric_only=numeric_only)
def std(self, numeric_only: bool = True) -> pd.Series: def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return standard deviation for each numeric column Return standard deviation for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -303,25 +376,41 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.std() # doctest: +SKIP >>> df.std()
AvgTicketPrice 266.386661 AvgTicketPrice 266.407061
Cancelled 0.334639 Cancelled 0.334664
DistanceKilometers 4578.263193 dayOfWeek 1.939513
DistanceMiles 2844.800855
FlightDelay 0.433685
FlightDelayMin 96.743006
FlightTimeHour 5.579019
FlightTimeMin 334.741135
dayOfWeek 1.939365
dtype: float64 dtype: float64
>>> df.std(numeric_only=True)
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
dtype: float64
>>> df.std(numeric_only=False)
AvgTicketPrice 266.407
Cancelled 0.334664
dayOfWeek 1.93951
timestamp NaT
DestCountry NaN
dtype: object
""" """
return self._query_compiler.std(numeric_only=numeric_only) return self._query_compiler.std(numeric_only=numeric_only)
def median(self, numeric_only: bool = True) -> pd.Series: def median(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return the median value for each numeric column Return the median value for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -333,27 +422,44 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.median() # doctest: +SKIP >>> df.median() # doctest: +SKIP
AvgTicketPrice 640.387285 AvgTicketPrice 640.363
Cancelled False
dayOfWeek 3
timestamp 2018-01-21 23:54:06.624776611
dtype: object
>>> df.median(numeric_only=True) # doctest: +SKIP
AvgTicketPrice 640.362667
Cancelled 0.000000 Cancelled 0.000000
DistanceKilometers 7612.072403
DistanceMiles 4729.922470
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 8.383113
FlightTimeMin 503.148975
dayOfWeek 3.000000 dayOfWeek 3.000000
dtype: float64 dtype: float64
>>> df.median(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 640.387
Cancelled False
dayOfWeek 3
timestamp 2018-01-21 23:54:06.624776611
DestCountry NaN
dtype: object
""" """
return self._query_compiler.median(numeric_only=numeric_only) return self._query_compiler.median(numeric_only=numeric_only)
def max(self, numeric_only: bool = True) -> pd.Series: def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
""" """
Return the maximum value for each numeric column Return the maximum value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns Returns
------- -------
pandas.Series pandas.Series
@ -365,17 +471,26 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.max() >>> df.max()
AvgTicketPrice 1199.73 AvgTicketPrice 1199.73
Cancelled True Cancelled True
DistanceKilometers 19881.5
DistanceMiles 12353.8
FlightDelay True
FlightDelayMin 360
FlightTimeHour 31.715
FlightTimeMin 1902.9
dayOfWeek 6 dayOfWeek 6
timestamp 2018-02-11 23:50:12
dtype: object
>>> df.max(numeric_only=True)
AvgTicketPrice 1199.729004
Cancelled 1.000000
dayOfWeek 6.000000
dtype: float64
>>> df.max(numeric_only=False)
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
timestamp 2018-02-11 23:50:12
DestCountry NaN
dtype: object dtype: object
""" """
return self._query_compiler.max(numeric_only=numeric_only) return self._query_compiler.max(numeric_only=numeric_only)
@ -441,18 +556,24 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mad() # doctest: +SKIP >>> df.mad() # doctest: +SKIP
AvgTicketPrice 213.368709 AvgTicketPrice 213.35497
Cancelled 0.000000 dayOfWeek 2.00000
DistanceKilometers 2946.168236 dtype: float64
DistanceMiles 1830.987236
FlightDelay 0.000000 >>> df.mad(numeric_only=True) # doctest: +SKIP
FlightDelayMin 0.000000 AvgTicketPrice 213.473011
FlightTimeHour 3.819435
FlightTimeMin 229.142297
dayOfWeek 2.000000 dayOfWeek 2.000000
dtype: float64 dtype: float64
>>> df.mad(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 213.484
Cancelled NaN
dayOfWeek 2
timestamp NaT
DestCountry NaN
dtype: object
""" """
return self._query_compiler.mad(numeric_only=numeric_only) return self._query_compiler.mad(numeric_only=numeric_only)

View File

@ -145,43 +145,28 @@ class Operations:
return build_pd_series(data=counts, index=fields) return build_pd_series(data=counts, index=fields)
def mean(self, query_compiler, numeric_only=True): def _metric_agg_series(
results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only) self,
return build_pd_series(results, index=results.keys()) query_compiler: "QueryCompiler",
agg: List,
def var(self, query_compiler, numeric_only=True): numeric_only: Optional[bool] = None,
results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only) ) -> pd.Series:
return build_pd_series(results, index=results.keys()) results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only)
if numeric_only:
def std(self, query_compiler, numeric_only=True): return build_pd_series(results, index=results.keys(), dtype=np.float64)
results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only) else:
return build_pd_series(results, index=results.keys()) # If all results are float convert into float64
if all(isinstance(i, float) for i in results.values()):
def median(self, query_compiler, numeric_only=True): dtype = np.float64
results = self._metric_aggs( # If all results are int convert into int64
query_compiler, ["median"], numeric_only=numeric_only elif all(isinstance(i, int) for i in results.values()):
) dtype = np.int64
return build_pd_series(results, index=results.keys()) # If single result is present consider that datatype instead of object
elif len(results) <= 1:
def sum(self, query_compiler, numeric_only=True): dtype = None
results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only) else:
return build_pd_series(results, index=results.keys()) dtype = "object"
return build_pd_series(results, index=results.keys(), dtype=dtype)
def max(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def min(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def nunique(self, query_compiler):
results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
return build_pd_series(results, index=results.keys())
def mad(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def value_counts(self, query_compiler, es_size): def value_counts(self, query_compiler, es_size):
return self._terms_aggs(query_compiler, "terms", es_size) return self._terms_aggs(query_compiler, "terms", es_size)
@ -189,7 +174,21 @@ class Operations:
def hist(self, query_compiler, bins): def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins) return self._hist_aggs(query_compiler, bins)
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True): def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame:
results = self._metric_aggs(
query_compiler, pd_aggs, numeric_only=numeric_only, is_dataframe_agg=True
)
return pd.DataFrame(
results, index=pd_aggs, dtype=(np.float64 if numeric_only else None)
)
def _metric_aggs(
self,
query_compiler: "QueryCompiler",
pd_aggs,
numeric_only: Optional[bool] = None,
is_dataframe_agg: bool = False,
) -> Dict:
query_params, post_processing = self._resolve_tasks(query_compiler) query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing) size = self._size(query_params, post_processing)
@ -201,6 +200,7 @@ class Operations:
results = {} results = {}
fields = query_compiler._mappings.all_source_fields() fields = query_compiler._mappings.all_source_fields()
if numeric_only: if numeric_only:
# Consider if field is Int/Float/Bool
fields = [field for field in fields if (field.is_numeric or field.is_bool)] fields = [field for field in fields if (field.is_numeric or field.is_bool)]
body = Query(query_params.query) body = Query(query_params.query)
@ -210,6 +210,7 @@ class Operations:
for field in fields: for field in fields:
for es_agg in es_aggs: for es_agg in es_aggs:
# NaN/NaT fields are ignored
if not field.is_es_agg_compatible(es_agg): if not field.is_es_agg_compatible(es_agg):
continue continue
@ -241,9 +242,17 @@ class Operations:
for field in fields: for field in fields:
values = [] values = []
for es_agg, pd_agg in zip(es_aggs, pd_aggs): for es_agg, pd_agg in zip(es_aggs, pd_aggs):
# is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean()
# If the field and agg aren't compatible we add a NaN/NaT # If the field and agg aren't compatible we add a NaN/NaT for agg
# If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean()
if not field.is_es_agg_compatible(es_agg): if not field.is_es_agg_compatible(es_agg):
if is_dataframe_agg and not numeric_only:
values.append(field.nan_value)
elif not is_dataframe_agg and numeric_only is False:
values.append(field.nan_value)
# Explicit condition for mad to add NaN because it doesn't support bool
elif is_dataframe_agg and numeric_only:
if pd_agg == "mad":
values.append(field.nan_value) values.append(field.nan_value)
continue continue
@ -269,7 +278,7 @@ class Operations:
# All of the below calculations result in NaN if count<=1 # All of the below calculations result in NaN if count<=1
if count <= 1: if count <= 1:
agg_value = np.float64(np.NaN) agg_value = np.NaN
elif es_agg[1] == "std_deviation": elif es_agg[1] == "std_deviation":
agg_value *= count / (count - 1.0) agg_value *= count / (count - 1.0)
@ -287,8 +296,11 @@ class Operations:
]["value"] ]["value"]
# Null usually means there were no results. # Null usually means there were no results.
if agg_value is None: if agg_value is None or np.isnan(agg_value):
agg_value = field.nan_value if is_dataframe_agg and not numeric_only:
agg_value = np.NaN
elif not is_dataframe_agg and numeric_only is False:
agg_value = np.NaN
# Cardinality is always either NaN or integer. # Cardinality is always either NaN or integer.
elif pd_agg == "nunique": elif pd_agg == "nunique":
@ -299,13 +311,20 @@ class Operations:
agg_value = elasticsearch_date_to_pandas_date( agg_value = elasticsearch_date_to_pandas_date(
agg_value, field.es_date_format agg_value, field.es_date_format
) )
# If numeric_only is False | None then maintain column datatype
# These aggregations maintain the column datatype elif not numeric_only:
elif pd_agg in {"max", "min", "median"}: # we're only converting to bool for lossless aggs like min, max, and median.
if pd_agg in {"max", "min", "median", "sum"}:
# 'sum' isn't representable with bool, use int64
if pd_agg == "sum" and field.is_bool:
agg_value = np.int64(agg_value)
else:
agg_value = field.np_dtype.type(agg_value) agg_value = field.np_dtype.type(agg_value)
values.append(agg_value) values.append(agg_value)
# If numeric_only is True and We only have a NaN type field then we check for empty.
if values:
results[field.index] = values if len(values) > 1 else values[0] results[field.index] = values if len(values) > 1 else values[0]
return results return results
@ -540,10 +559,6 @@ class Operations:
return es_aggs return es_aggs
def aggs(self, query_compiler, pd_aggs):
results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
return pd.DataFrame(results, index=pd_aggs)
def filter(self, query_compiler, items=None, like=None, regex=None): def filter(self, query_compiler, items=None, like=None, regex=None):
# This function is only called for axis='index', # This function is only called for axis='index',
# DataFrame.filter(..., axis="columns") calls .drop() # DataFrame.filter(..., axis="columns") calls .drop()

View File

@ -17,7 +17,7 @@
import copy import copy
from datetime import datetime from datetime import datetime
from typing import Optional, TYPE_CHECKING from typing import Optional, TYPE_CHECKING, List
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -490,38 +490,56 @@ class QueryCompiler:
result._operations.filter(self, items=items, like=like, regex=regex) result._operations.filter(self, items=items, like=like, regex=regex)
return result return result
def aggs(self, func): def aggs(self, func: List[str], numeric_only: Optional[bool] = None):
return self._operations.aggs(self, func) return self._operations.aggs(self, func, numeric_only=numeric_only)
def count(self): def count(self):
return self._operations.count(self) return self._operations.count(self)
def mean(self, numeric_only=None): def mean(self, numeric_only: Optional[bool] = None):
return self._operations.mean(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["mean"], numeric_only=numeric_only
)
def var(self, numeric_only=None): def var(self, numeric_only: Optional[bool] = None):
return self._operations.var(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["var"], numeric_only=numeric_only
)
def std(self, numeric_only=None): def std(self, numeric_only: Optional[bool] = None):
return self._operations.std(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["std"], numeric_only=numeric_only
)
def mad(self, numeric_only=None): def mad(self, numeric_only: Optional[bool] = None):
return self._operations.mad(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["mad"], numeric_only=numeric_only
)
def median(self, numeric_only=None): def median(self, numeric_only: Optional[bool] = None):
return self._operations.median(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["median"], numeric_only=numeric_only
)
def sum(self, numeric_only=None): def sum(self, numeric_only: Optional[bool] = None):
return self._operations.sum(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["sum"], numeric_only=numeric_only
)
def min(self, numeric_only=None): def min(self, numeric_only: Optional[bool] = None):
return self._operations.min(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["min"], numeric_only=numeric_only
)
def max(self, numeric_only=None): def max(self, numeric_only: Optional[bool] = None):
return self._operations.max(self, numeric_only=numeric_only) return self._operations._metric_agg_series(
self, ["max"], numeric_only=numeric_only
)
def nunique(self): def nunique(self):
return self._operations.nunique(self) return self._operations._metric_agg_series(
self, ["nunique"], numeric_only=False
)
def value_counts(self, es_size): def value_counts(self, es_size):
return self._operations.value_counts(self, es_size) return self._operations.value_counts(self, es_size)

View File

@ -29,7 +29,9 @@ class TestDataFrameAggs(TestData):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min"], numeric_only=True
)
# Eland returns all float values for all metric aggs, pandas can return int # Eland returns all float values for all metric aggs, pandas can return int
# TODO - investigate this more # TODO - investigate this more
@ -40,22 +42,22 @@ class TestDataFrameAggs(TestData):
["sum", "min", "std"] ["sum", "min", "std"]
) )
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min", "std"] ["sum", "min", "std"], numeric_only=True
) )
print(pd_sum_min_std.dtypes) print(pd_sum_min_std.dtypes)
print(ed_sum_min_std.dtypes) print(ed_sum_min_std.dtypes)
assert_frame_equal( assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
)
def test_terms_aggs(self): def test_terms_aggs(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min"], numeric_only=True
)
# Eland returns all float values for all metric aggs, pandas can return int # Eland returns all float values for all metric aggs, pandas can return int
# TODO - investigate this more # TODO - investigate this more
@ -66,15 +68,13 @@ class TestDataFrameAggs(TestData):
["sum", "min", "std"] ["sum", "min", "std"]
) )
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg( ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min", "std"] ["sum", "min", "std"], numeric_only=True
) )
print(pd_sum_min_std.dtypes) print(pd_sum_min_std.dtypes)
print(ed_sum_min_std.dtypes) print(ed_sum_min_std.dtypes)
assert_frame_equal( assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
)
def test_aggs_median_var(self): def test_aggs_median_var(self):
pd_ecommerce = self.pd_ecommerce() pd_ecommerce = self.pd_ecommerce()
@ -85,7 +85,7 @@ class TestDataFrameAggs(TestData):
].agg(["median", "var"]) ].agg(["median", "var"])
ed_aggs = ed_ecommerce[ ed_aggs = ed_ecommerce[
["taxful_total_price", "taxless_total_price", "total_quantity"] ["taxful_total_price", "taxless_total_price", "total_quantity"]
].agg(["median", "var"]) ].agg(["median", "var"], numeric_only=True)
print(pd_aggs, pd_aggs.dtypes) print(pd_aggs, pd_aggs.dtypes)
print(ed_aggs, ed_aggs.dtypes) print(ed_aggs, ed_aggs.dtypes)
@ -102,7 +102,9 @@ class TestDataFrameAggs(TestData):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg) pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg)
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg) ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
agg, numeric_only=True
)
assert_series_equal(pd_sum_min_std, ed_sum_min_std) assert_series_equal(pd_sum_min_std, ed_sum_min_std)
@ -112,7 +114,9 @@ class TestDataFrameAggs(TestData):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"]) pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"]) ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
["mean"], numeric_only=True
)
assert_frame_equal(pd_sum_min, ed_sum_min) assert_frame_equal(pd_sum_min, ed_sum_min)

View File

@ -16,18 +16,23 @@
# under the License. # under the License.
# File called _pytest for PyCharm compatibility # File called _pytest for PyCharm compatibility
import pytest import pytest
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.testing import assert_series_equal from pandas.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
funcs = ["max", "min", "mean", "sum"] funcs = ["max", "min", "mean", "sum"]
extended_funcs = ["median", "mad", "var", "std"] extended_funcs = ["median", "mad", "var", "std"]
filter_data = [
"AvgTicketPrice",
"Cancelled",
"dayOfWeek",
"timestamp",
"DestCountry",
]
@pytest.mark.parametrize("numeric_only", [False, None]) @pytest.mark.parametrize("numeric_only", [False, None])
def test_flights_metrics(self, numeric_only): def test_flights_metrics(self, numeric_only):
@ -49,7 +54,7 @@ class TestDataFrameMetrics(TestData):
pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only) pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only)
ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only) ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only)
assert_series_equal(pd_metric, ed_metric) assert_series_equal(pd_metric, ed_metric, check_dtype=False)
def test_flights_extended_metrics(self): def test_flights_extended_metrics(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -86,11 +91,9 @@ class TestDataFrameMetrics(TestData):
for func in self.extended_funcs: for func in self.extended_funcs:
pd_metric = getattr(pd_flights_1, func)() pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)() ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
assert_series_equal( assert_series_equal(pd_metric, ed_metric, check_exact=False)
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)
# Test on zero rows to test NaN behaviour of sample std/variance # Test on zero rows to test NaN behaviour of sample std/variance
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]] pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
@ -98,11 +101,9 @@ class TestDataFrameMetrics(TestData):
for func in self.extended_funcs: for func in self.extended_funcs:
pd_metric = getattr(pd_flights_0, func)() pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)() ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
assert_series_equal( assert_series_equal(pd_metric, ed_metric, check_exact=False)
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)
def test_ecommerce_selected_non_numeric_source_fields(self): def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric # None of these are numeric
@ -121,7 +122,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal( assert_series_equal(
getattr(pd_ecommerce, func)(numeric_only=True), getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True, check_exact=False,
) )
def test_ecommerce_selected_mixed_numeric_source_fields(self): def test_ecommerce_selected_mixed_numeric_source_fields(self):
@ -143,7 +144,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal( assert_series_equal(
getattr(pd_ecommerce, func)(numeric_only=True), getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True, check_exact=False,
) )
def test_ecommerce_selected_all_numeric_source_fields(self): def test_ecommerce_selected_all_numeric_source_fields(self):
@ -157,27 +158,27 @@ class TestDataFrameMetrics(TestData):
assert_series_equal( assert_series_equal(
getattr(pd_ecommerce, func)(numeric_only=True), getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True, check_exact=False,
) )
def test_flights_datetime_metrics_agg(self): def test_flights_datetime_metrics_agg(self):
ed_timestamps = self.ed_flights()[["timestamp"]] ed_timestamps = self.ed_flights()[["timestamp"]]
expected_values = { expected_values = {
"timestamp": { "max": pd.Timestamp("2018-02-11 23:50:12"),
"min": pd.Timestamp("2018-01-01 00:00:00"), "min": pd.Timestamp("2018-01-01 00:00:00"),
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
"max": pd.Timestamp("2018-02-11 23:50:12"),
"nunique": 12236,
"mad": pd.NaT,
"std": pd.NaT,
"sum": pd.NaT, "sum": pd.NaT,
"mad": pd.NaT,
"var": pd.NaT, "var": pd.NaT,
} "std": pd.NaT,
"nunique": 12236,
} }
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"]) ed_metrics = ed_timestamps.agg(
ed_metrics_dict = ed_metrics.to_dict() self.funcs + self.extended_funcs + ["nunique"], numeric_only=False
ed_metrics_dict["timestamp"].pop("median") # Median is tested below. )
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
ed_metrics_dict.pop("median") # Median is tested below.
assert ed_metrics_dict == expected_values assert ed_metrics_dict == expected_values
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
@ -192,8 +193,10 @@ class TestDataFrameMetrics(TestData):
ed_metric = ed_timestamps.agg([agg]) ed_metric = ed_timestamps.agg([agg])
if agg == "nunique": if agg == "nunique":
# df with timestamp column should return int64
assert ed_metric.dtypes["timestamp"] == np.int64 assert ed_metric.dtypes["timestamp"] == np.int64
else: else:
# df with timestamp column should return datetime64[ns]
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
assert ed_metric["timestamp"][0] == expected_values[agg] assert ed_metric["timestamp"][0] == expected_values[agg]
@ -230,7 +233,7 @@ class TestDataFrameMetrics(TestData):
) )
def test_metric_agg_keep_dtypes(self): def test_metric_agg_keep_dtypes(self):
# max, min, and median maintain their dtypes # max, min and median maintain their dtypes
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]] df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
assert df.min().tolist() == [131.81910705566406, False, 0] assert df.min().tolist() == [131.81910705566406, False, 0]
assert df.max().tolist() == [989.9527587890625, True, 0] assert df.max().tolist() == [989.9527587890625, True, 0]
@ -250,3 +253,162 @@ class TestDataFrameMetrics(TestData):
"Cancelled": {"max": True, "median": False, "min": False}, "Cancelled": {"max": True, "median": False, "min": False},
"dayOfWeek": {"max": 0, "median": 0, "min": 0}, "dayOfWeek": {"max": 0, "median": 0, "min": 0},
} }
# sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64.
sum_agg = df.agg(["sum"])
assert sum_agg.dtypes.to_list() == [
np.dtype("float64"),
np.dtype("int64"),
np.dtype("int64"),
]
assert sum_agg.to_dict() == {
"AvgTicketPrice": {"sum": 26521.624084472656},
"Cancelled": {"sum": 6},
"dayOfWeek": {"sum": 0},
}
def test_flights_numeric_only(self):
# All Aggregations Data Check
ed_flights = self.ed_flights().filter(self.filter_data)
pd_flights = self.pd_flights().filter(self.filter_data)
# agg => numeric_only True returns float64 values
# We compare it with individual single agg functions of pandas with numeric_only=True
filtered_aggs = self.funcs + self.extended_funcs
agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose()
for agg in filtered_aggs:
# Explicitly check for mad because it returns nan for bools
if agg == "mad":
assert np.isnan(agg_data[agg]["Cancelled"])
else:
assert_series_equal(
agg_data[agg].rename(None),
getattr(pd_flights, agg)(numeric_only=True),
check_exact=False,
rtol=True,
)
# all single aggs return float64 for numeric_only=True
def test_numeric_only_true_single_aggs(self):
ed_flights = self.ed_flights().filter(self.filter_data)
for agg in self.funcs + self.extended_funcs:
result = getattr(ed_flights, agg)(numeric_only=True)
assert result.dtype == np.dtype("float64")
assert result.shape == ((3,) if agg != "mad" else (2,))
# check dtypes and shape of min, max and median for numeric_only=False | None
@pytest.mark.parametrize("agg", ["min", "max", "median"])
@pytest.mark.parametrize("numeric_only", [False, None])
def test_min_max_median_numeric_only(self, agg, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["Cancelled"], np.bool_)
assert isinstance(calculated_values["dayOfWeek"], np.int64)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["Cancelled"], np.bool_)
assert isinstance(calculated_values["dayOfWeek"], np.int64)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert calculated_values.shape == (4,)
# check dtypes and shape for sum
@pytest.mark.parametrize("numeric_only", [False, None])
def test_sum_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.sum(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["dayOfWeek"], np.int64)
assert isinstance(calculated_values["Cancelled"], np.int64)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.sum(numeric_only=numeric_only)
dtype_list = [calculated_values[i].dtype for i in calculated_values.index]
assert dtype_list == [
np.dtype("float64"),
np.dtype("int64"),
np.dtype("int64"),
]
assert calculated_values.shape == (3,)
# check dtypes and shape for std
@pytest.mark.parametrize("numeric_only", [False, None])
def test_std_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.std(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.std(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert calculated_values.shape == (3,)
# check dtypes and shape for var
@pytest.mark.parametrize("numeric_only", [False, None])
def test_var_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.var(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["dayOfWeek"], np.float64)
assert isinstance(calculated_values["Cancelled"], np.float64)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.var(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert calculated_values.shape == (3,)
# check dtypes and shape for mean
@pytest.mark.parametrize("numeric_only", [False, None])
def test_mean_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.mean(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.mean(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert calculated_values.shape == (4,)
# check dtypes and shape for mad
@pytest.mark.parametrize("numeric_only", [False, None])
def test_mad_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.mad(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], np.float64)
assert isinstance(calculated_values["dayOfWeek"], float)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.mad(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert calculated_values.shape == (2,)

View File

@ -72,7 +72,7 @@ class TestSeriesMetrics(TestData):
if func == "nunique": # nunique never returns 'NaN' if func == "nunique": # nunique never returns 'NaN'
continue continue
ed_metric = getattr(ed_ecommerce, func)() ed_metric = getattr(ed_ecommerce, func)(numeric_only=False)
print(func, ed_metric) print(func, ed_metric)
assert np.isnan(ed_metric) assert np.isnan(ed_metric)
@ -86,7 +86,9 @@ class TestSeriesMetrics(TestData):
for func in self.all_funcs: for func in self.all_funcs:
pd_metric = getattr(pd_ecommerce, func)() pd_metric = getattr(pd_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)() ed_metric = getattr(ed_ecommerce, func)(
**({"numeric_only": True} if (func != "nunique") else {})
)
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric) self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
@pytest.mark.parametrize("agg", ["mean", "min", "max"]) @pytest.mark.parametrize("agg", ["mean", "min", "max"])