Switch agg defaults to numeric_only=None

This commit is contained in:
P. Sai Vinay 2020-09-22 21:02:27 +05:30 committed by GitHub
parent c86371733d
commit 4d96ad39fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 576 additions and 217 deletions

View File

@ -19,7 +19,7 @@ import sys
import warnings
from io import StringIO
import re
from typing import Optional, Sequence, Union, Tuple
from typing import Optional, Sequence, Union, Tuple, List
import numpy as np
import pandas as pd
@ -1328,7 +1328,14 @@ class DataFrame(NDFrame):
"""
return self.columns
def aggregate(self, func, axis=0, *args, **kwargs):
def aggregate(
self,
func: Union[str, List[str]],
axis: int = 0,
numeric_only: Optional[bool] = None,
*args,
**kwargs,
) -> Union[pd.Series, pd.DataFrame]:
"""
Aggregate using one or more operations over the specified axis.
@ -1347,8 +1354,13 @@ class DataFrame(NDFrame):
Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
'rank', 'sem', 'skew', 'sum', 'std', 'var']``
axis
axis: int
Currently, we only support axis=0 (index)
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: returns all values with float64, NaN/NaT are ignored.
- False: returns all values with float64.
- None: returns all values with default datatype.
*args
Positional arguments to pass to `func`
**kwargs
@ -1368,12 +1380,30 @@ class DataFrame(NDFrame):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
DistanceKilometers AvgTicketPrice
sum 92616288 8204364
min 0 100
std 4578 266
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int)
AvgTicketPrice DistanceKilometers
sum 8204364 92616288
min 100 0
std 266 4578
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True)
AvgTicketPrice DistanceKilometers
sum 8.204365e+06 9.261629e+07
min 1.000205e+02 0.000000e+00
std 2.664071e+02 4.578614e+03
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=False)
AvgTicketPrice DistanceKilometers timestamp DestCountry
sum 8.204365e+06 9.261629e+07 NaT NaN
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
std 2.664071e+02 4.578614e+03 NaT NaN
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=None)
AvgTicketPrice DistanceKilometers timestamp DestCountry
sum 8.204365e+06 9.261629e+07 NaT NaN
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
std 2.664071e+02 4.578614e+03 NaT NaN
"""
axis = pd.DataFrame._get_axis_number(axis)
@ -1387,10 +1417,14 @@ class DataFrame(NDFrame):
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, str):
# Wrap in list
return self._query_compiler.aggs([func]).squeeze().rename(None)
return (
self._query_compiler.aggs([func], numeric_only=numeric_only)
.squeeze()
.rename(None)
)
elif is_list_like(func):
# we have a list!
return self._query_compiler.aggs(func)
return self._query_compiler.aggs(func, numeric_only=numeric_only)
agg = aggregate

View File

@ -100,6 +100,9 @@ class Field(NamedTuple):
# Cardinality works for all types
# Numerics and bools work for all aggs
# Except "median_absolute_deviation" which doesn't support bool
if es_agg == "median_absolute_deviation" and self.is_bool:
return False
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
return True
# Timestamps also work for 'min', 'max' and 'avg'

View File

@ -17,7 +17,7 @@
import sys
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Tuple
from typing import TYPE_CHECKING, Tuple, Optional
import pandas as pd
from eland.query_compiler import QueryCompiler
@ -162,12 +162,19 @@ class NDFrame(ABC):
def _es_info(self, buf):
self._query_compiler.es_info(buf)
def mean(self, numeric_only: bool = True) -> pd.Series:
def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return mean value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -179,27 +186,44 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mean()
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
timestamp 2018-01-21 19:20:45.564438232
dtype: object
>>> df.mean(numeric_only=True)
AvgTicketPrice 628.253689
Cancelled 0.128494
DistanceKilometers 7092.142457
DistanceMiles 4406.853010
FlightDelay 0.251168
FlightDelayMin 47.335171
FlightTimeHour 8.518797
FlightTimeMin 511.127842
dayOfWeek 2.835975
dtype: float64
>>> df.mean(numeric_only=False)
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
timestamp 2018-01-21 19:20:45.564438232
DestCountry NaN
dtype: object
"""
return self._query_compiler.mean(numeric_only=numeric_only)
def sum(self, numeric_only: bool = True) -> pd.Series:
def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return sum for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -211,27 +235,43 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.sum()
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
dtype: object
>>> df.sum(numeric_only=True)
AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03
DistanceKilometers 9.261629e+07
DistanceMiles 5.754909e+07
FlightDelay 3.280000e+03
FlightDelayMin 6.181500e+05
FlightTimeHour 1.112470e+05
FlightTimeMin 6.674818e+06
dayOfWeek 3.703500e+04
dtype: float64
>>> df.sum(numeric_only=False)
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.sum(numeric_only=numeric_only)
def min(self, numeric_only: bool = True) -> pd.Series:
def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return the minimum value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -243,25 +283,42 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.min()
AvgTicketPrice 100.021
Cancelled False
DistanceKilometers 0
DistanceMiles 0
FlightDelay False
FlightDelayMin 0
FlightTimeHour 0
FlightTimeMin 0
dayOfWeek 0
timestamp 2018-01-01 00:00:00
dtype: object
>>> df.min(numeric_only=True)
AvgTicketPrice 100.020531
Cancelled 0.000000
dayOfWeek 0.000000
dtype: float64
>>> df.min(numeric_only=False)
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
timestamp 2018-01-01 00:00:00
DestCountry NaN
dtype: object
"""
return self._query_compiler.min(numeric_only=numeric_only)
def var(self, numeric_only: bool = True) -> pd.Series:
def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return variance for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -273,25 +330,41 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.var() # doctest: +SKIP
AvgTicketPrice 7.096185e+04
Cancelled 1.119831e-01
DistanceKilometers 2.096049e+07
DistanceMiles 8.092892e+06
FlightDelay 1.880825e-01
FlightDelayMin 9.359209e+03
FlightTimeHour 3.112545e+01
FlightTimeMin 1.120516e+05
dayOfWeek 3.761135e+00
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.var()
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
dtype: float64
>>> df.var(numeric_only=True)
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
dtype: float64
>>> df.var(numeric_only=False)
AvgTicketPrice 70964.6
Cancelled 0.111987
dayOfWeek 3.76128
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.var(numeric_only=numeric_only)
def std(self, numeric_only: bool = True) -> pd.Series:
def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return standard deviation for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -303,25 +376,41 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.std() # doctest: +SKIP
AvgTicketPrice 266.386661
Cancelled 0.334639
DistanceKilometers 4578.263193
DistanceMiles 2844.800855
FlightDelay 0.433685
FlightDelayMin 96.743006
FlightTimeHour 5.579019
FlightTimeMin 334.741135
dayOfWeek 1.939365
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.std()
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
dtype: float64
>>> df.std(numeric_only=True)
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
dtype: float64
>>> df.std(numeric_only=False)
AvgTicketPrice 266.407
Cancelled 0.334664
dayOfWeek 1.93951
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.std(numeric_only=numeric_only)
def median(self, numeric_only: bool = True) -> pd.Series:
def median(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return the median value for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -333,27 +422,44 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.median() # doctest: +SKIP
AvgTicketPrice 640.387285
AvgTicketPrice 640.363
Cancelled False
dayOfWeek 3
timestamp 2018-01-21 23:54:06.624776611
dtype: object
>>> df.median(numeric_only=True) # doctest: +SKIP
AvgTicketPrice 640.362667
Cancelled 0.000000
DistanceKilometers 7612.072403
DistanceMiles 4729.922470
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 8.383113
FlightTimeMin 503.148975
dayOfWeek 3.000000
dtype: float64
>>> df.median(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 640.387
Cancelled False
dayOfWeek 3
timestamp 2018-01-21 23:54:06.624776611
DestCountry NaN
dtype: object
"""
return self._query_compiler.median(numeric_only=numeric_only)
def max(self, numeric_only: bool = True) -> pd.Series:
def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return the maximum value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
@ -365,17 +471,26 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.max()
AvgTicketPrice 1199.73
Cancelled True
DistanceKilometers 19881.5
DistanceMiles 12353.8
FlightDelay True
FlightDelayMin 360
FlightTimeHour 31.715
FlightTimeMin 1902.9
dayOfWeek 6
timestamp 2018-02-11 23:50:12
dtype: object
>>> df.max(numeric_only=True)
AvgTicketPrice 1199.729004
Cancelled 1.000000
dayOfWeek 6.000000
dtype: float64
>>> df.max(numeric_only=False)
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
timestamp 2018-02-11 23:50:12
DestCountry NaN
dtype: object
"""
return self._query_compiler.max(numeric_only=numeric_only)
@ -441,18 +556,24 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mad() # doctest: +SKIP
AvgTicketPrice 213.368709
Cancelled 0.000000
DistanceKilometers 2946.168236
DistanceMiles 1830.987236
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 3.819435
FlightTimeMin 229.142297
AvgTicketPrice 213.35497
dayOfWeek 2.00000
dtype: float64
>>> df.mad(numeric_only=True) # doctest: +SKIP
AvgTicketPrice 213.473011
dayOfWeek 2.000000
dtype: float64
>>> df.mad(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 213.484
Cancelled NaN
dayOfWeek 2
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.mad(numeric_only=numeric_only)

View File

@ -145,43 +145,28 @@ class Operations:
return build_pd_series(data=counts, index=fields)
def mean(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def var(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def std(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def median(self, query_compiler, numeric_only=True):
results = self._metric_aggs(
query_compiler, ["median"], numeric_only=numeric_only
)
return build_pd_series(results, index=results.keys())
def sum(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def max(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def min(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def nunique(self, query_compiler):
results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
return build_pd_series(results, index=results.keys())
def mad(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def _metric_agg_series(
self,
query_compiler: "QueryCompiler",
agg: List,
numeric_only: Optional[bool] = None,
) -> pd.Series:
results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only)
if numeric_only:
return build_pd_series(results, index=results.keys(), dtype=np.float64)
else:
# If all results are float convert into float64
if all(isinstance(i, float) for i in results.values()):
dtype = np.float64
# If all results are int convert into int64
elif all(isinstance(i, int) for i in results.values()):
dtype = np.int64
# If single result is present consider that datatype instead of object
elif len(results) <= 1:
dtype = None
else:
dtype = "object"
return build_pd_series(results, index=results.keys(), dtype=dtype)
def value_counts(self, query_compiler, es_size):
return self._terms_aggs(query_compiler, "terms", es_size)
@ -189,7 +174,21 @@ class Operations:
def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins)
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True):
def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame:
results = self._metric_aggs(
query_compiler, pd_aggs, numeric_only=numeric_only, is_dataframe_agg=True
)
return pd.DataFrame(
results, index=pd_aggs, dtype=(np.float64 if numeric_only else None)
)
def _metric_aggs(
self,
query_compiler: "QueryCompiler",
pd_aggs,
numeric_only: Optional[bool] = None,
is_dataframe_agg: bool = False,
) -> Dict:
query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing)
@ -201,6 +200,7 @@ class Operations:
results = {}
fields = query_compiler._mappings.all_source_fields()
if numeric_only:
# Consider if field is Int/Float/Bool
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
body = Query(query_params.query)
@ -210,6 +210,7 @@ class Operations:
for field in fields:
for es_agg in es_aggs:
# NaN/NaT fields are ignored
if not field.is_es_agg_compatible(es_agg):
continue
@ -241,9 +242,17 @@ class Operations:
for field in fields:
values = []
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
# If the field and agg aren't compatible we add a NaN/NaT
# is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean()
# If the field and agg aren't compatible we add a NaN/NaT for agg
# If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean()
if not field.is_es_agg_compatible(es_agg):
if is_dataframe_agg and not numeric_only:
values.append(field.nan_value)
elif not is_dataframe_agg and numeric_only is False:
values.append(field.nan_value)
# Explicit condition for mad to add NaN because it doesn't support bool
elif is_dataframe_agg and numeric_only:
if pd_agg == "mad":
values.append(field.nan_value)
continue
@ -269,7 +278,7 @@ class Operations:
# All of the below calculations result in NaN if count<=1
if count <= 1:
agg_value = np.float64(np.NaN)
agg_value = np.NaN
elif es_agg[1] == "std_deviation":
agg_value *= count / (count - 1.0)
@ -287,8 +296,11 @@ class Operations:
]["value"]
# Null usually means there were no results.
if agg_value is None:
agg_value = field.nan_value
if agg_value is None or np.isnan(agg_value):
if is_dataframe_agg and not numeric_only:
agg_value = np.NaN
elif not is_dataframe_agg and numeric_only is False:
agg_value = np.NaN
# Cardinality is always either NaN or integer.
elif pd_agg == "nunique":
@ -299,13 +311,20 @@ class Operations:
agg_value = elasticsearch_date_to_pandas_date(
agg_value, field.es_date_format
)
# These aggregations maintain the column datatype
elif pd_agg in {"max", "min", "median"}:
# If numeric_only is False | None then maintain column datatype
elif not numeric_only:
# we're only converting to bool for lossless aggs like min, max, and median.
if pd_agg in {"max", "min", "median", "sum"}:
# 'sum' isn't representable with bool, use int64
if pd_agg == "sum" and field.is_bool:
agg_value = np.int64(agg_value)
else:
agg_value = field.np_dtype.type(agg_value)
values.append(agg_value)
# If numeric_only is True and We only have a NaN type field then we check for empty.
if values:
results[field.index] = values if len(values) > 1 else values[0]
return results
@ -540,10 +559,6 @@ class Operations:
return es_aggs
def aggs(self, query_compiler, pd_aggs):
results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
return pd.DataFrame(results, index=pd_aggs)
def filter(self, query_compiler, items=None, like=None, regex=None):
# This function is only called for axis='index',
# DataFrame.filter(..., axis="columns") calls .drop()

View File

@ -17,7 +17,7 @@
import copy
from datetime import datetime
from typing import Optional, TYPE_CHECKING
from typing import Optional, TYPE_CHECKING, List
import numpy as np
import pandas as pd
@ -490,38 +490,56 @@ class QueryCompiler:
result._operations.filter(self, items=items, like=like, regex=regex)
return result
def aggs(self, func):
return self._operations.aggs(self, func)
def aggs(self, func: List[str], numeric_only: Optional[bool] = None):
return self._operations.aggs(self, func, numeric_only=numeric_only)
def count(self):
return self._operations.count(self)
def mean(self, numeric_only=None):
return self._operations.mean(self, numeric_only=numeric_only)
def mean(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["mean"], numeric_only=numeric_only
)
def var(self, numeric_only=None):
return self._operations.var(self, numeric_only=numeric_only)
def var(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["var"], numeric_only=numeric_only
)
def std(self, numeric_only=None):
return self._operations.std(self, numeric_only=numeric_only)
def std(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["std"], numeric_only=numeric_only
)
def mad(self, numeric_only=None):
return self._operations.mad(self, numeric_only=numeric_only)
def mad(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["mad"], numeric_only=numeric_only
)
def median(self, numeric_only=None):
return self._operations.median(self, numeric_only=numeric_only)
def median(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["median"], numeric_only=numeric_only
)
def sum(self, numeric_only=None):
return self._operations.sum(self, numeric_only=numeric_only)
def sum(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["sum"], numeric_only=numeric_only
)
def min(self, numeric_only=None):
return self._operations.min(self, numeric_only=numeric_only)
def min(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["min"], numeric_only=numeric_only
)
def max(self, numeric_only=None):
return self._operations.max(self, numeric_only=numeric_only)
def max(self, numeric_only: Optional[bool] = None):
return self._operations._metric_agg_series(
self, ["max"], numeric_only=numeric_only
)
def nunique(self):
return self._operations.nunique(self)
return self._operations._metric_agg_series(
self, ["nunique"], numeric_only=False
)
def value_counts(self, es_size):
return self._operations.value_counts(self, es_size)

View File

@ -29,7 +29,9 @@ class TestDataFrameAggs(TestData):
ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min"], numeric_only=True
)
# Eland returns all float values for all metric aggs, pandas can return int
# TODO - investigate this more
@ -40,22 +42,22 @@ class TestDataFrameAggs(TestData):
["sum", "min", "std"]
)
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min", "std"]
["sum", "min", "std"], numeric_only=True
)
print(pd_sum_min_std.dtypes)
print(ed_sum_min_std.dtypes)
assert_frame_equal(
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
)
assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
def test_terms_aggs(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min"], numeric_only=True
)
# Eland returns all float values for all metric aggs, pandas can return int
# TODO - investigate this more
@ -66,15 +68,13 @@ class TestDataFrameAggs(TestData):
["sum", "min", "std"]
)
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
["sum", "min", "std"]
["sum", "min", "std"], numeric_only=True
)
print(pd_sum_min_std.dtypes)
print(ed_sum_min_std.dtypes)
assert_frame_equal(
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
)
assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
def test_aggs_median_var(self):
pd_ecommerce = self.pd_ecommerce()
@ -85,7 +85,7 @@ class TestDataFrameAggs(TestData):
].agg(["median", "var"])
ed_aggs = ed_ecommerce[
["taxful_total_price", "taxless_total_price", "total_quantity"]
].agg(["median", "var"])
].agg(["median", "var"], numeric_only=True)
print(pd_aggs, pd_aggs.dtypes)
print(ed_aggs, ed_aggs.dtypes)
@ -102,7 +102,9 @@ class TestDataFrameAggs(TestData):
ed_flights = self.ed_flights()
pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg)
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg)
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
agg, numeric_only=True
)
assert_series_equal(pd_sum_min_std, ed_sum_min_std)
@ -112,7 +114,9 @@ class TestDataFrameAggs(TestData):
ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"])
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
["mean"], numeric_only=True
)
assert_frame_equal(pd_sum_min, ed_sum_min)

View File

@ -16,18 +16,23 @@
# under the License.
# File called _pytest for PyCharm compatibility
import pytest
import numpy as np
import pandas as pd
from pandas.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameMetrics(TestData):
funcs = ["max", "min", "mean", "sum"]
extended_funcs = ["median", "mad", "var", "std"]
filter_data = [
"AvgTicketPrice",
"Cancelled",
"dayOfWeek",
"timestamp",
"DestCountry",
]
@pytest.mark.parametrize("numeric_only", [False, None])
def test_flights_metrics(self, numeric_only):
@ -49,7 +54,7 @@ class TestDataFrameMetrics(TestData):
pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only)
ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only)
assert_series_equal(pd_metric, ed_metric)
assert_series_equal(pd_metric, ed_metric, check_dtype=False)
def test_flights_extended_metrics(self):
pd_flights = self.pd_flights()
@ -86,11 +91,9 @@ class TestDataFrameMetrics(TestData):
for func in self.extended_funcs:
pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)
assert_series_equal(pd_metric, ed_metric, check_exact=False)
# Test on zero rows to test NaN behaviour of sample std/variance
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
@ -98,11 +101,9 @@ class TestDataFrameMetrics(TestData):
for func in self.extended_funcs:
pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)
assert_series_equal(pd_metric, ed_metric, check_exact=False)
def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
@ -121,7 +122,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(
getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True,
check_exact=False,
)
def test_ecommerce_selected_mixed_numeric_source_fields(self):
@ -143,7 +144,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(
getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True,
check_exact=False,
)
def test_ecommerce_selected_all_numeric_source_fields(self):
@ -157,27 +158,27 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(
getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True,
check_exact=False,
)
def test_flights_datetime_metrics_agg(self):
ed_timestamps = self.ed_flights()[["timestamp"]]
expected_values = {
"timestamp": {
"max": pd.Timestamp("2018-02-11 23:50:12"),
"min": pd.Timestamp("2018-01-01 00:00:00"),
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
"max": pd.Timestamp("2018-02-11 23:50:12"),
"nunique": 12236,
"mad": pd.NaT,
"std": pd.NaT,
"sum": pd.NaT,
"mad": pd.NaT,
"var": pd.NaT,
}
"std": pd.NaT,
"nunique": 12236,
}
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"])
ed_metrics_dict = ed_metrics.to_dict()
ed_metrics_dict["timestamp"].pop("median") # Median is tested below.
ed_metrics = ed_timestamps.agg(
self.funcs + self.extended_funcs + ["nunique"], numeric_only=False
)
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
ed_metrics_dict.pop("median") # Median is tested below.
assert ed_metrics_dict == expected_values
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
@ -192,8 +193,10 @@ class TestDataFrameMetrics(TestData):
ed_metric = ed_timestamps.agg([agg])
if agg == "nunique":
# df with timestamp column should return int64
assert ed_metric.dtypes["timestamp"] == np.int64
else:
# df with timestamp column should return datetime64[ns]
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
assert ed_metric["timestamp"][0] == expected_values[agg]
@ -230,7 +233,7 @@ class TestDataFrameMetrics(TestData):
)
def test_metric_agg_keep_dtypes(self):
# max, min, and median maintain their dtypes
# max, min and median maintain their dtypes
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
assert df.min().tolist() == [131.81910705566406, False, 0]
assert df.max().tolist() == [989.9527587890625, True, 0]
@ -250,3 +253,162 @@ class TestDataFrameMetrics(TestData):
"Cancelled": {"max": True, "median": False, "min": False},
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
}
# sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64.
sum_agg = df.agg(["sum"])
assert sum_agg.dtypes.to_list() == [
np.dtype("float64"),
np.dtype("int64"),
np.dtype("int64"),
]
assert sum_agg.to_dict() == {
"AvgTicketPrice": {"sum": 26521.624084472656},
"Cancelled": {"sum": 6},
"dayOfWeek": {"sum": 0},
}
def test_flights_numeric_only(self):
# All Aggregations Data Check
ed_flights = self.ed_flights().filter(self.filter_data)
pd_flights = self.pd_flights().filter(self.filter_data)
# agg => numeric_only True returns float64 values
# We compare it with individual single agg functions of pandas with numeric_only=True
filtered_aggs = self.funcs + self.extended_funcs
agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose()
for agg in filtered_aggs:
# Explicitly check for mad because it returns nan for bools
if agg == "mad":
assert np.isnan(agg_data[agg]["Cancelled"])
else:
assert_series_equal(
agg_data[agg].rename(None),
getattr(pd_flights, agg)(numeric_only=True),
check_exact=False,
rtol=True,
)
# all single aggs return float64 for numeric_only=True
def test_numeric_only_true_single_aggs(self):
ed_flights = self.ed_flights().filter(self.filter_data)
for agg in self.funcs + self.extended_funcs:
result = getattr(ed_flights, agg)(numeric_only=True)
assert result.dtype == np.dtype("float64")
assert result.shape == ((3,) if agg != "mad" else (2,))
# check dtypes and shape of min, max and median for numeric_only=False | None
@pytest.mark.parametrize("agg", ["min", "max", "median"])
@pytest.mark.parametrize("numeric_only", [False, None])
def test_min_max_median_numeric_only(self, agg, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["Cancelled"], np.bool_)
assert isinstance(calculated_values["dayOfWeek"], np.int64)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["Cancelled"], np.bool_)
assert isinstance(calculated_values["dayOfWeek"], np.int64)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert calculated_values.shape == (4,)
# check dtypes and shape for sum
@pytest.mark.parametrize("numeric_only", [False, None])
def test_sum_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.sum(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["dayOfWeek"], np.int64)
assert isinstance(calculated_values["Cancelled"], np.int64)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.sum(numeric_only=numeric_only)
dtype_list = [calculated_values[i].dtype for i in calculated_values.index]
assert dtype_list == [
np.dtype("float64"),
np.dtype("int64"),
np.dtype("int64"),
]
assert calculated_values.shape == (3,)
# check dtypes and shape for std
@pytest.mark.parametrize("numeric_only", [False, None])
def test_std_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.std(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.std(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert calculated_values.shape == (3,)
# check dtypes and shape for var
@pytest.mark.parametrize("numeric_only", [False, None])
def test_var_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.var(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
assert isinstance(calculated_values["dayOfWeek"], np.float64)
assert isinstance(calculated_values["Cancelled"], np.float64)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.var(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert calculated_values.shape == (3,)
# check dtypes and shape for mean
@pytest.mark.parametrize("numeric_only", [False, None])
def test_mean_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.mean(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.mean(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
assert calculated_values.shape == (4,)
# check dtypes and shape for mad
@pytest.mark.parametrize("numeric_only", [False, None])
def test_mad_numeric_only(self, numeric_only):
ed_flights = self.ed_flights().filter(self.filter_data)
if numeric_only is False:
calculated_values = ed_flights.mad(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["Cancelled"], np.float64)
assert isinstance(calculated_values["dayOfWeek"], float)
assert pd.isnull(calculated_values["timestamp"])
assert np.isnan(calculated_values["DestCountry"])
assert calculated_values.shape == (5,)
elif numeric_only is None:
calculated_values = ed_flights.mad(numeric_only=numeric_only)
assert isinstance(calculated_values["AvgTicketPrice"], float)
assert isinstance(calculated_values["dayOfWeek"], float)
assert calculated_values.shape == (2,)

View File

@ -72,7 +72,7 @@ class TestSeriesMetrics(TestData):
if func == "nunique": # nunique never returns 'NaN'
continue
ed_metric = getattr(ed_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)(numeric_only=False)
print(func, ed_metric)
assert np.isnan(ed_metric)
@ -86,7 +86,9 @@ class TestSeriesMetrics(TestData):
for func in self.all_funcs:
pd_metric = getattr(pd_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)(
**({"numeric_only": True} if (func != "nunique") else {})
)
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
@pytest.mark.parametrize("agg", ["mean", "min", "max"])