mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Switch agg defaults to numeric_only=None
This commit is contained in:
parent
c86371733d
commit
4d96ad39fd
@ -19,7 +19,7 @@ import sys
|
||||
import warnings
|
||||
from io import StringIO
|
||||
import re
|
||||
from typing import Optional, Sequence, Union, Tuple
|
||||
from typing import Optional, Sequence, Union, Tuple, List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -1328,7 +1328,14 @@ class DataFrame(NDFrame):
|
||||
"""
|
||||
return self.columns
|
||||
|
||||
def aggregate(self, func, axis=0, *args, **kwargs):
|
||||
def aggregate(
|
||||
self,
|
||||
func: Union[str, List[str]],
|
||||
axis: int = 0,
|
||||
numeric_only: Optional[bool] = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> Union[pd.Series, pd.DataFrame]:
|
||||
"""
|
||||
Aggregate using one or more operations over the specified axis.
|
||||
|
||||
@ -1347,8 +1354,13 @@ class DataFrame(NDFrame):
|
||||
|
||||
Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
|
||||
'rank', 'sem', 'skew', 'sum', 'std', 'var']``
|
||||
axis
|
||||
axis: int
|
||||
Currently, we only support axis=0 (index)
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: returns all values with float64, NaN/NaT are ignored.
|
||||
- False: returns all values with float64.
|
||||
- None: returns all values with default datatype.
|
||||
*args
|
||||
Positional arguments to pass to `func`
|
||||
**kwargs
|
||||
@ -1368,12 +1380,30 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
|
||||
DistanceKilometers AvgTicketPrice
|
||||
sum 92616288 8204364
|
||||
min 0 100
|
||||
std 4578 266
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
|
||||
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int)
|
||||
AvgTicketPrice DistanceKilometers
|
||||
sum 8204364 92616288
|
||||
min 100 0
|
||||
std 266 4578
|
||||
|
||||
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True)
|
||||
AvgTicketPrice DistanceKilometers
|
||||
sum 8.204365e+06 9.261629e+07
|
||||
min 1.000205e+02 0.000000e+00
|
||||
std 2.664071e+02 4.578614e+03
|
||||
|
||||
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=False)
|
||||
AvgTicketPrice DistanceKilometers timestamp DestCountry
|
||||
sum 8.204365e+06 9.261629e+07 NaT NaN
|
||||
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
|
||||
std 2.664071e+02 4.578614e+03 NaT NaN
|
||||
|
||||
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=None)
|
||||
AvgTicketPrice DistanceKilometers timestamp DestCountry
|
||||
sum 8.204365e+06 9.261629e+07 NaT NaN
|
||||
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
|
||||
std 2.664071e+02 4.578614e+03 NaT NaN
|
||||
"""
|
||||
axis = pd.DataFrame._get_axis_number(axis)
|
||||
|
||||
@ -1387,10 +1417,14 @@ class DataFrame(NDFrame):
|
||||
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
|
||||
if isinstance(func, str):
|
||||
# Wrap in list
|
||||
return self._query_compiler.aggs([func]).squeeze().rename(None)
|
||||
return (
|
||||
self._query_compiler.aggs([func], numeric_only=numeric_only)
|
||||
.squeeze()
|
||||
.rename(None)
|
||||
)
|
||||
elif is_list_like(func):
|
||||
# we have a list!
|
||||
return self._query_compiler.aggs(func)
|
||||
return self._query_compiler.aggs(func, numeric_only=numeric_only)
|
||||
|
||||
agg = aggregate
|
||||
|
||||
|
@ -100,6 +100,9 @@ class Field(NamedTuple):
|
||||
|
||||
# Cardinality works for all types
|
||||
# Numerics and bools work for all aggs
|
||||
# Except "median_absolute_deviation" which doesn't support bool
|
||||
if es_agg == "median_absolute_deviation" and self.is_bool:
|
||||
return False
|
||||
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
||||
return True
|
||||
# Timestamps also work for 'min', 'max' and 'avg'
|
||||
|
301
eland/ndframe.py
301
eland/ndframe.py
@ -17,7 +17,7 @@
|
||||
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Tuple
|
||||
from typing import TYPE_CHECKING, Tuple, Optional
|
||||
import pandas as pd
|
||||
from eland.query_compiler import QueryCompiler
|
||||
|
||||
@ -162,12 +162,19 @@ class NDFrame(ABC):
|
||||
def _es_info(self, buf):
|
||||
self._query_compiler.es_info(buf)
|
||||
|
||||
def mean(self, numeric_only: bool = True) -> pd.Series:
|
||||
def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return mean value for each numeric column
|
||||
|
||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -179,27 +186,44 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.mean()
|
||||
AvgTicketPrice 628.253689
|
||||
Cancelled 0.128494
|
||||
DistanceKilometers 7092.142457
|
||||
DistanceMiles 4406.853010
|
||||
FlightDelay 0.251168
|
||||
FlightDelayMin 47.335171
|
||||
FlightTimeHour 8.518797
|
||||
FlightTimeMin 511.127842
|
||||
dayOfWeek 2.835975
|
||||
AvgTicketPrice 628.254
|
||||
Cancelled 0.128494
|
||||
dayOfWeek 2.83598
|
||||
timestamp 2018-01-21 19:20:45.564438232
|
||||
dtype: object
|
||||
|
||||
>>> df.mean(numeric_only=True)
|
||||
AvgTicketPrice 628.253689
|
||||
Cancelled 0.128494
|
||||
dayOfWeek 2.835975
|
||||
dtype: float64
|
||||
|
||||
>>> df.mean(numeric_only=False)
|
||||
AvgTicketPrice 628.254
|
||||
Cancelled 0.128494
|
||||
dayOfWeek 2.83598
|
||||
timestamp 2018-01-21 19:20:45.564438232
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.mean(numeric_only=numeric_only)
|
||||
|
||||
def sum(self, numeric_only: bool = True) -> pd.Series:
|
||||
def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return sum for each numeric column
|
||||
|
||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -211,27 +235,43 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.sum()
|
||||
AvgTicketPrice 8.204365e+06
|
||||
Cancelled 1.678000e+03
|
||||
DistanceKilometers 9.261629e+07
|
||||
DistanceMiles 5.754909e+07
|
||||
FlightDelay 3.280000e+03
|
||||
FlightDelayMin 6.181500e+05
|
||||
FlightTimeHour 1.112470e+05
|
||||
FlightTimeMin 6.674818e+06
|
||||
dayOfWeek 3.703500e+04
|
||||
AvgTicketPrice 8.20436e+06
|
||||
Cancelled 1678
|
||||
dayOfWeek 37035
|
||||
dtype: object
|
||||
|
||||
>>> df.sum(numeric_only=True)
|
||||
AvgTicketPrice 8.204365e+06
|
||||
Cancelled 1.678000e+03
|
||||
dayOfWeek 3.703500e+04
|
||||
dtype: float64
|
||||
|
||||
>>> df.sum(numeric_only=False)
|
||||
AvgTicketPrice 8.20436e+06
|
||||
Cancelled 1678
|
||||
dayOfWeek 37035
|
||||
timestamp NaT
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.sum(numeric_only=numeric_only)
|
||||
|
||||
def min(self, numeric_only: bool = True) -> pd.Series:
|
||||
def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return the minimum value for each numeric column
|
||||
|
||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -243,25 +283,42 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.min()
|
||||
AvgTicketPrice 100.021
|
||||
Cancelled False
|
||||
DistanceKilometers 0
|
||||
DistanceMiles 0
|
||||
FlightDelay False
|
||||
FlightDelayMin 0
|
||||
FlightTimeHour 0
|
||||
FlightTimeMin 0
|
||||
dayOfWeek 0
|
||||
AvgTicketPrice 100.021
|
||||
Cancelled False
|
||||
dayOfWeek 0
|
||||
timestamp 2018-01-01 00:00:00
|
||||
dtype: object
|
||||
|
||||
>>> df.min(numeric_only=True)
|
||||
AvgTicketPrice 100.020531
|
||||
Cancelled 0.000000
|
||||
dayOfWeek 0.000000
|
||||
dtype: float64
|
||||
|
||||
>>> df.min(numeric_only=False)
|
||||
AvgTicketPrice 100.021
|
||||
Cancelled False
|
||||
dayOfWeek 0
|
||||
timestamp 2018-01-01 00:00:00
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.min(numeric_only=numeric_only)
|
||||
|
||||
def var(self, numeric_only: bool = True) -> pd.Series:
|
||||
def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return variance for each numeric column
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -273,25 +330,41 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df.var() # doctest: +SKIP
|
||||
AvgTicketPrice 7.096185e+04
|
||||
Cancelled 1.119831e-01
|
||||
DistanceKilometers 2.096049e+07
|
||||
DistanceMiles 8.092892e+06
|
||||
FlightDelay 1.880825e-01
|
||||
FlightDelayMin 9.359209e+03
|
||||
FlightTimeHour 3.112545e+01
|
||||
FlightTimeMin 1.120516e+05
|
||||
dayOfWeek 3.761135e+00
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.var()
|
||||
AvgTicketPrice 70964.570234
|
||||
Cancelled 0.111987
|
||||
dayOfWeek 3.761279
|
||||
dtype: float64
|
||||
|
||||
>>> df.var(numeric_only=True)
|
||||
AvgTicketPrice 70964.570234
|
||||
Cancelled 0.111987
|
||||
dayOfWeek 3.761279
|
||||
dtype: float64
|
||||
|
||||
>>> df.var(numeric_only=False)
|
||||
AvgTicketPrice 70964.6
|
||||
Cancelled 0.111987
|
||||
dayOfWeek 3.76128
|
||||
timestamp NaT
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.var(numeric_only=numeric_only)
|
||||
|
||||
def std(self, numeric_only: bool = True) -> pd.Series:
|
||||
def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return standard deviation for each numeric column
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -303,25 +376,41 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df.std() # doctest: +SKIP
|
||||
AvgTicketPrice 266.386661
|
||||
Cancelled 0.334639
|
||||
DistanceKilometers 4578.263193
|
||||
DistanceMiles 2844.800855
|
||||
FlightDelay 0.433685
|
||||
FlightDelayMin 96.743006
|
||||
FlightTimeHour 5.579019
|
||||
FlightTimeMin 334.741135
|
||||
dayOfWeek 1.939365
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.std()
|
||||
AvgTicketPrice 266.407061
|
||||
Cancelled 0.334664
|
||||
dayOfWeek 1.939513
|
||||
dtype: float64
|
||||
|
||||
>>> df.std(numeric_only=True)
|
||||
AvgTicketPrice 266.407061
|
||||
Cancelled 0.334664
|
||||
dayOfWeek 1.939513
|
||||
dtype: float64
|
||||
|
||||
>>> df.std(numeric_only=False)
|
||||
AvgTicketPrice 266.407
|
||||
Cancelled 0.334664
|
||||
dayOfWeek 1.93951
|
||||
timestamp NaT
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.std(numeric_only=numeric_only)
|
||||
|
||||
def median(self, numeric_only: bool = True) -> pd.Series:
|
||||
def median(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return the median value for each numeric column
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -333,27 +422,44 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.median() # doctest: +SKIP
|
||||
AvgTicketPrice 640.387285
|
||||
Cancelled 0.000000
|
||||
DistanceKilometers 7612.072403
|
||||
DistanceMiles 4729.922470
|
||||
FlightDelay 0.000000
|
||||
FlightDelayMin 0.000000
|
||||
FlightTimeHour 8.383113
|
||||
FlightTimeMin 503.148975
|
||||
dayOfWeek 3.000000
|
||||
AvgTicketPrice 640.363
|
||||
Cancelled False
|
||||
dayOfWeek 3
|
||||
timestamp 2018-01-21 23:54:06.624776611
|
||||
dtype: object
|
||||
|
||||
>>> df.median(numeric_only=True) # doctest: +SKIP
|
||||
AvgTicketPrice 640.362667
|
||||
Cancelled 0.000000
|
||||
dayOfWeek 3.000000
|
||||
dtype: float64
|
||||
|
||||
>>> df.median(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 640.387
|
||||
Cancelled False
|
||||
dayOfWeek 3
|
||||
timestamp 2018-01-21 23:54:06.624776611
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.median(numeric_only=numeric_only)
|
||||
|
||||
def max(self, numeric_only: bool = True) -> pd.Series:
|
||||
def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||
"""
|
||||
Return the maximum value for each numeric column
|
||||
|
||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is None
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Series
|
||||
@ -365,17 +471,26 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.max()
|
||||
AvgTicketPrice 1199.73
|
||||
Cancelled True
|
||||
DistanceKilometers 19881.5
|
||||
DistanceMiles 12353.8
|
||||
FlightDelay True
|
||||
FlightDelayMin 360
|
||||
FlightTimeHour 31.715
|
||||
FlightTimeMin 1902.9
|
||||
dayOfWeek 6
|
||||
AvgTicketPrice 1199.73
|
||||
Cancelled True
|
||||
dayOfWeek 6
|
||||
timestamp 2018-02-11 23:50:12
|
||||
dtype: object
|
||||
|
||||
>>> df.max(numeric_only=True)
|
||||
AvgTicketPrice 1199.729004
|
||||
Cancelled 1.000000
|
||||
dayOfWeek 6.000000
|
||||
dtype: float64
|
||||
|
||||
>>> df.max(numeric_only=False)
|
||||
AvgTicketPrice 1199.73
|
||||
Cancelled True
|
||||
dayOfWeek 6
|
||||
timestamp 2018-02-11 23:50:12
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.max(numeric_only=numeric_only)
|
||||
@ -441,18 +556,24 @@ class NDFrame(ABC):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.mad() # doctest: +SKIP
|
||||
AvgTicketPrice 213.368709
|
||||
Cancelled 0.000000
|
||||
DistanceKilometers 2946.168236
|
||||
DistanceMiles 1830.987236
|
||||
FlightDelay 0.000000
|
||||
FlightDelayMin 0.000000
|
||||
FlightTimeHour 3.819435
|
||||
FlightTimeMin 229.142297
|
||||
dayOfWeek 2.000000
|
||||
AvgTicketPrice 213.35497
|
||||
dayOfWeek 2.00000
|
||||
dtype: float64
|
||||
|
||||
>>> df.mad(numeric_only=True) # doctest: +SKIP
|
||||
AvgTicketPrice 213.473011
|
||||
dayOfWeek 2.000000
|
||||
dtype: float64
|
||||
|
||||
>>> df.mad(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 213.484
|
||||
Cancelled NaN
|
||||
dayOfWeek 2
|
||||
timestamp NaT
|
||||
DestCountry NaN
|
||||
dtype: object
|
||||
"""
|
||||
return self._query_compiler.mad(numeric_only=numeric_only)
|
||||
|
||||
|
@ -145,43 +145,28 @@ class Operations:
|
||||
|
||||
return build_pd_series(data=counts, index=fields)
|
||||
|
||||
def mean(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def var(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def std(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def median(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(
|
||||
query_compiler, ["median"], numeric_only=numeric_only
|
||||
)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def sum(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def max(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def min(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def nunique(self, query_compiler):
|
||||
results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
|
||||
def mad(self, query_compiler, numeric_only=True):
|
||||
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
|
||||
return build_pd_series(results, index=results.keys())
|
||||
def _metric_agg_series(
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
agg: List,
|
||||
numeric_only: Optional[bool] = None,
|
||||
) -> pd.Series:
|
||||
results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only)
|
||||
if numeric_only:
|
||||
return build_pd_series(results, index=results.keys(), dtype=np.float64)
|
||||
else:
|
||||
# If all results are float convert into float64
|
||||
if all(isinstance(i, float) for i in results.values()):
|
||||
dtype = np.float64
|
||||
# If all results are int convert into int64
|
||||
elif all(isinstance(i, int) for i in results.values()):
|
||||
dtype = np.int64
|
||||
# If single result is present consider that datatype instead of object
|
||||
elif len(results) <= 1:
|
||||
dtype = None
|
||||
else:
|
||||
dtype = "object"
|
||||
return build_pd_series(results, index=results.keys(), dtype=dtype)
|
||||
|
||||
def value_counts(self, query_compiler, es_size):
|
||||
return self._terms_aggs(query_compiler, "terms", es_size)
|
||||
@ -189,7 +174,21 @@ class Operations:
|
||||
def hist(self, query_compiler, bins):
|
||||
return self._hist_aggs(query_compiler, bins)
|
||||
|
||||
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True):
|
||||
def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame:
|
||||
results = self._metric_aggs(
|
||||
query_compiler, pd_aggs, numeric_only=numeric_only, is_dataframe_agg=True
|
||||
)
|
||||
return pd.DataFrame(
|
||||
results, index=pd_aggs, dtype=(np.float64 if numeric_only else None)
|
||||
)
|
||||
|
||||
def _metric_aggs(
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
pd_aggs,
|
||||
numeric_only: Optional[bool] = None,
|
||||
is_dataframe_agg: bool = False,
|
||||
) -> Dict:
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
@ -201,6 +200,7 @@ class Operations:
|
||||
results = {}
|
||||
fields = query_compiler._mappings.all_source_fields()
|
||||
if numeric_only:
|
||||
# Consider if field is Int/Float/Bool
|
||||
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
|
||||
|
||||
body = Query(query_params.query)
|
||||
@ -210,6 +210,7 @@ class Operations:
|
||||
|
||||
for field in fields:
|
||||
for es_agg in es_aggs:
|
||||
# NaN/NaT fields are ignored
|
||||
if not field.is_es_agg_compatible(es_agg):
|
||||
continue
|
||||
|
||||
@ -241,10 +242,18 @@ class Operations:
|
||||
for field in fields:
|
||||
values = []
|
||||
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
|
||||
|
||||
# If the field and agg aren't compatible we add a NaN/NaT
|
||||
# is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean()
|
||||
# If the field and agg aren't compatible we add a NaN/NaT for agg
|
||||
# If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean()
|
||||
if not field.is_es_agg_compatible(es_agg):
|
||||
values.append(field.nan_value)
|
||||
if is_dataframe_agg and not numeric_only:
|
||||
values.append(field.nan_value)
|
||||
elif not is_dataframe_agg and numeric_only is False:
|
||||
values.append(field.nan_value)
|
||||
# Explicit condition for mad to add NaN because it doesn't support bool
|
||||
elif is_dataframe_agg and numeric_only:
|
||||
if pd_agg == "mad":
|
||||
values.append(field.nan_value)
|
||||
continue
|
||||
|
||||
if isinstance(es_agg, tuple):
|
||||
@ -269,7 +278,7 @@ class Operations:
|
||||
|
||||
# All of the below calculations result in NaN if count<=1
|
||||
if count <= 1:
|
||||
agg_value = np.float64(np.NaN)
|
||||
agg_value = np.NaN
|
||||
|
||||
elif es_agg[1] == "std_deviation":
|
||||
agg_value *= count / (count - 1.0)
|
||||
@ -287,8 +296,11 @@ class Operations:
|
||||
]["value"]
|
||||
|
||||
# Null usually means there were no results.
|
||||
if agg_value is None:
|
||||
agg_value = field.nan_value
|
||||
if agg_value is None or np.isnan(agg_value):
|
||||
if is_dataframe_agg and not numeric_only:
|
||||
agg_value = np.NaN
|
||||
elif not is_dataframe_agg and numeric_only is False:
|
||||
agg_value = np.NaN
|
||||
|
||||
# Cardinality is always either NaN or integer.
|
||||
elif pd_agg == "nunique":
|
||||
@ -299,14 +311,21 @@ class Operations:
|
||||
agg_value = elasticsearch_date_to_pandas_date(
|
||||
agg_value, field.es_date_format
|
||||
)
|
||||
|
||||
# These aggregations maintain the column datatype
|
||||
elif pd_agg in {"max", "min", "median"}:
|
||||
agg_value = field.np_dtype.type(agg_value)
|
||||
# If numeric_only is False | None then maintain column datatype
|
||||
elif not numeric_only:
|
||||
# we're only converting to bool for lossless aggs like min, max, and median.
|
||||
if pd_agg in {"max", "min", "median", "sum"}:
|
||||
# 'sum' isn't representable with bool, use int64
|
||||
if pd_agg == "sum" and field.is_bool:
|
||||
agg_value = np.int64(agg_value)
|
||||
else:
|
||||
agg_value = field.np_dtype.type(agg_value)
|
||||
|
||||
values.append(agg_value)
|
||||
|
||||
results[field.index] = values if len(values) > 1 else values[0]
|
||||
# If numeric_only is True and We only have a NaN type field then we check for empty.
|
||||
if values:
|
||||
results[field.index] = values if len(values) > 1 else values[0]
|
||||
|
||||
return results
|
||||
|
||||
@ -540,10 +559,6 @@ class Operations:
|
||||
|
||||
return es_aggs
|
||||
|
||||
def aggs(self, query_compiler, pd_aggs):
|
||||
results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
|
||||
return pd.DataFrame(results, index=pd_aggs)
|
||||
|
||||
def filter(self, query_compiler, items=None, like=None, regex=None):
|
||||
# This function is only called for axis='index',
|
||||
# DataFrame.filter(..., axis="columns") calls .drop()
|
||||
|
@ -17,7 +17,7 @@
|
||||
|
||||
import copy
|
||||
from datetime import datetime
|
||||
from typing import Optional, TYPE_CHECKING
|
||||
from typing import Optional, TYPE_CHECKING, List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -490,38 +490,56 @@ class QueryCompiler:
|
||||
result._operations.filter(self, items=items, like=like, regex=regex)
|
||||
return result
|
||||
|
||||
def aggs(self, func):
|
||||
return self._operations.aggs(self, func)
|
||||
def aggs(self, func: List[str], numeric_only: Optional[bool] = None):
|
||||
return self._operations.aggs(self, func, numeric_only=numeric_only)
|
||||
|
||||
def count(self):
|
||||
return self._operations.count(self)
|
||||
|
||||
def mean(self, numeric_only=None):
|
||||
return self._operations.mean(self, numeric_only=numeric_only)
|
||||
def mean(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["mean"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def var(self, numeric_only=None):
|
||||
return self._operations.var(self, numeric_only=numeric_only)
|
||||
def var(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["var"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def std(self, numeric_only=None):
|
||||
return self._operations.std(self, numeric_only=numeric_only)
|
||||
def std(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["std"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def mad(self, numeric_only=None):
|
||||
return self._operations.mad(self, numeric_only=numeric_only)
|
||||
def mad(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["mad"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def median(self, numeric_only=None):
|
||||
return self._operations.median(self, numeric_only=numeric_only)
|
||||
def median(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["median"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def sum(self, numeric_only=None):
|
||||
return self._operations.sum(self, numeric_only=numeric_only)
|
||||
def sum(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["sum"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def min(self, numeric_only=None):
|
||||
return self._operations.min(self, numeric_only=numeric_only)
|
||||
def min(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["min"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def max(self, numeric_only=None):
|
||||
return self._operations.max(self, numeric_only=numeric_only)
|
||||
def max(self, numeric_only: Optional[bool] = None):
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["max"], numeric_only=numeric_only
|
||||
)
|
||||
|
||||
def nunique(self):
|
||||
return self._operations.nunique(self)
|
||||
return self._operations._metric_agg_series(
|
||||
self, ["nunique"], numeric_only=False
|
||||
)
|
||||
|
||||
def value_counts(self, es_size):
|
||||
return self._operations.value_counts(self, es_size)
|
||||
|
@ -29,7 +29,9 @@ class TestDataFrameAggs(TestData):
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||
["sum", "min"], numeric_only=True
|
||||
)
|
||||
|
||||
# Eland returns all float values for all metric aggs, pandas can return int
|
||||
# TODO - investigate this more
|
||||
@ -40,22 +42,22 @@ class TestDataFrameAggs(TestData):
|
||||
["sum", "min", "std"]
|
||||
)
|
||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||
["sum", "min", "std"]
|
||||
["sum", "min", "std"], numeric_only=True
|
||||
)
|
||||
|
||||
print(pd_sum_min_std.dtypes)
|
||||
print(ed_sum_min_std.dtypes)
|
||||
|
||||
assert_frame_equal(
|
||||
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
|
||||
)
|
||||
assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
|
||||
|
||||
def test_terms_aggs(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||
["sum", "min"], numeric_only=True
|
||||
)
|
||||
|
||||
# Eland returns all float values for all metric aggs, pandas can return int
|
||||
# TODO - investigate this more
|
||||
@ -66,15 +68,13 @@ class TestDataFrameAggs(TestData):
|
||||
["sum", "min", "std"]
|
||||
)
|
||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||
["sum", "min", "std"]
|
||||
["sum", "min", "std"], numeric_only=True
|
||||
)
|
||||
|
||||
print(pd_sum_min_std.dtypes)
|
||||
print(ed_sum_min_std.dtypes)
|
||||
|
||||
assert_frame_equal(
|
||||
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
|
||||
)
|
||||
assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
|
||||
|
||||
def test_aggs_median_var(self):
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
@ -85,7 +85,7 @@ class TestDataFrameAggs(TestData):
|
||||
].agg(["median", "var"])
|
||||
ed_aggs = ed_ecommerce[
|
||||
["taxful_total_price", "taxless_total_price", "total_quantity"]
|
||||
].agg(["median", "var"])
|
||||
].agg(["median", "var"], numeric_only=True)
|
||||
|
||||
print(pd_aggs, pd_aggs.dtypes)
|
||||
print(ed_aggs, ed_aggs.dtypes)
|
||||
@ -102,7 +102,9 @@ class TestDataFrameAggs(TestData):
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg)
|
||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg)
|
||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||
agg, numeric_only=True
|
||||
)
|
||||
|
||||
assert_series_equal(pd_sum_min_std, ed_sum_min_std)
|
||||
|
||||
@ -112,7 +114,9 @@ class TestDataFrameAggs(TestData):
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"])
|
||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"])
|
||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||
["mean"], numeric_only=True
|
||||
)
|
||||
|
||||
assert_frame_equal(pd_sum_min, ed_sum_min)
|
||||
|
||||
|
@ -16,18 +16,23 @@
|
||||
# under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatibility
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameMetrics(TestData):
|
||||
funcs = ["max", "min", "mean", "sum"]
|
||||
extended_funcs = ["median", "mad", "var", "std"]
|
||||
filter_data = [
|
||||
"AvgTicketPrice",
|
||||
"Cancelled",
|
||||
"dayOfWeek",
|
||||
"timestamp",
|
||||
"DestCountry",
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_flights_metrics(self, numeric_only):
|
||||
@ -49,7 +54,7 @@ class TestDataFrameMetrics(TestData):
|
||||
pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only)
|
||||
ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only)
|
||||
|
||||
assert_series_equal(pd_metric, ed_metric)
|
||||
assert_series_equal(pd_metric, ed_metric, check_dtype=False)
|
||||
|
||||
def test_flights_extended_metrics(self):
|
||||
pd_flights = self.pd_flights()
|
||||
@ -86,11 +91,9 @@ class TestDataFrameMetrics(TestData):
|
||||
|
||||
for func in self.extended_funcs:
|
||||
pd_metric = getattr(pd_flights_1, func)()
|
||||
ed_metric = getattr(ed_flights_1, func)()
|
||||
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
||||
|
||||
assert_series_equal(
|
||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
||||
)
|
||||
assert_series_equal(pd_metric, ed_metric, check_exact=False)
|
||||
|
||||
# Test on zero rows to test NaN behaviour of sample std/variance
|
||||
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||
@ -98,11 +101,9 @@ class TestDataFrameMetrics(TestData):
|
||||
|
||||
for func in self.extended_funcs:
|
||||
pd_metric = getattr(pd_flights_0, func)()
|
||||
ed_metric = getattr(ed_flights_0, func)()
|
||||
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
||||
|
||||
assert_series_equal(
|
||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
||||
)
|
||||
assert_series_equal(pd_metric, ed_metric, check_exact=False)
|
||||
|
||||
def test_ecommerce_selected_non_numeric_source_fields(self):
|
||||
# None of these are numeric
|
||||
@ -121,7 +122,7 @@ class TestDataFrameMetrics(TestData):
|
||||
assert_series_equal(
|
||||
getattr(pd_ecommerce, func)(numeric_only=True),
|
||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||
check_less_precise=True,
|
||||
check_exact=False,
|
||||
)
|
||||
|
||||
def test_ecommerce_selected_mixed_numeric_source_fields(self):
|
||||
@ -143,7 +144,7 @@ class TestDataFrameMetrics(TestData):
|
||||
assert_series_equal(
|
||||
getattr(pd_ecommerce, func)(numeric_only=True),
|
||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||
check_less_precise=True,
|
||||
check_exact=False,
|
||||
)
|
||||
|
||||
def test_ecommerce_selected_all_numeric_source_fields(self):
|
||||
@ -157,27 +158,27 @@ class TestDataFrameMetrics(TestData):
|
||||
assert_series_equal(
|
||||
getattr(pd_ecommerce, func)(numeric_only=True),
|
||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||
check_less_precise=True,
|
||||
check_exact=False,
|
||||
)
|
||||
|
||||
def test_flights_datetime_metrics_agg(self):
|
||||
ed_timestamps = self.ed_flights()[["timestamp"]]
|
||||
expected_values = {
|
||||
"timestamp": {
|
||||
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||
"nunique": 12236,
|
||||
"mad": pd.NaT,
|
||||
"std": pd.NaT,
|
||||
"sum": pd.NaT,
|
||||
"var": pd.NaT,
|
||||
}
|
||||
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||
"sum": pd.NaT,
|
||||
"mad": pd.NaT,
|
||||
"var": pd.NaT,
|
||||
"std": pd.NaT,
|
||||
"nunique": 12236,
|
||||
}
|
||||
|
||||
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"])
|
||||
ed_metrics_dict = ed_metrics.to_dict()
|
||||
ed_metrics_dict["timestamp"].pop("median") # Median is tested below.
|
||||
ed_metrics = ed_timestamps.agg(
|
||||
self.funcs + self.extended_funcs + ["nunique"], numeric_only=False
|
||||
)
|
||||
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
|
||||
ed_metrics_dict.pop("median") # Median is tested below.
|
||||
assert ed_metrics_dict == expected_values
|
||||
|
||||
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
|
||||
@ -192,8 +193,10 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_metric = ed_timestamps.agg([agg])
|
||||
|
||||
if agg == "nunique":
|
||||
# df with timestamp column should return int64
|
||||
assert ed_metric.dtypes["timestamp"] == np.int64
|
||||
else:
|
||||
# df with timestamp column should return datetime64[ns]
|
||||
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
|
||||
assert ed_metric["timestamp"][0] == expected_values[agg]
|
||||
|
||||
@ -230,7 +233,7 @@ class TestDataFrameMetrics(TestData):
|
||||
)
|
||||
|
||||
def test_metric_agg_keep_dtypes(self):
|
||||
# max, min, and median maintain their dtypes
|
||||
# max, min and median maintain their dtypes
|
||||
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
|
||||
assert df.min().tolist() == [131.81910705566406, False, 0]
|
||||
assert df.max().tolist() == [989.9527587890625, True, 0]
|
||||
@ -250,3 +253,162 @@ class TestDataFrameMetrics(TestData):
|
||||
"Cancelled": {"max": True, "median": False, "min": False},
|
||||
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
|
||||
}
|
||||
# sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64.
|
||||
sum_agg = df.agg(["sum"])
|
||||
assert sum_agg.dtypes.to_list() == [
|
||||
np.dtype("float64"),
|
||||
np.dtype("int64"),
|
||||
np.dtype("int64"),
|
||||
]
|
||||
assert sum_agg.to_dict() == {
|
||||
"AvgTicketPrice": {"sum": 26521.624084472656},
|
||||
"Cancelled": {"sum": 6},
|
||||
"dayOfWeek": {"sum": 0},
|
||||
}
|
||||
|
||||
def test_flights_numeric_only(self):
|
||||
# All Aggregations Data Check
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
pd_flights = self.pd_flights().filter(self.filter_data)
|
||||
# agg => numeric_only True returns float64 values
|
||||
# We compare it with individual single agg functions of pandas with numeric_only=True
|
||||
filtered_aggs = self.funcs + self.extended_funcs
|
||||
agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose()
|
||||
for agg in filtered_aggs:
|
||||
# Explicitly check for mad because it returns nan for bools
|
||||
if agg == "mad":
|
||||
assert np.isnan(agg_data[agg]["Cancelled"])
|
||||
else:
|
||||
assert_series_equal(
|
||||
agg_data[agg].rename(None),
|
||||
getattr(pd_flights, agg)(numeric_only=True),
|
||||
check_exact=False,
|
||||
rtol=True,
|
||||
)
|
||||
|
||||
# all single aggs return float64 for numeric_only=True
|
||||
def test_numeric_only_true_single_aggs(self):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
for agg in self.funcs + self.extended_funcs:
|
||||
result = getattr(ed_flights, agg)(numeric_only=True)
|
||||
assert result.dtype == np.dtype("float64")
|
||||
assert result.shape == ((3,) if agg != "mad" else (2,))
|
||||
|
||||
# check dtypes and shape of min, max and median for numeric_only=False | None
|
||||
@pytest.mark.parametrize("agg", ["min", "max", "median"])
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_min_max_median_numeric_only(self, agg, numeric_only):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
if numeric_only is False:
|
||||
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||
assert isinstance(calculated_values["Cancelled"], np.bool_)
|
||||
assert isinstance(calculated_values["dayOfWeek"], np.int64)
|
||||
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||
assert np.isnan(calculated_values["DestCountry"])
|
||||
assert calculated_values.shape == (5,)
|
||||
elif numeric_only is None:
|
||||
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||
assert isinstance(calculated_values["Cancelled"], np.bool_)
|
||||
assert isinstance(calculated_values["dayOfWeek"], np.int64)
|
||||
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||
assert calculated_values.shape == (4,)
|
||||
|
||||
# check dtypes and shape for sum
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_sum_numeric_only(self, numeric_only):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
if numeric_only is False:
|
||||
calculated_values = ed_flights.sum(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||
assert isinstance(calculated_values["dayOfWeek"], np.int64)
|
||||
assert isinstance(calculated_values["Cancelled"], np.int64)
|
||||
assert pd.isnull(calculated_values["timestamp"])
|
||||
assert np.isnan(calculated_values["DestCountry"])
|
||||
assert calculated_values.shape == (5,)
|
||||
elif numeric_only is None:
|
||||
calculated_values = ed_flights.sum(numeric_only=numeric_only)
|
||||
dtype_list = [calculated_values[i].dtype for i in calculated_values.index]
|
||||
assert dtype_list == [
|
||||
np.dtype("float64"),
|
||||
np.dtype("int64"),
|
||||
np.dtype("int64"),
|
||||
]
|
||||
assert calculated_values.shape == (3,)
|
||||
|
||||
# check dtypes and shape for std
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_std_numeric_only(self, numeric_only):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
if numeric_only is False:
|
||||
calculated_values = ed_flights.std(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["Cancelled"], float)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert pd.isnull(calculated_values["timestamp"])
|
||||
assert np.isnan(calculated_values["DestCountry"])
|
||||
assert calculated_values.shape == (5,)
|
||||
elif numeric_only is None:
|
||||
calculated_values = ed_flights.std(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["Cancelled"], float)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert calculated_values.shape == (3,)
|
||||
|
||||
# check dtypes and shape for var
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_var_numeric_only(self, numeric_only):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
if numeric_only is False:
|
||||
calculated_values = ed_flights.var(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||
assert isinstance(calculated_values["dayOfWeek"], np.float64)
|
||||
assert isinstance(calculated_values["Cancelled"], np.float64)
|
||||
assert pd.isnull(calculated_values["timestamp"])
|
||||
assert np.isnan(calculated_values["DestCountry"])
|
||||
assert calculated_values.shape == (5,)
|
||||
elif numeric_only is None:
|
||||
calculated_values = ed_flights.var(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["Cancelled"], float)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert calculated_values.shape == (3,)
|
||||
|
||||
# check dtypes and shape for mean
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_mean_numeric_only(self, numeric_only):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
if numeric_only is False:
|
||||
calculated_values = ed_flights.mean(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert isinstance(calculated_values["Cancelled"], float)
|
||||
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||
assert np.isnan(calculated_values["DestCountry"])
|
||||
assert calculated_values.shape == (5,)
|
||||
elif numeric_only is None:
|
||||
calculated_values = ed_flights.mean(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["Cancelled"], float)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||
assert calculated_values.shape == (4,)
|
||||
|
||||
# check dtypes and shape for mad
|
||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||
def test_mad_numeric_only(self, numeric_only):
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
if numeric_only is False:
|
||||
calculated_values = ed_flights.mad(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["Cancelled"], np.float64)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert pd.isnull(calculated_values["timestamp"])
|
||||
assert np.isnan(calculated_values["DestCountry"])
|
||||
assert calculated_values.shape == (5,)
|
||||
elif numeric_only is None:
|
||||
calculated_values = ed_flights.mad(numeric_only=numeric_only)
|
||||
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||
assert calculated_values.shape == (2,)
|
||||
|
@ -72,7 +72,7 @@ class TestSeriesMetrics(TestData):
|
||||
if func == "nunique": # nunique never returns 'NaN'
|
||||
continue
|
||||
|
||||
ed_metric = getattr(ed_ecommerce, func)()
|
||||
ed_metric = getattr(ed_ecommerce, func)(numeric_only=False)
|
||||
print(func, ed_metric)
|
||||
assert np.isnan(ed_metric)
|
||||
|
||||
@ -86,7 +86,9 @@ class TestSeriesMetrics(TestData):
|
||||
|
||||
for func in self.all_funcs:
|
||||
pd_metric = getattr(pd_ecommerce, func)()
|
||||
ed_metric = getattr(ed_ecommerce, func)()
|
||||
ed_metric = getattr(ed_ecommerce, func)(
|
||||
**({"numeric_only": True} if (func != "nunique") else {})
|
||||
)
|
||||
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||
|
||||
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user