mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Switch agg defaults to numeric_only=None
This commit is contained in:
parent
c86371733d
commit
4d96ad39fd
@ -19,7 +19,7 @@ import sys
|
|||||||
import warnings
|
import warnings
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
import re
|
import re
|
||||||
from typing import Optional, Sequence, Union, Tuple
|
from typing import Optional, Sequence, Union, Tuple, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -1328,7 +1328,14 @@ class DataFrame(NDFrame):
|
|||||||
"""
|
"""
|
||||||
return self.columns
|
return self.columns
|
||||||
|
|
||||||
def aggregate(self, func, axis=0, *args, **kwargs):
|
def aggregate(
|
||||||
|
self,
|
||||||
|
func: Union[str, List[str]],
|
||||||
|
axis: int = 0,
|
||||||
|
numeric_only: Optional[bool] = None,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
) -> Union[pd.Series, pd.DataFrame]:
|
||||||
"""
|
"""
|
||||||
Aggregate using one or more operations over the specified axis.
|
Aggregate using one or more operations over the specified axis.
|
||||||
|
|
||||||
@ -1347,8 +1354,13 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
|
Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
|
||||||
'rank', 'sem', 'skew', 'sum', 'std', 'var']``
|
'rank', 'sem', 'skew', 'sum', 'std', 'var']``
|
||||||
axis
|
axis: int
|
||||||
Currently, we only support axis=0 (index)
|
Currently, we only support axis=0 (index)
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: returns all values with float64, NaN/NaT are ignored.
|
||||||
|
- False: returns all values with float64.
|
||||||
|
- None: returns all values with default datatype.
|
||||||
*args
|
*args
|
||||||
Positional arguments to pass to `func`
|
Positional arguments to pass to `func`
|
||||||
**kwargs
|
**kwargs
|
||||||
@ -1368,12 +1380,30 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'DistanceKilometers', 'timestamp', 'DestCountry'])
|
||||||
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
|
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True).astype(int)
|
||||||
DistanceKilometers AvgTicketPrice
|
AvgTicketPrice DistanceKilometers
|
||||||
sum 92616288 8204364
|
sum 8204364 92616288
|
||||||
min 0 100
|
min 100 0
|
||||||
std 4578 266
|
std 266 4578
|
||||||
|
|
||||||
|
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=True)
|
||||||
|
AvgTicketPrice DistanceKilometers
|
||||||
|
sum 8.204365e+06 9.261629e+07
|
||||||
|
min 1.000205e+02 0.000000e+00
|
||||||
|
std 2.664071e+02 4.578614e+03
|
||||||
|
|
||||||
|
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=False)
|
||||||
|
AvgTicketPrice DistanceKilometers timestamp DestCountry
|
||||||
|
sum 8.204365e+06 9.261629e+07 NaT NaN
|
||||||
|
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
|
||||||
|
std 2.664071e+02 4.578614e+03 NaT NaN
|
||||||
|
|
||||||
|
>>> df.aggregate(['sum', 'min', 'std'], numeric_only=None)
|
||||||
|
AvgTicketPrice DistanceKilometers timestamp DestCountry
|
||||||
|
sum 8.204365e+06 9.261629e+07 NaT NaN
|
||||||
|
min 1.000205e+02 0.000000e+00 2018-01-01 NaN
|
||||||
|
std 2.664071e+02 4.578614e+03 NaT NaN
|
||||||
"""
|
"""
|
||||||
axis = pd.DataFrame._get_axis_number(axis)
|
axis = pd.DataFrame._get_axis_number(axis)
|
||||||
|
|
||||||
@ -1387,10 +1417,14 @@ class DataFrame(NDFrame):
|
|||||||
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
|
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
|
||||||
if isinstance(func, str):
|
if isinstance(func, str):
|
||||||
# Wrap in list
|
# Wrap in list
|
||||||
return self._query_compiler.aggs([func]).squeeze().rename(None)
|
return (
|
||||||
|
self._query_compiler.aggs([func], numeric_only=numeric_only)
|
||||||
|
.squeeze()
|
||||||
|
.rename(None)
|
||||||
|
)
|
||||||
elif is_list_like(func):
|
elif is_list_like(func):
|
||||||
# we have a list!
|
# we have a list!
|
||||||
return self._query_compiler.aggs(func)
|
return self._query_compiler.aggs(func, numeric_only=numeric_only)
|
||||||
|
|
||||||
agg = aggregate
|
agg = aggregate
|
||||||
|
|
||||||
|
@ -100,6 +100,9 @@ class Field(NamedTuple):
|
|||||||
|
|
||||||
# Cardinality works for all types
|
# Cardinality works for all types
|
||||||
# Numerics and bools work for all aggs
|
# Numerics and bools work for all aggs
|
||||||
|
# Except "median_absolute_deviation" which doesn't support bool
|
||||||
|
if es_agg == "median_absolute_deviation" and self.is_bool:
|
||||||
|
return False
|
||||||
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
||||||
return True
|
return True
|
||||||
# Timestamps also work for 'min', 'max' and 'avg'
|
# Timestamps also work for 'min', 'max' and 'avg'
|
||||||
|
271
eland/ndframe.py
271
eland/ndframe.py
@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import TYPE_CHECKING, Tuple
|
from typing import TYPE_CHECKING, Tuple, Optional
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from eland.query_compiler import QueryCompiler
|
from eland.query_compiler import QueryCompiler
|
||||||
|
|
||||||
@ -162,12 +162,19 @@ class NDFrame(ABC):
|
|||||||
def _es_info(self, buf):
|
def _es_info(self, buf):
|
||||||
self._query_compiler.es_info(buf)
|
self._query_compiler.es_info(buf)
|
||||||
|
|
||||||
def mean(self, numeric_only: bool = True) -> pd.Series:
|
def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return mean value for each numeric column
|
Return mean value for each numeric column
|
||||||
|
|
||||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -179,27 +186,44 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.mean()
|
>>> df.mean()
|
||||||
|
AvgTicketPrice 628.254
|
||||||
|
Cancelled 0.128494
|
||||||
|
dayOfWeek 2.83598
|
||||||
|
timestamp 2018-01-21 19:20:45.564438232
|
||||||
|
dtype: object
|
||||||
|
|
||||||
|
>>> df.mean(numeric_only=True)
|
||||||
AvgTicketPrice 628.253689
|
AvgTicketPrice 628.253689
|
||||||
Cancelled 0.128494
|
Cancelled 0.128494
|
||||||
DistanceKilometers 7092.142457
|
|
||||||
DistanceMiles 4406.853010
|
|
||||||
FlightDelay 0.251168
|
|
||||||
FlightDelayMin 47.335171
|
|
||||||
FlightTimeHour 8.518797
|
|
||||||
FlightTimeMin 511.127842
|
|
||||||
dayOfWeek 2.835975
|
dayOfWeek 2.835975
|
||||||
dtype: float64
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.mean(numeric_only=False)
|
||||||
|
AvgTicketPrice 628.254
|
||||||
|
Cancelled 0.128494
|
||||||
|
dayOfWeek 2.83598
|
||||||
|
timestamp 2018-01-21 19:20:45.564438232
|
||||||
|
DestCountry NaN
|
||||||
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.mean(numeric_only=numeric_only)
|
return self._query_compiler.mean(numeric_only=numeric_only)
|
||||||
|
|
||||||
def sum(self, numeric_only: bool = True) -> pd.Series:
|
def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return sum for each numeric column
|
Return sum for each numeric column
|
||||||
|
|
||||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -211,27 +235,43 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.sum()
|
>>> df.sum()
|
||||||
|
AvgTicketPrice 8.20436e+06
|
||||||
|
Cancelled 1678
|
||||||
|
dayOfWeek 37035
|
||||||
|
dtype: object
|
||||||
|
|
||||||
|
>>> df.sum(numeric_only=True)
|
||||||
AvgTicketPrice 8.204365e+06
|
AvgTicketPrice 8.204365e+06
|
||||||
Cancelled 1.678000e+03
|
Cancelled 1.678000e+03
|
||||||
DistanceKilometers 9.261629e+07
|
|
||||||
DistanceMiles 5.754909e+07
|
|
||||||
FlightDelay 3.280000e+03
|
|
||||||
FlightDelayMin 6.181500e+05
|
|
||||||
FlightTimeHour 1.112470e+05
|
|
||||||
FlightTimeMin 6.674818e+06
|
|
||||||
dayOfWeek 3.703500e+04
|
dayOfWeek 3.703500e+04
|
||||||
dtype: float64
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.sum(numeric_only=False)
|
||||||
|
AvgTicketPrice 8.20436e+06
|
||||||
|
Cancelled 1678
|
||||||
|
dayOfWeek 37035
|
||||||
|
timestamp NaT
|
||||||
|
DestCountry NaN
|
||||||
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.sum(numeric_only=numeric_only)
|
return self._query_compiler.sum(numeric_only=numeric_only)
|
||||||
|
|
||||||
def min(self, numeric_only: bool = True) -> pd.Series:
|
def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return the minimum value for each numeric column
|
Return the minimum value for each numeric column
|
||||||
|
|
||||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -243,25 +283,42 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.min()
|
>>> df.min()
|
||||||
AvgTicketPrice 100.021
|
AvgTicketPrice 100.021
|
||||||
Cancelled False
|
Cancelled False
|
||||||
DistanceKilometers 0
|
|
||||||
DistanceMiles 0
|
|
||||||
FlightDelay False
|
|
||||||
FlightDelayMin 0
|
|
||||||
FlightTimeHour 0
|
|
||||||
FlightTimeMin 0
|
|
||||||
dayOfWeek 0
|
dayOfWeek 0
|
||||||
|
timestamp 2018-01-01 00:00:00
|
||||||
|
dtype: object
|
||||||
|
|
||||||
|
>>> df.min(numeric_only=True)
|
||||||
|
AvgTicketPrice 100.020531
|
||||||
|
Cancelled 0.000000
|
||||||
|
dayOfWeek 0.000000
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.min(numeric_only=False)
|
||||||
|
AvgTicketPrice 100.021
|
||||||
|
Cancelled False
|
||||||
|
dayOfWeek 0
|
||||||
|
timestamp 2018-01-01 00:00:00
|
||||||
|
DestCountry NaN
|
||||||
dtype: object
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.min(numeric_only=numeric_only)
|
return self._query_compiler.min(numeric_only=numeric_only)
|
||||||
|
|
||||||
def var(self, numeric_only: bool = True) -> pd.Series:
|
def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return variance for each numeric column
|
Return variance for each numeric column
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -273,25 +330,41 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.var() # doctest: +SKIP
|
>>> df.var()
|
||||||
AvgTicketPrice 7.096185e+04
|
AvgTicketPrice 70964.570234
|
||||||
Cancelled 1.119831e-01
|
Cancelled 0.111987
|
||||||
DistanceKilometers 2.096049e+07
|
dayOfWeek 3.761279
|
||||||
DistanceMiles 8.092892e+06
|
|
||||||
FlightDelay 1.880825e-01
|
|
||||||
FlightDelayMin 9.359209e+03
|
|
||||||
FlightTimeHour 3.112545e+01
|
|
||||||
FlightTimeMin 1.120516e+05
|
|
||||||
dayOfWeek 3.761135e+00
|
|
||||||
dtype: float64
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.var(numeric_only=True)
|
||||||
|
AvgTicketPrice 70964.570234
|
||||||
|
Cancelled 0.111987
|
||||||
|
dayOfWeek 3.761279
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.var(numeric_only=False)
|
||||||
|
AvgTicketPrice 70964.6
|
||||||
|
Cancelled 0.111987
|
||||||
|
dayOfWeek 3.76128
|
||||||
|
timestamp NaT
|
||||||
|
DestCountry NaN
|
||||||
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.var(numeric_only=numeric_only)
|
return self._query_compiler.var(numeric_only=numeric_only)
|
||||||
|
|
||||||
def std(self, numeric_only: bool = True) -> pd.Series:
|
def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return standard deviation for each numeric column
|
Return standard deviation for each numeric column
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -303,25 +376,41 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.std() # doctest: +SKIP
|
>>> df.std()
|
||||||
AvgTicketPrice 266.386661
|
AvgTicketPrice 266.407061
|
||||||
Cancelled 0.334639
|
Cancelled 0.334664
|
||||||
DistanceKilometers 4578.263193
|
dayOfWeek 1.939513
|
||||||
DistanceMiles 2844.800855
|
|
||||||
FlightDelay 0.433685
|
|
||||||
FlightDelayMin 96.743006
|
|
||||||
FlightTimeHour 5.579019
|
|
||||||
FlightTimeMin 334.741135
|
|
||||||
dayOfWeek 1.939365
|
|
||||||
dtype: float64
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.std(numeric_only=True)
|
||||||
|
AvgTicketPrice 266.407061
|
||||||
|
Cancelled 0.334664
|
||||||
|
dayOfWeek 1.939513
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.std(numeric_only=False)
|
||||||
|
AvgTicketPrice 266.407
|
||||||
|
Cancelled 0.334664
|
||||||
|
dayOfWeek 1.93951
|
||||||
|
timestamp NaT
|
||||||
|
DestCountry NaN
|
||||||
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.std(numeric_only=numeric_only)
|
return self._query_compiler.std(numeric_only=numeric_only)
|
||||||
|
|
||||||
def median(self, numeric_only: bool = True) -> pd.Series:
|
def median(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return the median value for each numeric column
|
Return the median value for each numeric column
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -333,27 +422,44 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.median() # doctest: +SKIP
|
>>> df.median() # doctest: +SKIP
|
||||||
AvgTicketPrice 640.387285
|
AvgTicketPrice 640.363
|
||||||
|
Cancelled False
|
||||||
|
dayOfWeek 3
|
||||||
|
timestamp 2018-01-21 23:54:06.624776611
|
||||||
|
dtype: object
|
||||||
|
|
||||||
|
>>> df.median(numeric_only=True) # doctest: +SKIP
|
||||||
|
AvgTicketPrice 640.362667
|
||||||
Cancelled 0.000000
|
Cancelled 0.000000
|
||||||
DistanceKilometers 7612.072403
|
|
||||||
DistanceMiles 4729.922470
|
|
||||||
FlightDelay 0.000000
|
|
||||||
FlightDelayMin 0.000000
|
|
||||||
FlightTimeHour 8.383113
|
|
||||||
FlightTimeMin 503.148975
|
|
||||||
dayOfWeek 3.000000
|
dayOfWeek 3.000000
|
||||||
dtype: float64
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.median(numeric_only=False) # doctest: +SKIP
|
||||||
|
AvgTicketPrice 640.387
|
||||||
|
Cancelled False
|
||||||
|
dayOfWeek 3
|
||||||
|
timestamp 2018-01-21 23:54:06.624776611
|
||||||
|
DestCountry NaN
|
||||||
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.median(numeric_only=numeric_only)
|
return self._query_compiler.median(numeric_only=numeric_only)
|
||||||
|
|
||||||
def max(self, numeric_only: bool = True) -> pd.Series:
|
def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Return the maximum value for each numeric column
|
Return the maximum value for each numeric column
|
||||||
|
|
||||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is None
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
pandas.Series
|
pandas.Series
|
||||||
@ -365,17 +471,26 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.max()
|
>>> df.max()
|
||||||
AvgTicketPrice 1199.73
|
AvgTicketPrice 1199.73
|
||||||
Cancelled True
|
Cancelled True
|
||||||
DistanceKilometers 19881.5
|
|
||||||
DistanceMiles 12353.8
|
|
||||||
FlightDelay True
|
|
||||||
FlightDelayMin 360
|
|
||||||
FlightTimeHour 31.715
|
|
||||||
FlightTimeMin 1902.9
|
|
||||||
dayOfWeek 6
|
dayOfWeek 6
|
||||||
|
timestamp 2018-02-11 23:50:12
|
||||||
|
dtype: object
|
||||||
|
|
||||||
|
>>> df.max(numeric_only=True)
|
||||||
|
AvgTicketPrice 1199.729004
|
||||||
|
Cancelled 1.000000
|
||||||
|
dayOfWeek 6.000000
|
||||||
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.max(numeric_only=False)
|
||||||
|
AvgTicketPrice 1199.73
|
||||||
|
Cancelled True
|
||||||
|
dayOfWeek 6
|
||||||
|
timestamp 2018-02-11 23:50:12
|
||||||
|
DestCountry NaN
|
||||||
dtype: object
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.max(numeric_only=numeric_only)
|
return self._query_compiler.max(numeric_only=numeric_only)
|
||||||
@ -441,18 +556,24 @@ class NDFrame(ABC):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||||
>>> df.mad() # doctest: +SKIP
|
>>> df.mad() # doctest: +SKIP
|
||||||
AvgTicketPrice 213.368709
|
AvgTicketPrice 213.35497
|
||||||
Cancelled 0.000000
|
dayOfWeek 2.00000
|
||||||
DistanceKilometers 2946.168236
|
dtype: float64
|
||||||
DistanceMiles 1830.987236
|
|
||||||
FlightDelay 0.000000
|
>>> df.mad(numeric_only=True) # doctest: +SKIP
|
||||||
FlightDelayMin 0.000000
|
AvgTicketPrice 213.473011
|
||||||
FlightTimeHour 3.819435
|
|
||||||
FlightTimeMin 229.142297
|
|
||||||
dayOfWeek 2.000000
|
dayOfWeek 2.000000
|
||||||
dtype: float64
|
dtype: float64
|
||||||
|
|
||||||
|
>>> df.mad(numeric_only=False) # doctest: +SKIP
|
||||||
|
AvgTicketPrice 213.484
|
||||||
|
Cancelled NaN
|
||||||
|
dayOfWeek 2
|
||||||
|
timestamp NaT
|
||||||
|
DestCountry NaN
|
||||||
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.mad(numeric_only=numeric_only)
|
return self._query_compiler.mad(numeric_only=numeric_only)
|
||||||
|
|
||||||
|
@ -145,43 +145,28 @@ class Operations:
|
|||||||
|
|
||||||
return build_pd_series(data=counts, index=fields)
|
return build_pd_series(data=counts, index=fields)
|
||||||
|
|
||||||
def mean(self, query_compiler, numeric_only=True):
|
def _metric_agg_series(
|
||||||
results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only)
|
self,
|
||||||
return build_pd_series(results, index=results.keys())
|
query_compiler: "QueryCompiler",
|
||||||
|
agg: List,
|
||||||
def var(self, query_compiler, numeric_only=True):
|
numeric_only: Optional[bool] = None,
|
||||||
results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only)
|
) -> pd.Series:
|
||||||
return build_pd_series(results, index=results.keys())
|
results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only)
|
||||||
|
if numeric_only:
|
||||||
def std(self, query_compiler, numeric_only=True):
|
return build_pd_series(results, index=results.keys(), dtype=np.float64)
|
||||||
results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only)
|
else:
|
||||||
return build_pd_series(results, index=results.keys())
|
# If all results are float convert into float64
|
||||||
|
if all(isinstance(i, float) for i in results.values()):
|
||||||
def median(self, query_compiler, numeric_only=True):
|
dtype = np.float64
|
||||||
results = self._metric_aggs(
|
# If all results are int convert into int64
|
||||||
query_compiler, ["median"], numeric_only=numeric_only
|
elif all(isinstance(i, int) for i in results.values()):
|
||||||
)
|
dtype = np.int64
|
||||||
return build_pd_series(results, index=results.keys())
|
# If single result is present consider that datatype instead of object
|
||||||
|
elif len(results) <= 1:
|
||||||
def sum(self, query_compiler, numeric_only=True):
|
dtype = None
|
||||||
results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only)
|
else:
|
||||||
return build_pd_series(results, index=results.keys())
|
dtype = "object"
|
||||||
|
return build_pd_series(results, index=results.keys(), dtype=dtype)
|
||||||
def max(self, query_compiler, numeric_only=True):
|
|
||||||
results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
|
|
||||||
return build_pd_series(results, index=results.keys())
|
|
||||||
|
|
||||||
def min(self, query_compiler, numeric_only=True):
|
|
||||||
results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
|
|
||||||
return build_pd_series(results, index=results.keys())
|
|
||||||
|
|
||||||
def nunique(self, query_compiler):
|
|
||||||
results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
|
|
||||||
return build_pd_series(results, index=results.keys())
|
|
||||||
|
|
||||||
def mad(self, query_compiler, numeric_only=True):
|
|
||||||
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
|
|
||||||
return build_pd_series(results, index=results.keys())
|
|
||||||
|
|
||||||
def value_counts(self, query_compiler, es_size):
|
def value_counts(self, query_compiler, es_size):
|
||||||
return self._terms_aggs(query_compiler, "terms", es_size)
|
return self._terms_aggs(query_compiler, "terms", es_size)
|
||||||
@ -189,7 +174,21 @@ class Operations:
|
|||||||
def hist(self, query_compiler, bins):
|
def hist(self, query_compiler, bins):
|
||||||
return self._hist_aggs(query_compiler, bins)
|
return self._hist_aggs(query_compiler, bins)
|
||||||
|
|
||||||
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True):
|
def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame:
|
||||||
|
results = self._metric_aggs(
|
||||||
|
query_compiler, pd_aggs, numeric_only=numeric_only, is_dataframe_agg=True
|
||||||
|
)
|
||||||
|
return pd.DataFrame(
|
||||||
|
results, index=pd_aggs, dtype=(np.float64 if numeric_only else None)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _metric_aggs(
|
||||||
|
self,
|
||||||
|
query_compiler: "QueryCompiler",
|
||||||
|
pd_aggs,
|
||||||
|
numeric_only: Optional[bool] = None,
|
||||||
|
is_dataframe_agg: bool = False,
|
||||||
|
) -> Dict:
|
||||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||||
|
|
||||||
size = self._size(query_params, post_processing)
|
size = self._size(query_params, post_processing)
|
||||||
@ -201,6 +200,7 @@ class Operations:
|
|||||||
results = {}
|
results = {}
|
||||||
fields = query_compiler._mappings.all_source_fields()
|
fields = query_compiler._mappings.all_source_fields()
|
||||||
if numeric_only:
|
if numeric_only:
|
||||||
|
# Consider if field is Int/Float/Bool
|
||||||
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
|
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
|
||||||
|
|
||||||
body = Query(query_params.query)
|
body = Query(query_params.query)
|
||||||
@ -210,6 +210,7 @@ class Operations:
|
|||||||
|
|
||||||
for field in fields:
|
for field in fields:
|
||||||
for es_agg in es_aggs:
|
for es_agg in es_aggs:
|
||||||
|
# NaN/NaT fields are ignored
|
||||||
if not field.is_es_agg_compatible(es_agg):
|
if not field.is_es_agg_compatible(es_agg):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -241,9 +242,17 @@ class Operations:
|
|||||||
for field in fields:
|
for field in fields:
|
||||||
values = []
|
values = []
|
||||||
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
|
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
|
||||||
|
# is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean()
|
||||||
# If the field and agg aren't compatible we add a NaN/NaT
|
# If the field and agg aren't compatible we add a NaN/NaT for agg
|
||||||
|
# If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean()
|
||||||
if not field.is_es_agg_compatible(es_agg):
|
if not field.is_es_agg_compatible(es_agg):
|
||||||
|
if is_dataframe_agg and not numeric_only:
|
||||||
|
values.append(field.nan_value)
|
||||||
|
elif not is_dataframe_agg and numeric_only is False:
|
||||||
|
values.append(field.nan_value)
|
||||||
|
# Explicit condition for mad to add NaN because it doesn't support bool
|
||||||
|
elif is_dataframe_agg and numeric_only:
|
||||||
|
if pd_agg == "mad":
|
||||||
values.append(field.nan_value)
|
values.append(field.nan_value)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -269,7 +278,7 @@ class Operations:
|
|||||||
|
|
||||||
# All of the below calculations result in NaN if count<=1
|
# All of the below calculations result in NaN if count<=1
|
||||||
if count <= 1:
|
if count <= 1:
|
||||||
agg_value = np.float64(np.NaN)
|
agg_value = np.NaN
|
||||||
|
|
||||||
elif es_agg[1] == "std_deviation":
|
elif es_agg[1] == "std_deviation":
|
||||||
agg_value *= count / (count - 1.0)
|
agg_value *= count / (count - 1.0)
|
||||||
@ -287,8 +296,11 @@ class Operations:
|
|||||||
]["value"]
|
]["value"]
|
||||||
|
|
||||||
# Null usually means there were no results.
|
# Null usually means there were no results.
|
||||||
if agg_value is None:
|
if agg_value is None or np.isnan(agg_value):
|
||||||
agg_value = field.nan_value
|
if is_dataframe_agg and not numeric_only:
|
||||||
|
agg_value = np.NaN
|
||||||
|
elif not is_dataframe_agg and numeric_only is False:
|
||||||
|
agg_value = np.NaN
|
||||||
|
|
||||||
# Cardinality is always either NaN or integer.
|
# Cardinality is always either NaN or integer.
|
||||||
elif pd_agg == "nunique":
|
elif pd_agg == "nunique":
|
||||||
@ -299,13 +311,20 @@ class Operations:
|
|||||||
agg_value = elasticsearch_date_to_pandas_date(
|
agg_value = elasticsearch_date_to_pandas_date(
|
||||||
agg_value, field.es_date_format
|
agg_value, field.es_date_format
|
||||||
)
|
)
|
||||||
|
# If numeric_only is False | None then maintain column datatype
|
||||||
# These aggregations maintain the column datatype
|
elif not numeric_only:
|
||||||
elif pd_agg in {"max", "min", "median"}:
|
# we're only converting to bool for lossless aggs like min, max, and median.
|
||||||
|
if pd_agg in {"max", "min", "median", "sum"}:
|
||||||
|
# 'sum' isn't representable with bool, use int64
|
||||||
|
if pd_agg == "sum" and field.is_bool:
|
||||||
|
agg_value = np.int64(agg_value)
|
||||||
|
else:
|
||||||
agg_value = field.np_dtype.type(agg_value)
|
agg_value = field.np_dtype.type(agg_value)
|
||||||
|
|
||||||
values.append(agg_value)
|
values.append(agg_value)
|
||||||
|
|
||||||
|
# If numeric_only is True and We only have a NaN type field then we check for empty.
|
||||||
|
if values:
|
||||||
results[field.index] = values if len(values) > 1 else values[0]
|
results[field.index] = values if len(values) > 1 else values[0]
|
||||||
|
|
||||||
return results
|
return results
|
||||||
@ -540,10 +559,6 @@ class Operations:
|
|||||||
|
|
||||||
return es_aggs
|
return es_aggs
|
||||||
|
|
||||||
def aggs(self, query_compiler, pd_aggs):
|
|
||||||
results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
|
|
||||||
return pd.DataFrame(results, index=pd_aggs)
|
|
||||||
|
|
||||||
def filter(self, query_compiler, items=None, like=None, regex=None):
|
def filter(self, query_compiler, items=None, like=None, regex=None):
|
||||||
# This function is only called for axis='index',
|
# This function is only called for axis='index',
|
||||||
# DataFrame.filter(..., axis="columns") calls .drop()
|
# DataFrame.filter(..., axis="columns") calls .drop()
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
|
|
||||||
import copy
|
import copy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, TYPE_CHECKING
|
from typing import Optional, TYPE_CHECKING, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -490,38 +490,56 @@ class QueryCompiler:
|
|||||||
result._operations.filter(self, items=items, like=like, regex=regex)
|
result._operations.filter(self, items=items, like=like, regex=regex)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def aggs(self, func):
|
def aggs(self, func: List[str], numeric_only: Optional[bool] = None):
|
||||||
return self._operations.aggs(self, func)
|
return self._operations.aggs(self, func, numeric_only=numeric_only)
|
||||||
|
|
||||||
def count(self):
|
def count(self):
|
||||||
return self._operations.count(self)
|
return self._operations.count(self)
|
||||||
|
|
||||||
def mean(self, numeric_only=None):
|
def mean(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.mean(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["mean"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def var(self, numeric_only=None):
|
def var(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.var(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["var"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def std(self, numeric_only=None):
|
def std(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.std(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["std"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def mad(self, numeric_only=None):
|
def mad(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.mad(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["mad"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def median(self, numeric_only=None):
|
def median(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.median(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["median"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def sum(self, numeric_only=None):
|
def sum(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.sum(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["sum"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def min(self, numeric_only=None):
|
def min(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.min(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["min"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def max(self, numeric_only=None):
|
def max(self, numeric_only: Optional[bool] = None):
|
||||||
return self._operations.max(self, numeric_only=numeric_only)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["max"], numeric_only=numeric_only
|
||||||
|
)
|
||||||
|
|
||||||
def nunique(self):
|
def nunique(self):
|
||||||
return self._operations.nunique(self)
|
return self._operations._metric_agg_series(
|
||||||
|
self, ["nunique"], numeric_only=False
|
||||||
|
)
|
||||||
|
|
||||||
def value_counts(self, es_size):
|
def value_counts(self, es_size):
|
||||||
return self._operations.value_counts(self, es_size)
|
return self._operations.value_counts(self, es_size)
|
||||||
|
@ -29,7 +29,9 @@ class TestDataFrameAggs(TestData):
|
|||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
||||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||||
|
["sum", "min"], numeric_only=True
|
||||||
|
)
|
||||||
|
|
||||||
# Eland returns all float values for all metric aggs, pandas can return int
|
# Eland returns all float values for all metric aggs, pandas can return int
|
||||||
# TODO - investigate this more
|
# TODO - investigate this more
|
||||||
@ -40,22 +42,22 @@ class TestDataFrameAggs(TestData):
|
|||||||
["sum", "min", "std"]
|
["sum", "min", "std"]
|
||||||
)
|
)
|
||||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||||
["sum", "min", "std"]
|
["sum", "min", "std"], numeric_only=True
|
||||||
)
|
)
|
||||||
|
|
||||||
print(pd_sum_min_std.dtypes)
|
print(pd_sum_min_std.dtypes)
|
||||||
print(ed_sum_min_std.dtypes)
|
print(ed_sum_min_std.dtypes)
|
||||||
|
|
||||||
assert_frame_equal(
|
assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
|
||||||
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_terms_aggs(self):
|
def test_terms_aggs(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
||||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"])
|
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||||
|
["sum", "min"], numeric_only=True
|
||||||
|
)
|
||||||
|
|
||||||
# Eland returns all float values for all metric aggs, pandas can return int
|
# Eland returns all float values for all metric aggs, pandas can return int
|
||||||
# TODO - investigate this more
|
# TODO - investigate this more
|
||||||
@ -66,15 +68,13 @@ class TestDataFrameAggs(TestData):
|
|||||||
["sum", "min", "std"]
|
["sum", "min", "std"]
|
||||||
)
|
)
|
||||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||||
["sum", "min", "std"]
|
["sum", "min", "std"], numeric_only=True
|
||||||
)
|
)
|
||||||
|
|
||||||
print(pd_sum_min_std.dtypes)
|
print(pd_sum_min_std.dtypes)
|
||||||
print(ed_sum_min_std.dtypes)
|
print(ed_sum_min_std.dtypes)
|
||||||
|
|
||||||
assert_frame_equal(
|
assert_frame_equal(pd_sum_min_std, ed_sum_min_std, check_exact=False, rtol=True)
|
||||||
pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_aggs_median_var(self):
|
def test_aggs_median_var(self):
|
||||||
pd_ecommerce = self.pd_ecommerce()
|
pd_ecommerce = self.pd_ecommerce()
|
||||||
@ -85,7 +85,7 @@ class TestDataFrameAggs(TestData):
|
|||||||
].agg(["median", "var"])
|
].agg(["median", "var"])
|
||||||
ed_aggs = ed_ecommerce[
|
ed_aggs = ed_ecommerce[
|
||||||
["taxful_total_price", "taxless_total_price", "total_quantity"]
|
["taxful_total_price", "taxless_total_price", "total_quantity"]
|
||||||
].agg(["median", "var"])
|
].agg(["median", "var"], numeric_only=True)
|
||||||
|
|
||||||
print(pd_aggs, pd_aggs.dtypes)
|
print(pd_aggs, pd_aggs.dtypes)
|
||||||
print(ed_aggs, ed_aggs.dtypes)
|
print(ed_aggs, ed_aggs.dtypes)
|
||||||
@ -102,7 +102,9 @@ class TestDataFrameAggs(TestData):
|
|||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg)
|
pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg)
|
||||||
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg)
|
ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||||
|
agg, numeric_only=True
|
||||||
|
)
|
||||||
|
|
||||||
assert_series_equal(pd_sum_min_std, ed_sum_min_std)
|
assert_series_equal(pd_sum_min_std, ed_sum_min_std)
|
||||||
|
|
||||||
@ -112,7 +114,9 @@ class TestDataFrameAggs(TestData):
|
|||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"])
|
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"])
|
||||||
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"])
|
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(
|
||||||
|
["mean"], numeric_only=True
|
||||||
|
)
|
||||||
|
|
||||||
assert_frame_equal(pd_sum_min, ed_sum_min)
|
assert_frame_equal(pd_sum_min, ed_sum_min)
|
||||||
|
|
||||||
|
@ -16,18 +16,23 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
# File called _pytest for PyCharm compatibility
|
# File called _pytest for PyCharm compatibility
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.testing import assert_series_equal
|
from pandas.testing import assert_series_equal
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameMetrics(TestData):
|
class TestDataFrameMetrics(TestData):
|
||||||
funcs = ["max", "min", "mean", "sum"]
|
funcs = ["max", "min", "mean", "sum"]
|
||||||
extended_funcs = ["median", "mad", "var", "std"]
|
extended_funcs = ["median", "mad", "var", "std"]
|
||||||
|
filter_data = [
|
||||||
|
"AvgTicketPrice",
|
||||||
|
"Cancelled",
|
||||||
|
"dayOfWeek",
|
||||||
|
"timestamp",
|
||||||
|
"DestCountry",
|
||||||
|
]
|
||||||
|
|
||||||
@pytest.mark.parametrize("numeric_only", [False, None])
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
def test_flights_metrics(self, numeric_only):
|
def test_flights_metrics(self, numeric_only):
|
||||||
@ -49,7 +54,7 @@ class TestDataFrameMetrics(TestData):
|
|||||||
pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only)
|
pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only)
|
||||||
ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only)
|
ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only)
|
||||||
|
|
||||||
assert_series_equal(pd_metric, ed_metric)
|
assert_series_equal(pd_metric, ed_metric, check_dtype=False)
|
||||||
|
|
||||||
def test_flights_extended_metrics(self):
|
def test_flights_extended_metrics(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
@ -86,11 +91,9 @@ class TestDataFrameMetrics(TestData):
|
|||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
pd_metric = getattr(pd_flights_1, func)()
|
pd_metric = getattr(pd_flights_1, func)()
|
||||||
ed_metric = getattr(ed_flights_1, func)()
|
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
||||||
|
|
||||||
assert_series_equal(
|
assert_series_equal(pd_metric, ed_metric, check_exact=False)
|
||||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test on zero rows to test NaN behaviour of sample std/variance
|
# Test on zero rows to test NaN behaviour of sample std/variance
|
||||||
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||||
@ -98,11 +101,9 @@ class TestDataFrameMetrics(TestData):
|
|||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
pd_metric = getattr(pd_flights_0, func)()
|
pd_metric = getattr(pd_flights_0, func)()
|
||||||
ed_metric = getattr(ed_flights_0, func)()
|
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
||||||
|
|
||||||
assert_series_equal(
|
assert_series_equal(pd_metric, ed_metric, check_exact=False)
|
||||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_ecommerce_selected_non_numeric_source_fields(self):
|
def test_ecommerce_selected_non_numeric_source_fields(self):
|
||||||
# None of these are numeric
|
# None of these are numeric
|
||||||
@ -121,7 +122,7 @@ class TestDataFrameMetrics(TestData):
|
|||||||
assert_series_equal(
|
assert_series_equal(
|
||||||
getattr(pd_ecommerce, func)(numeric_only=True),
|
getattr(pd_ecommerce, func)(numeric_only=True),
|
||||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||||
check_less_precise=True,
|
check_exact=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_ecommerce_selected_mixed_numeric_source_fields(self):
|
def test_ecommerce_selected_mixed_numeric_source_fields(self):
|
||||||
@ -143,7 +144,7 @@ class TestDataFrameMetrics(TestData):
|
|||||||
assert_series_equal(
|
assert_series_equal(
|
||||||
getattr(pd_ecommerce, func)(numeric_only=True),
|
getattr(pd_ecommerce, func)(numeric_only=True),
|
||||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||||
check_less_precise=True,
|
check_exact=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_ecommerce_selected_all_numeric_source_fields(self):
|
def test_ecommerce_selected_all_numeric_source_fields(self):
|
||||||
@ -157,27 +158,27 @@ class TestDataFrameMetrics(TestData):
|
|||||||
assert_series_equal(
|
assert_series_equal(
|
||||||
getattr(pd_ecommerce, func)(numeric_only=True),
|
getattr(pd_ecommerce, func)(numeric_only=True),
|
||||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||||
check_less_precise=True,
|
check_exact=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_flights_datetime_metrics_agg(self):
|
def test_flights_datetime_metrics_agg(self):
|
||||||
ed_timestamps = self.ed_flights()[["timestamp"]]
|
ed_timestamps = self.ed_flights()[["timestamp"]]
|
||||||
expected_values = {
|
expected_values = {
|
||||||
"timestamp": {
|
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||||
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||||
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||||
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
|
||||||
"nunique": 12236,
|
|
||||||
"mad": pd.NaT,
|
|
||||||
"std": pd.NaT,
|
|
||||||
"sum": pd.NaT,
|
"sum": pd.NaT,
|
||||||
|
"mad": pd.NaT,
|
||||||
"var": pd.NaT,
|
"var": pd.NaT,
|
||||||
}
|
"std": pd.NaT,
|
||||||
|
"nunique": 12236,
|
||||||
}
|
}
|
||||||
|
|
||||||
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"])
|
ed_metrics = ed_timestamps.agg(
|
||||||
ed_metrics_dict = ed_metrics.to_dict()
|
self.funcs + self.extended_funcs + ["nunique"], numeric_only=False
|
||||||
ed_metrics_dict["timestamp"].pop("median") # Median is tested below.
|
)
|
||||||
|
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
|
||||||
|
ed_metrics_dict.pop("median") # Median is tested below.
|
||||||
assert ed_metrics_dict == expected_values
|
assert ed_metrics_dict == expected_values
|
||||||
|
|
||||||
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
|
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
|
||||||
@ -192,8 +193,10 @@ class TestDataFrameMetrics(TestData):
|
|||||||
ed_metric = ed_timestamps.agg([agg])
|
ed_metric = ed_timestamps.agg([agg])
|
||||||
|
|
||||||
if agg == "nunique":
|
if agg == "nunique":
|
||||||
|
# df with timestamp column should return int64
|
||||||
assert ed_metric.dtypes["timestamp"] == np.int64
|
assert ed_metric.dtypes["timestamp"] == np.int64
|
||||||
else:
|
else:
|
||||||
|
# df with timestamp column should return datetime64[ns]
|
||||||
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
|
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
|
||||||
assert ed_metric["timestamp"][0] == expected_values[agg]
|
assert ed_metric["timestamp"][0] == expected_values[agg]
|
||||||
|
|
||||||
@ -230,7 +233,7 @@ class TestDataFrameMetrics(TestData):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def test_metric_agg_keep_dtypes(self):
|
def test_metric_agg_keep_dtypes(self):
|
||||||
# max, min, and median maintain their dtypes
|
# max, min and median maintain their dtypes
|
||||||
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
|
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
|
||||||
assert df.min().tolist() == [131.81910705566406, False, 0]
|
assert df.min().tolist() == [131.81910705566406, False, 0]
|
||||||
assert df.max().tolist() == [989.9527587890625, True, 0]
|
assert df.max().tolist() == [989.9527587890625, True, 0]
|
||||||
@ -250,3 +253,162 @@ class TestDataFrameMetrics(TestData):
|
|||||||
"Cancelled": {"max": True, "median": False, "min": False},
|
"Cancelled": {"max": True, "median": False, "min": False},
|
||||||
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
|
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
|
||||||
}
|
}
|
||||||
|
# sum should always be the same dtype as the input, except for bool where the sum of bools should be an int64.
|
||||||
|
sum_agg = df.agg(["sum"])
|
||||||
|
assert sum_agg.dtypes.to_list() == [
|
||||||
|
np.dtype("float64"),
|
||||||
|
np.dtype("int64"),
|
||||||
|
np.dtype("int64"),
|
||||||
|
]
|
||||||
|
assert sum_agg.to_dict() == {
|
||||||
|
"AvgTicketPrice": {"sum": 26521.624084472656},
|
||||||
|
"Cancelled": {"sum": 6},
|
||||||
|
"dayOfWeek": {"sum": 0},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_flights_numeric_only(self):
|
||||||
|
# All Aggregations Data Check
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
pd_flights = self.pd_flights().filter(self.filter_data)
|
||||||
|
# agg => numeric_only True returns float64 values
|
||||||
|
# We compare it with individual single agg functions of pandas with numeric_only=True
|
||||||
|
filtered_aggs = self.funcs + self.extended_funcs
|
||||||
|
agg_data = ed_flights.agg(filtered_aggs, numeric_only=True).transpose()
|
||||||
|
for agg in filtered_aggs:
|
||||||
|
# Explicitly check for mad because it returns nan for bools
|
||||||
|
if agg == "mad":
|
||||||
|
assert np.isnan(agg_data[agg]["Cancelled"])
|
||||||
|
else:
|
||||||
|
assert_series_equal(
|
||||||
|
agg_data[agg].rename(None),
|
||||||
|
getattr(pd_flights, agg)(numeric_only=True),
|
||||||
|
check_exact=False,
|
||||||
|
rtol=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# all single aggs return float64 for numeric_only=True
|
||||||
|
def test_numeric_only_true_single_aggs(self):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
for agg in self.funcs + self.extended_funcs:
|
||||||
|
result = getattr(ed_flights, agg)(numeric_only=True)
|
||||||
|
assert result.dtype == np.dtype("float64")
|
||||||
|
assert result.shape == ((3,) if agg != "mad" else (2,))
|
||||||
|
|
||||||
|
# check dtypes and shape of min, max and median for numeric_only=False | None
|
||||||
|
@pytest.mark.parametrize("agg", ["min", "max", "median"])
|
||||||
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_min_max_median_numeric_only(self, agg, numeric_only):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
if numeric_only is False:
|
||||||
|
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], np.bool_)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], np.int64)
|
||||||
|
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||||
|
assert np.isnan(calculated_values["DestCountry"])
|
||||||
|
assert calculated_values.shape == (5,)
|
||||||
|
elif numeric_only is None:
|
||||||
|
calculated_values = getattr(ed_flights, agg)(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], np.bool_)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], np.int64)
|
||||||
|
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||||
|
assert calculated_values.shape == (4,)
|
||||||
|
|
||||||
|
# check dtypes and shape for sum
|
||||||
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_sum_numeric_only(self, numeric_only):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
if numeric_only is False:
|
||||||
|
calculated_values = ed_flights.sum(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], np.int64)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], np.int64)
|
||||||
|
assert pd.isnull(calculated_values["timestamp"])
|
||||||
|
assert np.isnan(calculated_values["DestCountry"])
|
||||||
|
assert calculated_values.shape == (5,)
|
||||||
|
elif numeric_only is None:
|
||||||
|
calculated_values = ed_flights.sum(numeric_only=numeric_only)
|
||||||
|
dtype_list = [calculated_values[i].dtype for i in calculated_values.index]
|
||||||
|
assert dtype_list == [
|
||||||
|
np.dtype("float64"),
|
||||||
|
np.dtype("int64"),
|
||||||
|
np.dtype("int64"),
|
||||||
|
]
|
||||||
|
assert calculated_values.shape == (3,)
|
||||||
|
|
||||||
|
# check dtypes and shape for std
|
||||||
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_std_numeric_only(self, numeric_only):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
if numeric_only is False:
|
||||||
|
calculated_values = ed_flights.std(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], float)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert pd.isnull(calculated_values["timestamp"])
|
||||||
|
assert np.isnan(calculated_values["DestCountry"])
|
||||||
|
assert calculated_values.shape == (5,)
|
||||||
|
elif numeric_only is None:
|
||||||
|
calculated_values = ed_flights.std(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], float)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert calculated_values.shape == (3,)
|
||||||
|
|
||||||
|
# check dtypes and shape for var
|
||||||
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_var_numeric_only(self, numeric_only):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
if numeric_only is False:
|
||||||
|
calculated_values = ed_flights.var(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], np.float64)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], np.float64)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], np.float64)
|
||||||
|
assert pd.isnull(calculated_values["timestamp"])
|
||||||
|
assert np.isnan(calculated_values["DestCountry"])
|
||||||
|
assert calculated_values.shape == (5,)
|
||||||
|
elif numeric_only is None:
|
||||||
|
calculated_values = ed_flights.var(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], float)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert calculated_values.shape == (3,)
|
||||||
|
|
||||||
|
# check dtypes and shape for mean
|
||||||
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_mean_numeric_only(self, numeric_only):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
if numeric_only is False:
|
||||||
|
calculated_values = ed_flights.mean(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], float)
|
||||||
|
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||||
|
assert np.isnan(calculated_values["DestCountry"])
|
||||||
|
assert calculated_values.shape == (5,)
|
||||||
|
elif numeric_only is None:
|
||||||
|
calculated_values = ed_flights.mean(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], float)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert isinstance(calculated_values["timestamp"], pd.Timestamp)
|
||||||
|
assert calculated_values.shape == (4,)
|
||||||
|
|
||||||
|
# check dtypes and shape for mad
|
||||||
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_mad_numeric_only(self, numeric_only):
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||||
|
if numeric_only is False:
|
||||||
|
calculated_values = ed_flights.mad(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["Cancelled"], np.float64)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert pd.isnull(calculated_values["timestamp"])
|
||||||
|
assert np.isnan(calculated_values["DestCountry"])
|
||||||
|
assert calculated_values.shape == (5,)
|
||||||
|
elif numeric_only is None:
|
||||||
|
calculated_values = ed_flights.mad(numeric_only=numeric_only)
|
||||||
|
assert isinstance(calculated_values["AvgTicketPrice"], float)
|
||||||
|
assert isinstance(calculated_values["dayOfWeek"], float)
|
||||||
|
assert calculated_values.shape == (2,)
|
||||||
|
@ -72,7 +72,7 @@ class TestSeriesMetrics(TestData):
|
|||||||
if func == "nunique": # nunique never returns 'NaN'
|
if func == "nunique": # nunique never returns 'NaN'
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ed_metric = getattr(ed_ecommerce, func)()
|
ed_metric = getattr(ed_ecommerce, func)(numeric_only=False)
|
||||||
print(func, ed_metric)
|
print(func, ed_metric)
|
||||||
assert np.isnan(ed_metric)
|
assert np.isnan(ed_metric)
|
||||||
|
|
||||||
@ -86,7 +86,9 @@ class TestSeriesMetrics(TestData):
|
|||||||
|
|
||||||
for func in self.all_funcs:
|
for func in self.all_funcs:
|
||||||
pd_metric = getattr(pd_ecommerce, func)()
|
pd_metric = getattr(pd_ecommerce, func)()
|
||||||
ed_metric = getattr(ed_ecommerce, func)()
|
ed_metric = getattr(ed_ecommerce, func)(
|
||||||
|
**({"numeric_only": True} if (func != "nunique") else {})
|
||||||
|
)
|
||||||
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||||
|
|
||||||
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user