Fix Series.median(), support median() for datetimes

This commit is contained in:
Seth Michael Larson 2020-08-12 15:28:47 -05:00 committed by Seth Michael Larson
parent a709ed589d
commit 535ed9b334
4 changed files with 73 additions and 11 deletions

View File

@ -91,15 +91,19 @@ class Field(NamedTuple):
return np.dtype(self.pd_dtype) return np.dtype(self.pd_dtype)
def is_es_agg_compatible(self, es_agg) -> bool: def is_es_agg_compatible(self, es_agg) -> bool:
# Unpack the actual aggregation if this is 'extended_stats' # Unpack the actual aggregation if this is 'extended_stats/percentiles'
if isinstance(es_agg, tuple): if isinstance(es_agg, tuple):
if es_agg[0] == "extended_stats":
es_agg = es_agg[1] es_agg = es_agg[1]
elif es_agg[0] == "percentiles":
es_agg = "percentiles"
# Cardinality works for all types # Cardinality works for all types
# Numerics and bools work for all aggs # Numerics and bools work for all aggs
if es_agg == "cardinality" or self.is_numeric or self.is_bool: if es_agg == "cardinality" or self.is_numeric or self.is_bool:
return True return True
# Timestamps also work for 'min', 'max' and 'avg' # Timestamps also work for 'min', 'max' and 'avg'
if es_agg in {"min", "max", "avg"} and self.is_timestamp: if es_agg in {"min", "max", "avg", "percentiles"} and self.is_timestamp:
return True return True
return False return False

View File

@ -1255,7 +1255,7 @@ class Series(NDFrame):
Returns Returns
------- -------
float float
max value mean value
See Also See Also
-------- --------
@ -1270,6 +1270,30 @@ class Series(NDFrame):
results = super().mean(numeric_only=numeric_only) results = super().mean(numeric_only=numeric_only)
return results.squeeze() return results.squeeze()
def median(self, numeric_only=None):
"""
Return the median of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
median value
See Also
--------
:pandas_api_docs:`pandas.Series.median`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.median())
640
"""
results = super().median(numeric_only=numeric_only)
return results.squeeze()
def min(self, numeric_only=None): def min(self, numeric_only=None):
""" """
Return the minimum of the Series values Return the minimum of the Series values
@ -1279,7 +1303,7 @@ class Series(NDFrame):
Returns Returns
------- -------
float float
max value min value
See Also See Also
-------- --------
@ -1303,7 +1327,7 @@ class Series(NDFrame):
Returns Returns
------- -------
float float
max value sum of all values
See Also See Also
-------- --------

View File

@ -167,19 +167,20 @@ class TestDataFrameMetrics(TestData):
"min": pd.Timestamp("2018-01-01 00:00:00"), "min": pd.Timestamp("2018-01-01 00:00:00"),
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
"max": pd.Timestamp("2018-02-11 23:50:12"), "max": pd.Timestamp("2018-02-11 23:50:12"),
"nunique": 12236,
"mad": pd.NaT, "mad": pd.NaT,
"median": pd.NaT,
"std": pd.NaT, "std": pd.NaT,
"sum": pd.NaT, "sum": pd.NaT,
"var": pd.NaT, "var": pd.NaT,
"nunique": 12236,
} }
} }
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"]) ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"])
assert ed_metrics.to_dict() == expected_values ed_metrics_dict = ed_metrics.to_dict()
ed_metrics_dict["timestamp"].pop("median") # Median is tested below.
assert ed_metrics_dict == expected_values
@pytest.mark.parametrize("agg", ["mean", "min", "max"]) @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
def test_flights_datetime_metrics_single_agg(self, agg): def test_flights_datetime_metrics_single_agg(self, agg):
ed_timestamps = self.ed_flights()[["timestamp"]] ed_timestamps = self.ed_flights()[["timestamp"]]
expected_values = { expected_values = {
@ -190,6 +191,9 @@ class TestDataFrameMetrics(TestData):
} }
ed_metric = ed_timestamps.agg([agg]) ed_metric = ed_timestamps.agg([agg])
if agg == "nunique":
assert ed_metric.dtypes["timestamp"] == np.int64
else:
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
assert ed_metric["timestamp"][0] == expected_values[agg] assert ed_metric["timestamp"][0] == expected_values[agg]
@ -205,3 +209,22 @@ class TestDataFrameMetrics(TestData):
assert ed_metric.dtype == np.dtype("datetime64[ns]") assert ed_metric.dtype == np.dtype("datetime64[ns]")
assert ed_metric[0] == expected_values[agg] assert ed_metric[0] == expected_values[agg]
def test_flights_datetime_metrics_median(self):
ed_df = self.ed_flights_small()[["timestamp"]]
median = ed_df.median(numeric_only=False)[0]
assert isinstance(median, pd.Timestamp)
assert (
pd.to_datetime("2018-01-01 10:00:00.000")
<= median
<= pd.to_datetime("2018-01-01 12:00:00.000")
)
median = ed_df.agg(["mean"])["timestamp"][0]
assert isinstance(median, pd.Timestamp)
assert (
pd.to_datetime("2018-01-01 10:00:00.000")
<= median
<= pd.to_datetime("2018-01-01 12:00:00.000")
)

View File

@ -100,3 +100,14 @@ class TestSeriesMetrics(TestData):
ed_metric = getattr(ed_timestamps, agg)() ed_metric = getattr(ed_timestamps, agg)()
assert ed_metric == expected_values[agg] assert ed_metric == expected_values[agg]
def test_flights_datetime_median_metric(self):
ed_series = self.ed_flights_small()["timestamp"]
median = ed_series.median()
assert isinstance(median, pd.Timestamp)
assert (
pd.to_datetime("2018-01-01 10:00:00.000")
<= median
<= pd.to_datetime("2018-01-01 12:00:00.000")
)