mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Fix Series.median(), support median() for datetimes
This commit is contained in:
parent
a709ed589d
commit
535ed9b334
@ -91,15 +91,19 @@ class Field(NamedTuple):
|
||||
return np.dtype(self.pd_dtype)
|
||||
|
||||
def is_es_agg_compatible(self, es_agg) -> bool:
|
||||
# Unpack the actual aggregation if this is 'extended_stats'
|
||||
# Unpack the actual aggregation if this is 'extended_stats/percentiles'
|
||||
if isinstance(es_agg, tuple):
|
||||
if es_agg[0] == "extended_stats":
|
||||
es_agg = es_agg[1]
|
||||
elif es_agg[0] == "percentiles":
|
||||
es_agg = "percentiles"
|
||||
|
||||
# Cardinality works for all types
|
||||
# Numerics and bools work for all aggs
|
||||
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
||||
return True
|
||||
# Timestamps also work for 'min', 'max' and 'avg'
|
||||
if es_agg in {"min", "max", "avg"} and self.is_timestamp:
|
||||
if es_agg in {"min", "max", "avg", "percentiles"} and self.is_timestamp:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -1255,7 +1255,7 @@ class Series(NDFrame):
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
max value
|
||||
mean value
|
||||
|
||||
See Also
|
||||
--------
|
||||
@ -1270,6 +1270,30 @@ class Series(NDFrame):
|
||||
results = super().mean(numeric_only=numeric_only)
|
||||
return results.squeeze()
|
||||
|
||||
def median(self, numeric_only=None):
|
||||
"""
|
||||
Return the median of the Series values
|
||||
|
||||
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
median value
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.Series.median`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||
>>> int(s.median())
|
||||
640
|
||||
"""
|
||||
results = super().median(numeric_only=numeric_only)
|
||||
return results.squeeze()
|
||||
|
||||
def min(self, numeric_only=None):
|
||||
"""
|
||||
Return the minimum of the Series values
|
||||
@ -1279,7 +1303,7 @@ class Series(NDFrame):
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
max value
|
||||
min value
|
||||
|
||||
See Also
|
||||
--------
|
||||
@ -1303,7 +1327,7 @@ class Series(NDFrame):
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
max value
|
||||
sum of all values
|
||||
|
||||
See Also
|
||||
--------
|
||||
|
@ -167,19 +167,20 @@ class TestDataFrameMetrics(TestData):
|
||||
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||
"nunique": 12236,
|
||||
"mad": pd.NaT,
|
||||
"median": pd.NaT,
|
||||
"std": pd.NaT,
|
||||
"sum": pd.NaT,
|
||||
"var": pd.NaT,
|
||||
"nunique": 12236,
|
||||
}
|
||||
}
|
||||
|
||||
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"])
|
||||
assert ed_metrics.to_dict() == expected_values
|
||||
ed_metrics_dict = ed_metrics.to_dict()
|
||||
ed_metrics_dict["timestamp"].pop("median") # Median is tested below.
|
||||
assert ed_metrics_dict == expected_values
|
||||
|
||||
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
|
||||
def test_flights_datetime_metrics_single_agg(self, agg):
|
||||
ed_timestamps = self.ed_flights()[["timestamp"]]
|
||||
expected_values = {
|
||||
@ -190,6 +191,9 @@ class TestDataFrameMetrics(TestData):
|
||||
}
|
||||
ed_metric = ed_timestamps.agg([agg])
|
||||
|
||||
if agg == "nunique":
|
||||
assert ed_metric.dtypes["timestamp"] == np.int64
|
||||
else:
|
||||
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
|
||||
assert ed_metric["timestamp"][0] == expected_values[agg]
|
||||
|
||||
@ -205,3 +209,22 @@ class TestDataFrameMetrics(TestData):
|
||||
|
||||
assert ed_metric.dtype == np.dtype("datetime64[ns]")
|
||||
assert ed_metric[0] == expected_values[agg]
|
||||
|
||||
def test_flights_datetime_metrics_median(self):
|
||||
ed_df = self.ed_flights_small()[["timestamp"]]
|
||||
|
||||
median = ed_df.median(numeric_only=False)[0]
|
||||
assert isinstance(median, pd.Timestamp)
|
||||
assert (
|
||||
pd.to_datetime("2018-01-01 10:00:00.000")
|
||||
<= median
|
||||
<= pd.to_datetime("2018-01-01 12:00:00.000")
|
||||
)
|
||||
|
||||
median = ed_df.agg(["mean"])["timestamp"][0]
|
||||
assert isinstance(median, pd.Timestamp)
|
||||
assert (
|
||||
pd.to_datetime("2018-01-01 10:00:00.000")
|
||||
<= median
|
||||
<= pd.to_datetime("2018-01-01 12:00:00.000")
|
||||
)
|
||||
|
@ -100,3 +100,14 @@ class TestSeriesMetrics(TestData):
|
||||
ed_metric = getattr(ed_timestamps, agg)()
|
||||
|
||||
assert ed_metric == expected_values[agg]
|
||||
|
||||
def test_flights_datetime_median_metric(self):
|
||||
ed_series = self.ed_flights_small()["timestamp"]
|
||||
|
||||
median = ed_series.median()
|
||||
assert isinstance(median, pd.Timestamp)
|
||||
assert (
|
||||
pd.to_datetime("2018-01-01 10:00:00.000")
|
||||
<= median
|
||||
<= pd.to_datetime("2018-01-01 12:00:00.000")
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user