diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 18c6484..67b9855 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -91,15 +91,19 @@ class Field(NamedTuple): return np.dtype(self.pd_dtype) def is_es_agg_compatible(self, es_agg) -> bool: - # Unpack the actual aggregation if this is 'extended_stats' + # Unpack the actual aggregation if this is 'extended_stats/percentiles' if isinstance(es_agg, tuple): - es_agg = es_agg[1] + if es_agg[0] == "extended_stats": + es_agg = es_agg[1] + elif es_agg[0] == "percentiles": + es_agg = "percentiles" + # Cardinality works for all types # Numerics and bools work for all aggs if es_agg == "cardinality" or self.is_numeric or self.is_bool: return True # Timestamps also work for 'min', 'max' and 'avg' - if es_agg in {"min", "max", "avg"} and self.is_timestamp: + if es_agg in {"min", "max", "avg", "percentiles"} and self.is_timestamp: return True return False diff --git a/eland/series.py b/eland/series.py index 011bd0c..9623a54 100644 --- a/eland/series.py +++ b/eland/series.py @@ -1255,7 +1255,7 @@ class Series(NDFrame): Returns ------- float - max value + mean value See Also -------- @@ -1270,6 +1270,30 @@ class Series(NDFrame): results = super().mean(numeric_only=numeric_only) return results.squeeze() + def median(self, numeric_only=None): + """ + Return the median of the Series values + + TODO - implement remainder of pandas arguments, currently non-numerics are not supported + + Returns + ------- + float + median value + + See Also + -------- + :pandas_api_docs:`pandas.Series.median` + + Examples + -------- + >>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice'] + >>> int(s.median()) + 640 + """ + results = super().median(numeric_only=numeric_only) + return results.squeeze() + def min(self, numeric_only=None): """ Return the minimum of the Series values @@ -1279,7 +1303,7 @@ class Series(NDFrame): Returns ------- float - max value + min value See Also -------- @@ -1303,7 +1327,7 @@ class Series(NDFrame): Returns ------- float - max value + sum of all values See Also -------- diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index 702e9f4..4a1c826 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -167,19 +167,20 @@ class TestDataFrameMetrics(TestData): "min": pd.Timestamp("2018-01-01 00:00:00"), "mean": pd.Timestamp("2018-01-21 19:20:45.564438232"), "max": pd.Timestamp("2018-02-11 23:50:12"), + "nunique": 12236, "mad": pd.NaT, - "median": pd.NaT, "std": pd.NaT, "sum": pd.NaT, "var": pd.NaT, - "nunique": 12236, } } ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"]) - assert ed_metrics.to_dict() == expected_values + ed_metrics_dict = ed_metrics.to_dict() + ed_metrics_dict["timestamp"].pop("median") # Median is tested below. + assert ed_metrics_dict == expected_values - @pytest.mark.parametrize("agg", ["mean", "min", "max"]) + @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) def test_flights_datetime_metrics_single_agg(self, agg): ed_timestamps = self.ed_flights()[["timestamp"]] expected_values = { @@ -190,7 +191,10 @@ class TestDataFrameMetrics(TestData): } ed_metric = ed_timestamps.agg([agg]) - assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") + if agg == "nunique": + assert ed_metric.dtypes["timestamp"] == np.int64 + else: + assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert ed_metric["timestamp"][0] == expected_values[agg] @pytest.mark.parametrize("agg", ["mean", "min", "max"]) @@ -205,3 +209,22 @@ class TestDataFrameMetrics(TestData): assert ed_metric.dtype == np.dtype("datetime64[ns]") assert ed_metric[0] == expected_values[agg] + + def test_flights_datetime_metrics_median(self): + ed_df = self.ed_flights_small()[["timestamp"]] + + median = ed_df.median(numeric_only=False)[0] + assert isinstance(median, pd.Timestamp) + assert ( + pd.to_datetime("2018-01-01 10:00:00.000") + <= median + <= pd.to_datetime("2018-01-01 12:00:00.000") + ) + + median = ed_df.agg(["mean"])["timestamp"][0] + assert isinstance(median, pd.Timestamp) + assert ( + pd.to_datetime("2018-01-01 10:00:00.000") + <= median + <= pd.to_datetime("2018-01-01 12:00:00.000") + ) diff --git a/eland/tests/series/test_metrics_pytest.py b/eland/tests/series/test_metrics_pytest.py index d24a796..86a9557 100644 --- a/eland/tests/series/test_metrics_pytest.py +++ b/eland/tests/series/test_metrics_pytest.py @@ -100,3 +100,14 @@ class TestSeriesMetrics(TestData): ed_metric = getattr(ed_timestamps, agg)() assert ed_metric == expected_values[agg] + + def test_flights_datetime_median_metric(self): + ed_series = self.ed_flights_small()["timestamp"] + + median = ed_series.median() + assert isinstance(median, pd.Timestamp) + assert ( + pd.to_datetime("2018-01-01 10:00:00.000") + <= median + <= pd.to_datetime("2018-01-01 12:00:00.000") + )