Fix Series.describe(), median agg dtype

2025-07-11 00:02:14 +08:00 · 2020-08-17 09:28:30 -05:00 · 2020-08-17 09:28:30 -05:00 · 5bf205a1e0
commit 5bf205a1e0
parent f5b37e643c
5 changed files with 103 additions and 6 deletions
--- a/docs/source/reference/supported_apis.rst
+++ b/docs/source/reference/supported_apis.rst
@ -714,7 +714,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.Series.dropna()``                | No         |
 +---------------------------------------+------------+
-| ``ed.Series.dtype``                   | No         |
+| ``ed.Series.dtype``                   | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.Series.dtypes``                  | **Yes**    |
 +---------------------------------------+------------+
--- a/eland/operations.py
+++ b/eland/operations.py
@ -301,7 +301,7 @@ class Operations:
                    )
                # These aggregations maintain the column datatype
-                elif pd_agg in ("max", "min"):
+                elif pd_agg in {"max", "min", "median"}:
                    agg_value = field.np_dtype.type(agg_value)
                values.append(agg_value)
--- a/eland/series.py
+++ b/eland/series.py
@ -425,7 +425,7 @@ class Series(NDFrame):
        return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
    @property
-    def _dtype(self) -> np.dtype:
+    def dtype(self) -> np.dtype:
        # DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
        return self._query_compiler.dtypes[0]
@ -1192,7 +1192,7 @@ class Series(NDFrame):
            self._query_compiler.check_arithmetics(right._query_compiler)
            right_object = ArithmeticSeries(
-                right._query_compiler, right.name, right._dtype
+                right._query_compiler, right.name, right.dtype
            )
            display_name = None
        elif np.issubdtype(np.dtype(type(right)), np.number):
@ -1204,11 +1204,11 @@ class Series(NDFrame):
        else:
            raise TypeError(
                f"unsupported operation type(s) [{method_name!r}] "
-                f"for operands ['{type(self)}' with dtype '{self._dtype}', "
+                f"for operands ['{type(self)}' with dtype '{self.dtype}', "
                f"'{type(right).__name__}']"
            )
-        left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
+        left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
        left_object.arithmetic_operation(method_name, right_object)
        series = Series(
@ -1430,6 +1430,41 @@ class Series(NDFrame):
        results = super().mad(numeric_only=numeric_only)
        return results.squeeze()
    def describe(self) -> pd.Series:
        """
        Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
        dataset’s distribution, excluding NaN values.
        Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
        The output will vary depending on what is provided. Refer to the notes below for more detail.
        TODO - add additional arguments (current only numeric values supported)
        Returns
        -------
        pandas.Series:
            Summary information
        See Also
        --------
        :pandas_api_docs:`pandas.Series.describe`
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
        count    13059.000000
        mean       628.253689
        std        266.386661
        min        100.020531
        ...
        ...
        ...
        max       1199.729004
        Name: AvgTicketPrice, dtype: float64
        """
        return super().describe().squeeze()
    # def values TODO - not implemented as causes current implementation of query to fail
    def to_numpy(self):
--- a/eland/tests/dataframe/test_metrics_pytest.py
+++ b/eland/tests/dataframe/test_metrics_pytest.py
@ -228,3 +228,25 @@ class TestDataFrameMetrics(TestData):
            <= median
            <= pd.to_datetime("2018-01-01 12:00:00.000")
        )
    def test_metric_agg_keep_dtypes(self):
        # max, min, and median maintain their dtypes
        df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
        assert df.min().tolist() == [131.81910705566406, False, 0]
        assert df.max().tolist() == [989.9527587890625, True, 0]
        assert df.median().tolist() == [550.276123046875, False, 0]
        all_agg = df.agg(["min", "max", "median"])
        assert all_agg.dtypes.tolist() == [
            np.dtype("float64"),
            np.dtype("bool"),
            np.dtype("int64"),
        ]
        assert all_agg.to_dict() == {
            "AvgTicketPrice": {
                "max": 989.9527587890625,
                "median": 550.276123046875,
                "min": 131.81910705566406,
            },
            "Cancelled": {"max": True, "median": False, "min": False},
            "dayOfWeek": {"max": 0, "median": 0, "min": 0},
        }
--- a/eland/tests/series/test_describe_pytest.py
+++ b/eland/tests/series/test_describe_pytest.py
@ -0,0 +1,40 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 import pandas as pd
 from eland.tests.common import TestData, assert_series_equal
 class TestSeriesDescribe(TestData):
    def test_series_describe(self):
        ed_df = self.ed_flights_small()
        pd_df = self.pd_flights_small()
        ed_desc = ed_df.AvgTicketPrice.describe()
        pd_desc = pd_df.AvgTicketPrice.describe()
        assert isinstance(ed_desc, pd.Series)
        assert ed_desc.shape == pd_desc.shape
        assert ed_desc.dtype == pd_desc.dtype
        assert ed_desc.index.equals(pd_desc.index)
        # Percentiles calculations vary for Elasticsearch
        assert_series_equal(
            ed_desc[["count", "mean", "std", "min", "max"]],
            pd_desc[["count", "mean", "std", "min", "max"]],
            rtol=0.2,
        )