Fix Series.describe(), median agg dtype

2025-07-11 00:02:14 +08:00 · 2020-08-17 09:28:30 -05:00 · 2020-08-17 09:28:30 -05:00 · 5bf205a1e0
commit 5bf205a1e0
parent f5b37e643c
5 changed files with 103 additions and 6 deletions
--- a/docs/source/reference/supported_apis.rst
+++ b/docs/source/reference/supported_apis.rst
@ -714,7 +714,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.Series.dropna()``                | No         |
 +---------------------------------------+------------+
-| ``ed.Series.dtype``                   | No         |
+| ``ed.Series.dtype``                   | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.Series.dtypes``                  | **Yes**    |
 +---------------------------------------+------------+
--- a/eland/operations.py
+++ b/eland/operations.py
@ -301,7 +301,7 @@ class Operations:
                    )

                # These aggregations maintain the column datatype
-                elif pd_agg in ("max", "min"):
+                elif pd_agg in {"max", "min", "median"}:
                    agg_value = field.np_dtype.type(agg_value)

                values.append(agg_value)
--- a/eland/series.py
+++ b/eland/series.py
@ -425,7 +425,7 @@ class Series(NDFrame):
        return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]

    @property
-    def _dtype(self) -> np.dtype:
+    def dtype(self) -> np.dtype:
        # DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
        return self._query_compiler.dtypes[0]

@ -1192,7 +1192,7 @@ class Series(NDFrame):
            self._query_compiler.check_arithmetics(right._query_compiler)

            right_object = ArithmeticSeries(
-                right._query_compiler, right.name, right._dtype
+                right._query_compiler, right.name, right.dtype
            )
            display_name = None
        elif np.issubdtype(np.dtype(type(right)), np.number):
@ -1204,11 +1204,11 @@ class Series(NDFrame):
        else:
            raise TypeError(
                f"unsupported operation type(s) [{method_name!r}] "
-                f"for operands ['{type(self)}' with dtype '{self._dtype}', "
+                f"for operands ['{type(self)}' with dtype '{self.dtype}', "
                f"'{type(right).__name__}']"
            )

-        left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
+        left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
        left_object.arithmetic_operation(method_name, right_object)

        series = Series(
@ -1430,6 +1430,41 @@ class Series(NDFrame):
        results = super().mad(numeric_only=numeric_only)
        return results.squeeze()

+    def describe(self) -> pd.Series:
+        """
+        Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
+        dataset’s distribution, excluding NaN values.
+
+        Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
+        The output will vary depending on what is provided. Refer to the notes below for more detail.
+
+        TODO - add additional arguments (current only numeric values supported)
+
+        Returns
+        -------
+        pandas.Series:
+            Summary information
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.Series.describe`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
+        count    13059.000000
+        mean       628.253689
+        std        266.386661
+        min        100.020531
+        ...
+        ...
+        ...
+        max       1199.729004
+        Name: AvgTicketPrice, dtype: float64
+        """
+        return super().describe().squeeze()
+
    # def values TODO - not implemented as causes current implementation of query to fail

    def to_numpy(self):
--- a/eland/tests/dataframe/test_metrics_pytest.py
+++ b/eland/tests/dataframe/test_metrics_pytest.py
@ -228,3 +228,25 @@ class TestDataFrameMetrics(TestData):
            <= median
            <= pd.to_datetime("2018-01-01 12:00:00.000")
        )
+
+    def test_metric_agg_keep_dtypes(self):
+        # max, min, and median maintain their dtypes
+        df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
+        assert df.min().tolist() == [131.81910705566406, False, 0]
+        assert df.max().tolist() == [989.9527587890625, True, 0]
+        assert df.median().tolist() == [550.276123046875, False, 0]
+        all_agg = df.agg(["min", "max", "median"])
+        assert all_agg.dtypes.tolist() == [
+            np.dtype("float64"),
+            np.dtype("bool"),
+            np.dtype("int64"),
+        ]
+        assert all_agg.to_dict() == {
+            "AvgTicketPrice": {
+                "max": 989.9527587890625,
+                "median": 550.276123046875,
+                "min": 131.81910705566406,
+            },
+            "Cancelled": {"max": True, "median": False, "min": False},
+            "dayOfWeek": {"max": 0, "median": 0, "min": 0},
+        }
--- a/eland/tests/series/test_describe_pytest.py
+++ b/eland/tests/series/test_describe_pytest.py
@ -0,0 +1,40 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+import pandas as pd
+from eland.tests.common import TestData, assert_series_equal
+
+
+class TestSeriesDescribe(TestData):
+    def test_series_describe(self):
+        ed_df = self.ed_flights_small()
+        pd_df = self.pd_flights_small()
+
+        ed_desc = ed_df.AvgTicketPrice.describe()
+        pd_desc = pd_df.AvgTicketPrice.describe()
+
+        assert isinstance(ed_desc, pd.Series)
+        assert ed_desc.shape == pd_desc.shape
+        assert ed_desc.dtype == pd_desc.dtype
+        assert ed_desc.index.equals(pd_desc.index)
+
+        # Percentiles calculations vary for Elasticsearch
+        assert_series_equal(
+            ed_desc[["count", "mean", "std", "min", "max"]],
+            pd_desc[["count", "mean", "std", "min", "max"]],
+            rtol=0.2,
+        )