Optimize df.describe() to use aggregations instead of own query

2025-07-24 00:00:39 +08:00 · 2021-06-22 21:59:54 +05:30 · 2021-06-22 21:59:54 +05:30 · ac2efb5863
commit ac2efb5863
parent 5fe32a24df
4 changed files with 24 additions and 47 deletions
--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -628,8 +628,8 @@ class NDFrame(ABC):
        Examples
        --------
-        >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
+        >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
-        >>> df.describe() # ignoring percentiles as they don't generate consistent results
+        >>> df.describe() # doctest: +SKIP
               AvgTicketPrice  FlightDelayMin
        count    13059.000000    13059.000000
        mean       628.253689       47.335171
--- a/eland/operations.py
+++ b/eland/operations.py
@ -1081,52 +1081,26 @@ class Operations:
                f"Can not count field matches if size is set {size}"
            )
-        numeric_source_fields = query_compiler._mappings.numeric_source_fields()
+        df1 = self.aggs(
-
+            query_compiler=query_compiler,
-        # for each field we compute:
+            pd_aggs=["count", "mean", "std", "min", "max"],
-        # count, mean, std, min, 25%, 50%, 75%, max
+            numeric_only=True,
-        body = Query(query_params.query)
+        )
-
+        df2 = self.quantile(
-        for field in numeric_source_fields:
+            query_compiler=query_compiler,
-            body.metric_aggs("extended_stats_" + field, "extended_stats", field)
+            pd_aggs=["quantile"],
-            body.metric_aggs("percentiles_" + field, "percentiles", field)
+            quantiles=[0.25, 0.5, 0.75],
-
+            is_dataframe=True,
-        response = query_compiler._client.search(
+            numeric_only=True,
            index=query_compiler._index_pattern, size=0, body=body.to_search_body()
        )
-        results = {}
+        # Convert [.25,.5,.75] to ["25%", "50%", "75%"]
        df2 = df2.set_index([["25%", "50%", "75%"]])
-        for field in numeric_source_fields:
+        return pd.concat([df1, df2]).reindex(
-            values = list()
+            ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
            values.append(response["aggregations"]["extended_stats_" + field]["count"])
            values.append(response["aggregations"]["extended_stats_" + field]["avg"])
            values.append(
                response["aggregations"]["extended_stats_" + field]["std_deviation"]
            )
            values.append(response["aggregations"]["extended_stats_" + field]["min"])
            values.append(
                response["aggregations"]["percentiles_" + field]["values"]["25.0"]
            )
            values.append(
                response["aggregations"]["percentiles_" + field]["values"]["50.0"]
            )
            values.append(
                response["aggregations"]["percentiles_" + field]["values"]["75.0"]
            )
            values.append(response["aggregations"]["extended_stats_" + field]["max"])
            # if not None
            if values.count(None) < len(values):
                results[field] = values
        df = pd.DataFrame(
            data=results,
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )
        return df
    def to_pandas(self, query_compiler, show_progress=False):
        class PandasDataFrameCollector:
            def __init__(self, show_progress):
--- a/eland/series.py
+++ b/eland/series.py
@ -1269,7 +1269,7 @@ class Series(NDFrame):
        3    2
        4    2
        Name: total_quantity, dtype: int64
-        >>> np.int(2) ** df.total_quantity
+        >>> np.int_(2) ** df.total_quantity
        0    4.0
        1    4.0
        2    4.0
@ -1627,8 +1627,8 @@ class Series(NDFrame):
        Examples
        --------
-        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results
-        >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
+        >>> df.AvgTicketPrice.describe()  # doctest: +SKIP
        count    13059.000000
        mean       628.253689
        std        266.386661
--- a/tests/dataframe/test_describe_pytest.py
+++ b/tests/dataframe/test_describe_pytest.py
@ -28,7 +28,10 @@ class TestDataFrameDescribe(TestData):
        ed_flights = self.ed_flights()
        pd_describe = pd_flights.describe()
-        ed_describe = ed_flights.describe()
+        # We remove bool columns to match pandas output
        ed_describe = ed_flights.describe().drop(
            ["Cancelled", "FlightDelay"], axis="columns"
        )
        assert_frame_equal(
            pd_describe.drop(["25%", "50%", "75%"], axis="index"),