Optimize df.describe() to use aggregations instead of own query

2025-07-24 00:00:39 +08:00 · 2021-06-22 21:59:54 +05:30 · 2021-06-22 21:59:54 +05:30 · ac2efb5863
commit ac2efb5863
parent 5fe32a24df
4 changed files with 24 additions and 47 deletions
--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -628,8 +628,8 @@ class NDFrame(ABC):

        Examples
        --------
-        >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
-        >>> df.describe() # ignoring percentiles as they don't generate consistent results
+        >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles
+        >>> df.describe() # doctest: +SKIP
               AvgTicketPrice  FlightDelayMin
        count    13059.000000    13059.000000
        mean       628.253689       47.335171
--- a/eland/operations.py
+++ b/eland/operations.py
@ -1081,51 +1081,25 @@ class Operations:
                f"Can not count field matches if size is set {size}"
            )

-        numeric_source_fields = query_compiler._mappings.numeric_source_fields()
-
-        # for each field we compute:
-        # count, mean, std, min, 25%, 50%, 75%, max
-        body = Query(query_params.query)
-
-        for field in numeric_source_fields:
-            body.metric_aggs("extended_stats_" + field, "extended_stats", field)
-            body.metric_aggs("percentiles_" + field, "percentiles", field)
-
-        response = query_compiler._client.search(
-            index=query_compiler._index_pattern, size=0, body=body.to_search_body()
+        df1 = self.aggs(
+            query_compiler=query_compiler,
+            pd_aggs=["count", "mean", "std", "min", "max"],
+            numeric_only=True,
+        )
+        df2 = self.quantile(
+            query_compiler=query_compiler,
+            pd_aggs=["quantile"],
+            quantiles=[0.25, 0.5, 0.75],
+            is_dataframe=True,
+            numeric_only=True,
        )

-        results = {}
+        # Convert [.25,.5,.75] to ["25%", "50%", "75%"]
+        df2 = df2.set_index([["25%", "50%", "75%"]])

-        for field in numeric_source_fields:
-            values = list()
-            values.append(response["aggregations"]["extended_stats_" + field]["count"])
-            values.append(response["aggregations"]["extended_stats_" + field]["avg"])
-            values.append(
-                response["aggregations"]["extended_stats_" + field]["std_deviation"]
+        return pd.concat([df1, df2]).reindex(
+            ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
        )
-            values.append(response["aggregations"]["extended_stats_" + field]["min"])
-            values.append(
-                response["aggregations"]["percentiles_" + field]["values"]["25.0"]
-            )
-            values.append(
-                response["aggregations"]["percentiles_" + field]["values"]["50.0"]
-            )
-            values.append(
-                response["aggregations"]["percentiles_" + field]["values"]["75.0"]
-            )
-            values.append(response["aggregations"]["extended_stats_" + field]["max"])
-
-            # if not None
-            if values.count(None) < len(values):
-                results[field] = values
-
-        df = pd.DataFrame(
-            data=results,
-            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
-        )
-
-        return df

    def to_pandas(self, query_compiler, show_progress=False):
        class PandasDataFrameCollector:
--- a/eland/series.py
+++ b/eland/series.py
@ -1269,7 +1269,7 @@ class Series(NDFrame):
        3    2
        4    2
        Name: total_quantity, dtype: int64
-        >>> np.int(2) ** df.total_quantity
+        >>> np.int_(2) ** df.total_quantity
        0    4.0
        1    4.0
        2    4.0
@ -1627,8 +1627,8 @@ class Series(NDFrame):

        Examples
        --------
-        >>> df = ed.DataFrame('localhost', 'flights')
-        >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
+        >>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results
+        >>> df.AvgTicketPrice.describe()  # doctest: +SKIP
        count    13059.000000
        mean       628.253689
        std        266.386661
--- a/tests/dataframe/test_describe_pytest.py
+++ b/tests/dataframe/test_describe_pytest.py
@ -28,7 +28,10 @@ class TestDataFrameDescribe(TestData):
        ed_flights = self.ed_flights()

        pd_describe = pd_flights.describe()
-        ed_describe = ed_flights.describe()
+        # We remove bool columns to match pandas output
+        ed_describe = ed_flights.describe().drop(
+            ["Cancelled", "FlightDelay"], axis="columns"
+        )

        assert_frame_equal(
            pd_describe.drop(["25%", "50%", "75%"], axis="index"),