Add quantile() to DataFrameGroupBy

2025-07-24 00:00:39 +08:00 · 2021-06-22 21:24:33 +05:30 · 2021-06-22 21:24:33 +05:30 · 5fe32a24df
commit 5fe32a24df
parent 7e8520a8ef
6 changed files with 182 additions and 25 deletions
--- a/docs/sphinx/reference/api/eland.groupby.DataFrameGroupBy.quantile.rst
+++ b/docs/sphinx/reference/api/eland.groupby.DataFrameGroupBy.quantile.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.quantile
 =======================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.quantile
--- a/docs/sphinx/reference/dataframe.rst
+++ b/docs/sphinx/reference/dataframe.rst
@ -75,6 +75,7 @@ Function Application, GroupBy & Window
   DataFrameGroupBy.std
   DataFrameGroupBy.sum
   DataFrameGroupBy.var
   DataFrameGroupBy.quantile
   GroupBy
 .. currentmodule:: eland
--- a/eland/groupby.py
+++ b/eland/groupby.py
@ -503,6 +503,86 @@ class DataFrameGroupBy(GroupBy):
            numeric_only=False,
        )
    def quantile(
        self, q: Union[int, float, List[int], List[float]] = 0.5
    ) -> "pd.DataFrame":
        """
        Used to groupby and calculate quantile for a given DataFrame.
        Parameters
        ----------
        q:
            float or array like, default 0.5
            Value between 0 <= q <= 1, the quantile(s) to compute.
        Returns
        -------
        pandas.DataFrame
            quantile value for each grouped column
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.quantile`
        Examples
        --------
        >>> ed_df = ed.DataFrame('localhost', 'flights')
        >>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
        >>> ed_flights.groupby(["dayOfWeek", "Cancelled"]).quantile() # doctest: +SKIP
                             AvgTicketPrice  FlightDelayMin
        dayOfWeek Cancelled
        0         False          572.290384             0.0
                  True           578.140564             0.0
        1         False          567.980560             0.0
                  True           582.618713             0.0
        2         False          590.170986             0.0
                  True           579.811890             0.0
        3         False          574.131340             0.0
                  True           572.852264             0.0
        4         False          591.533699             0.0
                  True           582.877014             0.0
        5         False          791.622625             0.0
                  True           793.362946             0.0
        6         False          817.378523             0.0
                  True           766.855530             0.0
        >>> ed_flights.groupby(["dayOfWeek", "Cancelled"]).quantile(q=[.2, .5]) # doctest: +SKIP
                                 AvgTicketPrice  FlightDelayMin
        dayOfWeek Cancelled
        0         False     0.2      319.925979             0.0
                            0.5      572.290384             0.0
                  True      0.2      325.704562             0.0
                            0.5      578.140564             0.0
        1         False     0.2      327.311007             0.0
                            0.5      567.980560             0.0
                  True      0.2      336.839572             0.0
                            0.5      582.618713             0.0
        2         False     0.2      332.323011             0.0
                            0.5      590.170986             0.0
                  True      0.2      314.472537             0.0
                            0.5      579.811890             0.0
        3         False     0.2      327.652659             0.0
                            0.5      574.131340             0.0
                  True      0.2      298.483032             0.0
                            0.5      572.852264             0.0
        4         False     0.2      314.290205             0.0
                            0.5      591.533699             0.0
                  True      0.2      325.024850             0.0
                            0.5      582.877014             0.0
        5         False     0.2      567.362137             0.0
                            0.5      791.622625             0.0
                  True      0.2      568.323944             0.0
                            0.5      793.362946             0.0
        6         False     0.2      568.489746             0.0
                            0.5      817.378523             0.0
                  True      0.2      523.890680             0.0
                            0.5      766.855530             0.0
        """
        return self._query_compiler.aggs_groupby(
            by=self._by, pd_aggs=["quantile"], quantiles=q, numeric_only=True
        )
    def aggregate(
        self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
    ) -> "pd.DataFrame":
--- a/eland/operations.py
+++ b/eland/operations.py
@ -493,6 +493,7 @@ class Operations:
        numeric_only: Optional[bool],
        percentiles: Optional[List[float]] = None,
        is_dataframe_agg: bool = False,
        is_groupby: bool = False,
    ) -> Dict[str, List[Any]]:
        """
        This method unpacks metric aggregations JSON response.
@ -554,7 +555,9 @@ class Operations:
                            agg_value = agg_value["50.0"]
                        # Currently Pandas does the same
                        # If we call quantile it returns the same result as of median.
-                        elif pd_agg == "quantile" and is_dataframe_agg:
+                        elif (
                            pd_agg == "quantile" and is_dataframe_agg and not is_groupby
                        ):
                            agg_value = agg_value["50.0"]
                        else:
                            # Maintain order of percentiles
@ -668,8 +671,9 @@ class Operations:
            # If numeric_only is True and We only have a NaN type field then we check for empty.
            if values:
                results[field.column] = values if len(values) > 1 else values[0]
-            # This only runs when df.quantile() or series.quantile() is called
+            # This only runs when df.quantile() or series.quantile() or
-            if percentile_values and not is_dataframe_agg:
+            # quantile from groupby is called
            if percentile_values:
                results[f"{field.column}"] = percentile_values
        return results
@ -682,19 +686,6 @@ class Operations:
        is_dataframe: bool = True,
        numeric_only: Optional[bool] = True,
    ) -> Union[pd.DataFrame, pd.Series]:
        # To verify if quantile range falls between 0 to 1
        def quantile_to_percentile(quantile: Any) -> float:
            if isinstance(quantile, (int, float)):
                quantile = float(quantile)
                if quantile > 1 or quantile < 0:
                    raise ValueError(
                        f"quantile should be in range of 0 and 1, given {quantile}"
                    )
            else:
                raise TypeError("quantile should be of type int or float")
            # quantile * 100 = percentile
            # return float(...) because min(1.0) gives 1
            return float(min(100, max(0, quantile * 100)))
        percentiles = [
            quantile_to_percentile(x)
@ -730,6 +721,7 @@ class Operations:
        by: List[str],
        pd_aggs: List[str],
        dropna: bool = True,
        quantiles: Optional[List[float]] = None,
        is_dataframe_agg: bool = False,
        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
@ -751,6 +743,8 @@ class Operations:
            Know if groupby with aggregation or single agg is called.
        numeric_only:
            return either numeric values or NaN/NaT
        quantiles:
            List of quantiles when 'quantile' agg is called. Otherwise it is None
        Returns
        -------
@ -779,8 +773,19 @@ class Operations:
        # To return for creating multi-index on columns
        headers = [agg_field.column for agg_field in agg_fields]
        percentiles: Optional[List[str]] = None
        if quantiles:
            percentiles = [
                quantile_to_percentile(x)
                for x in (
                    (quantiles,)
                    if not isinstance(quantiles, (list, tuple))
                    else quantiles
                )
            ]
        # Convert pandas aggs to ES equivalent
-        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
+        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs=pd_aggs, percentiles=percentiles)
        # Construct Query
        for by_field in by_fields:
@ -804,6 +809,13 @@ class Operations:
                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    if es_agg[0] == "percentiles":
                        body.percentile_agg(
                            name=f"{es_agg[0]}_{agg_field.es_field_name}",
                            field=agg_field.es_field_name,
                            percents=es_agg[1],
                        )
                    else:
                        body.metric_aggs(
                            f"{es_agg[0]}_{agg_field.es_field_name}",
                            es_agg[0],
@ -832,7 +844,12 @@ class Operations:
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")
-                    results[by_field.column].append(bucket_key)
+                    if pd_aggs == ["quantile"] and len(percentiles) > 1:
                        bucket_key = [bucket_key] * len(percentiles)
                    results[by_field.column].extend(
                        bucket_key if isinstance(bucket_key, list) else [bucket_key]
                    )
                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
@ -840,19 +857,32 @@ class Operations:
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    percentiles=percentiles,
                    # We set 'True' here because we want the value
                    # unpacking to always be in 'dataframe' mode.
                    is_dataframe_agg=True,
                    is_groupby=True,
                )
                # to construct index with quantiles
                if pd_aggs == ["quantile"] and len(percentiles) > 1:
                    results[None].extend([i / 100 for i in percentiles])
                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if not isinstance(value, list):
                        results[key].append(value)
                        continue
                    elif isinstance(value, list) and pd_aggs == ["quantile"]:
                        results[f"{key}_{pd_aggs[0]}"].extend(value)
                    else:
                        for pd_agg, val in zip(pd_aggs, value):
                            results[f"{key}_{pd_agg}"].append(val)
        # Just to maintain Output same as pandas with empty header.
        if pd_aggs == ["quantile"] and len(percentiles) > 1:
            by = by + [None]
        agg_df = pd.DataFrame(results).set_index(by).sort_index()
        if is_dataframe_agg:
@ -1408,3 +1438,18 @@ class Operations:
    def update_query(self, boolean_filter):
        task = BooleanFilterTask(boolean_filter)
        self._tasks.append(task)
 def quantile_to_percentile(quantile: Union[int, float]) -> float:
    # To verify if quantile range falls between 0 to 1
    if isinstance(quantile, (int, float)):
        quantile = float(quantile)
        if quantile > 1 or quantile < 0:
            raise ValueError(
                f"quantile should be in range of 0 and 1, given {quantile}"
            )
    else:
        raise TypeError("quantile should be of type int or float")
    # quantile * 100 = percentile
    # return float(...) because min(1.0) gives 1
    return float(min(100, max(0, quantile * 100)))
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -673,11 +673,13 @@ class QueryCompiler:
        dropna: bool = True,
        is_dataframe_agg: bool = False,
        numeric_only: Optional[bool] = True,
        quantiles: Union[int, float, List[int], List[float], None] = None,
    ) -> pd.DataFrame:
        return self._operations.aggs_groupby(
            self,
            by=by,
            pd_aggs=pd_aggs,
            quantiles=quantiles,
            dropna=dropna,
            is_dataframe_agg=is_dataframe_agg,
            numeric_only=numeric_only,
--- a/tests/dataframe/test_groupby_pytest.py
+++ b/tests/dataframe/test_groupby_pytest.py
@ -230,3 +230,26 @@ class TestGroupbyDataFrame(TestData):
        match = "Currently mode is not supported for groupby"
        with pytest.raises(NotImplementedError, match=match):
            ed_flights.groupby("Cancelled").mode()
    @pytest.mark.parametrize("dropna", [True, False])
    @pytest.mark.parametrize(
        ["func", "args"],
        [
            ("quantile", ()),
            ("quantile", (0.55,)),
            ("quantile", ([0.2, 0.4, 0.6, 0.8],)),
        ],
    )
    @pytest.mark.parametrize("columns", ["Cancelled", ["dayOfWeek", "Cancelled"]])
    def test_groupby_aggs_quantile(self, dropna, columns, func, args):
        # Pandas has numeric_only  applicable for the above aggs with groupby only.
        pd_flights = self.pd_flights().filter(self.filter_data)
        ed_flights = self.ed_flights().filter(self.filter_data)
        pd_groupby = getattr(pd_flights.groupby(columns, dropna=dropna), func)(*args)
        ed_groupby = getattr(ed_flights.groupby(columns, dropna=dropna), func)(*args)
        # checking only values because dtypes are checked in aggs tests
        assert_frame_equal(
            pd_groupby, ed_groupby, check_exact=False, check_dtype=False, rtol=2
        )