Document DataFrame.groupby() methods

2025-07-24 00:00:39 +08:00 · 2020-10-27 10:10:57 -05:00 · 2020-10-27 10:10:57 -05:00 · ae70f03df3
commit ae70f03df3
parent 475e0f41ef
22 changed files with 696 additions and 185 deletions
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.agg
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.agg
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.aggregate
 ========================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.aggregate
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.count
 ====================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.count
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.mad
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.mad
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.max
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.max
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.mean
 ===================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.mean
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.median
 =====================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.median
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.min
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.min
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.nunique
 ======================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.nunique
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
@ -0,0 +1,15 @@
 eland.groupby.DataFrameGroupBy
 ==============================
 .. currentmodule:: eland.groupby
 .. autoclass:: DataFrameGroupBy
 ..
   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
   .. autosummary::
      :toctree:
      DataFrame.abs
      DataFrame.add
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.std
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.std
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.sum
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.sum
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst
@ -0,0 +1,6 @@
 eland.groupby.DataFrameGroupBy.var
 ==================================
 .. currentmodule:: eland.groupby
 .. automethod:: DataFrameGroupBy.var
--- a/docs/source/reference/api/eland.groupby.GroupBy.rst
+++ b/docs/source/reference/api/eland.groupby.GroupBy.rst
@ -0,0 +1,15 @@
 eland.groupby.GroupBy
 =====================
 .. currentmodule:: eland.groupby
 .. autoclass:: GroupBy
 ..
   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
   .. autosummary::
      :toctree:
      DataFrame.abs
      DataFrame.add
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@ -48,6 +48,28 @@ Function Application, GroupBy & Window
   DataFrame.aggregate
   DataFrame.groupby
 .. currentmodule:: eland.groupby
 .. autosummary::
   :toctree: api/
   DataFrameGroupBy
   DataFrameGroupBy.agg
   DataFrameGroupBy.aggregate
   DataFrameGroupBy.count
   DataFrameGroupByGroupBy.mad
   DataFrameGroupByGroupBy.max
   DataFrameGroupByGroupBy.mean
   DataFrameGroupByGroupBy.median
   DataFrameGroupByGroupBy.min
   DataFrameGroupByGroupBy.nunique
   DataFrameGroupByGroupBy.std
   DataFrameGroupByGroupBy.sum
   DataFrameGroupByGroupBy.var
   GroupBy
 .. currentmodule:: eland
 .. _api.dataframe.stats:
 Computations / Descriptive Stats
--- a/docs/source/reference/general_utility_functions.rst
+++ b/docs/source/reference/general_utility_functions.rst
@ -1,7 +1,7 @@
 .. _api.general_utility_functions:
 =========================
-General utility functions
+General Utility Functions
 =========================
 .. currentmodule:: eland
--- a/docs/source/reference/supported_apis.rst
+++ b/docs/source/reference/supported_apis.rst
@ -170,7 +170,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.DataFrame.get()``                | **Yes**    |
 +---------------------------------------+------------+
-| ``ed.DataFrame.groupby()``            | No         |
+| ``ed.DataFrame.groupby()``            | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.DataFrame.gt()``                 | No         |
 +---------------------------------------+------------+
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -36,7 +36,7 @@ from pandas.util._validators import validate_bool_kwarg
 import eland.plotting as gfx
 from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
 from eland.filter import BooleanFilter
-from eland.groupby import GroupByDataFrame
+from eland.groupby import DataFrameGroupBy
 from eland.ndframe import NDFrame
 from eland.series import Series
 from eland.utils import deprecated_api, is_valid_attr_name
@ -1433,7 +1433,7 @@ class DataFrame(NDFrame):
    def groupby(
        self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True
-    ) -> "GroupByDataFrame":
+    ) -> "DataFrameGroupBy":
        """
        Used to perform groupby operations
@ -1448,7 +1448,7 @@ class DataFrame(NDFrame):
        Returns
        -------
-        GroupByDataFrame
+        eland.groupby.DataFrameGroupBy
        See Also
        --------
@ -1520,7 +1520,7 @@ class DataFrame(NDFrame):
                    f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
                )
-        return GroupByDataFrame(
+        return DataFrameGroupBy(
            by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
        )
--- a/eland/groupby.py
+++ b/eland/groupby.py
@ -15,7 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional, Union
 from eland.query_compiler import QueryCompiler
@ -25,16 +25,7 @@ if TYPE_CHECKING:
 class GroupBy:
    """
-    Base class for calls to X.groupby([...])
+    Base class for calls to :py:func:`eland.DataFrame.groupby`
    Parameters
    ----------
    by:
        List of columns to groupby
    query_compiler:
        Query compiler object
    dropna:
        default is true, drop None/NaT/NaN values while grouping
    """
    def __init__(
@ -47,7 +38,56 @@ class GroupBy:
        self._dropna: bool = dropna
        self._by: List[str] = by
 class DataFrameGroupBy(GroupBy):
    """
    This holds all the groupby methods for :py:func:`eland.DataFrame.groupby`
    """
    def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the mean value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            mean value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek                     timestamp
        DestCountry
        AE               605.132970   0.152174   2.695652 2018-01-21 16:58:07.891304443
        AR               674.827252   0.147541   2.744262 2018-01-21 22:18:06.593442627
        AT               646.650530   0.175066   2.872679 2018-01-21 15:54:42.469496094
        AU               669.558832   0.129808   2.843750 2018-01-22 02:28:39.199519287
        CA               648.747109   0.134534   2.951271 2018-01-22 14:40:47.165254150
        ...                     ...        ...        ...                           ...
        RU               662.994963   0.131258   2.832206 2018-01-21 07:11:16.534506104
        SE               660.612988   0.149020   2.682353 2018-01-22 07:48:23.447058838
        TR               485.253247   0.100000   1.900000 2018-01-16 16:02:33.000000000
        US               595.774391   0.125315   2.753900 2018-01-21 16:55:04.456970215
        ZA               643.053057   0.148410   2.766784 2018-01-22 15:17:56.141342773
        <BLANKLINE>
        [32 rows x 4 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["mean"],
@ -56,6 +96,49 @@ class GroupBy:
        )
    def var(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the variance value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            variance value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.var`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek
        DestCountry
        AE             75789.979090   0.130443   3.950549
        AR             59683.055316   0.125979   3.783429
        AT             65726.669676   0.144610   4.090013
        AU             65088.483446   0.113094   3.833562
        CA             68149.950516   0.116496   3.688139
        ...                     ...        ...        ...
        RU             67305.277617   0.114107   3.852666
        SE             53740.570338   0.127062   3.942132
        TR             61245.521047   0.094868   4.100420
        US             74349.939410   0.109638   3.758700
        ZA             62920.072901   0.126608   3.775609
        <BLANKLINE>
        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["var"],
@ -64,6 +147,49 @@ class GroupBy:
        )
    def std(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the standard deviation value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            standard deviation value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.std`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek
        DestCountry
        AE               279.875500   0.367171   2.020634
        AR               244.903626   0.355811   1.949901
        AT               256.883342   0.381035   2.026411
        AU               255.585377   0.336902   1.961486
        CA               261.263054   0.341587   1.921980
        ...                     ...        ...        ...
        RU               259.696213   0.338140   1.964815
        SE               232.504297   0.357510   1.991340
        TR               267.827572   0.333333   2.191454
        US               272.774819   0.331242   1.939469
        ZA               251.505568   0.356766   1.948258
        <BLANKLINE>
        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["std"],
@ -72,6 +198,49 @@ class GroupBy:
        )
    def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the median absolute deviation value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            median absolute deviation value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").mad() # doctest: +SKIP
                     AvgTicketPrice  Cancelled  dayOfWeek
        DestCountry
        AE               233.697174        NaN        1.5
        AR               189.250061        NaN        2.0
        AT               195.823669        NaN        2.0
        AU               202.539764        NaN        2.0
        CA               203.344696        NaN        2.0
        ...                     ...        ...        ...
        RU               206.431702        NaN        2.0
        SE               178.658447        NaN        2.0
        TR               221.863434        NaN        1.0
        US               228.461365        NaN        2.0
        ZA               192.162842        NaN        2.0
        <BLANKLINE>
        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["mad"],
@ -80,6 +249,49 @@ class GroupBy:
        )
    def median(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the median value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            median absolute deviation value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.median`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
                     AvgTicketPrice  Cancelled  dayOfWeek               timestamp
        DestCountry
        AE               585.720490      False          2 2018-01-19 23:56:44.000
        AR               678.447433      False          3 2018-01-22 10:18:50.000
        AT               659.715592      False          3 2018-01-20 20:40:10.000
        AU               689.241348      False          3 2018-01-22 18:46:11.000
        CA               663.516057      False          3 2018-01-22 21:35:09.500
        ...                     ...        ...        ...                     ...
        RU               670.714956      False          3 2018-01-20 16:48:16.000
        SE               680.111084      False          3 2018-01-22 20:53:44.000
        TR               441.681122      False          1 2018-01-13 23:17:27.000
        US               600.591525      False          3 2018-01-22 04:09:50.000
        ZA               633.935425      False          3 2018-01-23 17:42:57.000
        <BLANKLINE>
        [32 rows x 4 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["median"],
@ -88,6 +300,49 @@ class GroupBy:
        )
    def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the sum value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            sum value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek
        DestCountry
        AE             2.783612e+04        7.0      124.0
        AR             2.058223e+05       45.0      837.0
        AT             2.437872e+05       66.0     1083.0
        AU             2.785365e+05       54.0     1183.0
        CA             6.124173e+05      127.0     2786.0
        ...                     ...        ...        ...
        RU             4.899533e+05       97.0     2093.0
        SE             1.684563e+05       38.0      684.0
        TR             4.852532e+03        1.0       19.0
        US             1.183804e+06      249.0     5472.0
        ZA             1.819840e+05       42.0      783.0
        <BLANKLINE>
        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["sum"],
@ -96,6 +351,49 @@ class GroupBy:
        )
    def min(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the min value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            min value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.min`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek           timestamp
        DestCountry
        AE               110.799911      False          0 2018-01-01 19:31:30
        AR               125.589394      False          0 2018-01-01 01:30:47
        AT               100.020531      False          0 2018-01-01 05:24:19
        AU               102.294312      False          0 2018-01-01 00:00:00
        CA               100.557251      False          0 2018-01-01 00:44:08
        ...                     ...        ...        ...                 ...
        RU               101.004005      False          0 2018-01-01 01:01:51
        SE               102.877190      False          0 2018-01-01 04:09:38
        TR               142.876465      False          0 2018-01-01 06:45:17
        US               100.145966      False          0 2018-01-01 00:06:27
        ZA               102.002663      False          0 2018-01-01 06:44:44
        <BLANKLINE>
        [32 rows x 4 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["min"],
@ -104,6 +402,49 @@ class GroupBy:
        )
    def max(self, numeric_only: bool = True) -> "pd.DataFrame":
        """
        Compute the max value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            max value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.max`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek           timestamp
        DestCountry
        AE              1126.148682       True          6 2018-02-11 04:11:14
        AR              1199.642822       True          6 2018-02-11 17:09:05
        AT              1181.835815       True          6 2018-02-11 23:12:33
        AU              1197.632690       True          6 2018-02-11 21:39:01
        CA              1198.852539       True          6 2018-02-11 23:04:08
        ...                     ...        ...        ...                 ...
        RU              1196.742310       True          6 2018-02-11 20:03:31
        SE              1198.621582       True          6 2018-02-11 22:06:14
        TR               855.935547       True          6 2018-02-04 01:59:23
        US              1199.729004       True          6 2018-02-11 23:27:00
        ZA              1196.186157       True          6 2018-02-11 23:29:45
        <BLANKLINE>
        [32 rows x 4 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["max"],
@ -112,6 +453,49 @@ class GroupBy:
        )
    def nunique(self) -> "pd.DataFrame":
        """
        Compute the nunique value for each group.
        Parameters
        ----------
        numeric_only: {True, False, None} Default is True
            Which datatype to be returned
            - True: Returns all values as float64, NaN/NaT values are removed
            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
        Returns
        -------
        pandas.DataFrame
            nunique value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek
        DestCountry
        AE                       46          2          7
        AR                      305          2          7
        AT                      377          2          7
        AU                      416          2          7
        CA                      944          2          7
        ...                     ...        ...        ...
        RU                      739          2          7
        SE                      255          2          7
        TR                       10          2          5
        US                     1987          2          7
        ZA                      283          2          7
        <BLANKLINE>
        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["nunique"],
@ -119,22 +503,9 @@ class GroupBy:
            numeric_only=False,
        )
-
+    def aggregate(
-class GroupByDataFrame(GroupBy):
+        self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
-    """
+    ) -> "pd.DataFrame":
    This holds all the groupby methods for DataFrame
    Parameters
    ----------
    by:
        List of columns to groupby
    query_compiler:
        Query compiler object
    dropna:
        default is true, drop None/NaT/NaN values while grouping
    """
    def aggregate(self, func: List[str], numeric_only: bool = False) -> "pd.DataFrame":
        """
        Used to groupby and aggregate
@ -155,8 +526,36 @@ class GroupByDataFrame(GroupBy):
        Returns
        -------
-            A Pandas DataFrame
+        pandas.DataFrame
            aggregation value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
                    AvgTicketPrice               ... dayOfWeek
                               min          max  ...       min max
        DestCountry                              ...
        AE              110.799911  1126.148682  ...         0   6
        AR              125.589394  1199.642822  ...         0   6
        AT              100.020531  1181.835815  ...         0   6
        AU              102.294312  1197.632690  ...         0   6
        CA              100.557251  1198.852539  ...         0   6
        ...                    ...          ...  ...       ...  ..
        RU              101.004005  1196.742310  ...         0   6
        SE              102.877190  1198.621582  ...         0   6
        TR              142.876465   855.935547  ...         0   6
        US              100.145966  1199.729004  ...         0   6
        ZA              102.002663  1196.186157  ...         0   6
        <BLANKLINE>
        [32 rows x 6 columns]
        """
        # Controls whether a MultiIndex is used for the
        # columns of the result DataFrame.
@ -177,12 +576,39 @@ class GroupByDataFrame(GroupBy):
    def count(self) -> "pd.DataFrame":
        """
-        Used to groupby and count
+        Compute the count value for each group.
        Returns
        -------
-            A Pandas DataFrame
+        pandas.DataFrame
            nunique value for each numeric column of each group
        See Also
        --------
        :pandas_api_docs:`pandas.core.groupby.GroupBy.count`
        Examples
        --------
        >>> df = ed.DataFrame(
        ...   "localhost", "flights",
        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
        ... )
        >>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
                     AvgTicketPrice  Cancelled  dayOfWeek
        DestCountry
        AE                       46         46         46
        AR                      305        305        305
        AT                      377        377        377
        AU                      416        416        416
        CA                      944        944        944
        ...                     ...        ...        ...
        RU                      739        739        739
        SE                      255        255        255
        TR                       10         10         10
        US                     1987       1987       1987
        ZA                      283        283        283
        <BLANKLINE>
        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
--- a/eland/operations.py
+++ b/eland/operations.py
@ -545,7 +545,7 @@ class Operations:
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
-        numeric_only: bool = True,
+        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
        """
        This method is used to construct groupby aggregation dataframe
@ -570,15 +570,98 @@ class Operations:
        -------
            A dataframe which consists groupby data
        """
-        headers, results = self._groupby_aggs(
+        query_params, post_processing = self._resolve_tasks(query_compiler)
-            query_compiler,
+
-            by=by,
+        size = self._size(query_params, post_processing)
-            pd_aggs=pd_aggs,
+        if size is not None:
-            dropna=dropna,
+            raise NotImplementedError(
-            is_dataframe_agg=is_dataframe_agg,
+                f"Can not count field matches if size is set {size}"
-            numeric_only=numeric_only,
+            )
        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
        # Used defaultdict to avoid initialization of columns with lists
        results: Dict[str, List[Any]] = defaultdict(list)
        if numeric_only:
            agg_fields = [
                field for field in agg_fields if (field.is_numeric or field.is_bool)
            ]
        body = Query(query_params.query)
        # To return for creating multi-index on columns
        headers = [agg_field.column for agg_field in agg_fields]
        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
        # Construct Query
        for by_field in by_fields:
            # groupby fields will be term aggregations
            body.composite_agg_bucket_terms(
                name=f"groupby_{by_field.column}",
                field=by_field.aggregatable_es_field_name,
            )
        for agg_field in agg_fields:
            for es_agg in es_aggs:
                # Skip if the field isn't compatible or if the agg is
                # 'value_count' as this value is pulled from bucket.doc_count.
                if not agg_field.is_es_agg_compatible(es_agg):
                    continue
                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{agg_field.es_field_name}",
                        es_agg[0],
                        agg_field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{agg_field.es_field_name}",
                        es_agg,
                        agg_field.aggregatable_es_field_name,
                    )
        # Composite aggregation
        body.composite_agg_start(
            size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
        )
        for buckets in self.bucket_generator(query_compiler, body):
            # We recieve response row-wise
            for bucket in buckets:
                # groupby columns are added to result same way they are returned
                for by_field in by_fields:
                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]
                    # Datetimes always come back as integers, convert to pd.Timestamp()
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")
                    results[by_field.column].append(bucket_key)
                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
                    es_aggs=es_aggs,
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    # We set 'True' here because we want the value
                    # unpacking to always be in 'dataframe' mode.
                    is_dataframe_agg=True,
                )
                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if not isinstance(value, list):
                        results[key].append(value)
                        continue
                    for pd_agg, val in zip(pd_aggs, value):
                        results[f"{key}_{pd_agg}"].append(val)
        agg_df = pd.DataFrame(results).set_index(by)
        if is_dataframe_agg:
@ -636,146 +719,6 @@ class Operations:
            else:
                return composite_buckets["buckets"]
    def _groupby_aggs(
        self,
        query_compiler: "QueryCompiler",
        by: List[str],
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
        numeric_only: bool = True,
    ) -> Tuple[List[str], Dict[str, Any]]:
        """
        This method is used to calculate groupby aggregations
        Parameters
        ----------
        query_compiler:
            A Query compiler
        by:
            a list of columns on which groupby operations have to be performed
        pd_aggs:
            a list of aggregations to be performed
        dropna:
            Drop None values if True.
            TODO Not yet implemented
        is_dataframe_agg:
            Know if multi aggregation or single agg is called.
        numeric_only:
            return either numeric values or NaN/NaT
        Returns
        -------
        headers: columns on which MultiIndex has to be applied
        response: dictionary of groupby aggregated values
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)
        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}"
            )
        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
        # Used defaultdict to avoid initialization of columns with lists
        response: Dict[str, List[Any]] = defaultdict(list)
        if numeric_only:
            agg_fields = [
                field for field in agg_fields if (field.is_numeric or field.is_bool)
            ]
        body = Query(query_params.query)
        # To return for creating multi-index on columns
        headers = [field.column for field in agg_fields]
        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
        # pd_agg 'count' is handled via 'doc_count' from buckets
        using_pd_agg_count = "count" in pd_aggs
        # Construct Query
        for by_field in by_fields:
            # groupby fields will be term aggregations
            body.composite_agg_bucket_terms(
                name=f"groupby_{by_field.column}",
                field=by_field.aggregatable_es_field_name,
            )
        for agg_field in agg_fields:
            for es_agg in es_aggs:
                # Skip if the field isn't compatible or if the agg is
                # 'value_count' as this value is pulled from bucket.doc_count.
                if (
                    not agg_field.is_es_agg_compatible(es_agg)
                    or es_agg == "value_count"
                ):
                    continue
                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{agg_field.es_field_name}",
                        es_agg[0],
                        agg_field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{agg_field.es_field_name}",
                        es_agg,
                        agg_field.aggregatable_es_field_name,
                    )
        # Composite aggregation
        body.composite_agg_start(
            size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
        )
        for buckets in self.bucket_generator(query_compiler, body):
            # We recieve response row-wise
            for bucket in buckets:
                # groupby columns are added to result same way they are returned
                for by_field in by_fields:
                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]
                    # Datetimes always come back as integers, convert to pd.Timestamp()
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")
                    response[by_field.column].append(bucket_key)
                # Put 'doc_count' from bucket into each 'agg_field'
                # to be extracted from _unpack_metric_aggs()
                if using_pd_agg_count:
                    doc_count = bucket["doc_count"]
                    for agg_field in agg_fields:
                        bucket[f"value_count_{agg_field.es_field_name}"] = {
                            "value": doc_count
                        }
                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
                    es_aggs=es_aggs,
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    is_dataframe_agg=is_dataframe_agg,
                )
                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if not isinstance(value, list):
                        response[key].append(value)
                        continue
                    for pd_agg, val in zip(pd_aggs, value):
                        response[f"{key}_{pd_agg}"].append(val)
        return headers, response
    @staticmethod
    def _map_pd_aggs_to_es_aggs(pd_aggs):
        """
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -556,7 +556,7 @@ class QueryCompiler:
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
-        numeric_only: bool = True,
+        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
        return self._operations.aggs_groupby(
            self,
--- a/eland/tests/dataframe/test_groupby_pytest.py
+++ b/eland/tests/dataframe/test_groupby_pytest.py
@ -176,3 +176,21 @@ class TestGroupbyDataFrame(TestData):
        assert_index_equal(pd_count.index, ed_count.index)
        assert_frame_equal(pd_count, ed_count)
        assert_series_equal(pd_count.dtypes, ed_count.dtypes)
    def test_groupby_dataframe_mad(self):
        pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
        ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
        pd_mad = pd_flights.groupby("DestCountry").mad()
        ed_mad = ed_flights.groupby("DestCountry").mad()
        assert_index_equal(pd_mad.columns, ed_mad.columns)
        assert_index_equal(pd_mad.index, ed_mad.index)
        assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
        pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
        ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
        assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)
        assert_index_equal(pd_min_mad.index, ed_min_mad.index)
        assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes)