Document DataFrame.groupby() methods

2025-07-11 00:02:14 +08:00 · 2020-10-27 10:10:57 -05:00 · 2020-10-27 10:10:57 -05:00 · ae70f03df3
commit ae70f03df3
parent 475e0f41ef
22 changed files with 696 additions and 185 deletions
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.agg
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.agg
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.aggregate
+========================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.aggregate
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.count
+====================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.count
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.mad
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.mad
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.max
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.max
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.mean
+===================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.mean
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.median
+=====================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.median
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.min
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.min
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.nunique
+======================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.nunique
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
@ -0,0 +1,15 @@
+eland.groupby.DataFrameGroupBy
+==============================
+
+.. currentmodule:: eland.groupby
+
+.. autoclass:: DataFrameGroupBy
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      DataFrame.abs
+      DataFrame.add
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.std
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.std
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.sum
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.sum
--- a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst
+++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst
@ -0,0 +1,6 @@
+eland.groupby.DataFrameGroupBy.var
+==================================
+
+.. currentmodule:: eland.groupby
+
+.. automethod:: DataFrameGroupBy.var
--- a/docs/source/reference/api/eland.groupby.GroupBy.rst
+++ b/docs/source/reference/api/eland.groupby.GroupBy.rst
@ -0,0 +1,15 @@
+eland.groupby.GroupBy
+=====================
+
+.. currentmodule:: eland.groupby
+
+.. autoclass:: GroupBy
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      DataFrame.abs
+      DataFrame.add
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@ -48,6 +48,28 @@ Function Application, GroupBy & Window
   DataFrame.aggregate
   DataFrame.groupby

+.. currentmodule:: eland.groupby
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrameGroupBy
+   DataFrameGroupBy.agg
+   DataFrameGroupBy.aggregate
+   DataFrameGroupBy.count
+   DataFrameGroupByGroupBy.mad
+   DataFrameGroupByGroupBy.max
+   DataFrameGroupByGroupBy.mean
+   DataFrameGroupByGroupBy.median
+   DataFrameGroupByGroupBy.min
+   DataFrameGroupByGroupBy.nunique
+   DataFrameGroupByGroupBy.std
+   DataFrameGroupByGroupBy.sum
+   DataFrameGroupByGroupBy.var
+   GroupBy
+
+.. currentmodule:: eland
+
 .. _api.dataframe.stats:

 Computations / Descriptive Stats
--- a/docs/source/reference/general_utility_functions.rst
+++ b/docs/source/reference/general_utility_functions.rst
@ -1,7 +1,7 @@
 .. _api.general_utility_functions:

 =========================
-General utility functions
+General Utility Functions
 =========================
 .. currentmodule:: eland

--- a/docs/source/reference/supported_apis.rst
+++ b/docs/source/reference/supported_apis.rst
@ -170,7 +170,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.DataFrame.get()``                | **Yes**    |
 +---------------------------------------+------------+
-| ``ed.DataFrame.groupby()``            | No         |
+| ``ed.DataFrame.groupby()``            | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.DataFrame.gt()``                 | No         |
 +---------------------------------------+------------+
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -36,7 +36,7 @@ from pandas.util._validators import validate_bool_kwarg
 import eland.plotting as gfx
 from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
 from eland.filter import BooleanFilter
-from eland.groupby import GroupByDataFrame
+from eland.groupby import DataFrameGroupBy
 from eland.ndframe import NDFrame
 from eland.series import Series
 from eland.utils import deprecated_api, is_valid_attr_name
@ -1433,7 +1433,7 @@ class DataFrame(NDFrame):

    def groupby(
        self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True
-    ) -> "GroupByDataFrame":
+    ) -> "DataFrameGroupBy":
        """
        Used to perform groupby operations

@ -1448,7 +1448,7 @@ class DataFrame(NDFrame):

        Returns
        -------
-        GroupByDataFrame
+        eland.groupby.DataFrameGroupBy

        See Also
        --------
@ -1520,7 +1520,7 @@ class DataFrame(NDFrame):
                    f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
                )

-        return GroupByDataFrame(
+        return DataFrameGroupBy(
            by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
        )

--- a/eland/groupby.py
+++ b/eland/groupby.py
@ -15,7 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.

-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional, Union

 from eland.query_compiler import QueryCompiler

@ -25,16 +25,7 @@ if TYPE_CHECKING:

 class GroupBy:
    """
-    Base class for calls to X.groupby([...])
-
-    Parameters
-    ----------
-    by:
-        List of columns to groupby
-    query_compiler:
-        Query compiler object
-    dropna:
-        default is true, drop None/NaT/NaN values while grouping
+    Base class for calls to :py:func:`eland.DataFrame.groupby`
    """

    def __init__(
@ -47,7 +38,56 @@ class GroupBy:
        self._dropna: bool = dropna
        self._by: List[str] = by

+
+class DataFrameGroupBy(GroupBy):
+    """
+    This holds all the groupby methods for :py:func:`eland.DataFrame.groupby`
+    """
+
    def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the mean value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            mean value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek                     timestamp
+        DestCountry
+        AE               605.132970   0.152174   2.695652 2018-01-21 16:58:07.891304443
+        AR               674.827252   0.147541   2.744262 2018-01-21 22:18:06.593442627
+        AT               646.650530   0.175066   2.872679 2018-01-21 15:54:42.469496094
+        AU               669.558832   0.129808   2.843750 2018-01-22 02:28:39.199519287
+        CA               648.747109   0.134534   2.951271 2018-01-22 14:40:47.165254150
+        ...                     ...        ...        ...                           ...
+        RU               662.994963   0.131258   2.832206 2018-01-21 07:11:16.534506104
+        SE               660.612988   0.149020   2.682353 2018-01-22 07:48:23.447058838
+        TR               485.253247   0.100000   1.900000 2018-01-16 16:02:33.000000000
+        US               595.774391   0.125315   2.753900 2018-01-21 16:55:04.456970215
+        ZA               643.053057   0.148410   2.766784 2018-01-22 15:17:56.141342773
+        <BLANKLINE>
+        [32 rows x 4 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["mean"],
@ -56,6 +96,49 @@ class GroupBy:
        )

    def var(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the variance value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            variance value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.var`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek
+        DestCountry
+        AE             75789.979090   0.130443   3.950549
+        AR             59683.055316   0.125979   3.783429
+        AT             65726.669676   0.144610   4.090013
+        AU             65088.483446   0.113094   3.833562
+        CA             68149.950516   0.116496   3.688139
+        ...                     ...        ...        ...
+        RU             67305.277617   0.114107   3.852666
+        SE             53740.570338   0.127062   3.942132
+        TR             61245.521047   0.094868   4.100420
+        US             74349.939410   0.109638   3.758700
+        ZA             62920.072901   0.126608   3.775609
+        <BLANKLINE>
+        [32 rows x 3 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["var"],
@ -64,6 +147,49 @@ class GroupBy:
        )

    def std(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the standard deviation value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            standard deviation value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.std`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek
+        DestCountry
+        AE               279.875500   0.367171   2.020634
+        AR               244.903626   0.355811   1.949901
+        AT               256.883342   0.381035   2.026411
+        AU               255.585377   0.336902   1.961486
+        CA               261.263054   0.341587   1.921980
+        ...                     ...        ...        ...
+        RU               259.696213   0.338140   1.964815
+        SE               232.504297   0.357510   1.991340
+        TR               267.827572   0.333333   2.191454
+        US               272.774819   0.331242   1.939469
+        ZA               251.505568   0.356766   1.948258
+        <BLANKLINE>
+        [32 rows x 3 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["std"],
@ -72,6 +198,49 @@ class GroupBy:
        )

    def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the median absolute deviation value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            median absolute deviation value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").mad() # doctest: +SKIP
+                     AvgTicketPrice  Cancelled  dayOfWeek
+        DestCountry
+        AE               233.697174        NaN        1.5
+        AR               189.250061        NaN        2.0
+        AT               195.823669        NaN        2.0
+        AU               202.539764        NaN        2.0
+        CA               203.344696        NaN        2.0
+        ...                     ...        ...        ...
+        RU               206.431702        NaN        2.0
+        SE               178.658447        NaN        2.0
+        TR               221.863434        NaN        1.0
+        US               228.461365        NaN        2.0
+        ZA               192.162842        NaN        2.0
+        <BLANKLINE>
+        [32 rows x 3 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["mad"],
@ -80,6 +249,49 @@ class GroupBy:
        )

    def median(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the median value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            median absolute deviation value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.median`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
+                     AvgTicketPrice  Cancelled  dayOfWeek               timestamp
+        DestCountry
+        AE               585.720490      False          2 2018-01-19 23:56:44.000
+        AR               678.447433      False          3 2018-01-22 10:18:50.000
+        AT               659.715592      False          3 2018-01-20 20:40:10.000
+        AU               689.241348      False          3 2018-01-22 18:46:11.000
+        CA               663.516057      False          3 2018-01-22 21:35:09.500
+        ...                     ...        ...        ...                     ...
+        RU               670.714956      False          3 2018-01-20 16:48:16.000
+        SE               680.111084      False          3 2018-01-22 20:53:44.000
+        TR               441.681122      False          1 2018-01-13 23:17:27.000
+        US               600.591525      False          3 2018-01-22 04:09:50.000
+        ZA               633.935425      False          3 2018-01-23 17:42:57.000
+        <BLANKLINE>
+        [32 rows x 4 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["median"],
@ -88,6 +300,49 @@ class GroupBy:
        )

    def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the sum value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            sum value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek
+        DestCountry
+        AE             2.783612e+04        7.0      124.0
+        AR             2.058223e+05       45.0      837.0
+        AT             2.437872e+05       66.0     1083.0
+        AU             2.785365e+05       54.0     1183.0
+        CA             6.124173e+05      127.0     2786.0
+        ...                     ...        ...        ...
+        RU             4.899533e+05       97.0     2093.0
+        SE             1.684563e+05       38.0      684.0
+        TR             4.852532e+03        1.0       19.0
+        US             1.183804e+06      249.0     5472.0
+        ZA             1.819840e+05       42.0      783.0
+        <BLANKLINE>
+        [32 rows x 3 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["sum"],
@ -96,6 +351,49 @@ class GroupBy:
        )

    def min(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the min value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            min value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.min`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek           timestamp
+        DestCountry
+        AE               110.799911      False          0 2018-01-01 19:31:30
+        AR               125.589394      False          0 2018-01-01 01:30:47
+        AT               100.020531      False          0 2018-01-01 05:24:19
+        AU               102.294312      False          0 2018-01-01 00:00:00
+        CA               100.557251      False          0 2018-01-01 00:44:08
+        ...                     ...        ...        ...                 ...
+        RU               101.004005      False          0 2018-01-01 01:01:51
+        SE               102.877190      False          0 2018-01-01 04:09:38
+        TR               142.876465      False          0 2018-01-01 06:45:17
+        US               100.145966      False          0 2018-01-01 00:06:27
+        ZA               102.002663      False          0 2018-01-01 06:44:44
+        <BLANKLINE>
+        [32 rows x 4 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["min"],
@ -104,6 +402,49 @@ class GroupBy:
        )

    def max(self, numeric_only: bool = True) -> "pd.DataFrame":
+        """
+        Compute the max value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            max value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.max`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek           timestamp
+        DestCountry
+        AE              1126.148682       True          6 2018-02-11 04:11:14
+        AR              1199.642822       True          6 2018-02-11 17:09:05
+        AT              1181.835815       True          6 2018-02-11 23:12:33
+        AU              1197.632690       True          6 2018-02-11 21:39:01
+        CA              1198.852539       True          6 2018-02-11 23:04:08
+        ...                     ...        ...        ...                 ...
+        RU              1196.742310       True          6 2018-02-11 20:03:31
+        SE              1198.621582       True          6 2018-02-11 22:06:14
+        TR               855.935547       True          6 2018-02-04 01:59:23
+        US              1199.729004       True          6 2018-02-11 23:27:00
+        ZA              1196.186157       True          6 2018-02-11 23:29:45
+        <BLANKLINE>
+        [32 rows x 4 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["max"],
@ -112,6 +453,49 @@ class GroupBy:
        )

    def nunique(self) -> "pd.DataFrame":
+        """
+        Compute the nunique value for each group.
+
+        Parameters
+        ----------
+        numeric_only: {True, False, None} Default is True
+            Which datatype to be returned
+            - True: Returns all values as float64, NaN/NaT values are removed
+            - None: Returns all values as the same dtype where possible, NaN/NaT are removed
+            - False: Returns all values as the same dtype where possible, NaN/NaT are preserved
+
+        Returns
+        -------
+        pandas.DataFrame
+            nunique value for each numeric column of each group
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek
+        DestCountry
+        AE                       46          2          7
+        AR                      305          2          7
+        AT                      377          2          7
+        AU                      416          2          7
+        CA                      944          2          7
+        ...                     ...        ...        ...
+        RU                      739          2          7
+        SE                      255          2          7
+        TR                       10          2          5
+        US                     1987          2          7
+        ZA                      283          2          7
+        <BLANKLINE>
+        [32 rows x 3 columns]
+        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
            pd_aggs=["nunique"],
@ -119,22 +503,9 @@ class GroupBy:
            numeric_only=False,
        )

-
-class GroupByDataFrame(GroupBy):
-    """
-    This holds all the groupby methods for DataFrame
-
-    Parameters
-    ----------
-    by:
-        List of columns to groupby
-    query_compiler:
-        Query compiler object
-    dropna:
-        default is true, drop None/NaT/NaN values while grouping
-    """
-
-    def aggregate(self, func: List[str], numeric_only: bool = False) -> "pd.DataFrame":
+    def aggregate(
+        self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
+    ) -> "pd.DataFrame":
        """
        Used to groupby and aggregate

@ -155,8 +526,36 @@ class GroupByDataFrame(GroupBy):

        Returns
        -------
-            A Pandas DataFrame
+        pandas.DataFrame
+            aggregation value for each numeric column of each group

+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
+                    AvgTicketPrice               ... dayOfWeek
+                               min          max  ...       min max
+        DestCountry                              ...
+        AE              110.799911  1126.148682  ...         0   6
+        AR              125.589394  1199.642822  ...         0   6
+        AT              100.020531  1181.835815  ...         0   6
+        AU              102.294312  1197.632690  ...         0   6
+        CA              100.557251  1198.852539  ...         0   6
+        ...                    ...          ...  ...       ...  ..
+        RU              101.004005  1196.742310  ...         0   6
+        SE              102.877190  1198.621582  ...         0   6
+        TR              142.876465   855.935547  ...         0   6
+        US              100.145966  1199.729004  ...         0   6
+        ZA              102.002663  1196.186157  ...         0   6
+        <BLANKLINE>
+        [32 rows x 6 columns]
        """
        # Controls whether a MultiIndex is used for the
        # columns of the result DataFrame.
@ -177,12 +576,39 @@ class GroupByDataFrame(GroupBy):

    def count(self) -> "pd.DataFrame":
        """
-        Used to groupby and count
+        Compute the count value for each group.

        Returns
        -------
-            A Pandas DataFrame
+        pandas.DataFrame
+            nunique value for each numeric column of each group

+        See Also
+        --------
+        :pandas_api_docs:`pandas.core.groupby.GroupBy.count`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame(
+        ...   "localhost", "flights",
+        ...   columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
+        ... )
+        >>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
+                     AvgTicketPrice  Cancelled  dayOfWeek
+        DestCountry
+        AE                       46         46         46
+        AR                      305        305        305
+        AT                      377        377        377
+        AU                      416        416        416
+        CA                      944        944        944
+        ...                     ...        ...        ...
+        RU                      739        739        739
+        SE                      255        255        255
+        TR                       10         10         10
+        US                     1987       1987       1987
+        ZA                      283        283        283
+        <BLANKLINE>
+        [32 rows x 3 columns]
        """
        return self._query_compiler.aggs_groupby(
            by=self._by,
--- a/eland/operations.py
+++ b/eland/operations.py
@ -545,7 +545,7 @@ class Operations:
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
-        numeric_only: bool = True,
+        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
        """
        This method is used to construct groupby aggregation dataframe
@ -570,15 +570,98 @@ class Operations:
        -------
            A dataframe which consists groupby data
        """
-        headers, results = self._groupby_aggs(
-            query_compiler,
-            by=by,
-            pd_aggs=pd_aggs,
-            dropna=dropna,
-            is_dataframe_agg=is_dataframe_agg,
-            numeric_only=numeric_only,
+        query_params, post_processing = self._resolve_tasks(query_compiler)
+
+        size = self._size(query_params, post_processing)
+        if size is not None:
+            raise NotImplementedError(
+                f"Can not count field matches if size is set {size}"
+            )
+
+        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
+
+        # Used defaultdict to avoid initialization of columns with lists
+        results: Dict[str, List[Any]] = defaultdict(list)
+
+        if numeric_only:
+            agg_fields = [
+                field for field in agg_fields if (field.is_numeric or field.is_bool)
+            ]
+
+        body = Query(query_params.query)
+
+        # To return for creating multi-index on columns
+        headers = [agg_field.column for agg_field in agg_fields]
+
+        # Convert pandas aggs to ES equivalent
+        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
+
+        # Construct Query
+        for by_field in by_fields:
+            # groupby fields will be term aggregations
+            body.composite_agg_bucket_terms(
+                name=f"groupby_{by_field.column}",
+                field=by_field.aggregatable_es_field_name,
+            )
+
+        for agg_field in agg_fields:
+            for es_agg in es_aggs:
+                # Skip if the field isn't compatible or if the agg is
+                # 'value_count' as this value is pulled from bucket.doc_count.
+                if not agg_field.is_es_agg_compatible(es_agg):
+                    continue
+
+                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
+                if isinstance(es_agg, tuple):
+                    body.metric_aggs(
+                        f"{es_agg[0]}_{agg_field.es_field_name}",
+                        es_agg[0],
+                        agg_field.aggregatable_es_field_name,
+                    )
+                else:
+                    body.metric_aggs(
+                        f"{es_agg}_{agg_field.es_field_name}",
+                        es_agg,
+                        agg_field.aggregatable_es_field_name,
+                    )
+
+        # Composite aggregation
+        body.composite_agg_start(
+            size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
        )

+        for buckets in self.bucket_generator(query_compiler, body):
+            # We recieve response row-wise
+            for bucket in buckets:
+                # groupby columns are added to result same way they are returned
+                for by_field in by_fields:
+                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]
+
+                    # Datetimes always come back as integers, convert to pd.Timestamp()
+                    if by_field.is_timestamp and isinstance(bucket_key, int):
+                        bucket_key = pd.to_datetime(bucket_key, unit="ms")
+
+                    results[by_field.column].append(bucket_key)
+
+                agg_calculation = self._unpack_metric_aggs(
+                    fields=agg_fields,
+                    es_aggs=es_aggs,
+                    pd_aggs=pd_aggs,
+                    response={"aggregations": bucket},
+                    numeric_only=numeric_only,
+                    # We set 'True' here because we want the value
+                    # unpacking to always be in 'dataframe' mode.
+                    is_dataframe_agg=True,
+                )
+
+                # Process the calculated agg values to response
+                for key, value in agg_calculation.items():
+                    if not isinstance(value, list):
+                        results[key].append(value)
+                        continue
+                    for pd_agg, val in zip(pd_aggs, value):
+                        results[f"{key}_{pd_agg}"].append(val)
+
        agg_df = pd.DataFrame(results).set_index(by)

        if is_dataframe_agg:
@ -636,146 +719,6 @@ class Operations:
            else:
                return composite_buckets["buckets"]

-    def _groupby_aggs(
-        self,
-        query_compiler: "QueryCompiler",
-        by: List[str],
-        pd_aggs: List[str],
-        dropna: bool = True,
-        is_dataframe_agg: bool = False,
-        numeric_only: bool = True,
-    ) -> Tuple[List[str], Dict[str, Any]]:
-        """
-        This method is used to calculate groupby aggregations
-
-        Parameters
-        ----------
-        query_compiler:
-            A Query compiler
-        by:
-            a list of columns on which groupby operations have to be performed
-        pd_aggs:
-            a list of aggregations to be performed
-        dropna:
-            Drop None values if True.
-            TODO Not yet implemented
-        is_dataframe_agg:
-            Know if multi aggregation or single agg is called.
-        numeric_only:
-            return either numeric values or NaN/NaT
-
-        Returns
-        -------
-        headers: columns on which MultiIndex has to be applied
-        response: dictionary of groupby aggregated values
-        """
-        query_params, post_processing = self._resolve_tasks(query_compiler)
-
-        size = self._size(query_params, post_processing)
-        if size is not None:
-            raise NotImplementedError(
-                f"Can not count field matches if size is set {size}"
-            )
-
-        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
-
-        # Used defaultdict to avoid initialization of columns with lists
-        response: Dict[str, List[Any]] = defaultdict(list)
-
-        if numeric_only:
-            agg_fields = [
-                field for field in agg_fields if (field.is_numeric or field.is_bool)
-            ]
-
-        body = Query(query_params.query)
-
-        # To return for creating multi-index on columns
-        headers = [field.column for field in agg_fields]
-
-        # Convert pandas aggs to ES equivalent
-        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
-
-        # pd_agg 'count' is handled via 'doc_count' from buckets
-        using_pd_agg_count = "count" in pd_aggs
-
-        # Construct Query
-        for by_field in by_fields:
-            # groupby fields will be term aggregations
-            body.composite_agg_bucket_terms(
-                name=f"groupby_{by_field.column}",
-                field=by_field.aggregatable_es_field_name,
-            )
-
-        for agg_field in agg_fields:
-            for es_agg in es_aggs:
-                # Skip if the field isn't compatible or if the agg is
-                # 'value_count' as this value is pulled from bucket.doc_count.
-                if (
-                    not agg_field.is_es_agg_compatible(es_agg)
-                    or es_agg == "value_count"
-                ):
-                    continue
-
-                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
-                if isinstance(es_agg, tuple):
-                    body.metric_aggs(
-                        f"{es_agg[0]}_{agg_field.es_field_name}",
-                        es_agg[0],
-                        agg_field.aggregatable_es_field_name,
-                    )
-                else:
-                    body.metric_aggs(
-                        f"{es_agg}_{agg_field.es_field_name}",
-                        es_agg,
-                        agg_field.aggregatable_es_field_name,
-                    )
-
-        # Composite aggregation
-        body.composite_agg_start(
-            size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
-        )
-
-        for buckets in self.bucket_generator(query_compiler, body):
-            # We recieve response row-wise
-            for bucket in buckets:
-                # groupby columns are added to result same way they are returned
-                for by_field in by_fields:
-                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]
-
-                    # Datetimes always come back as integers, convert to pd.Timestamp()
-                    if by_field.is_timestamp and isinstance(bucket_key, int):
-                        bucket_key = pd.to_datetime(bucket_key, unit="ms")
-
-                    response[by_field.column].append(bucket_key)
-
-                # Put 'doc_count' from bucket into each 'agg_field'
-                # to be extracted from _unpack_metric_aggs()
-                if using_pd_agg_count:
-                    doc_count = bucket["doc_count"]
-                    for agg_field in agg_fields:
-                        bucket[f"value_count_{agg_field.es_field_name}"] = {
-                            "value": doc_count
-                        }
-
-                agg_calculation = self._unpack_metric_aggs(
-                    fields=agg_fields,
-                    es_aggs=es_aggs,
-                    pd_aggs=pd_aggs,
-                    response={"aggregations": bucket},
-                    numeric_only=numeric_only,
-                    is_dataframe_agg=is_dataframe_agg,
-                )
-
-                # Process the calculated agg values to response
-                for key, value in agg_calculation.items():
-                    if not isinstance(value, list):
-                        response[key].append(value)
-                        continue
-                    for pd_agg, val in zip(pd_aggs, value):
-                        response[f"{key}_{pd_agg}"].append(val)
-
-        return headers, response
-
    @staticmethod
    def _map_pd_aggs_to_es_aggs(pd_aggs):
        """
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -556,7 +556,7 @@ class QueryCompiler:
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
-        numeric_only: bool = True,
+        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
        return self._operations.aggs_groupby(
            self,
--- a/eland/tests/dataframe/test_groupby_pytest.py
+++ b/eland/tests/dataframe/test_groupby_pytest.py
@ -176,3 +176,21 @@ class TestGroupbyDataFrame(TestData):
        assert_index_equal(pd_count.index, ed_count.index)
        assert_frame_equal(pd_count, ed_count)
        assert_series_equal(pd_count.dtypes, ed_count.dtypes)
+
+    def test_groupby_dataframe_mad(self):
+        pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
+        ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
+
+        pd_mad = pd_flights.groupby("DestCountry").mad()
+        ed_mad = ed_flights.groupby("DestCountry").mad()
+
+        assert_index_equal(pd_mad.columns, ed_mad.columns)
+        assert_index_equal(pd_mad.index, ed_mad.index)
+        assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
+
+        pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
+        ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
+
+        assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)
+        assert_index_equal(pd_min_mad.index, ed_min_mad.index)
+        assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes)