From ae70f03df3009f25026dd53de79d3bee512b7cc2 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 27 Oct 2020 10:10:57 -0500 Subject: [PATCH] Document DataFrame.groupby() methods --- .../eland.groupby.DataFrameGroupBy.agg.rst | 6 + ...and.groupby.DataFrameGroupBy.aggregate.rst | 6 + .../eland.groupby.DataFrameGroupBy.count.rst | 6 + .../eland.groupby.DataFrameGroupBy.mad.rst | 6 + .../eland.groupby.DataFrameGroupBy.max.rst | 6 + .../eland.groupby.DataFrameGroupBy.mean.rst | 6 + .../eland.groupby.DataFrameGroupBy.median.rst | 6 + .../eland.groupby.DataFrameGroupBy.min.rst | 6 + ...eland.groupby.DataFrameGroupBy.nunique.rst | 6 + .../api/eland.groupby.DataFrameGroupBy.rst | 15 + .../eland.groupby.DataFrameGroupBy.std.rst | 6 + .../eland.groupby.DataFrameGroupBy.sum.rst | 6 + .../eland.groupby.DataFrameGroupBy.var.rst | 6 + .../reference/api/eland.groupby.GroupBy.rst | 15 + docs/source/reference/dataframe.rst | 22 + .../reference/general_utility_functions.rst | 2 +- docs/source/reference/supported_apis.rst | 2 +- eland/dataframe.py | 8 +- eland/groupby.py | 486 ++++++++++++++++-- eland/operations.py | 239 ++++----- eland/query_compiler.py | 2 +- eland/tests/dataframe/test_groupby_pytest.py | 18 + 22 files changed, 696 insertions(+), 185 deletions(-) create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst create mode 100644 docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst create mode 100644 docs/source/reference/api/eland.groupby.GroupBy.rst diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst new file mode 100644 index 0000000..954dcb8 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.agg.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.agg +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.agg diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst new file mode 100644 index 0000000..c27bc37 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.aggregate.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.aggregate +======================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.aggregate diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst new file mode 100644 index 0000000..06b2210 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.count.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.count +==================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.count diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst new file mode 100644 index 0000000..d7e0063 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mad.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.mad +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.mad diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst new file mode 100644 index 0000000..f0b1d41 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.max.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.max +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.max diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst new file mode 100644 index 0000000..97ac3b2 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.mean.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.mean +=================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.mean diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst new file mode 100644 index 0000000..651c4f2 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.median.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.median +===================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.median diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst new file mode 100644 index 0000000..f0aabd5 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.min.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.min +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.min diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst new file mode 100644 index 0000000..613574b --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.nunique.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.nunique +====================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.nunique diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst new file mode 100644 index 0000000..858c005 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst @@ -0,0 +1,15 @@ +eland.groupby.DataFrameGroupBy +============================== + +.. currentmodule:: eland.groupby + +.. autoclass:: DataFrameGroupBy + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + DataFrame.abs + DataFrame.add diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst new file mode 100644 index 0000000..4f92370 --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.std.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.std +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.std diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst new file mode 100644 index 0000000..56fb8cf --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.sum.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.sum +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.sum diff --git a/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst new file mode 100644 index 0000000..cac346a --- /dev/null +++ b/docs/source/reference/api/eland.groupby.DataFrameGroupBy.var.rst @@ -0,0 +1,6 @@ +eland.groupby.DataFrameGroupBy.var +================================== + +.. currentmodule:: eland.groupby + +.. automethod:: DataFrameGroupBy.var diff --git a/docs/source/reference/api/eland.groupby.GroupBy.rst b/docs/source/reference/api/eland.groupby.GroupBy.rst new file mode 100644 index 0000000..fc9872e --- /dev/null +++ b/docs/source/reference/api/eland.groupby.GroupBy.rst @@ -0,0 +1,15 @@ +eland.groupby.GroupBy +===================== + +.. currentmodule:: eland.groupby + +.. autoclass:: GroupBy + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + DataFrame.abs + DataFrame.add diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst index cc05497..e21142d 100644 --- a/docs/source/reference/dataframe.rst +++ b/docs/source/reference/dataframe.rst @@ -48,6 +48,28 @@ Function Application, GroupBy & Window DataFrame.aggregate DataFrame.groupby +.. currentmodule:: eland.groupby + +.. autosummary:: + :toctree: api/ + + DataFrameGroupBy + DataFrameGroupBy.agg + DataFrameGroupBy.aggregate + DataFrameGroupBy.count + DataFrameGroupByGroupBy.mad + DataFrameGroupByGroupBy.max + DataFrameGroupByGroupBy.mean + DataFrameGroupByGroupBy.median + DataFrameGroupByGroupBy.min + DataFrameGroupByGroupBy.nunique + DataFrameGroupByGroupBy.std + DataFrameGroupByGroupBy.sum + DataFrameGroupByGroupBy.var + GroupBy + +.. currentmodule:: eland + .. _api.dataframe.stats: Computations / Descriptive Stats diff --git a/docs/source/reference/general_utility_functions.rst b/docs/source/reference/general_utility_functions.rst index 030a740..561934c 100644 --- a/docs/source/reference/general_utility_functions.rst +++ b/docs/source/reference/general_utility_functions.rst @@ -1,7 +1,7 @@ .. _api.general_utility_functions: ========================= -General utility functions +General Utility Functions ========================= .. currentmodule:: eland diff --git a/docs/source/reference/supported_apis.rst b/docs/source/reference/supported_apis.rst index 25c80a8..ee7773a 100644 --- a/docs/source/reference/supported_apis.rst +++ b/docs/source/reference/supported_apis.rst @@ -170,7 +170,7 @@ script instead of being modified manually. +---------------------------------------+------------+ | ``ed.DataFrame.get()`` | **Yes** | +---------------------------------------+------------+ -| ``ed.DataFrame.groupby()`` | No | +| ``ed.DataFrame.groupby()`` | **Yes** | +---------------------------------------+------------+ | ``ed.DataFrame.gt()`` | No | +---------------------------------------+------------+ diff --git a/eland/dataframe.py b/eland/dataframe.py index 87a60dc..550c7e4 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -36,7 +36,7 @@ from pandas.util._validators import validate_bool_kwarg import eland.plotting as gfx from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from eland.filter import BooleanFilter -from eland.groupby import GroupByDataFrame +from eland.groupby import DataFrameGroupBy from eland.ndframe import NDFrame from eland.series import Series from eland.utils import deprecated_api, is_valid_attr_name @@ -1433,7 +1433,7 @@ class DataFrame(NDFrame): def groupby( self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True - ) -> "GroupByDataFrame": + ) -> "DataFrameGroupBy": """ Used to perform groupby operations @@ -1448,7 +1448,7 @@ class DataFrame(NDFrame): Returns ------- - GroupByDataFrame + eland.groupby.DataFrameGroupBy See Also -------- @@ -1520,7 +1520,7 @@ class DataFrame(NDFrame): f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame" ) - return GroupByDataFrame( + return DataFrameGroupBy( by=by, query_compiler=self._query_compiler.copy(), dropna=dropna ) diff --git a/eland/groupby.py b/eland/groupby.py index 97f9661..71eee10 100644 --- a/eland/groupby.py +++ b/eland/groupby.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING, List, Optional, Union from eland.query_compiler import QueryCompiler @@ -25,16 +25,7 @@ if TYPE_CHECKING: class GroupBy: """ - Base class for calls to X.groupby([...]) - - Parameters - ---------- - by: - List of columns to groupby - query_compiler: - Query compiler object - dropna: - default is true, drop None/NaT/NaN values while grouping + Base class for calls to :py:func:`eland.DataFrame.groupby` """ def __init__( @@ -47,7 +38,56 @@ class GroupBy: self._dropna: bool = dropna self._by: List[str] = by + +class DataFrameGroupBy(GroupBy): + """ + This holds all the groupby methods for :py:func:`eland.DataFrame.groupby` + """ + def mean(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the mean value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + mean value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.mean` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek timestamp + DestCountry + AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443 + AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627 + AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094 + AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287 + CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150 + ... ... ... ... ... + RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104 + SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838 + TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000 + US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215 + ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773 + + [32 rows x 4 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["mean"], @@ -56,6 +96,49 @@ class GroupBy: ) def var(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the variance value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + variance value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.var` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek + DestCountry + AE 75789.979090 0.130443 3.950549 + AR 59683.055316 0.125979 3.783429 + AT 65726.669676 0.144610 4.090013 + AU 65088.483446 0.113094 3.833562 + CA 68149.950516 0.116496 3.688139 + ... ... ... ... + RU 67305.277617 0.114107 3.852666 + SE 53740.570338 0.127062 3.942132 + TR 61245.521047 0.094868 4.100420 + US 74349.939410 0.109638 3.758700 + ZA 62920.072901 0.126608 3.775609 + + [32 rows x 3 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["var"], @@ -64,6 +147,49 @@ class GroupBy: ) def std(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the standard deviation value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + standard deviation value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.std` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek + DestCountry + AE 279.875500 0.367171 2.020634 + AR 244.903626 0.355811 1.949901 + AT 256.883342 0.381035 2.026411 + AU 255.585377 0.336902 1.961486 + CA 261.263054 0.341587 1.921980 + ... ... ... ... + RU 259.696213 0.338140 1.964815 + SE 232.504297 0.357510 1.991340 + TR 267.827572 0.333333 2.191454 + US 272.774819 0.331242 1.939469 + ZA 251.505568 0.356766 1.948258 + + [32 rows x 3 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["std"], @@ -72,6 +198,49 @@ class GroupBy: ) def mad(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the median absolute deviation value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + median absolute deviation value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.mad` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").mad() # doctest: +SKIP + AvgTicketPrice Cancelled dayOfWeek + DestCountry + AE 233.697174 NaN 1.5 + AR 189.250061 NaN 2.0 + AT 195.823669 NaN 2.0 + AU 202.539764 NaN 2.0 + CA 203.344696 NaN 2.0 + ... ... ... ... + RU 206.431702 NaN 2.0 + SE 178.658447 NaN 2.0 + TR 221.863434 NaN 1.0 + US 228.461365 NaN 2.0 + ZA 192.162842 NaN 2.0 + + [32 rows x 3 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["mad"], @@ -80,6 +249,49 @@ class GroupBy: ) def median(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the median value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + median absolute deviation value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.median` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP + AvgTicketPrice Cancelled dayOfWeek timestamp + DestCountry + AE 585.720490 False 2 2018-01-19 23:56:44.000 + AR 678.447433 False 3 2018-01-22 10:18:50.000 + AT 659.715592 False 3 2018-01-20 20:40:10.000 + AU 689.241348 False 3 2018-01-22 18:46:11.000 + CA 663.516057 False 3 2018-01-22 21:35:09.500 + ... ... ... ... ... + RU 670.714956 False 3 2018-01-20 16:48:16.000 + SE 680.111084 False 3 2018-01-22 20:53:44.000 + TR 441.681122 False 1 2018-01-13 23:17:27.000 + US 600.591525 False 3 2018-01-22 04:09:50.000 + ZA 633.935425 False 3 2018-01-23 17:42:57.000 + + [32 rows x 4 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["median"], @@ -88,6 +300,49 @@ class GroupBy: ) def sum(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the sum value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + sum value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.sum` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek + DestCountry + AE 2.783612e+04 7.0 124.0 + AR 2.058223e+05 45.0 837.0 + AT 2.437872e+05 66.0 1083.0 + AU 2.785365e+05 54.0 1183.0 + CA 6.124173e+05 127.0 2786.0 + ... ... ... ... + RU 4.899533e+05 97.0 2093.0 + SE 1.684563e+05 38.0 684.0 + TR 4.852532e+03 1.0 19.0 + US 1.183804e+06 249.0 5472.0 + ZA 1.819840e+05 42.0 783.0 + + [32 rows x 3 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["sum"], @@ -96,6 +351,49 @@ class GroupBy: ) def min(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the min value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + min value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.min` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek timestamp + DestCountry + AE 110.799911 False 0 2018-01-01 19:31:30 + AR 125.589394 False 0 2018-01-01 01:30:47 + AT 100.020531 False 0 2018-01-01 05:24:19 + AU 102.294312 False 0 2018-01-01 00:00:00 + CA 100.557251 False 0 2018-01-01 00:44:08 + ... ... ... ... ... + RU 101.004005 False 0 2018-01-01 01:01:51 + SE 102.877190 False 0 2018-01-01 04:09:38 + TR 142.876465 False 0 2018-01-01 06:45:17 + US 100.145966 False 0 2018-01-01 00:06:27 + ZA 102.002663 False 0 2018-01-01 06:44:44 + + [32 rows x 4 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["min"], @@ -104,6 +402,49 @@ class GroupBy: ) def max(self, numeric_only: bool = True) -> "pd.DataFrame": + """ + Compute the max value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + max value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.max` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek timestamp + DestCountry + AE 1126.148682 True 6 2018-02-11 04:11:14 + AR 1199.642822 True 6 2018-02-11 17:09:05 + AT 1181.835815 True 6 2018-02-11 23:12:33 + AU 1197.632690 True 6 2018-02-11 21:39:01 + CA 1198.852539 True 6 2018-02-11 23:04:08 + ... ... ... ... ... + RU 1196.742310 True 6 2018-02-11 20:03:31 + SE 1198.621582 True 6 2018-02-11 22:06:14 + TR 855.935547 True 6 2018-02-04 01:59:23 + US 1199.729004 True 6 2018-02-11 23:27:00 + ZA 1196.186157 True 6 2018-02-11 23:29:45 + + [32 rows x 4 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["max"], @@ -112,6 +453,49 @@ class GroupBy: ) def nunique(self) -> "pd.DataFrame": + """ + Compute the nunique value for each group. + + Parameters + ---------- + numeric_only: {True, False, None} Default is True + Which datatype to be returned + - True: Returns all values as float64, NaN/NaT values are removed + - None: Returns all values as the same dtype where possible, NaN/NaT are removed + - False: Returns all values as the same dtype where possible, NaN/NaT are preserved + + Returns + ------- + pandas.DataFrame + nunique value for each numeric column of each group + + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.nunique` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek + DestCountry + AE 46 2 7 + AR 305 2 7 + AT 377 2 7 + AU 416 2 7 + CA 944 2 7 + ... ... ... ... + RU 739 2 7 + SE 255 2 7 + TR 10 2 5 + US 1987 2 7 + ZA 283 2 7 + + [32 rows x 3 columns] + """ return self._query_compiler.aggs_groupby( by=self._by, pd_aggs=["nunique"], @@ -119,22 +503,9 @@ class GroupBy: numeric_only=False, ) - -class GroupByDataFrame(GroupBy): - """ - This holds all the groupby methods for DataFrame - - Parameters - ---------- - by: - List of columns to groupby - query_compiler: - Query compiler object - dropna: - default is true, drop None/NaT/NaN values while grouping - """ - - def aggregate(self, func: List[str], numeric_only: bool = False) -> "pd.DataFrame": + def aggregate( + self, func: Union[str, List[str]], numeric_only: Optional[bool] = False + ) -> "pd.DataFrame": """ Used to groupby and aggregate @@ -155,8 +526,36 @@ class GroupByDataFrame(GroupBy): Returns ------- - A Pandas DataFrame + pandas.DataFrame + aggregation value for each numeric column of each group + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice ... dayOfWeek + min max ... min max + DestCountry ... + AE 110.799911 1126.148682 ... 0 6 + AR 125.589394 1199.642822 ... 0 6 + AT 100.020531 1181.835815 ... 0 6 + AU 102.294312 1197.632690 ... 0 6 + CA 100.557251 1198.852539 ... 0 6 + ... ... ... ... ... .. + RU 101.004005 1196.742310 ... 0 6 + SE 102.877190 1198.621582 ... 0 6 + TR 142.876465 855.935547 ... 0 6 + US 100.145966 1199.729004 ... 0 6 + ZA 102.002663 1196.186157 ... 0 6 + + [32 rows x 6 columns] """ # Controls whether a MultiIndex is used for the # columns of the result DataFrame. @@ -177,12 +576,39 @@ class GroupByDataFrame(GroupBy): def count(self) -> "pd.DataFrame": """ - Used to groupby and count + Compute the count value for each group. Returns ------- - A Pandas DataFrame + pandas.DataFrame + nunique value for each numeric column of each group + See Also + -------- + :pandas_api_docs:`pandas.core.groupby.GroupBy.count` + + Examples + -------- + >>> df = ed.DataFrame( + ... "localhost", "flights", + ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"] + ... ) + >>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE + AvgTicketPrice Cancelled dayOfWeek + DestCountry + AE 46 46 46 + AR 305 305 305 + AT 377 377 377 + AU 416 416 416 + CA 944 944 944 + ... ... ... ... + RU 739 739 739 + SE 255 255 255 + TR 10 10 10 + US 1987 1987 1987 + ZA 283 283 283 + + [32 rows x 3 columns] """ return self._query_compiler.aggs_groupby( by=self._by, diff --git a/eland/operations.py b/eland/operations.py index acb93c9..909ea0c 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -545,7 +545,7 @@ class Operations: pd_aggs: List[str], dropna: bool = True, is_dataframe_agg: bool = False, - numeric_only: bool = True, + numeric_only: Optional[bool] = True, ) -> pd.DataFrame: """ This method is used to construct groupby aggregation dataframe @@ -570,15 +570,98 @@ class Operations: ------- A dataframe which consists groupby data """ - headers, results = self._groupby_aggs( - query_compiler, - by=by, - pd_aggs=pd_aggs, - dropna=dropna, - is_dataframe_agg=is_dataframe_agg, - numeric_only=numeric_only, + query_params, post_processing = self._resolve_tasks(query_compiler) + + size = self._size(query_params, post_processing) + if size is not None: + raise NotImplementedError( + f"Can not count field matches if size is set {size}" + ) + + by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by) + + # Used defaultdict to avoid initialization of columns with lists + results: Dict[str, List[Any]] = defaultdict(list) + + if numeric_only: + agg_fields = [ + field for field in agg_fields if (field.is_numeric or field.is_bool) + ] + + body = Query(query_params.query) + + # To return for creating multi-index on columns + headers = [agg_field.column for agg_field in agg_fields] + + # Convert pandas aggs to ES equivalent + es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) + + # Construct Query + for by_field in by_fields: + # groupby fields will be term aggregations + body.composite_agg_bucket_terms( + name=f"groupby_{by_field.column}", + field=by_field.aggregatable_es_field_name, + ) + + for agg_field in agg_fields: + for es_agg in es_aggs: + # Skip if the field isn't compatible or if the agg is + # 'value_count' as this value is pulled from bucket.doc_count. + if not agg_field.is_es_agg_compatible(es_agg): + continue + + # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call + if isinstance(es_agg, tuple): + body.metric_aggs( + f"{es_agg[0]}_{agg_field.es_field_name}", + es_agg[0], + agg_field.aggregatable_es_field_name, + ) + else: + body.metric_aggs( + f"{es_agg}_{agg_field.es_field_name}", + es_agg, + agg_field.aggregatable_es_field_name, + ) + + # Composite aggregation + body.composite_agg_start( + size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna ) + for buckets in self.bucket_generator(query_compiler, body): + # We recieve response row-wise + for bucket in buckets: + # groupby columns are added to result same way they are returned + for by_field in by_fields: + bucket_key = bucket["key"][f"groupby_{by_field.column}"] + + # Datetimes always come back as integers, convert to pd.Timestamp() + if by_field.is_timestamp and isinstance(bucket_key, int): + bucket_key = pd.to_datetime(bucket_key, unit="ms") + + results[by_field.column].append(bucket_key) + + agg_calculation = self._unpack_metric_aggs( + fields=agg_fields, + es_aggs=es_aggs, + pd_aggs=pd_aggs, + response={"aggregations": bucket}, + numeric_only=numeric_only, + # We set 'True' here because we want the value + # unpacking to always be in 'dataframe' mode. + is_dataframe_agg=True, + ) + + # Process the calculated agg values to response + for key, value in agg_calculation.items(): + if not isinstance(value, list): + results[key].append(value) + continue + for pd_agg, val in zip(pd_aggs, value): + results[f"{key}_{pd_agg}"].append(val) + agg_df = pd.DataFrame(results).set_index(by) if is_dataframe_agg: @@ -636,146 +719,6 @@ class Operations: else: return composite_buckets["buckets"] - def _groupby_aggs( - self, - query_compiler: "QueryCompiler", - by: List[str], - pd_aggs: List[str], - dropna: bool = True, - is_dataframe_agg: bool = False, - numeric_only: bool = True, - ) -> Tuple[List[str], Dict[str, Any]]: - """ - This method is used to calculate groupby aggregations - - Parameters - ---------- - query_compiler: - A Query compiler - by: - a list of columns on which groupby operations have to be performed - pd_aggs: - a list of aggregations to be performed - dropna: - Drop None values if True. - TODO Not yet implemented - is_dataframe_agg: - Know if multi aggregation or single agg is called. - numeric_only: - return either numeric values or NaN/NaT - - Returns - ------- - headers: columns on which MultiIndex has to be applied - response: dictionary of groupby aggregated values - """ - query_params, post_processing = self._resolve_tasks(query_compiler) - - size = self._size(query_params, post_processing) - if size is not None: - raise NotImplementedError( - f"Can not count field matches if size is set {size}" - ) - - by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by) - - # Used defaultdict to avoid initialization of columns with lists - response: Dict[str, List[Any]] = defaultdict(list) - - if numeric_only: - agg_fields = [ - field for field in agg_fields if (field.is_numeric or field.is_bool) - ] - - body = Query(query_params.query) - - # To return for creating multi-index on columns - headers = [field.column for field in agg_fields] - - # Convert pandas aggs to ES equivalent - es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) - - # pd_agg 'count' is handled via 'doc_count' from buckets - using_pd_agg_count = "count" in pd_aggs - - # Construct Query - for by_field in by_fields: - # groupby fields will be term aggregations - body.composite_agg_bucket_terms( - name=f"groupby_{by_field.column}", - field=by_field.aggregatable_es_field_name, - ) - - for agg_field in agg_fields: - for es_agg in es_aggs: - # Skip if the field isn't compatible or if the agg is - # 'value_count' as this value is pulled from bucket.doc_count. - if ( - not agg_field.is_es_agg_compatible(es_agg) - or es_agg == "value_count" - ): - continue - - # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call - if isinstance(es_agg, tuple): - body.metric_aggs( - f"{es_agg[0]}_{agg_field.es_field_name}", - es_agg[0], - agg_field.aggregatable_es_field_name, - ) - else: - body.metric_aggs( - f"{es_agg}_{agg_field.es_field_name}", - es_agg, - agg_field.aggregatable_es_field_name, - ) - - # Composite aggregation - body.composite_agg_start( - size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna - ) - - for buckets in self.bucket_generator(query_compiler, body): - # We recieve response row-wise - for bucket in buckets: - # groupby columns are added to result same way they are returned - for by_field in by_fields: - bucket_key = bucket["key"][f"groupby_{by_field.column}"] - - # Datetimes always come back as integers, convert to pd.Timestamp() - if by_field.is_timestamp and isinstance(bucket_key, int): - bucket_key = pd.to_datetime(bucket_key, unit="ms") - - response[by_field.column].append(bucket_key) - - # Put 'doc_count' from bucket into each 'agg_field' - # to be extracted from _unpack_metric_aggs() - if using_pd_agg_count: - doc_count = bucket["doc_count"] - for agg_field in agg_fields: - bucket[f"value_count_{agg_field.es_field_name}"] = { - "value": doc_count - } - - agg_calculation = self._unpack_metric_aggs( - fields=agg_fields, - es_aggs=es_aggs, - pd_aggs=pd_aggs, - response={"aggregations": bucket}, - numeric_only=numeric_only, - is_dataframe_agg=is_dataframe_agg, - ) - - # Process the calculated agg values to response - for key, value in agg_calculation.items(): - if not isinstance(value, list): - response[key].append(value) - continue - for pd_agg, val in zip(pd_aggs, value): - response[f"{key}_{pd_agg}"].append(val) - - return headers, response - @staticmethod def _map_pd_aggs_to_es_aggs(pd_aggs): """ diff --git a/eland/query_compiler.py b/eland/query_compiler.py index c1395fd..85b2c66 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -556,7 +556,7 @@ class QueryCompiler: pd_aggs: List[str], dropna: bool = True, is_dataframe_agg: bool = False, - numeric_only: bool = True, + numeric_only: Optional[bool] = True, ) -> pd.DataFrame: return self._operations.aggs_groupby( self, diff --git a/eland/tests/dataframe/test_groupby_pytest.py b/eland/tests/dataframe/test_groupby_pytest.py index 8027caf..dd0afad 100644 --- a/eland/tests/dataframe/test_groupby_pytest.py +++ b/eland/tests/dataframe/test_groupby_pytest.py @@ -176,3 +176,21 @@ class TestGroupbyDataFrame(TestData): assert_index_equal(pd_count.index, ed_count.index) assert_frame_equal(pd_count, ed_count) assert_series_equal(pd_count.dtypes, ed_count.dtypes) + + def test_groupby_dataframe_mad(self): + pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"]) + ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"]) + + pd_mad = pd_flights.groupby("DestCountry").mad() + ed_mad = ed_flights.groupby("DestCountry").mad() + + assert_index_equal(pd_mad.columns, ed_mad.columns) + assert_index_equal(pd_mad.index, ed_mad.index) + assert_series_equal(pd_mad.dtypes, ed_mad.dtypes) + + pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"]) + ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"]) + + assert_index_equal(pd_min_mad.columns, ed_min_mad.columns) + assert_index_equal(pd_min_mad.index, ed_min_mad.index) + assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes)