Document DataFrame.groupby() methods

This commit is contained in:
Seth Michael Larson 2020-10-27 10:10:57 -05:00 committed by GitHub
parent 475e0f41ef
commit ae70f03df3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 696 additions and 185 deletions

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.agg
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.agg

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.aggregate
========================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.aggregate

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.count
====================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.count

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.mad
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.mad

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.max
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.max

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.mean
===================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.mean

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.median
=====================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.median

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.min
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.min

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.nunique
======================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.nunique

View File

@ -0,0 +1,15 @@
eland.groupby.DataFrameGroupBy
==============================
.. currentmodule:: eland.groupby
.. autoclass:: DataFrameGroupBy
..
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
.. autosummary::
:toctree:
DataFrame.abs
DataFrame.add

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.std
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.std

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.sum
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.sum

View File

@ -0,0 +1,6 @@
eland.groupby.DataFrameGroupBy.var
==================================
.. currentmodule:: eland.groupby
.. automethod:: DataFrameGroupBy.var

View File

@ -0,0 +1,15 @@
eland.groupby.GroupBy
=====================
.. currentmodule:: eland.groupby
.. autoclass:: GroupBy
..
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
.. autosummary::
:toctree:
DataFrame.abs
DataFrame.add

View File

@ -48,6 +48,28 @@ Function Application, GroupBy & Window
DataFrame.aggregate
DataFrame.groupby
.. currentmodule:: eland.groupby
.. autosummary::
:toctree: api/
DataFrameGroupBy
DataFrameGroupBy.agg
DataFrameGroupBy.aggregate
DataFrameGroupBy.count
DataFrameGroupByGroupBy.mad
DataFrameGroupByGroupBy.max
DataFrameGroupByGroupBy.mean
DataFrameGroupByGroupBy.median
DataFrameGroupByGroupBy.min
DataFrameGroupByGroupBy.nunique
DataFrameGroupByGroupBy.std
DataFrameGroupByGroupBy.sum
DataFrameGroupByGroupBy.var
GroupBy
.. currentmodule:: eland
.. _api.dataframe.stats:
Computations / Descriptive Stats

View File

@ -1,7 +1,7 @@
.. _api.general_utility_functions:
=========================
General utility functions
General Utility Functions
=========================
.. currentmodule:: eland

View File

@ -170,7 +170,7 @@ script instead of being modified manually.
+---------------------------------------+------------+
| ``ed.DataFrame.get()`` | **Yes** |
+---------------------------------------+------------+
| ``ed.DataFrame.groupby()`` | No |
| ``ed.DataFrame.groupby()`` | **Yes** |
+---------------------------------------+------------+
| ``ed.DataFrame.gt()`` | No |
+---------------------------------------+------------+

View File

@ -36,7 +36,7 @@ from pandas.util._validators import validate_bool_kwarg
import eland.plotting as gfx
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.filter import BooleanFilter
from eland.groupby import GroupByDataFrame
from eland.groupby import DataFrameGroupBy
from eland.ndframe import NDFrame
from eland.series import Series
from eland.utils import deprecated_api, is_valid_attr_name
@ -1433,7 +1433,7 @@ class DataFrame(NDFrame):
def groupby(
self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True
) -> "GroupByDataFrame":
) -> "DataFrameGroupBy":
"""
Used to perform groupby operations
@ -1448,7 +1448,7 @@ class DataFrame(NDFrame):
Returns
-------
GroupByDataFrame
eland.groupby.DataFrameGroupBy
See Also
--------
@ -1520,7 +1520,7 @@ class DataFrame(NDFrame):
f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
)
return GroupByDataFrame(
return DataFrameGroupBy(
by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
)

View File

@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
from typing import TYPE_CHECKING, List
from typing import TYPE_CHECKING, List, Optional, Union
from eland.query_compiler import QueryCompiler
@ -25,16 +25,7 @@ if TYPE_CHECKING:
class GroupBy:
"""
Base class for calls to X.groupby([...])
Parameters
----------
by:
List of columns to groupby
query_compiler:
Query compiler object
dropna:
default is true, drop None/NaT/NaN values while grouping
Base class for calls to :py:func:`eland.DataFrame.groupby`
"""
def __init__(
@ -47,7 +38,56 @@ class GroupBy:
self._dropna: bool = dropna
self._by: List[str] = by
class DataFrameGroupBy(GroupBy):
"""
This holds all the groupby methods for :py:func:`eland.DataFrame.groupby`
"""
def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the mean value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
mean value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627
AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094
AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287
CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150
... ... ... ... ...
RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104
SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838
TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000
US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215
ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["mean"],
@ -56,6 +96,49 @@ class GroupBy:
)
def var(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the variance value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
variance value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.var`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 75789.979090 0.130443 3.950549
AR 59683.055316 0.125979 3.783429
AT 65726.669676 0.144610 4.090013
AU 65088.483446 0.113094 3.833562
CA 68149.950516 0.116496 3.688139
... ... ... ...
RU 67305.277617 0.114107 3.852666
SE 53740.570338 0.127062 3.942132
TR 61245.521047 0.094868 4.100420
US 74349.939410 0.109638 3.758700
ZA 62920.072901 0.126608 3.775609
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["var"],
@ -64,6 +147,49 @@ class GroupBy:
)
def std(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the standard deviation value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
standard deviation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.std`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 279.875500 0.367171 2.020634
AR 244.903626 0.355811 1.949901
AT 256.883342 0.381035 2.026411
AU 255.585377 0.336902 1.961486
CA 261.263054 0.341587 1.921980
... ... ... ...
RU 259.696213 0.338140 1.964815
SE 232.504297 0.357510 1.991340
TR 267.827572 0.333333 2.191454
US 272.774819 0.331242 1.939469
ZA 251.505568 0.356766 1.948258
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["std"],
@ -72,6 +198,49 @@ class GroupBy:
)
def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the median absolute deviation value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
median absolute deviation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").mad() # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 233.697174 NaN 1.5
AR 189.250061 NaN 2.0
AT 195.823669 NaN 2.0
AU 202.539764 NaN 2.0
CA 203.344696 NaN 2.0
... ... ... ...
RU 206.431702 NaN 2.0
SE 178.658447 NaN 2.0
TR 221.863434 NaN 1.0
US 228.461365 NaN 2.0
ZA 192.162842 NaN 2.0
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["mad"],
@ -80,6 +249,49 @@ class GroupBy:
)
def median(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the median value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
median absolute deviation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.median`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 585.720490 False 2 2018-01-19 23:56:44.000
AR 678.447433 False 3 2018-01-22 10:18:50.000
AT 659.715592 False 3 2018-01-20 20:40:10.000
AU 689.241348 False 3 2018-01-22 18:46:11.000
CA 663.516057 False 3 2018-01-22 21:35:09.500
... ... ... ... ...
RU 670.714956 False 3 2018-01-20 16:48:16.000
SE 680.111084 False 3 2018-01-22 20:53:44.000
TR 441.681122 False 1 2018-01-13 23:17:27.000
US 600.591525 False 3 2018-01-22 04:09:50.000
ZA 633.935425 False 3 2018-01-23 17:42:57.000
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["median"],
@ -88,6 +300,49 @@ class GroupBy:
)
def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the sum value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
sum value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 2.783612e+04 7.0 124.0
AR 2.058223e+05 45.0 837.0
AT 2.437872e+05 66.0 1083.0
AU 2.785365e+05 54.0 1183.0
CA 6.124173e+05 127.0 2786.0
... ... ... ...
RU 4.899533e+05 97.0 2093.0
SE 1.684563e+05 38.0 684.0
TR 4.852532e+03 1.0 19.0
US 1.183804e+06 249.0 5472.0
ZA 1.819840e+05 42.0 783.0
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["sum"],
@ -96,6 +351,49 @@ class GroupBy:
)
def min(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the min value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
min value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.min`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 110.799911 False 0 2018-01-01 19:31:30
AR 125.589394 False 0 2018-01-01 01:30:47
AT 100.020531 False 0 2018-01-01 05:24:19
AU 102.294312 False 0 2018-01-01 00:00:00
CA 100.557251 False 0 2018-01-01 00:44:08
... ... ... ... ...
RU 101.004005 False 0 2018-01-01 01:01:51
SE 102.877190 False 0 2018-01-01 04:09:38
TR 142.876465 False 0 2018-01-01 06:45:17
US 100.145966 False 0 2018-01-01 00:06:27
ZA 102.002663 False 0 2018-01-01 06:44:44
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["min"],
@ -104,6 +402,49 @@ class GroupBy:
)
def max(self, numeric_only: bool = True) -> "pd.DataFrame":
"""
Compute the max value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
max value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.max`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 1126.148682 True 6 2018-02-11 04:11:14
AR 1199.642822 True 6 2018-02-11 17:09:05
AT 1181.835815 True 6 2018-02-11 23:12:33
AU 1197.632690 True 6 2018-02-11 21:39:01
CA 1198.852539 True 6 2018-02-11 23:04:08
... ... ... ... ...
RU 1196.742310 True 6 2018-02-11 20:03:31
SE 1198.621582 True 6 2018-02-11 22:06:14
TR 855.935547 True 6 2018-02-04 01:59:23
US 1199.729004 True 6 2018-02-11 23:27:00
ZA 1196.186157 True 6 2018-02-11 23:29:45
<BLANKLINE>
[32 rows x 4 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["max"],
@ -112,6 +453,49 @@ class GroupBy:
)
def nunique(self) -> "pd.DataFrame":
"""
Compute the nunique value for each group.
Parameters
----------
numeric_only: {True, False, None} Default is True
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.DataFrame
nunique value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 46 2 7
AR 305 2 7
AT 377 2 7
AU 416 2 7
CA 944 2 7
... ... ... ...
RU 739 2 7
SE 255 2 7
TR 10 2 5
US 1987 2 7
ZA 283 2 7
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,
pd_aggs=["nunique"],
@ -119,22 +503,9 @@ class GroupBy:
numeric_only=False,
)
class GroupByDataFrame(GroupBy):
"""
This holds all the groupby methods for DataFrame
Parameters
----------
by:
List of columns to groupby
query_compiler:
Query compiler object
dropna:
default is true, drop None/NaT/NaN values while grouping
"""
def aggregate(self, func: List[str], numeric_only: bool = False) -> "pd.DataFrame":
def aggregate(
self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
) -> "pd.DataFrame":
"""
Used to groupby and aggregate
@ -155,8 +526,36 @@ class GroupByDataFrame(GroupBy):
Returns
-------
A Pandas DataFrame
pandas.DataFrame
aggregation value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice ... dayOfWeek
min max ... min max
DestCountry ...
AE 110.799911 1126.148682 ... 0 6
AR 125.589394 1199.642822 ... 0 6
AT 100.020531 1181.835815 ... 0 6
AU 102.294312 1197.632690 ... 0 6
CA 100.557251 1198.852539 ... 0 6
... ... ... ... ... ..
RU 101.004005 1196.742310 ... 0 6
SE 102.877190 1198.621582 ... 0 6
TR 142.876465 855.935547 ... 0 6
US 100.145966 1199.729004 ... 0 6
ZA 102.002663 1196.186157 ... 0 6
<BLANKLINE>
[32 rows x 6 columns]
"""
# Controls whether a MultiIndex is used for the
# columns of the result DataFrame.
@ -177,12 +576,39 @@ class GroupByDataFrame(GroupBy):
def count(self) -> "pd.DataFrame":
"""
Used to groupby and count
Compute the count value for each group.
Returns
-------
A Pandas DataFrame
pandas.DataFrame
nunique value for each numeric column of each group
See Also
--------
:pandas_api_docs:`pandas.core.groupby.GroupBy.count`
Examples
--------
>>> df = ed.DataFrame(
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
... )
>>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
AvgTicketPrice Cancelled dayOfWeek
DestCountry
AE 46 46 46
AR 305 305 305
AT 377 377 377
AU 416 416 416
CA 944 944 944
... ... ... ...
RU 739 739 739
SE 255 255 255
TR 10 10 10
US 1987 1987 1987
ZA 283 283 283
<BLANKLINE>
[32 rows x 3 columns]
"""
return self._query_compiler.aggs_groupby(
by=self._by,

View File

@ -545,7 +545,7 @@ class Operations:
pd_aggs: List[str],
dropna: bool = True,
is_dataframe_agg: bool = False,
numeric_only: bool = True,
numeric_only: Optional[bool] = True,
) -> pd.DataFrame:
"""
This method is used to construct groupby aggregation dataframe
@ -570,15 +570,98 @@ class Operations:
-------
A dataframe which consists groupby data
"""
headers, results = self._groupby_aggs(
query_compiler,
by=by,
pd_aggs=pd_aggs,
dropna=dropna,
is_dataframe_agg=is_dataframe_agg,
numeric_only=numeric_only,
query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing)
if size is not None:
raise NotImplementedError(
f"Can not count field matches if size is set {size}"
)
by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
# Used defaultdict to avoid initialization of columns with lists
results: Dict[str, List[Any]] = defaultdict(list)
if numeric_only:
agg_fields = [
field for field in agg_fields if (field.is_numeric or field.is_bool)
]
body = Query(query_params.query)
# To return for creating multi-index on columns
headers = [agg_field.column for agg_field in agg_fields]
# Convert pandas aggs to ES equivalent
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
# Construct Query
for by_field in by_fields:
# groupby fields will be term aggregations
body.composite_agg_bucket_terms(
name=f"groupby_{by_field.column}",
field=by_field.aggregatable_es_field_name,
)
for agg_field in agg_fields:
for es_agg in es_aggs:
# Skip if the field isn't compatible or if the agg is
# 'value_count' as this value is pulled from bucket.doc_count.
if not agg_field.is_es_agg_compatible(es_agg):
continue
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
if isinstance(es_agg, tuple):
body.metric_aggs(
f"{es_agg[0]}_{agg_field.es_field_name}",
es_agg[0],
agg_field.aggregatable_es_field_name,
)
else:
body.metric_aggs(
f"{es_agg}_{agg_field.es_field_name}",
es_agg,
agg_field.aggregatable_es_field_name,
)
# Composite aggregation
body.composite_agg_start(
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
)
for buckets in self.bucket_generator(query_compiler, body):
# We recieve response row-wise
for bucket in buckets:
# groupby columns are added to result same way they are returned
for by_field in by_fields:
bucket_key = bucket["key"][f"groupby_{by_field.column}"]
# Datetimes always come back as integers, convert to pd.Timestamp()
if by_field.is_timestamp and isinstance(bucket_key, int):
bucket_key = pd.to_datetime(bucket_key, unit="ms")
results[by_field.column].append(bucket_key)
agg_calculation = self._unpack_metric_aggs(
fields=agg_fields,
es_aggs=es_aggs,
pd_aggs=pd_aggs,
response={"aggregations": bucket},
numeric_only=numeric_only,
# We set 'True' here because we want the value
# unpacking to always be in 'dataframe' mode.
is_dataframe_agg=True,
)
# Process the calculated agg values to response
for key, value in agg_calculation.items():
if not isinstance(value, list):
results[key].append(value)
continue
for pd_agg, val in zip(pd_aggs, value):
results[f"{key}_{pd_agg}"].append(val)
agg_df = pd.DataFrame(results).set_index(by)
if is_dataframe_agg:
@ -636,146 +719,6 @@ class Operations:
else:
return composite_buckets["buckets"]
def _groupby_aggs(
self,
query_compiler: "QueryCompiler",
by: List[str],
pd_aggs: List[str],
dropna: bool = True,
is_dataframe_agg: bool = False,
numeric_only: bool = True,
) -> Tuple[List[str], Dict[str, Any]]:
"""
This method is used to calculate groupby aggregations
Parameters
----------
query_compiler:
A Query compiler
by:
a list of columns on which groupby operations have to be performed
pd_aggs:
a list of aggregations to be performed
dropna:
Drop None values if True.
TODO Not yet implemented
is_dataframe_agg:
Know if multi aggregation or single agg is called.
numeric_only:
return either numeric values or NaN/NaT
Returns
-------
headers: columns on which MultiIndex has to be applied
response: dictionary of groupby aggregated values
"""
query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing)
if size is not None:
raise NotImplementedError(
f"Can not count field matches if size is set {size}"
)
by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
# Used defaultdict to avoid initialization of columns with lists
response: Dict[str, List[Any]] = defaultdict(list)
if numeric_only:
agg_fields = [
field for field in agg_fields if (field.is_numeric or field.is_bool)
]
body = Query(query_params.query)
# To return for creating multi-index on columns
headers = [field.column for field in agg_fields]
# Convert pandas aggs to ES equivalent
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
# pd_agg 'count' is handled via 'doc_count' from buckets
using_pd_agg_count = "count" in pd_aggs
# Construct Query
for by_field in by_fields:
# groupby fields will be term aggregations
body.composite_agg_bucket_terms(
name=f"groupby_{by_field.column}",
field=by_field.aggregatable_es_field_name,
)
for agg_field in agg_fields:
for es_agg in es_aggs:
# Skip if the field isn't compatible or if the agg is
# 'value_count' as this value is pulled from bucket.doc_count.
if (
not agg_field.is_es_agg_compatible(es_agg)
or es_agg == "value_count"
):
continue
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
if isinstance(es_agg, tuple):
body.metric_aggs(
f"{es_agg[0]}_{agg_field.es_field_name}",
es_agg[0],
agg_field.aggregatable_es_field_name,
)
else:
body.metric_aggs(
f"{es_agg}_{agg_field.es_field_name}",
es_agg,
agg_field.aggregatable_es_field_name,
)
# Composite aggregation
body.composite_agg_start(
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
)
for buckets in self.bucket_generator(query_compiler, body):
# We recieve response row-wise
for bucket in buckets:
# groupby columns are added to result same way they are returned
for by_field in by_fields:
bucket_key = bucket["key"][f"groupby_{by_field.column}"]
# Datetimes always come back as integers, convert to pd.Timestamp()
if by_field.is_timestamp and isinstance(bucket_key, int):
bucket_key = pd.to_datetime(bucket_key, unit="ms")
response[by_field.column].append(bucket_key)
# Put 'doc_count' from bucket into each 'agg_field'
# to be extracted from _unpack_metric_aggs()
if using_pd_agg_count:
doc_count = bucket["doc_count"]
for agg_field in agg_fields:
bucket[f"value_count_{agg_field.es_field_name}"] = {
"value": doc_count
}
agg_calculation = self._unpack_metric_aggs(
fields=agg_fields,
es_aggs=es_aggs,
pd_aggs=pd_aggs,
response={"aggregations": bucket},
numeric_only=numeric_only,
is_dataframe_agg=is_dataframe_agg,
)
# Process the calculated agg values to response
for key, value in agg_calculation.items():
if not isinstance(value, list):
response[key].append(value)
continue
for pd_agg, val in zip(pd_aggs, value):
response[f"{key}_{pd_agg}"].append(val)
return headers, response
@staticmethod
def _map_pd_aggs_to_es_aggs(pd_aggs):
"""

View File

@ -556,7 +556,7 @@ class QueryCompiler:
pd_aggs: List[str],
dropna: bool = True,
is_dataframe_agg: bool = False,
numeric_only: bool = True,
numeric_only: Optional[bool] = True,
) -> pd.DataFrame:
return self._operations.aggs_groupby(
self,

View File

@ -176,3 +176,21 @@ class TestGroupbyDataFrame(TestData):
assert_index_equal(pd_count.index, ed_count.index)
assert_frame_equal(pd_count, ed_count)
assert_series_equal(pd_count.dtypes, ed_count.dtypes)
def test_groupby_dataframe_mad(self):
pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
pd_mad = pd_flights.groupby("DestCountry").mad()
ed_mad = ed_flights.groupby("DestCountry").mad()
assert_index_equal(pd_mad.columns, ed_mad.columns)
assert_index_equal(pd_mad.index, ed_mad.index)
assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)
assert_index_equal(pd_min_mad.index, ed_min_mad.index)
assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes)