mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Document DataFrame.groupby() methods
This commit is contained in:
parent
475e0f41ef
commit
ae70f03df3
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.agg
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.agg
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.aggregate
|
||||||
|
========================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.aggregate
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.count
|
||||||
|
====================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.count
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.mad
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.mad
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.max
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.max
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.mean
|
||||||
|
===================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.mean
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.median
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.median
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.min
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.min
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.nunique
|
||||||
|
======================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.nunique
|
15
docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
Normal file
15
docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy
|
||||||
|
==============================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. autoclass:: DataFrameGroupBy
|
||||||
|
|
||||||
|
|
||||||
|
..
|
||||||
|
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
|
||||||
|
.. autosummary::
|
||||||
|
:toctree:
|
||||||
|
|
||||||
|
DataFrame.abs
|
||||||
|
DataFrame.add
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.std
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.std
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.sum
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.sum
|
@ -0,0 +1,6 @@
|
|||||||
|
eland.groupby.DataFrameGroupBy.var
|
||||||
|
==================================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. automethod:: DataFrameGroupBy.var
|
15
docs/source/reference/api/eland.groupby.GroupBy.rst
Normal file
15
docs/source/reference/api/eland.groupby.GroupBy.rst
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
eland.groupby.GroupBy
|
||||||
|
=====================
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. autoclass:: GroupBy
|
||||||
|
|
||||||
|
|
||||||
|
..
|
||||||
|
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
|
||||||
|
.. autosummary::
|
||||||
|
:toctree:
|
||||||
|
|
||||||
|
DataFrame.abs
|
||||||
|
DataFrame.add
|
@ -48,6 +48,28 @@ Function Application, GroupBy & Window
|
|||||||
DataFrame.aggregate
|
DataFrame.aggregate
|
||||||
DataFrame.groupby
|
DataFrame.groupby
|
||||||
|
|
||||||
|
.. currentmodule:: eland.groupby
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
DataFrameGroupBy
|
||||||
|
DataFrameGroupBy.agg
|
||||||
|
DataFrameGroupBy.aggregate
|
||||||
|
DataFrameGroupBy.count
|
||||||
|
DataFrameGroupByGroupBy.mad
|
||||||
|
DataFrameGroupByGroupBy.max
|
||||||
|
DataFrameGroupByGroupBy.mean
|
||||||
|
DataFrameGroupByGroupBy.median
|
||||||
|
DataFrameGroupByGroupBy.min
|
||||||
|
DataFrameGroupByGroupBy.nunique
|
||||||
|
DataFrameGroupByGroupBy.std
|
||||||
|
DataFrameGroupByGroupBy.sum
|
||||||
|
DataFrameGroupByGroupBy.var
|
||||||
|
GroupBy
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
.. _api.dataframe.stats:
|
.. _api.dataframe.stats:
|
||||||
|
|
||||||
Computations / Descriptive Stats
|
Computations / Descriptive Stats
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
.. _api.general_utility_functions:
|
.. _api.general_utility_functions:
|
||||||
|
|
||||||
=========================
|
=========================
|
||||||
General utility functions
|
General Utility Functions
|
||||||
=========================
|
=========================
|
||||||
.. currentmodule:: eland
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ script instead of being modified manually.
|
|||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.DataFrame.get()`` | **Yes** |
|
| ``ed.DataFrame.get()`` | **Yes** |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.DataFrame.groupby()`` | No |
|
| ``ed.DataFrame.groupby()`` | **Yes** |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.DataFrame.gt()`` | No |
|
| ``ed.DataFrame.gt()`` | No |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
|
@ -36,7 +36,7 @@ from pandas.util._validators import validate_bool_kwarg
|
|||||||
import eland.plotting as gfx
|
import eland.plotting as gfx
|
||||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||||
from eland.filter import BooleanFilter
|
from eland.filter import BooleanFilter
|
||||||
from eland.groupby import GroupByDataFrame
|
from eland.groupby import DataFrameGroupBy
|
||||||
from eland.ndframe import NDFrame
|
from eland.ndframe import NDFrame
|
||||||
from eland.series import Series
|
from eland.series import Series
|
||||||
from eland.utils import deprecated_api, is_valid_attr_name
|
from eland.utils import deprecated_api, is_valid_attr_name
|
||||||
@ -1433,7 +1433,7 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
def groupby(
|
def groupby(
|
||||||
self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True
|
self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True
|
||||||
) -> "GroupByDataFrame":
|
) -> "DataFrameGroupBy":
|
||||||
"""
|
"""
|
||||||
Used to perform groupby operations
|
Used to perform groupby operations
|
||||||
|
|
||||||
@ -1448,7 +1448,7 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
GroupByDataFrame
|
eland.groupby.DataFrameGroupBy
|
||||||
|
|
||||||
See Also
|
See Also
|
||||||
--------
|
--------
|
||||||
@ -1520,7 +1520,7 @@ class DataFrame(NDFrame):
|
|||||||
f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
|
f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
|
||||||
)
|
)
|
||||||
|
|
||||||
return GroupByDataFrame(
|
return DataFrameGroupBy(
|
||||||
by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
|
by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
|
||||||
)
|
)
|
||||||
|
|
||||||
|
486
eland/groupby.py
486
eland/groupby.py
@ -15,7 +15,7 @@
|
|||||||
# specific language governing permissions and limitations
|
# specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, List
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||||||
|
|
||||||
from eland.query_compiler import QueryCompiler
|
from eland.query_compiler import QueryCompiler
|
||||||
|
|
||||||
@ -25,16 +25,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
class GroupBy:
|
class GroupBy:
|
||||||
"""
|
"""
|
||||||
Base class for calls to X.groupby([...])
|
Base class for calls to :py:func:`eland.DataFrame.groupby`
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
by:
|
|
||||||
List of columns to groupby
|
|
||||||
query_compiler:
|
|
||||||
Query compiler object
|
|
||||||
dropna:
|
|
||||||
default is true, drop None/NaT/NaN values while grouping
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -47,7 +38,56 @@ class GroupBy:
|
|||||||
self._dropna: bool = dropna
|
self._dropna: bool = dropna
|
||||||
self._by: List[str] = by
|
self._by: List[str] = by
|
||||||
|
|
||||||
|
|
||||||
|
class DataFrameGroupBy(GroupBy):
|
||||||
|
"""
|
||||||
|
This holds all the groupby methods for :py:func:`eland.DataFrame.groupby`
|
||||||
|
"""
|
||||||
|
|
||||||
def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the mean value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
mean value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||||
|
DestCountry
|
||||||
|
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
|
||||||
|
AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627
|
||||||
|
AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094
|
||||||
|
AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287
|
||||||
|
CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150
|
||||||
|
... ... ... ... ...
|
||||||
|
RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104
|
||||||
|
SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838
|
||||||
|
TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000
|
||||||
|
US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215
|
||||||
|
ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 4 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["mean"],
|
pd_aggs=["mean"],
|
||||||
@ -56,6 +96,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def var(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def var(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the variance value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
variance value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.var`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek
|
||||||
|
DestCountry
|
||||||
|
AE 75789.979090 0.130443 3.950549
|
||||||
|
AR 59683.055316 0.125979 3.783429
|
||||||
|
AT 65726.669676 0.144610 4.090013
|
||||||
|
AU 65088.483446 0.113094 3.833562
|
||||||
|
CA 68149.950516 0.116496 3.688139
|
||||||
|
... ... ... ...
|
||||||
|
RU 67305.277617 0.114107 3.852666
|
||||||
|
SE 53740.570338 0.127062 3.942132
|
||||||
|
TR 61245.521047 0.094868 4.100420
|
||||||
|
US 74349.939410 0.109638 3.758700
|
||||||
|
ZA 62920.072901 0.126608 3.775609
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 3 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["var"],
|
pd_aggs=["var"],
|
||||||
@ -64,6 +147,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def std(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def std(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the standard deviation value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
standard deviation value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.std`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek
|
||||||
|
DestCountry
|
||||||
|
AE 279.875500 0.367171 2.020634
|
||||||
|
AR 244.903626 0.355811 1.949901
|
||||||
|
AT 256.883342 0.381035 2.026411
|
||||||
|
AU 255.585377 0.336902 1.961486
|
||||||
|
CA 261.263054 0.341587 1.921980
|
||||||
|
... ... ... ...
|
||||||
|
RU 259.696213 0.338140 1.964815
|
||||||
|
SE 232.504297 0.357510 1.991340
|
||||||
|
TR 267.827572 0.333333 2.191454
|
||||||
|
US 272.774819 0.331242 1.939469
|
||||||
|
ZA 251.505568 0.356766 1.948258
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 3 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["std"],
|
pd_aggs=["std"],
|
||||||
@ -72,6 +198,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the median absolute deviation value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
median absolute deviation value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").mad() # doctest: +SKIP
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek
|
||||||
|
DestCountry
|
||||||
|
AE 233.697174 NaN 1.5
|
||||||
|
AR 189.250061 NaN 2.0
|
||||||
|
AT 195.823669 NaN 2.0
|
||||||
|
AU 202.539764 NaN 2.0
|
||||||
|
CA 203.344696 NaN 2.0
|
||||||
|
... ... ... ...
|
||||||
|
RU 206.431702 NaN 2.0
|
||||||
|
SE 178.658447 NaN 2.0
|
||||||
|
TR 221.863434 NaN 1.0
|
||||||
|
US 228.461365 NaN 2.0
|
||||||
|
ZA 192.162842 NaN 2.0
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 3 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["mad"],
|
pd_aggs=["mad"],
|
||||||
@ -80,6 +249,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def median(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def median(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the median value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
median absolute deviation value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.median`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||||
|
DestCountry
|
||||||
|
AE 585.720490 False 2 2018-01-19 23:56:44.000
|
||||||
|
AR 678.447433 False 3 2018-01-22 10:18:50.000
|
||||||
|
AT 659.715592 False 3 2018-01-20 20:40:10.000
|
||||||
|
AU 689.241348 False 3 2018-01-22 18:46:11.000
|
||||||
|
CA 663.516057 False 3 2018-01-22 21:35:09.500
|
||||||
|
... ... ... ... ...
|
||||||
|
RU 670.714956 False 3 2018-01-20 16:48:16.000
|
||||||
|
SE 680.111084 False 3 2018-01-22 20:53:44.000
|
||||||
|
TR 441.681122 False 1 2018-01-13 23:17:27.000
|
||||||
|
US 600.591525 False 3 2018-01-22 04:09:50.000
|
||||||
|
ZA 633.935425 False 3 2018-01-23 17:42:57.000
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 4 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["median"],
|
pd_aggs=["median"],
|
||||||
@ -88,6 +300,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the sum value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
sum value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek
|
||||||
|
DestCountry
|
||||||
|
AE 2.783612e+04 7.0 124.0
|
||||||
|
AR 2.058223e+05 45.0 837.0
|
||||||
|
AT 2.437872e+05 66.0 1083.0
|
||||||
|
AU 2.785365e+05 54.0 1183.0
|
||||||
|
CA 6.124173e+05 127.0 2786.0
|
||||||
|
... ... ... ...
|
||||||
|
RU 4.899533e+05 97.0 2093.0
|
||||||
|
SE 1.684563e+05 38.0 684.0
|
||||||
|
TR 4.852532e+03 1.0 19.0
|
||||||
|
US 1.183804e+06 249.0 5472.0
|
||||||
|
ZA 1.819840e+05 42.0 783.0
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 3 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["sum"],
|
pd_aggs=["sum"],
|
||||||
@ -96,6 +351,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def min(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def min(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the min value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
min value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.min`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||||
|
DestCountry
|
||||||
|
AE 110.799911 False 0 2018-01-01 19:31:30
|
||||||
|
AR 125.589394 False 0 2018-01-01 01:30:47
|
||||||
|
AT 100.020531 False 0 2018-01-01 05:24:19
|
||||||
|
AU 102.294312 False 0 2018-01-01 00:00:00
|
||||||
|
CA 100.557251 False 0 2018-01-01 00:44:08
|
||||||
|
... ... ... ... ...
|
||||||
|
RU 101.004005 False 0 2018-01-01 01:01:51
|
||||||
|
SE 102.877190 False 0 2018-01-01 04:09:38
|
||||||
|
TR 142.876465 False 0 2018-01-01 06:45:17
|
||||||
|
US 100.145966 False 0 2018-01-01 00:06:27
|
||||||
|
ZA 102.002663 False 0 2018-01-01 06:44:44
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 4 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["min"],
|
pd_aggs=["min"],
|
||||||
@ -104,6 +402,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def max(self, numeric_only: bool = True) -> "pd.DataFrame":
|
def max(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the max value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
max value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.max`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||||
|
DestCountry
|
||||||
|
AE 1126.148682 True 6 2018-02-11 04:11:14
|
||||||
|
AR 1199.642822 True 6 2018-02-11 17:09:05
|
||||||
|
AT 1181.835815 True 6 2018-02-11 23:12:33
|
||||||
|
AU 1197.632690 True 6 2018-02-11 21:39:01
|
||||||
|
CA 1198.852539 True 6 2018-02-11 23:04:08
|
||||||
|
... ... ... ... ...
|
||||||
|
RU 1196.742310 True 6 2018-02-11 20:03:31
|
||||||
|
SE 1198.621582 True 6 2018-02-11 22:06:14
|
||||||
|
TR 855.935547 True 6 2018-02-04 01:59:23
|
||||||
|
US 1199.729004 True 6 2018-02-11 23:27:00
|
||||||
|
ZA 1196.186157 True 6 2018-02-11 23:29:45
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 4 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["max"],
|
pd_aggs=["max"],
|
||||||
@ -112,6 +453,49 @@ class GroupBy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def nunique(self) -> "pd.DataFrame":
|
def nunique(self) -> "pd.DataFrame":
|
||||||
|
"""
|
||||||
|
Compute the nunique value for each group.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numeric_only: {True, False, None} Default is True
|
||||||
|
Which datatype to be returned
|
||||||
|
- True: Returns all values as float64, NaN/NaT values are removed
|
||||||
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||||
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
nunique value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek
|
||||||
|
DestCountry
|
||||||
|
AE 46 2 7
|
||||||
|
AR 305 2 7
|
||||||
|
AT 377 2 7
|
||||||
|
AU 416 2 7
|
||||||
|
CA 944 2 7
|
||||||
|
... ... ... ...
|
||||||
|
RU 739 2 7
|
||||||
|
SE 255 2 7
|
||||||
|
TR 10 2 5
|
||||||
|
US 1987 2 7
|
||||||
|
ZA 283 2 7
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 3 columns]
|
||||||
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
pd_aggs=["nunique"],
|
pd_aggs=["nunique"],
|
||||||
@ -119,22 +503,9 @@ class GroupBy:
|
|||||||
numeric_only=False,
|
numeric_only=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def aggregate(
|
||||||
class GroupByDataFrame(GroupBy):
|
self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
|
||||||
"""
|
) -> "pd.DataFrame":
|
||||||
This holds all the groupby methods for DataFrame
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
by:
|
|
||||||
List of columns to groupby
|
|
||||||
query_compiler:
|
|
||||||
Query compiler object
|
|
||||||
dropna:
|
|
||||||
default is true, drop None/NaT/NaN values while grouping
|
|
||||||
"""
|
|
||||||
|
|
||||||
def aggregate(self, func: List[str], numeric_only: bool = False) -> "pd.DataFrame":
|
|
||||||
"""
|
"""
|
||||||
Used to groupby and aggregate
|
Used to groupby and aggregate
|
||||||
|
|
||||||
@ -155,8 +526,36 @@ class GroupByDataFrame(GroupBy):
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
A Pandas DataFrame
|
pandas.DataFrame
|
||||||
|
aggregation value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice ... dayOfWeek
|
||||||
|
min max ... min max
|
||||||
|
DestCountry ...
|
||||||
|
AE 110.799911 1126.148682 ... 0 6
|
||||||
|
AR 125.589394 1199.642822 ... 0 6
|
||||||
|
AT 100.020531 1181.835815 ... 0 6
|
||||||
|
AU 102.294312 1197.632690 ... 0 6
|
||||||
|
CA 100.557251 1198.852539 ... 0 6
|
||||||
|
... ... ... ... ... ..
|
||||||
|
RU 101.004005 1196.742310 ... 0 6
|
||||||
|
SE 102.877190 1198.621582 ... 0 6
|
||||||
|
TR 142.876465 855.935547 ... 0 6
|
||||||
|
US 100.145966 1199.729004 ... 0 6
|
||||||
|
ZA 102.002663 1196.186157 ... 0 6
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 6 columns]
|
||||||
"""
|
"""
|
||||||
# Controls whether a MultiIndex is used for the
|
# Controls whether a MultiIndex is used for the
|
||||||
# columns of the result DataFrame.
|
# columns of the result DataFrame.
|
||||||
@ -177,12 +576,39 @@ class GroupByDataFrame(GroupBy):
|
|||||||
|
|
||||||
def count(self) -> "pd.DataFrame":
|
def count(self) -> "pd.DataFrame":
|
||||||
"""
|
"""
|
||||||
Used to groupby and count
|
Compute the count value for each group.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
A Pandas DataFrame
|
pandas.DataFrame
|
||||||
|
nunique value for each numeric column of each group
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.count`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame(
|
||||||
|
... "localhost", "flights",
|
||||||
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||||
|
... )
|
||||||
|
>>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
|
||||||
|
AvgTicketPrice Cancelled dayOfWeek
|
||||||
|
DestCountry
|
||||||
|
AE 46 46 46
|
||||||
|
AR 305 305 305
|
||||||
|
AT 377 377 377
|
||||||
|
AU 416 416 416
|
||||||
|
CA 944 944 944
|
||||||
|
... ... ... ...
|
||||||
|
RU 739 739 739
|
||||||
|
SE 255 255 255
|
||||||
|
TR 10 10 10
|
||||||
|
US 1987 1987 1987
|
||||||
|
ZA 283 283 283
|
||||||
|
<BLANKLINE>
|
||||||
|
[32 rows x 3 columns]
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.aggs_groupby(
|
return self._query_compiler.aggs_groupby(
|
||||||
by=self._by,
|
by=self._by,
|
||||||
|
@ -545,7 +545,7 @@ class Operations:
|
|||||||
pd_aggs: List[str],
|
pd_aggs: List[str],
|
||||||
dropna: bool = True,
|
dropna: bool = True,
|
||||||
is_dataframe_agg: bool = False,
|
is_dataframe_agg: bool = False,
|
||||||
numeric_only: bool = True,
|
numeric_only: Optional[bool] = True,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
This method is used to construct groupby aggregation dataframe
|
This method is used to construct groupby aggregation dataframe
|
||||||
@ -570,15 +570,98 @@ class Operations:
|
|||||||
-------
|
-------
|
||||||
A dataframe which consists groupby data
|
A dataframe which consists groupby data
|
||||||
"""
|
"""
|
||||||
headers, results = self._groupby_aggs(
|
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||||
query_compiler,
|
|
||||||
by=by,
|
size = self._size(query_params, post_processing)
|
||||||
pd_aggs=pd_aggs,
|
if size is not None:
|
||||||
dropna=dropna,
|
raise NotImplementedError(
|
||||||
is_dataframe_agg=is_dataframe_agg,
|
f"Can not count field matches if size is set {size}"
|
||||||
numeric_only=numeric_only,
|
)
|
||||||
|
|
||||||
|
by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
|
||||||
|
|
||||||
|
# Used defaultdict to avoid initialization of columns with lists
|
||||||
|
results: Dict[str, List[Any]] = defaultdict(list)
|
||||||
|
|
||||||
|
if numeric_only:
|
||||||
|
agg_fields = [
|
||||||
|
field for field in agg_fields if (field.is_numeric or field.is_bool)
|
||||||
|
]
|
||||||
|
|
||||||
|
body = Query(query_params.query)
|
||||||
|
|
||||||
|
# To return for creating multi-index on columns
|
||||||
|
headers = [agg_field.column for agg_field in agg_fields]
|
||||||
|
|
||||||
|
# Convert pandas aggs to ES equivalent
|
||||||
|
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
||||||
|
|
||||||
|
# Construct Query
|
||||||
|
for by_field in by_fields:
|
||||||
|
# groupby fields will be term aggregations
|
||||||
|
body.composite_agg_bucket_terms(
|
||||||
|
name=f"groupby_{by_field.column}",
|
||||||
|
field=by_field.aggregatable_es_field_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
for agg_field in agg_fields:
|
||||||
|
for es_agg in es_aggs:
|
||||||
|
# Skip if the field isn't compatible or if the agg is
|
||||||
|
# 'value_count' as this value is pulled from bucket.doc_count.
|
||||||
|
if not agg_field.is_es_agg_compatible(es_agg):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
||||||
|
if isinstance(es_agg, tuple):
|
||||||
|
body.metric_aggs(
|
||||||
|
f"{es_agg[0]}_{agg_field.es_field_name}",
|
||||||
|
es_agg[0],
|
||||||
|
agg_field.aggregatable_es_field_name,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
body.metric_aggs(
|
||||||
|
f"{es_agg}_{agg_field.es_field_name}",
|
||||||
|
es_agg,
|
||||||
|
agg_field.aggregatable_es_field_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Composite aggregation
|
||||||
|
body.composite_agg_start(
|
||||||
|
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for buckets in self.bucket_generator(query_compiler, body):
|
||||||
|
# We recieve response row-wise
|
||||||
|
for bucket in buckets:
|
||||||
|
# groupby columns are added to result same way they are returned
|
||||||
|
for by_field in by_fields:
|
||||||
|
bucket_key = bucket["key"][f"groupby_{by_field.column}"]
|
||||||
|
|
||||||
|
# Datetimes always come back as integers, convert to pd.Timestamp()
|
||||||
|
if by_field.is_timestamp and isinstance(bucket_key, int):
|
||||||
|
bucket_key = pd.to_datetime(bucket_key, unit="ms")
|
||||||
|
|
||||||
|
results[by_field.column].append(bucket_key)
|
||||||
|
|
||||||
|
agg_calculation = self._unpack_metric_aggs(
|
||||||
|
fields=agg_fields,
|
||||||
|
es_aggs=es_aggs,
|
||||||
|
pd_aggs=pd_aggs,
|
||||||
|
response={"aggregations": bucket},
|
||||||
|
numeric_only=numeric_only,
|
||||||
|
# We set 'True' here because we want the value
|
||||||
|
# unpacking to always be in 'dataframe' mode.
|
||||||
|
is_dataframe_agg=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process the calculated agg values to response
|
||||||
|
for key, value in agg_calculation.items():
|
||||||
|
if not isinstance(value, list):
|
||||||
|
results[key].append(value)
|
||||||
|
continue
|
||||||
|
for pd_agg, val in zip(pd_aggs, value):
|
||||||
|
results[f"{key}_{pd_agg}"].append(val)
|
||||||
|
|
||||||
agg_df = pd.DataFrame(results).set_index(by)
|
agg_df = pd.DataFrame(results).set_index(by)
|
||||||
|
|
||||||
if is_dataframe_agg:
|
if is_dataframe_agg:
|
||||||
@ -636,146 +719,6 @@ class Operations:
|
|||||||
else:
|
else:
|
||||||
return composite_buckets["buckets"]
|
return composite_buckets["buckets"]
|
||||||
|
|
||||||
def _groupby_aggs(
|
|
||||||
self,
|
|
||||||
query_compiler: "QueryCompiler",
|
|
||||||
by: List[str],
|
|
||||||
pd_aggs: List[str],
|
|
||||||
dropna: bool = True,
|
|
||||||
is_dataframe_agg: bool = False,
|
|
||||||
numeric_only: bool = True,
|
|
||||||
) -> Tuple[List[str], Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
This method is used to calculate groupby aggregations
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
query_compiler:
|
|
||||||
A Query compiler
|
|
||||||
by:
|
|
||||||
a list of columns on which groupby operations have to be performed
|
|
||||||
pd_aggs:
|
|
||||||
a list of aggregations to be performed
|
|
||||||
dropna:
|
|
||||||
Drop None values if True.
|
|
||||||
TODO Not yet implemented
|
|
||||||
is_dataframe_agg:
|
|
||||||
Know if multi aggregation or single agg is called.
|
|
||||||
numeric_only:
|
|
||||||
return either numeric values or NaN/NaT
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
headers: columns on which MultiIndex has to be applied
|
|
||||||
response: dictionary of groupby aggregated values
|
|
||||||
"""
|
|
||||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
|
||||||
|
|
||||||
size = self._size(query_params, post_processing)
|
|
||||||
if size is not None:
|
|
||||||
raise NotImplementedError(
|
|
||||||
f"Can not count field matches if size is set {size}"
|
|
||||||
)
|
|
||||||
|
|
||||||
by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
|
|
||||||
|
|
||||||
# Used defaultdict to avoid initialization of columns with lists
|
|
||||||
response: Dict[str, List[Any]] = defaultdict(list)
|
|
||||||
|
|
||||||
if numeric_only:
|
|
||||||
agg_fields = [
|
|
||||||
field for field in agg_fields if (field.is_numeric or field.is_bool)
|
|
||||||
]
|
|
||||||
|
|
||||||
body = Query(query_params.query)
|
|
||||||
|
|
||||||
# To return for creating multi-index on columns
|
|
||||||
headers = [field.column for field in agg_fields]
|
|
||||||
|
|
||||||
# Convert pandas aggs to ES equivalent
|
|
||||||
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
|
||||||
|
|
||||||
# pd_agg 'count' is handled via 'doc_count' from buckets
|
|
||||||
using_pd_agg_count = "count" in pd_aggs
|
|
||||||
|
|
||||||
# Construct Query
|
|
||||||
for by_field in by_fields:
|
|
||||||
# groupby fields will be term aggregations
|
|
||||||
body.composite_agg_bucket_terms(
|
|
||||||
name=f"groupby_{by_field.column}",
|
|
||||||
field=by_field.aggregatable_es_field_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
for agg_field in agg_fields:
|
|
||||||
for es_agg in es_aggs:
|
|
||||||
# Skip if the field isn't compatible or if the agg is
|
|
||||||
# 'value_count' as this value is pulled from bucket.doc_count.
|
|
||||||
if (
|
|
||||||
not agg_field.is_es_agg_compatible(es_agg)
|
|
||||||
or es_agg == "value_count"
|
|
||||||
):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
|
||||||
if isinstance(es_agg, tuple):
|
|
||||||
body.metric_aggs(
|
|
||||||
f"{es_agg[0]}_{agg_field.es_field_name}",
|
|
||||||
es_agg[0],
|
|
||||||
agg_field.aggregatable_es_field_name,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
body.metric_aggs(
|
|
||||||
f"{es_agg}_{agg_field.es_field_name}",
|
|
||||||
es_agg,
|
|
||||||
agg_field.aggregatable_es_field_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Composite aggregation
|
|
||||||
body.composite_agg_start(
|
|
||||||
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
|
|
||||||
)
|
|
||||||
|
|
||||||
for buckets in self.bucket_generator(query_compiler, body):
|
|
||||||
# We recieve response row-wise
|
|
||||||
for bucket in buckets:
|
|
||||||
# groupby columns are added to result same way they are returned
|
|
||||||
for by_field in by_fields:
|
|
||||||
bucket_key = bucket["key"][f"groupby_{by_field.column}"]
|
|
||||||
|
|
||||||
# Datetimes always come back as integers, convert to pd.Timestamp()
|
|
||||||
if by_field.is_timestamp and isinstance(bucket_key, int):
|
|
||||||
bucket_key = pd.to_datetime(bucket_key, unit="ms")
|
|
||||||
|
|
||||||
response[by_field.column].append(bucket_key)
|
|
||||||
|
|
||||||
# Put 'doc_count' from bucket into each 'agg_field'
|
|
||||||
# to be extracted from _unpack_metric_aggs()
|
|
||||||
if using_pd_agg_count:
|
|
||||||
doc_count = bucket["doc_count"]
|
|
||||||
for agg_field in agg_fields:
|
|
||||||
bucket[f"value_count_{agg_field.es_field_name}"] = {
|
|
||||||
"value": doc_count
|
|
||||||
}
|
|
||||||
|
|
||||||
agg_calculation = self._unpack_metric_aggs(
|
|
||||||
fields=agg_fields,
|
|
||||||
es_aggs=es_aggs,
|
|
||||||
pd_aggs=pd_aggs,
|
|
||||||
response={"aggregations": bucket},
|
|
||||||
numeric_only=numeric_only,
|
|
||||||
is_dataframe_agg=is_dataframe_agg,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process the calculated agg values to response
|
|
||||||
for key, value in agg_calculation.items():
|
|
||||||
if not isinstance(value, list):
|
|
||||||
response[key].append(value)
|
|
||||||
continue
|
|
||||||
for pd_agg, val in zip(pd_aggs, value):
|
|
||||||
response[f"{key}_{pd_agg}"].append(val)
|
|
||||||
|
|
||||||
return headers, response
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _map_pd_aggs_to_es_aggs(pd_aggs):
|
def _map_pd_aggs_to_es_aggs(pd_aggs):
|
||||||
"""
|
"""
|
||||||
|
@ -556,7 +556,7 @@ class QueryCompiler:
|
|||||||
pd_aggs: List[str],
|
pd_aggs: List[str],
|
||||||
dropna: bool = True,
|
dropna: bool = True,
|
||||||
is_dataframe_agg: bool = False,
|
is_dataframe_agg: bool = False,
|
||||||
numeric_only: bool = True,
|
numeric_only: Optional[bool] = True,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
return self._operations.aggs_groupby(
|
return self._operations.aggs_groupby(
|
||||||
self,
|
self,
|
||||||
|
@ -176,3 +176,21 @@ class TestGroupbyDataFrame(TestData):
|
|||||||
assert_index_equal(pd_count.index, ed_count.index)
|
assert_index_equal(pd_count.index, ed_count.index)
|
||||||
assert_frame_equal(pd_count, ed_count)
|
assert_frame_equal(pd_count, ed_count)
|
||||||
assert_series_equal(pd_count.dtypes, ed_count.dtypes)
|
assert_series_equal(pd_count.dtypes, ed_count.dtypes)
|
||||||
|
|
||||||
|
def test_groupby_dataframe_mad(self):
|
||||||
|
pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
|
||||||
|
ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
|
||||||
|
|
||||||
|
pd_mad = pd_flights.groupby("DestCountry").mad()
|
||||||
|
ed_mad = ed_flights.groupby("DestCountry").mad()
|
||||||
|
|
||||||
|
assert_index_equal(pd_mad.columns, ed_mad.columns)
|
||||||
|
assert_index_equal(pd_mad.index, ed_mad.index)
|
||||||
|
assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
|
||||||
|
|
||||||
|
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||||
|
ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||||
|
|
||||||
|
assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)
|
||||||
|
assert_index_equal(pd_min_mad.index, ed_min_mad.index)
|
||||||
|
assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user