mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Document DataFrame.groupby() methods
This commit is contained in:
parent
475e0f41ef
commit
ae70f03df3
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.agg
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.agg
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.aggregate
|
||||
========================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.aggregate
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.count
|
||||
====================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.count
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.mad
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.mad
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.max
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.max
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.mean
|
||||
===================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.mean
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.median
|
||||
=====================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.median
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.min
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.min
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.nunique
|
||||
======================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.nunique
|
15
docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
Normal file
15
docs/source/reference/api/eland.groupby.DataFrameGroupBy.rst
Normal file
@ -0,0 +1,15 @@
|
||||
eland.groupby.DataFrameGroupBy
|
||||
==============================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. autoclass:: DataFrameGroupBy
|
||||
|
||||
|
||||
..
|
||||
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
|
||||
.. autosummary::
|
||||
:toctree:
|
||||
|
||||
DataFrame.abs
|
||||
DataFrame.add
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.std
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.std
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.sum
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.sum
|
@ -0,0 +1,6 @@
|
||||
eland.groupby.DataFrameGroupBy.var
|
||||
==================================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. automethod:: DataFrameGroupBy.var
|
15
docs/source/reference/api/eland.groupby.GroupBy.rst
Normal file
15
docs/source/reference/api/eland.groupby.GroupBy.rst
Normal file
@ -0,0 +1,15 @@
|
||||
eland.groupby.GroupBy
|
||||
=====================
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. autoclass:: GroupBy
|
||||
|
||||
|
||||
..
|
||||
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
|
||||
.. autosummary::
|
||||
:toctree:
|
||||
|
||||
DataFrame.abs
|
||||
DataFrame.add
|
@ -48,6 +48,28 @@ Function Application, GroupBy & Window
|
||||
DataFrame.aggregate
|
||||
DataFrame.groupby
|
||||
|
||||
.. currentmodule:: eland.groupby
|
||||
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
DataFrameGroupBy
|
||||
DataFrameGroupBy.agg
|
||||
DataFrameGroupBy.aggregate
|
||||
DataFrameGroupBy.count
|
||||
DataFrameGroupByGroupBy.mad
|
||||
DataFrameGroupByGroupBy.max
|
||||
DataFrameGroupByGroupBy.mean
|
||||
DataFrameGroupByGroupBy.median
|
||||
DataFrameGroupByGroupBy.min
|
||||
DataFrameGroupByGroupBy.nunique
|
||||
DataFrameGroupByGroupBy.std
|
||||
DataFrameGroupByGroupBy.sum
|
||||
DataFrameGroupByGroupBy.var
|
||||
GroupBy
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. _api.dataframe.stats:
|
||||
|
||||
Computations / Descriptive Stats
|
||||
|
@ -1,7 +1,7 @@
|
||||
.. _api.general_utility_functions:
|
||||
|
||||
=========================
|
||||
General utility functions
|
||||
General Utility Functions
|
||||
=========================
|
||||
.. currentmodule:: eland
|
||||
|
||||
|
@ -170,7 +170,7 @@ script instead of being modified manually.
|
||||
+---------------------------------------+------------+
|
||||
| ``ed.DataFrame.get()`` | **Yes** |
|
||||
+---------------------------------------+------------+
|
||||
| ``ed.DataFrame.groupby()`` | No |
|
||||
| ``ed.DataFrame.groupby()`` | **Yes** |
|
||||
+---------------------------------------+------------+
|
||||
| ``ed.DataFrame.gt()`` | No |
|
||||
+---------------------------------------+------------+
|
||||
|
@ -36,7 +36,7 @@ from pandas.util._validators import validate_bool_kwarg
|
||||
import eland.plotting as gfx
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.filter import BooleanFilter
|
||||
from eland.groupby import GroupByDataFrame
|
||||
from eland.groupby import DataFrameGroupBy
|
||||
from eland.ndframe import NDFrame
|
||||
from eland.series import Series
|
||||
from eland.utils import deprecated_api, is_valid_attr_name
|
||||
@ -1433,7 +1433,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
def groupby(
|
||||
self, by: Optional[Union[str, List[str]]] = None, dropna: bool = True
|
||||
) -> "GroupByDataFrame":
|
||||
) -> "DataFrameGroupBy":
|
||||
"""
|
||||
Used to perform groupby operations
|
||||
|
||||
@ -1448,7 +1448,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
Returns
|
||||
-------
|
||||
GroupByDataFrame
|
||||
eland.groupby.DataFrameGroupBy
|
||||
|
||||
See Also
|
||||
--------
|
||||
@ -1520,7 +1520,7 @@ class DataFrame(NDFrame):
|
||||
f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
|
||||
)
|
||||
|
||||
return GroupByDataFrame(
|
||||
return DataFrameGroupBy(
|
||||
by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
|
||||
)
|
||||
|
||||
|
486
eland/groupby.py
486
eland/groupby.py
@ -15,7 +15,7 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from typing import TYPE_CHECKING, List
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
from eland.query_compiler import QueryCompiler
|
||||
|
||||
@ -25,16 +25,7 @@ if TYPE_CHECKING:
|
||||
|
||||
class GroupBy:
|
||||
"""
|
||||
Base class for calls to X.groupby([...])
|
||||
|
||||
Parameters
|
||||
----------
|
||||
by:
|
||||
List of columns to groupby
|
||||
query_compiler:
|
||||
Query compiler object
|
||||
dropna:
|
||||
default is true, drop None/NaT/NaN values while grouping
|
||||
Base class for calls to :py:func:`eland.DataFrame.groupby`
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -47,7 +38,56 @@ class GroupBy:
|
||||
self._dropna: bool = dropna
|
||||
self._by: List[str] = by
|
||||
|
||||
|
||||
class DataFrameGroupBy(GroupBy):
|
||||
"""
|
||||
This holds all the groupby methods for :py:func:`eland.DataFrame.groupby`
|
||||
"""
|
||||
|
||||
def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the mean value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
mean value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||
DestCountry
|
||||
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
|
||||
AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627
|
||||
AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094
|
||||
AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287
|
||||
CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150
|
||||
... ... ... ... ...
|
||||
RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104
|
||||
SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838
|
||||
TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000
|
||||
US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215
|
||||
ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773
|
||||
<BLANKLINE>
|
||||
[32 rows x 4 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["mean"],
|
||||
@ -56,6 +96,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def var(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the variance value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
variance value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.var`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek
|
||||
DestCountry
|
||||
AE 75789.979090 0.130443 3.950549
|
||||
AR 59683.055316 0.125979 3.783429
|
||||
AT 65726.669676 0.144610 4.090013
|
||||
AU 65088.483446 0.113094 3.833562
|
||||
CA 68149.950516 0.116496 3.688139
|
||||
... ... ... ...
|
||||
RU 67305.277617 0.114107 3.852666
|
||||
SE 53740.570338 0.127062 3.942132
|
||||
TR 61245.521047 0.094868 4.100420
|
||||
US 74349.939410 0.109638 3.758700
|
||||
ZA 62920.072901 0.126608 3.775609
|
||||
<BLANKLINE>
|
||||
[32 rows x 3 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["var"],
|
||||
@ -64,6 +147,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def std(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the standard deviation value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
standard deviation value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.std`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek
|
||||
DestCountry
|
||||
AE 279.875500 0.367171 2.020634
|
||||
AR 244.903626 0.355811 1.949901
|
||||
AT 256.883342 0.381035 2.026411
|
||||
AU 255.585377 0.336902 1.961486
|
||||
CA 261.263054 0.341587 1.921980
|
||||
... ... ... ...
|
||||
RU 259.696213 0.338140 1.964815
|
||||
SE 232.504297 0.357510 1.991340
|
||||
TR 267.827572 0.333333 2.191454
|
||||
US 272.774819 0.331242 1.939469
|
||||
ZA 251.505568 0.356766 1.948258
|
||||
<BLANKLINE>
|
||||
[32 rows x 3 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["std"],
|
||||
@ -72,6 +198,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the median absolute deviation value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
median absolute deviation value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").mad() # doctest: +SKIP
|
||||
AvgTicketPrice Cancelled dayOfWeek
|
||||
DestCountry
|
||||
AE 233.697174 NaN 1.5
|
||||
AR 189.250061 NaN 2.0
|
||||
AT 195.823669 NaN 2.0
|
||||
AU 202.539764 NaN 2.0
|
||||
CA 203.344696 NaN 2.0
|
||||
... ... ... ...
|
||||
RU 206.431702 NaN 2.0
|
||||
SE 178.658447 NaN 2.0
|
||||
TR 221.863434 NaN 1.0
|
||||
US 228.461365 NaN 2.0
|
||||
ZA 192.162842 NaN 2.0
|
||||
<BLANKLINE>
|
||||
[32 rows x 3 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["mad"],
|
||||
@ -80,6 +249,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def median(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the median value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
median absolute deviation value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.median`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||
DestCountry
|
||||
AE 585.720490 False 2 2018-01-19 23:56:44.000
|
||||
AR 678.447433 False 3 2018-01-22 10:18:50.000
|
||||
AT 659.715592 False 3 2018-01-20 20:40:10.000
|
||||
AU 689.241348 False 3 2018-01-22 18:46:11.000
|
||||
CA 663.516057 False 3 2018-01-22 21:35:09.500
|
||||
... ... ... ... ...
|
||||
RU 670.714956 False 3 2018-01-20 16:48:16.000
|
||||
SE 680.111084 False 3 2018-01-22 20:53:44.000
|
||||
TR 441.681122 False 1 2018-01-13 23:17:27.000
|
||||
US 600.591525 False 3 2018-01-22 04:09:50.000
|
||||
ZA 633.935425 False 3 2018-01-23 17:42:57.000
|
||||
<BLANKLINE>
|
||||
[32 rows x 4 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["median"],
|
||||
@ -88,6 +300,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the sum value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
sum value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek
|
||||
DestCountry
|
||||
AE 2.783612e+04 7.0 124.0
|
||||
AR 2.058223e+05 45.0 837.0
|
||||
AT 2.437872e+05 66.0 1083.0
|
||||
AU 2.785365e+05 54.0 1183.0
|
||||
CA 6.124173e+05 127.0 2786.0
|
||||
... ... ... ...
|
||||
RU 4.899533e+05 97.0 2093.0
|
||||
SE 1.684563e+05 38.0 684.0
|
||||
TR 4.852532e+03 1.0 19.0
|
||||
US 1.183804e+06 249.0 5472.0
|
||||
ZA 1.819840e+05 42.0 783.0
|
||||
<BLANKLINE>
|
||||
[32 rows x 3 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["sum"],
|
||||
@ -96,6 +351,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def min(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the min value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
min value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.min`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||
DestCountry
|
||||
AE 110.799911 False 0 2018-01-01 19:31:30
|
||||
AR 125.589394 False 0 2018-01-01 01:30:47
|
||||
AT 100.020531 False 0 2018-01-01 05:24:19
|
||||
AU 102.294312 False 0 2018-01-01 00:00:00
|
||||
CA 100.557251 False 0 2018-01-01 00:44:08
|
||||
... ... ... ... ...
|
||||
RU 101.004005 False 0 2018-01-01 01:01:51
|
||||
SE 102.877190 False 0 2018-01-01 04:09:38
|
||||
TR 142.876465 False 0 2018-01-01 06:45:17
|
||||
US 100.145966 False 0 2018-01-01 00:06:27
|
||||
ZA 102.002663 False 0 2018-01-01 06:44:44
|
||||
<BLANKLINE>
|
||||
[32 rows x 4 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["min"],
|
||||
@ -104,6 +402,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def max(self, numeric_only: bool = True) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the max value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
max value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.max`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||
DestCountry
|
||||
AE 1126.148682 True 6 2018-02-11 04:11:14
|
||||
AR 1199.642822 True 6 2018-02-11 17:09:05
|
||||
AT 1181.835815 True 6 2018-02-11 23:12:33
|
||||
AU 1197.632690 True 6 2018-02-11 21:39:01
|
||||
CA 1198.852539 True 6 2018-02-11 23:04:08
|
||||
... ... ... ... ...
|
||||
RU 1196.742310 True 6 2018-02-11 20:03:31
|
||||
SE 1198.621582 True 6 2018-02-11 22:06:14
|
||||
TR 855.935547 True 6 2018-02-04 01:59:23
|
||||
US 1199.729004 True 6 2018-02-11 23:27:00
|
||||
ZA 1196.186157 True 6 2018-02-11 23:29:45
|
||||
<BLANKLINE>
|
||||
[32 rows x 4 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["max"],
|
||||
@ -112,6 +453,49 @@ class GroupBy:
|
||||
)
|
||||
|
||||
def nunique(self) -> "pd.DataFrame":
|
||||
"""
|
||||
Compute the nunique value for each group.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
numeric_only: {True, False, None} Default is True
|
||||
Which datatype to be returned
|
||||
- True: Returns all values as float64, NaN/NaT values are removed
|
||||
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
||||
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
nunique value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek
|
||||
DestCountry
|
||||
AE 46 2 7
|
||||
AR 305 2 7
|
||||
AT 377 2 7
|
||||
AU 416 2 7
|
||||
CA 944 2 7
|
||||
... ... ... ...
|
||||
RU 739 2 7
|
||||
SE 255 2 7
|
||||
TR 10 2 5
|
||||
US 1987 2 7
|
||||
ZA 283 2 7
|
||||
<BLANKLINE>
|
||||
[32 rows x 3 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
pd_aggs=["nunique"],
|
||||
@ -119,22 +503,9 @@ class GroupBy:
|
||||
numeric_only=False,
|
||||
)
|
||||
|
||||
|
||||
class GroupByDataFrame(GroupBy):
|
||||
"""
|
||||
This holds all the groupby methods for DataFrame
|
||||
|
||||
Parameters
|
||||
----------
|
||||
by:
|
||||
List of columns to groupby
|
||||
query_compiler:
|
||||
Query compiler object
|
||||
dropna:
|
||||
default is true, drop None/NaT/NaN values while grouping
|
||||
"""
|
||||
|
||||
def aggregate(self, func: List[str], numeric_only: bool = False) -> "pd.DataFrame":
|
||||
def aggregate(
|
||||
self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
|
||||
) -> "pd.DataFrame":
|
||||
"""
|
||||
Used to groupby and aggregate
|
||||
|
||||
@ -155,8 +526,36 @@ class GroupByDataFrame(GroupBy):
|
||||
|
||||
Returns
|
||||
-------
|
||||
A Pandas DataFrame
|
||||
pandas.DataFrame
|
||||
aggregation value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice ... dayOfWeek
|
||||
min max ... min max
|
||||
DestCountry ...
|
||||
AE 110.799911 1126.148682 ... 0 6
|
||||
AR 125.589394 1199.642822 ... 0 6
|
||||
AT 100.020531 1181.835815 ... 0 6
|
||||
AU 102.294312 1197.632690 ... 0 6
|
||||
CA 100.557251 1198.852539 ... 0 6
|
||||
... ... ... ... ... ..
|
||||
RU 101.004005 1196.742310 ... 0 6
|
||||
SE 102.877190 1198.621582 ... 0 6
|
||||
TR 142.876465 855.935547 ... 0 6
|
||||
US 100.145966 1199.729004 ... 0 6
|
||||
ZA 102.002663 1196.186157 ... 0 6
|
||||
<BLANKLINE>
|
||||
[32 rows x 6 columns]
|
||||
"""
|
||||
# Controls whether a MultiIndex is used for the
|
||||
# columns of the result DataFrame.
|
||||
@ -177,12 +576,39 @@ class GroupByDataFrame(GroupBy):
|
||||
|
||||
def count(self) -> "pd.DataFrame":
|
||||
"""
|
||||
Used to groupby and count
|
||||
Compute the count value for each group.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A Pandas DataFrame
|
||||
pandas.DataFrame
|
||||
nunique value for each numeric column of each group
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.core.groupby.GroupBy.count`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame(
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
|
||||
AvgTicketPrice Cancelled dayOfWeek
|
||||
DestCountry
|
||||
AE 46 46 46
|
||||
AR 305 305 305
|
||||
AT 377 377 377
|
||||
AU 416 416 416
|
||||
CA 944 944 944
|
||||
... ... ... ...
|
||||
RU 739 739 739
|
||||
SE 255 255 255
|
||||
TR 10 10 10
|
||||
US 1987 1987 1987
|
||||
ZA 283 283 283
|
||||
<BLANKLINE>
|
||||
[32 rows x 3 columns]
|
||||
"""
|
||||
return self._query_compiler.aggs_groupby(
|
||||
by=self._by,
|
||||
|
@ -545,7 +545,7 @@ class Operations:
|
||||
pd_aggs: List[str],
|
||||
dropna: bool = True,
|
||||
is_dataframe_agg: bool = False,
|
||||
numeric_only: bool = True,
|
||||
numeric_only: Optional[bool] = True,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
This method is used to construct groupby aggregation dataframe
|
||||
@ -570,15 +570,98 @@ class Operations:
|
||||
-------
|
||||
A dataframe which consists groupby data
|
||||
"""
|
||||
headers, results = self._groupby_aggs(
|
||||
query_compiler,
|
||||
by=by,
|
||||
pd_aggs=pd_aggs,
|
||||
dropna=dropna,
|
||||
is_dataframe_agg=is_dataframe_agg,
|
||||
numeric_only=numeric_only,
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError(
|
||||
f"Can not count field matches if size is set {size}"
|
||||
)
|
||||
|
||||
by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
|
||||
|
||||
# Used defaultdict to avoid initialization of columns with lists
|
||||
results: Dict[str, List[Any]] = defaultdict(list)
|
||||
|
||||
if numeric_only:
|
||||
agg_fields = [
|
||||
field for field in agg_fields if (field.is_numeric or field.is_bool)
|
||||
]
|
||||
|
||||
body = Query(query_params.query)
|
||||
|
||||
# To return for creating multi-index on columns
|
||||
headers = [agg_field.column for agg_field in agg_fields]
|
||||
|
||||
# Convert pandas aggs to ES equivalent
|
||||
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
||||
|
||||
# Construct Query
|
||||
for by_field in by_fields:
|
||||
# groupby fields will be term aggregations
|
||||
body.composite_agg_bucket_terms(
|
||||
name=f"groupby_{by_field.column}",
|
||||
field=by_field.aggregatable_es_field_name,
|
||||
)
|
||||
|
||||
for agg_field in agg_fields:
|
||||
for es_agg in es_aggs:
|
||||
# Skip if the field isn't compatible or if the agg is
|
||||
# 'value_count' as this value is pulled from bucket.doc_count.
|
||||
if not agg_field.is_es_agg_compatible(es_agg):
|
||||
continue
|
||||
|
||||
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
||||
if isinstance(es_agg, tuple):
|
||||
body.metric_aggs(
|
||||
f"{es_agg[0]}_{agg_field.es_field_name}",
|
||||
es_agg[0],
|
||||
agg_field.aggregatable_es_field_name,
|
||||
)
|
||||
else:
|
||||
body.metric_aggs(
|
||||
f"{es_agg}_{agg_field.es_field_name}",
|
||||
es_agg,
|
||||
agg_field.aggregatable_es_field_name,
|
||||
)
|
||||
|
||||
# Composite aggregation
|
||||
body.composite_agg_start(
|
||||
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
|
||||
)
|
||||
|
||||
for buckets in self.bucket_generator(query_compiler, body):
|
||||
# We recieve response row-wise
|
||||
for bucket in buckets:
|
||||
# groupby columns are added to result same way they are returned
|
||||
for by_field in by_fields:
|
||||
bucket_key = bucket["key"][f"groupby_{by_field.column}"]
|
||||
|
||||
# Datetimes always come back as integers, convert to pd.Timestamp()
|
||||
if by_field.is_timestamp and isinstance(bucket_key, int):
|
||||
bucket_key = pd.to_datetime(bucket_key, unit="ms")
|
||||
|
||||
results[by_field.column].append(bucket_key)
|
||||
|
||||
agg_calculation = self._unpack_metric_aggs(
|
||||
fields=agg_fields,
|
||||
es_aggs=es_aggs,
|
||||
pd_aggs=pd_aggs,
|
||||
response={"aggregations": bucket},
|
||||
numeric_only=numeric_only,
|
||||
# We set 'True' here because we want the value
|
||||
# unpacking to always be in 'dataframe' mode.
|
||||
is_dataframe_agg=True,
|
||||
)
|
||||
|
||||
# Process the calculated agg values to response
|
||||
for key, value in agg_calculation.items():
|
||||
if not isinstance(value, list):
|
||||
results[key].append(value)
|
||||
continue
|
||||
for pd_agg, val in zip(pd_aggs, value):
|
||||
results[f"{key}_{pd_agg}"].append(val)
|
||||
|
||||
agg_df = pd.DataFrame(results).set_index(by)
|
||||
|
||||
if is_dataframe_agg:
|
||||
@ -636,146 +719,6 @@ class Operations:
|
||||
else:
|
||||
return composite_buckets["buckets"]
|
||||
|
||||
def _groupby_aggs(
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
by: List[str],
|
||||
pd_aggs: List[str],
|
||||
dropna: bool = True,
|
||||
is_dataframe_agg: bool = False,
|
||||
numeric_only: bool = True,
|
||||
) -> Tuple[List[str], Dict[str, Any]]:
|
||||
"""
|
||||
This method is used to calculate groupby aggregations
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query_compiler:
|
||||
A Query compiler
|
||||
by:
|
||||
a list of columns on which groupby operations have to be performed
|
||||
pd_aggs:
|
||||
a list of aggregations to be performed
|
||||
dropna:
|
||||
Drop None values if True.
|
||||
TODO Not yet implemented
|
||||
is_dataframe_agg:
|
||||
Know if multi aggregation or single agg is called.
|
||||
numeric_only:
|
||||
return either numeric values or NaN/NaT
|
||||
|
||||
Returns
|
||||
-------
|
||||
headers: columns on which MultiIndex has to be applied
|
||||
response: dictionary of groupby aggregated values
|
||||
"""
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError(
|
||||
f"Can not count field matches if size is set {size}"
|
||||
)
|
||||
|
||||
by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)
|
||||
|
||||
# Used defaultdict to avoid initialization of columns with lists
|
||||
response: Dict[str, List[Any]] = defaultdict(list)
|
||||
|
||||
if numeric_only:
|
||||
agg_fields = [
|
||||
field for field in agg_fields if (field.is_numeric or field.is_bool)
|
||||
]
|
||||
|
||||
body = Query(query_params.query)
|
||||
|
||||
# To return for creating multi-index on columns
|
||||
headers = [field.column for field in agg_fields]
|
||||
|
||||
# Convert pandas aggs to ES equivalent
|
||||
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
||||
|
||||
# pd_agg 'count' is handled via 'doc_count' from buckets
|
||||
using_pd_agg_count = "count" in pd_aggs
|
||||
|
||||
# Construct Query
|
||||
for by_field in by_fields:
|
||||
# groupby fields will be term aggregations
|
||||
body.composite_agg_bucket_terms(
|
||||
name=f"groupby_{by_field.column}",
|
||||
field=by_field.aggregatable_es_field_name,
|
||||
)
|
||||
|
||||
for agg_field in agg_fields:
|
||||
for es_agg in es_aggs:
|
||||
# Skip if the field isn't compatible or if the agg is
|
||||
# 'value_count' as this value is pulled from bucket.doc_count.
|
||||
if (
|
||||
not agg_field.is_es_agg_compatible(es_agg)
|
||||
or es_agg == "value_count"
|
||||
):
|
||||
continue
|
||||
|
||||
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
||||
if isinstance(es_agg, tuple):
|
||||
body.metric_aggs(
|
||||
f"{es_agg[0]}_{agg_field.es_field_name}",
|
||||
es_agg[0],
|
||||
agg_field.aggregatable_es_field_name,
|
||||
)
|
||||
else:
|
||||
body.metric_aggs(
|
||||
f"{es_agg}_{agg_field.es_field_name}",
|
||||
es_agg,
|
||||
agg_field.aggregatable_es_field_name,
|
||||
)
|
||||
|
||||
# Composite aggregation
|
||||
body.composite_agg_start(
|
||||
size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
|
||||
)
|
||||
|
||||
for buckets in self.bucket_generator(query_compiler, body):
|
||||
# We recieve response row-wise
|
||||
for bucket in buckets:
|
||||
# groupby columns are added to result same way they are returned
|
||||
for by_field in by_fields:
|
||||
bucket_key = bucket["key"][f"groupby_{by_field.column}"]
|
||||
|
||||
# Datetimes always come back as integers, convert to pd.Timestamp()
|
||||
if by_field.is_timestamp and isinstance(bucket_key, int):
|
||||
bucket_key = pd.to_datetime(bucket_key, unit="ms")
|
||||
|
||||
response[by_field.column].append(bucket_key)
|
||||
|
||||
# Put 'doc_count' from bucket into each 'agg_field'
|
||||
# to be extracted from _unpack_metric_aggs()
|
||||
if using_pd_agg_count:
|
||||
doc_count = bucket["doc_count"]
|
||||
for agg_field in agg_fields:
|
||||
bucket[f"value_count_{agg_field.es_field_name}"] = {
|
||||
"value": doc_count
|
||||
}
|
||||
|
||||
agg_calculation = self._unpack_metric_aggs(
|
||||
fields=agg_fields,
|
||||
es_aggs=es_aggs,
|
||||
pd_aggs=pd_aggs,
|
||||
response={"aggregations": bucket},
|
||||
numeric_only=numeric_only,
|
||||
is_dataframe_agg=is_dataframe_agg,
|
||||
)
|
||||
|
||||
# Process the calculated agg values to response
|
||||
for key, value in agg_calculation.items():
|
||||
if not isinstance(value, list):
|
||||
response[key].append(value)
|
||||
continue
|
||||
for pd_agg, val in zip(pd_aggs, value):
|
||||
response[f"{key}_{pd_agg}"].append(val)
|
||||
|
||||
return headers, response
|
||||
|
||||
@staticmethod
|
||||
def _map_pd_aggs_to_es_aggs(pd_aggs):
|
||||
"""
|
||||
|
@ -556,7 +556,7 @@ class QueryCompiler:
|
||||
pd_aggs: List[str],
|
||||
dropna: bool = True,
|
||||
is_dataframe_agg: bool = False,
|
||||
numeric_only: bool = True,
|
||||
numeric_only: Optional[bool] = True,
|
||||
) -> pd.DataFrame:
|
||||
return self._operations.aggs_groupby(
|
||||
self,
|
||||
|
@ -176,3 +176,21 @@ class TestGroupbyDataFrame(TestData):
|
||||
assert_index_equal(pd_count.index, ed_count.index)
|
||||
assert_frame_equal(pd_count, ed_count)
|
||||
assert_series_equal(pd_count.dtypes, ed_count.dtypes)
|
||||
|
||||
def test_groupby_dataframe_mad(self):
|
||||
pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
|
||||
ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
|
||||
|
||||
pd_mad = pd_flights.groupby("DestCountry").mad()
|
||||
ed_mad = ed_flights.groupby("DestCountry").mad()
|
||||
|
||||
assert_index_equal(pd_mad.columns, ed_mad.columns)
|
||||
assert_index_equal(pd_mad.index, ed_mad.index)
|
||||
assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
|
||||
|
||||
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||
ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||
|
||||
assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)
|
||||
assert_index_equal(pd_min_mad.index, ed_min_mad.index)
|
||||
assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes)
|
||||
|
Loading…
x
Reference in New Issue
Block a user