mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
703 lines
28 KiB
Python
703 lines
28 KiB
Python
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
# license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright
|
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
# the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from typing import TYPE_CHECKING, List, Optional, Union
|
|
|
|
from eland.query_compiler import QueryCompiler
|
|
|
|
if TYPE_CHECKING:
|
|
import pandas as pd # type: ignore
|
|
|
|
|
|
class GroupBy:
|
|
"""
|
|
Base class for calls to :py:func:`eland.DataFrame.groupby`
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
by: List[str],
|
|
query_compiler: "QueryCompiler",
|
|
dropna: bool = True,
|
|
) -> None:
|
|
self._query_compiler: "QueryCompiler" = QueryCompiler(to_copy=query_compiler)
|
|
self._dropna: bool = dropna
|
|
self._by: List[str] = by
|
|
|
|
|
|
class DataFrameGroupBy(GroupBy):
|
|
"""
|
|
This holds all the groupby methods for :py:func:`eland.DataFrame.groupby`
|
|
"""
|
|
|
|
def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the mean value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
mean value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.mean`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
|
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
|
DestCountry
|
|
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
|
|
AR 674.827252 0.147541 2.744262 2018-01-21 22:18:06.593442627
|
|
AT 646.650530 0.175066 2.872679 2018-01-21 15:54:42.469496094
|
|
AU 669.558832 0.129808 2.843750 2018-01-22 02:28:39.199519287
|
|
CA 648.747109 0.134534 2.951271 2018-01-22 14:40:47.165254150
|
|
... ... ... ... ...
|
|
RU 662.994963 0.131258 2.832206 2018-01-21 07:11:16.534506104
|
|
SE 660.612988 0.149020 2.682353 2018-01-22 07:48:23.447058838
|
|
TR 485.253247 0.100000 1.900000 2018-01-16 16:02:33.000000000
|
|
US 595.774391 0.125315 2.753900 2018-01-21 16:55:04.456970215
|
|
ZA 643.053057 0.148410 2.766784 2018-01-22 15:17:56.141342773
|
|
<BLANKLINE>
|
|
[32 rows x 4 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["mean"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def var(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the variance value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
variance value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.var`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").var() # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek
|
|
DestCountry
|
|
AE 75789.979090 0.130443 3.950549
|
|
AR 59683.055316 0.125979 3.783429
|
|
AT 65726.669676 0.144610 4.090013
|
|
AU 65088.483446 0.113094 3.833562
|
|
CA 68149.950516 0.116496 3.688139
|
|
... ... ... ...
|
|
RU 67305.277617 0.114107 3.852666
|
|
SE 53740.570338 0.127062 3.942132
|
|
TR 61245.521047 0.094868 4.100420
|
|
US 74349.939410 0.109638 3.758700
|
|
ZA 62920.072901 0.126608 3.775609
|
|
<BLANKLINE>
|
|
[32 rows x 3 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["var"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def std(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the standard deviation value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
standard deviation value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.std`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").std() # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek
|
|
DestCountry
|
|
AE 279.875500 0.367171 2.020634
|
|
AR 244.903626 0.355811 1.949901
|
|
AT 256.883342 0.381035 2.026411
|
|
AU 255.585377 0.336902 1.961486
|
|
CA 261.263054 0.341587 1.921980
|
|
... ... ... ...
|
|
RU 259.696213 0.338140 1.964815
|
|
SE 232.504297 0.357510 1.991340
|
|
TR 267.827572 0.333333 2.191454
|
|
US 272.774819 0.331242 1.939469
|
|
ZA 251.505568 0.356766 1.948258
|
|
<BLANKLINE>
|
|
[32 rows x 3 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["std"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the median absolute deviation value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
median absolute deviation value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.mad`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").mad() # doctest: +SKIP
|
|
AvgTicketPrice Cancelled dayOfWeek
|
|
DestCountry
|
|
AE 233.697174 NaN 1.5
|
|
AR 189.250061 NaN 2.0
|
|
AT 195.823669 NaN 2.0
|
|
AU 202.539764 NaN 2.0
|
|
CA 203.344696 NaN 2.0
|
|
... ... ... ...
|
|
RU 206.431702 NaN 2.0
|
|
SE 178.658447 NaN 2.0
|
|
TR 221.863434 NaN 1.0
|
|
US 228.461365 NaN 2.0
|
|
ZA 192.162842 NaN 2.0
|
|
<BLANKLINE>
|
|
[32 rows x 3 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["mad"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def median(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the median value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
median absolute deviation value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.median`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").median(numeric_only=False) # doctest: +SKIP
|
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
|
DestCountry
|
|
AE 585.720490 False 2 2018-01-19 23:56:44.000
|
|
AR 678.447433 False 3 2018-01-22 10:18:50.000
|
|
AT 659.715592 False 3 2018-01-20 20:40:10.000
|
|
AU 689.241348 False 3 2018-01-22 18:46:11.000
|
|
CA 663.516057 False 3 2018-01-22 21:35:09.500
|
|
... ... ... ... ...
|
|
RU 670.714956 False 3 2018-01-20 16:48:16.000
|
|
SE 680.111084 False 3 2018-01-22 20:53:44.000
|
|
TR 441.681122 False 1 2018-01-13 23:17:27.000
|
|
US 600.591525 False 3 2018-01-22 04:09:50.000
|
|
ZA 633.935425 False 3 2018-01-23 17:42:57.000
|
|
<BLANKLINE>
|
|
[32 rows x 4 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["median"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def sum(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the sum value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
sum value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.sum`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").sum() # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek
|
|
DestCountry
|
|
AE 2.783612e+04 7.0 124.0
|
|
AR 2.058223e+05 45.0 837.0
|
|
AT 2.437872e+05 66.0 1083.0
|
|
AU 2.785365e+05 54.0 1183.0
|
|
CA 6.124173e+05 127.0 2786.0
|
|
... ... ... ...
|
|
RU 4.899533e+05 97.0 2093.0
|
|
SE 1.684563e+05 38.0 684.0
|
|
TR 4.852532e+03 1.0 19.0
|
|
US 1.183804e+06 249.0 5472.0
|
|
ZA 1.819840e+05 42.0 783.0
|
|
<BLANKLINE>
|
|
[32 rows x 3 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["sum"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def min(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the min value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
min value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.min`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").min(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
|
DestCountry
|
|
AE 110.799911 False 0 2018-01-01 19:31:30
|
|
AR 125.589394 False 0 2018-01-01 01:30:47
|
|
AT 100.020531 False 0 2018-01-01 05:24:19
|
|
AU 102.294312 False 0 2018-01-01 00:00:00
|
|
CA 100.557251 False 0 2018-01-01 00:44:08
|
|
... ... ... ... ...
|
|
RU 101.004005 False 0 2018-01-01 01:01:51
|
|
SE 102.877190 False 0 2018-01-01 04:09:38
|
|
TR 142.876465 False 0 2018-01-01 06:45:17
|
|
US 100.145966 False 0 2018-01-01 00:06:27
|
|
ZA 102.002663 False 0 2018-01-01 06:44:44
|
|
<BLANKLINE>
|
|
[32 rows x 4 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["min"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def max(self, numeric_only: bool = True) -> "pd.DataFrame":
|
|
"""
|
|
Compute the max value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
max value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.max`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").max(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek timestamp
|
|
DestCountry
|
|
AE 1126.148682 True 6 2018-02-11 04:11:14
|
|
AR 1199.642822 True 6 2018-02-11 17:09:05
|
|
AT 1181.835815 True 6 2018-02-11 23:12:33
|
|
AU 1197.632690 True 6 2018-02-11 21:39:01
|
|
CA 1198.852539 True 6 2018-02-11 23:04:08
|
|
... ... ... ... ...
|
|
RU 1196.742310 True 6 2018-02-11 20:03:31
|
|
SE 1198.621582 True 6 2018-02-11 22:06:14
|
|
TR 855.935547 True 6 2018-02-04 01:59:23
|
|
US 1199.729004 True 6 2018-02-11 23:27:00
|
|
ZA 1196.186157 True 6 2018-02-11 23:29:45
|
|
<BLANKLINE>
|
|
[32 rows x 4 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["max"],
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
)
|
|
|
|
def nunique(self) -> "pd.DataFrame":
|
|
"""
|
|
Compute the nunique value for each group.
|
|
|
|
Parameters
|
|
----------
|
|
numeric_only: {True, False, None} Default is True
|
|
Which datatype to be returned
|
|
- True: Returns all values as float64, NaN/NaT values are removed
|
|
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
|
|
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
nunique value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.nunique`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").nunique() # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek
|
|
DestCountry
|
|
AE 46 2 7
|
|
AR 305 2 7
|
|
AT 377 2 7
|
|
AU 416 2 7
|
|
CA 944 2 7
|
|
... ... ... ...
|
|
RU 739 2 7
|
|
SE 255 2 7
|
|
TR 10 2 5
|
|
US 1987 2 7
|
|
ZA 283 2 7
|
|
<BLANKLINE>
|
|
[32 rows x 3 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["nunique"],
|
|
dropna=self._dropna,
|
|
numeric_only=False,
|
|
)
|
|
|
|
def quantile(
|
|
self, q: Union[int, float, List[int], List[float]] = 0.5
|
|
) -> "pd.DataFrame":
|
|
"""
|
|
Used to groupby and calculate quantile for a given DataFrame.
|
|
|
|
Parameters
|
|
----------
|
|
q:
|
|
float or array like, default 0.5
|
|
Value between 0 <= q <= 1, the quantile(s) to compute.
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
quantile value for each grouped column
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.quantile`
|
|
|
|
Examples
|
|
--------
|
|
>>> ed_df = ed.DataFrame('localhost', 'flights')
|
|
>>> ed_flights = ed_df.filter(["AvgTicketPrice", "FlightDelayMin", "dayOfWeek", "timestamp"])
|
|
>>> ed_flights.groupby(["dayOfWeek", "Cancelled"]).quantile() # doctest: +SKIP
|
|
AvgTicketPrice FlightDelayMin
|
|
dayOfWeek Cancelled
|
|
0 False 572.290384 0.0
|
|
True 578.140564 0.0
|
|
1 False 567.980560 0.0
|
|
True 582.618713 0.0
|
|
2 False 590.170986 0.0
|
|
True 579.811890 0.0
|
|
3 False 574.131340 0.0
|
|
True 572.852264 0.0
|
|
4 False 591.533699 0.0
|
|
True 582.877014 0.0
|
|
5 False 791.622625 0.0
|
|
True 793.362946 0.0
|
|
6 False 817.378523 0.0
|
|
True 766.855530 0.0
|
|
|
|
>>> ed_flights.groupby(["dayOfWeek", "Cancelled"]).quantile(q=[.2, .5]) # doctest: +SKIP
|
|
AvgTicketPrice FlightDelayMin
|
|
dayOfWeek Cancelled
|
|
0 False 0.2 319.925979 0.0
|
|
0.5 572.290384 0.0
|
|
True 0.2 325.704562 0.0
|
|
0.5 578.140564 0.0
|
|
1 False 0.2 327.311007 0.0
|
|
0.5 567.980560 0.0
|
|
True 0.2 336.839572 0.0
|
|
0.5 582.618713 0.0
|
|
2 False 0.2 332.323011 0.0
|
|
0.5 590.170986 0.0
|
|
True 0.2 314.472537 0.0
|
|
0.5 579.811890 0.0
|
|
3 False 0.2 327.652659 0.0
|
|
0.5 574.131340 0.0
|
|
True 0.2 298.483032 0.0
|
|
0.5 572.852264 0.0
|
|
4 False 0.2 314.290205 0.0
|
|
0.5 591.533699 0.0
|
|
True 0.2 325.024850 0.0
|
|
0.5 582.877014 0.0
|
|
5 False 0.2 567.362137 0.0
|
|
0.5 791.622625 0.0
|
|
True 0.2 568.323944 0.0
|
|
0.5 793.362946 0.0
|
|
6 False 0.2 568.489746 0.0
|
|
0.5 817.378523 0.0
|
|
True 0.2 523.890680 0.0
|
|
0.5 766.855530 0.0
|
|
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by, pd_aggs=["quantile"], quantiles=q, numeric_only=True
|
|
)
|
|
|
|
def aggregate(
|
|
self, func: Union[str, List[str]], numeric_only: Optional[bool] = False
|
|
) -> "pd.DataFrame":
|
|
"""
|
|
Used to groupby and aggregate
|
|
|
|
Parameters
|
|
----------
|
|
func:
|
|
Functions to use for aggregating the data.
|
|
|
|
Accepted combinations are:
|
|
- function
|
|
- list of functions
|
|
|
|
numeric_only: {True, False, None} Default is None
|
|
Which datatype to be returned
|
|
- True: returns all values with float64, NaN/NaT are ignored.
|
|
- False: returns all values with float64.
|
|
- None: returns all values with default datatype.
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
aggregation value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.aggregate`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").aggregate(["min", "max"]) # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice ... dayOfWeek
|
|
min max ... min max
|
|
DestCountry ...
|
|
AE 110.799911 1126.148682 ... 0 6
|
|
AR 125.589394 1199.642822 ... 0 6
|
|
AT 100.020531 1181.835815 ... 0 6
|
|
AU 102.294312 1197.632690 ... 0 6
|
|
CA 100.557251 1198.852539 ... 0 6
|
|
... ... ... ... ... ..
|
|
RU 101.004005 1196.742310 ... 0 6
|
|
SE 102.877190 1198.621582 ... 0 6
|
|
TR 142.876465 855.935547 ... 0 6
|
|
US 100.145966 1199.729004 ... 0 6
|
|
ZA 102.002663 1196.186157 ... 0 6
|
|
<BLANKLINE>
|
|
[32 rows x 6 columns]
|
|
"""
|
|
# Controls whether a MultiIndex is used for the
|
|
# columns of the result DataFrame.
|
|
is_dataframe_agg = True
|
|
if isinstance(func, str):
|
|
func = [func]
|
|
is_dataframe_agg = False
|
|
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=func,
|
|
dropna=self._dropna,
|
|
numeric_only=numeric_only,
|
|
is_dataframe_agg=is_dataframe_agg,
|
|
)
|
|
|
|
agg = aggregate
|
|
|
|
def count(self) -> "pd.DataFrame":
|
|
"""
|
|
Compute the count value for each group.
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
nunique value for each numeric column of each group
|
|
|
|
See Also
|
|
--------
|
|
:pandas_api_docs:`pandas.core.groupby.GroupBy.count`
|
|
|
|
Examples
|
|
--------
|
|
>>> df = ed.DataFrame(
|
|
... "localhost", "flights",
|
|
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "DestCountry"]
|
|
... )
|
|
>>> df.groupby("DestCountry").count() # doctest: +NORMALIZE_WHITESPACE
|
|
AvgTicketPrice Cancelled dayOfWeek
|
|
DestCountry
|
|
AE 46 46 46
|
|
AR 305 305 305
|
|
AT 377 377 377
|
|
AU 416 416 416
|
|
CA 944 944 944
|
|
... ... ... ...
|
|
RU 739 739 739
|
|
SE 255 255 255
|
|
TR 10 10 10
|
|
US 1987 1987 1987
|
|
ZA 283 283 283
|
|
<BLANKLINE>
|
|
[32 rows x 3 columns]
|
|
"""
|
|
return self._query_compiler.aggs_groupby(
|
|
by=self._by,
|
|
pd_aggs=["count"],
|
|
dropna=self._dropna,
|
|
numeric_only=False,
|
|
is_dataframe_agg=False,
|
|
)
|
|
|
|
def mode(self) -> None:
|
|
raise NotImplementedError("Currently mode is not supported for groupby")
|