Add NDFrame.median() aggregation

This commit is contained in:
Daniel Mesejo-León 2020-04-13 15:48:39 +02:00 committed by GitHub
parent 7a1c636e56
commit e8f307d2e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 104 additions and 7 deletions

View File

@ -252,11 +252,95 @@ class NDFrame(ABC):
return self._query_compiler.min(numeric_only=numeric_only)
def var(self, numeric_only=True):
"""
Return variance for each numeric column
Returns
-------
pandas.Series
The value of the variance for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.var`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.var() # doctest: +SKIP
AvgTicketPrice 7.096185e+04
Cancelled 1.119831e-01
DistanceKilometers 2.096049e+07
DistanceMiles 8.092892e+06
FlightDelay 1.880825e-01
FlightDelayMin 9.359209e+03
FlightTimeHour 3.112545e+01
FlightTimeMin 1.120516e+05
dayOfWeek 3.761135e+00
dtype: float64
"""
return self._query_compiler.var(numeric_only=numeric_only)
def std(self, numeric_only=True):
"""
Return standard deviation for each numeric column
Returns
-------
pandas.Series
The value of the standard deviation for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.std`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.std() # doctest: +SKIP
AvgTicketPrice 266.386661
Cancelled 0.334639
DistanceKilometers 4578.263193
DistanceMiles 2844.800855
FlightDelay 0.433685
FlightDelayMin 96.743006
FlightTimeHour 5.579019
FlightTimeMin 334.741135
dayOfWeek 1.939365
dtype: float64
"""
return self._query_compiler.std(numeric_only=numeric_only)
def median(self, numeric_only=True):
"""
Return the median value for each numeric column
Returns
-------
pandas.Series
median value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.median`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.median() # doctest: +SKIP
AvgTicketPrice 640.387285
Cancelled 0.000000
DistanceKilometers 7612.072403
DistanceMiles 4729.922470
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 8.383113
FlightTimeMin 503.148975
dayOfWeek 3.000000
dtype: float64
"""
return self._query_compiler.median(numeric_only=numeric_only)
def max(self, numeric_only=True):
"""
Return the maximum value for each numeric column

View File

@ -135,6 +135,11 @@ class Operations:
numeric_only=numeric_only,
)
def median(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only
)
def sum(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
@ -275,9 +280,14 @@ class Operations:
)
else:
if isinstance(func, tuple):
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
if func[0] == "percentiles":
results[field] = response["aggregations"][
"percentiles_" + field
]["values"]["50.0"]
else:
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
else:
results[field] = response["aggregations"][field]["value"]

View File

@ -469,6 +469,9 @@ class QueryCompiler:
def std(self, numeric_only=None):
return self._operations.std(self, numeric_only=numeric_only)
def median(self, numeric_only=None):
return self._operations.median(self, numeric_only=numeric_only)
def sum(self, numeric_only=None):
return self._operations.sum(self, numeric_only=numeric_only)

View File

@ -12,16 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# File called _pytest for PyCharm compatability
# File called _pytest for PyCharm compatibility
from pandas.util.testing import assert_series_equal, assert_almost_equal
from pandas.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameMetrics(TestData):
funcs = ["max", "min", "mean", "sum"]
extended_funcs = ["var", "std"]
extended_funcs = ["var", "std", "median"]
def test_flights_metrics(self):
pd_flights = self.pd_flights()
@ -41,7 +41,7 @@ class TestDataFrameMetrics(TestData):
pd_metric = getattr(pd_flights, func)(numeric_only=True)
ed_metric = getattr(ed_flights, func)(numeric_only=True)
assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
assert_series_equal(pd_metric, ed_metric, check_less_precise=True)
def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric