Fix Series.describe(), median agg dtype

This commit is contained in:
Seth Michael Larson 2020-08-17 09:28:30 -05:00 committed by GitHub
parent f5b37e643c
commit 5bf205a1e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 103 additions and 6 deletions

View File

@ -714,7 +714,7 @@ script instead of being modified manually.
+---------------------------------------+------------+
| ``ed.Series.dropna()`` | No |
+---------------------------------------+------------+
| ``ed.Series.dtype`` | No |
| ``ed.Series.dtype`` | **Yes** |
+---------------------------------------+------------+
| ``ed.Series.dtypes`` | **Yes** |
+---------------------------------------+------------+

View File

@ -301,7 +301,7 @@ class Operations:
)
# These aggregations maintain the column datatype
elif pd_agg in ("max", "min"):
elif pd_agg in {"max", "min", "median"}:
agg_value = field.np_dtype.type(agg_value)
values.append(agg_value)

View File

@ -425,7 +425,7 @@ class Series(NDFrame):
return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
@property
def _dtype(self) -> np.dtype:
def dtype(self) -> np.dtype:
# DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
return self._query_compiler.dtypes[0]
@ -1192,7 +1192,7 @@ class Series(NDFrame):
self._query_compiler.check_arithmetics(right._query_compiler)
right_object = ArithmeticSeries(
right._query_compiler, right.name, right._dtype
right._query_compiler, right.name, right.dtype
)
display_name = None
elif np.issubdtype(np.dtype(type(right)), np.number):
@ -1204,11 +1204,11 @@ class Series(NDFrame):
else:
raise TypeError(
f"unsupported operation type(s) [{method_name!r}] "
f"for operands ['{type(self)}' with dtype '{self._dtype}', "
f"for operands ['{type(self)}' with dtype '{self.dtype}', "
f"'{type(right).__name__}']"
)
left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
left_object.arithmetic_operation(method_name, right_object)
series = Series(
@ -1430,6 +1430,41 @@ class Series(NDFrame):
results = super().mad(numeric_only=numeric_only)
return results.squeeze()
def describe(self) -> pd.Series:
"""
Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
datasets distribution, excluding NaN values.
Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
The output will vary depending on what is provided. Refer to the notes below for more detail.
TODO - add additional arguments (current only numeric values supported)
Returns
-------
pandas.Series:
Summary information
See Also
--------
:pandas_api_docs:`pandas.Series.describe`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
count 13059.000000
mean 628.253689
std 266.386661
min 100.020531
...
...
...
max 1199.729004
Name: AvgTicketPrice, dtype: float64
"""
return super().describe().squeeze()
# def values TODO - not implemented as causes current implementation of query to fail
def to_numpy(self):

View File

@ -228,3 +228,25 @@ class TestDataFrameMetrics(TestData):
<= median
<= pd.to_datetime("2018-01-01 12:00:00.000")
)
def test_metric_agg_keep_dtypes(self):
# max, min, and median maintain their dtypes
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
assert df.min().tolist() == [131.81910705566406, False, 0]
assert df.max().tolist() == [989.9527587890625, True, 0]
assert df.median().tolist() == [550.276123046875, False, 0]
all_agg = df.agg(["min", "max", "median"])
assert all_agg.dtypes.tolist() == [
np.dtype("float64"),
np.dtype("bool"),
np.dtype("int64"),
]
assert all_agg.to_dict() == {
"AvgTicketPrice": {
"max": 989.9527587890625,
"median": 550.276123046875,
"min": 131.81910705566406,
},
"Cancelled": {"max": True, "median": False, "min": False},
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
}

View File

@ -0,0 +1,40 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pandas as pd
from eland.tests.common import TestData, assert_series_equal
class TestSeriesDescribe(TestData):
def test_series_describe(self):
ed_df = self.ed_flights_small()
pd_df = self.pd_flights_small()
ed_desc = ed_df.AvgTicketPrice.describe()
pd_desc = pd_df.AvgTicketPrice.describe()
assert isinstance(ed_desc, pd.Series)
assert ed_desc.shape == pd_desc.shape
assert ed_desc.dtype == pd_desc.dtype
assert ed_desc.index.equals(pd_desc.index)
# Percentiles calculations vary for Elasticsearch
assert_series_equal(
ed_desc[["count", "mean", "std", "min", "max"]],
pd_desc[["count", "mean", "std", "min", "max"]],
rtol=0.2,
)