mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Fix Series.describe(), median agg dtype
This commit is contained in:
parent
f5b37e643c
commit
5bf205a1e0
@ -714,7 +714,7 @@ script instead of being modified manually.
|
|||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.Series.dropna()`` | No |
|
| ``ed.Series.dropna()`` | No |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.Series.dtype`` | No |
|
| ``ed.Series.dtype`` | **Yes** |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.Series.dtypes`` | **Yes** |
|
| ``ed.Series.dtypes`` | **Yes** |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
|
@ -301,7 +301,7 @@ class Operations:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# These aggregations maintain the column datatype
|
# These aggregations maintain the column datatype
|
||||||
elif pd_agg in ("max", "min"):
|
elif pd_agg in {"max", "min", "median"}:
|
||||||
agg_value = field.np_dtype.type(agg_value)
|
agg_value = field.np_dtype.type(agg_value)
|
||||||
|
|
||||||
values.append(agg_value)
|
values.append(agg_value)
|
||||||
|
@ -425,7 +425,7 @@ class Series(NDFrame):
|
|||||||
return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
|
return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _dtype(self) -> np.dtype:
|
def dtype(self) -> np.dtype:
|
||||||
# DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
|
# DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
|
||||||
return self._query_compiler.dtypes[0]
|
return self._query_compiler.dtypes[0]
|
||||||
|
|
||||||
@ -1192,7 +1192,7 @@ class Series(NDFrame):
|
|||||||
self._query_compiler.check_arithmetics(right._query_compiler)
|
self._query_compiler.check_arithmetics(right._query_compiler)
|
||||||
|
|
||||||
right_object = ArithmeticSeries(
|
right_object = ArithmeticSeries(
|
||||||
right._query_compiler, right.name, right._dtype
|
right._query_compiler, right.name, right.dtype
|
||||||
)
|
)
|
||||||
display_name = None
|
display_name = None
|
||||||
elif np.issubdtype(np.dtype(type(right)), np.number):
|
elif np.issubdtype(np.dtype(type(right)), np.number):
|
||||||
@ -1204,11 +1204,11 @@ class Series(NDFrame):
|
|||||||
else:
|
else:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"unsupported operation type(s) [{method_name!r}] "
|
f"unsupported operation type(s) [{method_name!r}] "
|
||||||
f"for operands ['{type(self)}' with dtype '{self._dtype}', "
|
f"for operands ['{type(self)}' with dtype '{self.dtype}', "
|
||||||
f"'{type(right).__name__}']"
|
f"'{type(right).__name__}']"
|
||||||
)
|
)
|
||||||
|
|
||||||
left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
|
left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
|
||||||
left_object.arithmetic_operation(method_name, right_object)
|
left_object.arithmetic_operation(method_name, right_object)
|
||||||
|
|
||||||
series = Series(
|
series = Series(
|
||||||
@ -1430,6 +1430,41 @@ class Series(NDFrame):
|
|||||||
results = super().mad(numeric_only=numeric_only)
|
results = super().mad(numeric_only=numeric_only)
|
||||||
return results.squeeze()
|
return results.squeeze()
|
||||||
|
|
||||||
|
def describe(self) -> pd.Series:
|
||||||
|
"""
|
||||||
|
Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
|
||||||
|
dataset’s distribution, excluding NaN values.
|
||||||
|
|
||||||
|
Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
|
||||||
|
The output will vary depending on what is provided. Refer to the notes below for more detail.
|
||||||
|
|
||||||
|
TODO - add additional arguments (current only numeric values supported)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.Series:
|
||||||
|
Summary information
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.Series.describe`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame('localhost', 'flights')
|
||||||
|
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
|
||||||
|
count 13059.000000
|
||||||
|
mean 628.253689
|
||||||
|
std 266.386661
|
||||||
|
min 100.020531
|
||||||
|
...
|
||||||
|
...
|
||||||
|
...
|
||||||
|
max 1199.729004
|
||||||
|
Name: AvgTicketPrice, dtype: float64
|
||||||
|
"""
|
||||||
|
return super().describe().squeeze()
|
||||||
|
|
||||||
# def values TODO - not implemented as causes current implementation of query to fail
|
# def values TODO - not implemented as causes current implementation of query to fail
|
||||||
|
|
||||||
def to_numpy(self):
|
def to_numpy(self):
|
||||||
|
@ -228,3 +228,25 @@ class TestDataFrameMetrics(TestData):
|
|||||||
<= median
|
<= median
|
||||||
<= pd.to_datetime("2018-01-01 12:00:00.000")
|
<= pd.to_datetime("2018-01-01 12:00:00.000")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_metric_agg_keep_dtypes(self):
|
||||||
|
# max, min, and median maintain their dtypes
|
||||||
|
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
|
||||||
|
assert df.min().tolist() == [131.81910705566406, False, 0]
|
||||||
|
assert df.max().tolist() == [989.9527587890625, True, 0]
|
||||||
|
assert df.median().tolist() == [550.276123046875, False, 0]
|
||||||
|
all_agg = df.agg(["min", "max", "median"])
|
||||||
|
assert all_agg.dtypes.tolist() == [
|
||||||
|
np.dtype("float64"),
|
||||||
|
np.dtype("bool"),
|
||||||
|
np.dtype("int64"),
|
||||||
|
]
|
||||||
|
assert all_agg.to_dict() == {
|
||||||
|
"AvgTicketPrice": {
|
||||||
|
"max": 989.9527587890625,
|
||||||
|
"median": 550.276123046875,
|
||||||
|
"min": 131.81910705566406,
|
||||||
|
},
|
||||||
|
"Cancelled": {"max": True, "median": False, "min": False},
|
||||||
|
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
|
||||||
|
}
|
||||||
|
40
eland/tests/series/test_describe_pytest.py
Normal file
40
eland/tests/series/test_describe_pytest.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||||
|
# license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright
|
||||||
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||||
|
# the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from eland.tests.common import TestData, assert_series_equal
|
||||||
|
|
||||||
|
|
||||||
|
class TestSeriesDescribe(TestData):
|
||||||
|
def test_series_describe(self):
|
||||||
|
ed_df = self.ed_flights_small()
|
||||||
|
pd_df = self.pd_flights_small()
|
||||||
|
|
||||||
|
ed_desc = ed_df.AvgTicketPrice.describe()
|
||||||
|
pd_desc = pd_df.AvgTicketPrice.describe()
|
||||||
|
|
||||||
|
assert isinstance(ed_desc, pd.Series)
|
||||||
|
assert ed_desc.shape == pd_desc.shape
|
||||||
|
assert ed_desc.dtype == pd_desc.dtype
|
||||||
|
assert ed_desc.index.equals(pd_desc.index)
|
||||||
|
|
||||||
|
# Percentiles calculations vary for Elasticsearch
|
||||||
|
assert_series_equal(
|
||||||
|
ed_desc[["count", "mean", "std", "min", "max"]],
|
||||||
|
pd_desc[["count", "mean", "std", "min", "max"]],
|
||||||
|
rtol=0.2,
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user