diff --git a/eland/ndframe.py b/eland/ndframe.py index 8a50b63..320ffa4 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -628,8 +628,8 @@ class NDFrame(ABC): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) - >>> df.describe() # ignoring percentiles as they don't generate consistent results + >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) # ignoring percentiles + >>> df.describe() # doctest: +SKIP AvgTicketPrice FlightDelayMin count 13059.000000 13059.000000 mean 628.253689 47.335171 diff --git a/eland/operations.py b/eland/operations.py index da4b5c2..3a71e63 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -1081,52 +1081,26 @@ class Operations: f"Can not count field matches if size is set {size}" ) - numeric_source_fields = query_compiler._mappings.numeric_source_fields() - - # for each field we compute: - # count, mean, std, min, 25%, 50%, 75%, max - body = Query(query_params.query) - - for field in numeric_source_fields: - body.metric_aggs("extended_stats_" + field, "extended_stats", field) - body.metric_aggs("percentiles_" + field, "percentiles", field) - - response = query_compiler._client.search( - index=query_compiler._index_pattern, size=0, body=body.to_search_body() + df1 = self.aggs( + query_compiler=query_compiler, + pd_aggs=["count", "mean", "std", "min", "max"], + numeric_only=True, + ) + df2 = self.quantile( + query_compiler=query_compiler, + pd_aggs=["quantile"], + quantiles=[0.25, 0.5, 0.75], + is_dataframe=True, + numeric_only=True, ) - results = {} + # Convert [.25,.5,.75] to ["25%", "50%", "75%"] + df2 = df2.set_index([["25%", "50%", "75%"]]) - for field in numeric_source_fields: - values = list() - values.append(response["aggregations"]["extended_stats_" + field]["count"]) - values.append(response["aggregations"]["extended_stats_" + field]["avg"]) - values.append( - response["aggregations"]["extended_stats_" + field]["std_deviation"] - ) - values.append(response["aggregations"]["extended_stats_" + field]["min"]) - values.append( - response["aggregations"]["percentiles_" + field]["values"]["25.0"] - ) - values.append( - response["aggregations"]["percentiles_" + field]["values"]["50.0"] - ) - values.append( - response["aggregations"]["percentiles_" + field]["values"]["75.0"] - ) - values.append(response["aggregations"]["extended_stats_" + field]["max"]) - - # if not None - if values.count(None) < len(values): - results[field] = values - - df = pd.DataFrame( - data=results, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + return pd.concat([df1, df2]).reindex( + ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] ) - return df - def to_pandas(self, query_compiler, show_progress=False): class PandasDataFrameCollector: def __init__(self, show_progress): diff --git a/eland/series.py b/eland/series.py index 43125d5..7abfd72 100644 --- a/eland/series.py +++ b/eland/series.py @@ -1269,7 +1269,7 @@ class Series(NDFrame): 3 2 4 2 Name: total_quantity, dtype: int64 - >>> np.int(2) ** df.total_quantity + >>> np.int_(2) ** df.total_quantity 0 4.0 1 4.0 2 4.0 @@ -1627,8 +1627,8 @@ class Series(NDFrame): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') - >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results + >>> df = ed.DataFrame('localhost', 'flights') # ignoring percentiles as they don't generate consistent results + >>> df.AvgTicketPrice.describe() # doctest: +SKIP count 13059.000000 mean 628.253689 std 266.386661 diff --git a/tests/dataframe/test_describe_pytest.py b/tests/dataframe/test_describe_pytest.py index b7d655c..a0f585b 100644 --- a/tests/dataframe/test_describe_pytest.py +++ b/tests/dataframe/test_describe_pytest.py @@ -28,7 +28,10 @@ class TestDataFrameDescribe(TestData): ed_flights = self.ed_flights() pd_describe = pd_flights.describe() - ed_describe = ed_flights.describe() + # We remove bool columns to match pandas output + ed_describe = ed_flights.describe().drop( + ["Cancelled", "FlightDelay"], axis="columns" + ) assert_frame_equal( pd_describe.drop(["25%", "50%", "75%"], axis="index"),