From aa9d60e7e7c49513d19f8964db29f4161eeb4a60 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" <33659563+V1NAY8@users.noreply.github.com> Date: Wed, 21 Apr 2021 18:54:52 +0530 Subject: [PATCH] Add sort order to groupby dropna=False (#322) * Add sort order to groupby dropna=False * Fix rebase --- eland/operations.py | 2 +- tests/dataframe/test_groupby_pytest.py | 54 ++++++++++++++++++++------ 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/eland/operations.py b/eland/operations.py index c261551..98ba583 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -763,7 +763,7 @@ class Operations: for pd_agg, val in zip(pd_aggs, value): results[f"{key}_{pd_agg}"].append(val) - agg_df = pd.DataFrame(results).set_index(by) + agg_df = pd.DataFrame(results).set_index(by).sort_index() if is_dataframe_agg: # Convert header columns to MultiIndex diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 1ba1b81..f3188c1 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -31,18 +31,26 @@ class TestGroupbyDataFrame(TestData): "Cancelled", "dayOfWeek", ] + ecommerce_filter_data = [ + "total_quantity", + "geoip.region_name", + "day_of_week", + "total_unique_products", + "taxful_total_price", + ] + @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("numeric_only", [True]) - def test_groupby_aggregate(self, numeric_only): + def test_groupby_aggregate(self, numeric_only, dropna): # TODO Add tests for numeric_only=False for aggs # when we support aggregations on text fields pd_flights = self.pd_flights().filter(self.filter_data) ed_flights = self.ed_flights().filter(self.filter_data) - pd_groupby = pd_flights.groupby("Cancelled").agg( + pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).agg( self.funcs, numeric_only=numeric_only ) - ed_groupby = ed_flights.groupby("Cancelled").agg( + ed_groupby = ed_flights.groupby("Cancelled", dropna=dropna).agg( self.funcs, numeric_only=numeric_only ) @@ -60,29 +68,37 @@ class TestGroupbyDataFrame(TestData): # checking only values because dtypes are checked in aggs tests assert_frame_equal(pd_groupby, ed_groupby, check_exact=False, check_dtype=False) + @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("pd_agg", ["max", "min", "mean", "sum", "median"]) - def test_groupby_aggs_numeric_only_true(self, pd_agg): + def test_groupby_aggs_numeric_only_true(self, pd_agg, dropna): # Pandas has numeric_only applicable for the above aggs with groupby only. pd_flights = self.pd_flights().filter(self.filter_data) ed_flights = self.ed_flights().filter(self.filter_data) - pd_groupby = getattr(pd_flights.groupby("Cancelled"), pd_agg)(numeric_only=True) - ed_groupby = getattr(ed_flights.groupby("Cancelled"), pd_agg)(numeric_only=True) + pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)( + numeric_only=True + ) + ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)( + numeric_only=True + ) # checking only values because dtypes are checked in aggs tests assert_frame_equal( pd_groupby, ed_groupby, check_exact=False, check_dtype=False, rtol=2 ) + @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("pd_agg", ["mad", "var", "std"]) - def test_groupby_aggs_mad_var_std(self, pd_agg): + def test_groupby_aggs_mad_var_std(self, pd_agg, dropna): # For these aggs pandas doesn't support numeric_only pd_flights = self.pd_flights().filter(self.filter_data) ed_flights = self.ed_flights().filter(self.filter_data) - pd_groupby = getattr(pd_flights.groupby("Cancelled"), pd_agg)() - ed_groupby = getattr(ed_flights.groupby("Cancelled"), pd_agg)(numeric_only=True) + pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)() + ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)( + numeric_only=True + ) # checking only values because dtypes are checked in aggs tests assert_frame_equal( @@ -151,9 +167,23 @@ class TestGroupbyDataFrame(TestData): assert pd_groupby.index.dtype == ed_groupby.index.dtype assert list(pd_groupby.columns) == list(ed_groupby.columns) - def test_groupby_dropna(self): - # TODO Add tests once dropna is implemeted - pass + @pytest.mark.parametrize("dropna", [True, False]) + @pytest.mark.parametrize("groupby", ["geoip.region_name", "day_of_week"]) + @pytest.mark.parametrize("func", ["min", "max", "mean"]) + def test_groupby_dropna(self, dropna, func, groupby): + pd_ecommerce = self.pd_ecommerce().filter(self.ecommerce_filter_data) + ed_ecommerce = self.ed_ecommerce().filter(self.ecommerce_filter_data) + + pd_groupby = getattr(pd_ecommerce.groupby(groupby, dropna=dropna), func)( + numeric_only=True + ) + ed_groupby = getattr(ed_ecommerce.groupby(groupby, dropna=dropna), func)( + numeric_only=True + ) + + assert_index_equal(pd_groupby.columns, ed_groupby.columns) + assert_index_equal(pd_groupby.index, ed_groupby.index) + assert_frame_equal(pd_groupby, ed_groupby, check_dtype=False) @pytest.mark.parametrize("groupby", ["dayOfWeek", ["dayOfWeek", "Cancelled"]]) @pytest.mark.parametrize(