From 2854eecbf43fb07f82582ff7f1e2769dff39a8dc Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Tue, 19 Nov 2019 12:01:53 +0000 Subject: [PATCH] https://github.com/elastic/eland/issues/50 fix. Root cause was incorrect filtering of numeric values by columns. Added more tests + fixed a doctest. --- eland/mappings.py | 37 ++++----- eland/ndframe.py | 14 ++-- eland/operations.py | 2 + eland/tests/dataframe/test_metrics_pytest.py | 33 ++++++++ .../test_numeric_source_fields_pytest.py | 77 +++++++++++++++++++ 5 files changed, 135 insertions(+), 28 deletions(-) create mode 100644 eland/tests/mappings/test_numeric_source_fields_pytest.py diff --git a/eland/mappings.py b/eland/mappings.py index 5a62bfb..0ddea31 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -453,28 +453,23 @@ class Mappings: numeric_source_fields: list of str List of source fields where pd_dtype == (int64 or float64 or bool) """ - if columns is not None: - if include_bool == True: - return self._mappings_capabilities[(self._mappings_capabilities._source == True) & - ((self._mappings_capabilities.pd_dtype == 'int64') | - (self._mappings_capabilities.pd_dtype == 'float64') | - (self._mappings_capabilities.pd_dtype == 'bool'))].reindex( - columns).index.tolist() - else: - return self._mappings_capabilities[(self._mappings_capabilities._source == True) & - ((self._mappings_capabilities.pd_dtype == 'int64') | - (self._mappings_capabilities.pd_dtype == 'float64'))].reindex( - columns).index.tolist() + if include_bool == True: + df = self._mappings_capabilities[(self._mappings_capabilities._source == True) & + ((self._mappings_capabilities.pd_dtype == 'int64') | + (self._mappings_capabilities.pd_dtype == 'float64') | + (self._mappings_capabilities.pd_dtype == 'bool'))] else: - if include_bool == True: - return self._mappings_capabilities[(self._mappings_capabilities._source == True) & - ((self._mappings_capabilities.pd_dtype == 'int64') | - (self._mappings_capabilities.pd_dtype == 'float64') | - (self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist() - else: - return self._mappings_capabilities[(self._mappings_capabilities._source == True) & - ((self._mappings_capabilities.pd_dtype == 'int64') | - (self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist() + df = self._mappings_capabilities[(self._mappings_capabilities._source == True) & + ((self._mappings_capabilities.pd_dtype == 'int64') | + (self._mappings_capabilities.pd_dtype == 'float64'))] + # if columns exists, filter index with columns + if columns is not None: + # reindex adds NA for non-existing columns (non-numeric), so drop these after reindex + df = df.reindex(columns) + df.dropna(inplace=True) + + # return as list + return df.index.to_list() def source_fields(self): """ diff --git a/eland/ndframe.py b/eland/ndframe.py index a196af8..31a2c40 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -488,16 +488,16 @@ class NDFrame: Examples -------- - >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay']) + >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin']) >>> df.describe() # ignoring percentiles as they don't generate consistent results - AvgTicketPrice FlightDelay - count 13059.000000 13059.000000 - mean 628.253689 0.251168 - std 266.386661 0.433685 - min 100.020531 0.000000 + AvgTicketPrice FlightDelayMin + count 13059.000000 13059.000000 + mean 628.253689 47.335171 + std 266.386661 96.743006 + min 100.020531 0.000000 ... ... ... - max 1199.729004 1.000000 + max 1199.729004 360.000000 """ return self._query_compiler.describe() diff --git a/eland/operations.py b/eland/operations.py index aa1aea5..0fd3617 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -151,6 +151,8 @@ class Operations: for field in numeric_source_fields: body.metric_aggs(field, func, field) + #print(body.to_search_body(), columns, numeric_source_fields) + response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index 997a323..c43edea 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -4,6 +4,8 @@ from pandas.util.testing import assert_series_equal from eland.tests.common import TestData +import eland as ed + class TestDataFrameMetrics(TestData): @@ -42,3 +44,34 @@ class TestDataFrameMetrics(TestData): ed_max = ed_flights.max(numeric_only=True) assert_series_equal(pd_max, ed_max) + + def test_ecommerce_selected_non_numeric_source_fields_max(self): + # None of these are numeric + columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user'] + + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] + + assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True)) + + def test_ecommerce_selected_mixed_numeric_source_fields_max(self): + # Some of these are numeric + columns = ['category', 'currency', 'taxless_total_price', 'customer_birth_date', + 'total_quantity', 'customer_first_name', 'user'] + + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] + + assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True), + check_less_precise=True) + + + def test_ecommerce_selected_all_numeric_source_fields_max(self): + # All of these are numeric + columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price'] + + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] + + assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True), + check_less_precise=True) diff --git a/eland/tests/mappings/test_numeric_source_fields_pytest.py b/eland/tests/mappings/test_numeric_source_fields_pytest.py new file mode 100644 index 0000000..9611a1f --- /dev/null +++ b/eland/tests/mappings/test_numeric_source_fields_pytest.py @@ -0,0 +1,77 @@ +# File called _pytest for PyCharm compatability + +import numpy as np + +from pandas.util.testing import assert_series_equal + +from eland.tests.common import TestData + + +class TestMappingsNumericSourceFields(TestData): + + def test_flights_numeric_source_fields(self): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_numeric = ed_flights._query_compiler._mappings.numeric_source_fields(columns=None, include_bool=False) + pd_numeric = pd_flights.select_dtypes(include=np.number) + + assert pd_numeric.columns.to_list() == ed_numeric + + def test_ecommerce_selected_non_numeric_source_fields(self): + columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user'] + """ + Note: non of there are numeric + category object + currency object + customer_birth_date datetime64[ns] + customer_first_name object + user object + """ + + ed_ecommerce = self.ed_ecommerce()[columns] + pd_ecommerce = self.pd_ecommerce()[columns] + + ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False) + pd_numeric = pd_ecommerce.select_dtypes(include=np.number) + + assert pd_numeric.columns.to_list() == ed_numeric + + def test_ecommerce_selected_mixed_numeric_source_fields(self): + columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'total_quantity', 'user'] + + """ + Note: one is numeric + category object + currency object + customer_birth_date datetime64[ns] + customer_first_name object + total_quantity int64 + user object + """ + + ed_ecommerce = self.ed_ecommerce()[columns] + pd_ecommerce = self.pd_ecommerce()[columns] + + ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False) + pd_numeric = pd_ecommerce.select_dtypes(include=np.number) + + assert pd_numeric.columns.to_list() == ed_numeric + + def test_ecommerce_selected_all_numeric_source_fields(self): + columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price'] + + """ + Note: all are numeric + total_quantity int64 + taxful_total_price float64 + taxless_total_price float64 + """ + + ed_ecommerce = self.ed_ecommerce()[columns] + pd_ecommerce = self.pd_ecommerce()[columns] + + ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False) + pd_numeric = pd_ecommerce.select_dtypes(include=np.number) + + assert pd_numeric.columns.to_list() == ed_numeric