Root cause was incorrect filtering of numeric values by columns.
Added more tests + fixed a doctest.
This commit is contained in:
Stephen Dodson 2019-11-19 12:01:53 +00:00
parent 9b4fe40305
commit 2854eecbf4
5 changed files with 135 additions and 28 deletions

View File

@ -453,28 +453,23 @@ class Mappings:
numeric_source_fields: list of str numeric_source_fields: list of str
List of source fields where pd_dtype == (int64 or float64 or bool) List of source fields where pd_dtype == (int64 or float64 or bool)
""" """
if include_bool == True:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))]
else:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))]
# if columns exists, filter index with columns
if columns is not None: if columns is not None:
if include_bool == True: # reindex adds NA for non-existing columns (non-numeric), so drop these after reindex
return self._mappings_capabilities[(self._mappings_capabilities._source == True) & df = df.reindex(columns)
((self._mappings_capabilities.pd_dtype == 'int64') | df.dropna(inplace=True)
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].reindex( # return as list
columns).index.tolist() return df.index.to_list()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].reindex(
columns).index.tolist()
else:
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
def source_fields(self): def source_fields(self):
""" """

View File

@ -488,16 +488,16 @@ class NDFrame:
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay']) >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
>>> df.describe() # ignoring percentiles as they don't generate consistent results >>> df.describe() # ignoring percentiles as they don't generate consistent results
AvgTicketPrice FlightDelay AvgTicketPrice FlightDelayMin
count 13059.000000 13059.000000 count 13059.000000 13059.000000
mean 628.253689 0.251168 mean 628.253689 47.335171
std 266.386661 0.433685 std 266.386661 96.743006
min 100.020531 0.000000 min 100.020531 0.000000
... ...
... ...
... ...
max 1199.729004 1.000000 max 1199.729004 360.000000
""" """
return self._query_compiler.describe() return self._query_compiler.describe()

View File

@ -151,6 +151,8 @@ class Operations:
for field in numeric_source_fields: for field in numeric_source_fields:
body.metric_aggs(field, func, field) body.metric_aggs(field, func, field)
#print(body.to_search_body(), columns, numeric_source_fields)
response = query_compiler._client.search( response = query_compiler._client.search(
index=query_compiler._index_pattern, index=query_compiler._index_pattern,
size=0, size=0,

View File

@ -4,6 +4,8 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
import eland as ed
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
@ -42,3 +44,34 @@ class TestDataFrameMetrics(TestData):
ed_max = ed_flights.max(numeric_only=True) ed_max = ed_flights.max(numeric_only=True)
assert_series_equal(pd_max, ed_max) assert_series_equal(pd_max, ed_max)
def test_ecommerce_selected_non_numeric_source_fields_max(self):
# None of these are numeric
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True))
def test_ecommerce_selected_mixed_numeric_source_fields_max(self):
# Some of these are numeric
columns = ['category', 'currency', 'taxless_total_price', 'customer_birth_date',
'total_quantity', 'customer_first_name', 'user']
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True),
check_less_precise=True)
def test_ecommerce_selected_all_numeric_source_fields_max(self):
# All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True),
check_less_precise=True)

View File

@ -0,0 +1,77 @@
# File called _pytest for PyCharm compatability
import numpy as np
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestMappingsNumericSourceFields(TestData):
def test_flights_numeric_source_fields(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_numeric = ed_flights._query_compiler._mappings.numeric_source_fields(columns=None, include_bool=False)
pd_numeric = pd_flights.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
def test_ecommerce_selected_non_numeric_source_fields(self):
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
"""
Note: non of there are numeric
category object
currency object
customer_birth_date datetime64[ns]
customer_first_name object
user object
"""
ed_ecommerce = self.ed_ecommerce()[columns]
pd_ecommerce = self.pd_ecommerce()[columns]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
def test_ecommerce_selected_mixed_numeric_source_fields(self):
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'total_quantity', 'user']
"""
Note: one is numeric
category object
currency object
customer_birth_date datetime64[ns]
customer_first_name object
total_quantity int64
user object
"""
ed_ecommerce = self.ed_ecommerce()[columns]
pd_ecommerce = self.pd_ecommerce()[columns]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
def test_ecommerce_selected_all_numeric_source_fields(self):
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
"""
Note: all are numeric
total_quantity int64
taxful_total_price float64
taxless_total_price float64
"""
ed_ecommerce = self.ed_ecommerce()[columns]
pd_ecommerce = self.pd_ecommerce()[columns]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric