mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Root cause was incorrect filtering of numeric values by columns. Added more tests + fixed a doctest.
This commit is contained in:
parent
9b4fe40305
commit
2854eecbf4
@ -453,28 +453,23 @@ class Mappings:
|
||||
numeric_source_fields: list of str
|
||||
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||
"""
|
||||
if include_bool == True:
|
||||
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))]
|
||||
else:
|
||||
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))]
|
||||
# if columns exists, filter index with columns
|
||||
if columns is not None:
|
||||
if include_bool == True:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].reindex(
|
||||
columns).index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].reindex(
|
||||
columns).index.tolist()
|
||||
else:
|
||||
if include_bool == True:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||
# reindex adds NA for non-existing columns (non-numeric), so drop these after reindex
|
||||
df = df.reindex(columns)
|
||||
df.dropna(inplace=True)
|
||||
|
||||
# return as list
|
||||
return df.index.to_list()
|
||||
|
||||
def source_fields(self):
|
||||
"""
|
||||
|
@ -488,16 +488,16 @@ class NDFrame:
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay'])
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
|
||||
>>> df.describe() # ignoring percentiles as they don't generate consistent results
|
||||
AvgTicketPrice FlightDelay
|
||||
AvgTicketPrice FlightDelayMin
|
||||
count 13059.000000 13059.000000
|
||||
mean 628.253689 0.251168
|
||||
std 266.386661 0.433685
|
||||
mean 628.253689 47.335171
|
||||
std 266.386661 96.743006
|
||||
min 100.020531 0.000000
|
||||
...
|
||||
...
|
||||
...
|
||||
max 1199.729004 1.000000
|
||||
max 1199.729004 360.000000
|
||||
"""
|
||||
return self._query_compiler.describe()
|
||||
|
@ -151,6 +151,8 @@ class Operations:
|
||||
for field in numeric_source_fields:
|
||||
body.metric_aggs(field, func, field)
|
||||
|
||||
#print(body.to_search_body(), columns, numeric_source_fields)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
|
@ -4,6 +4,8 @@ from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
import eland as ed
|
||||
|
||||
|
||||
class TestDataFrameMetrics(TestData):
|
||||
|
||||
@ -42,3 +44,34 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_max = ed_flights.max(numeric_only=True)
|
||||
|
||||
assert_series_equal(pd_max, ed_max)
|
||||
|
||||
def test_ecommerce_selected_non_numeric_source_fields_max(self):
|
||||
# None of these are numeric
|
||||
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
|
||||
|
||||
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||
|
||||
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True))
|
||||
|
||||
def test_ecommerce_selected_mixed_numeric_source_fields_max(self):
|
||||
# Some of these are numeric
|
||||
columns = ['category', 'currency', 'taxless_total_price', 'customer_birth_date',
|
||||
'total_quantity', 'customer_first_name', 'user']
|
||||
|
||||
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||
|
||||
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True),
|
||||
check_less_precise=True)
|
||||
|
||||
|
||||
def test_ecommerce_selected_all_numeric_source_fields_max(self):
|
||||
# All of these are numeric
|
||||
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
|
||||
|
||||
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||
|
||||
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True),
|
||||
check_less_precise=True)
|
||||
|
77
eland/tests/mappings/test_numeric_source_fields_pytest.py
Normal file
77
eland/tests/mappings/test_numeric_source_fields_pytest.py
Normal file
@ -0,0 +1,77 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestMappingsNumericSourceFields(TestData):
|
||||
|
||||
def test_flights_numeric_source_fields(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
ed_numeric = ed_flights._query_compiler._mappings.numeric_source_fields(columns=None, include_bool=False)
|
||||
pd_numeric = pd_flights.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
||||
|
||||
def test_ecommerce_selected_non_numeric_source_fields(self):
|
||||
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
|
||||
"""
|
||||
Note: non of there are numeric
|
||||
category object
|
||||
currency object
|
||||
customer_birth_date datetime64[ns]
|
||||
customer_first_name object
|
||||
user object
|
||||
"""
|
||||
|
||||
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||
|
||||
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
|
||||
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
||||
|
||||
def test_ecommerce_selected_mixed_numeric_source_fields(self):
|
||||
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'total_quantity', 'user']
|
||||
|
||||
"""
|
||||
Note: one is numeric
|
||||
category object
|
||||
currency object
|
||||
customer_birth_date datetime64[ns]
|
||||
customer_first_name object
|
||||
total_quantity int64
|
||||
user object
|
||||
"""
|
||||
|
||||
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||
|
||||
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
|
||||
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
||||
|
||||
def test_ecommerce_selected_all_numeric_source_fields(self):
|
||||
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
|
||||
|
||||
"""
|
||||
Note: all are numeric
|
||||
total_quantity int64
|
||||
taxful_total_price float64
|
||||
taxless_total_price float64
|
||||
"""
|
||||
|
||||
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||
|
||||
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
|
||||
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
Loading…
x
Reference in New Issue
Block a user