mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Root cause was incorrect filtering of numeric values by columns. Added more tests + fixed a doctest.
This commit is contained in:
parent
9b4fe40305
commit
2854eecbf4
@ -453,28 +453,23 @@ class Mappings:
|
|||||||
numeric_source_fields: list of str
|
numeric_source_fields: list of str
|
||||||
List of source fields where pd_dtype == (int64 or float64 or bool)
|
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||||
"""
|
"""
|
||||||
|
if include_bool == True:
|
||||||
|
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'bool'))]
|
||||||
|
else:
|
||||||
|
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'float64'))]
|
||||||
|
# if columns exists, filter index with columns
|
||||||
if columns is not None:
|
if columns is not None:
|
||||||
if include_bool == True:
|
# reindex adds NA for non-existing columns (non-numeric), so drop these after reindex
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
df = df.reindex(columns)
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
df.dropna(inplace=True)
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
|
||||||
(self._mappings_capabilities.pd_dtype == 'bool'))].reindex(
|
# return as list
|
||||||
columns).index.tolist()
|
return df.index.to_list()
|
||||||
else:
|
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64'))].reindex(
|
|
||||||
columns).index.tolist()
|
|
||||||
else:
|
|
||||||
if include_bool == True:
|
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
|
||||||
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
|
||||||
else:
|
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
|
||||||
|
|
||||||
def source_fields(self):
|
def source_fields(self):
|
||||||
"""
|
"""
|
||||||
|
@ -488,16 +488,16 @@ class NDFrame:
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay'])
|
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
|
||||||
>>> df.describe() # ignoring percentiles as they don't generate consistent results
|
>>> df.describe() # ignoring percentiles as they don't generate consistent results
|
||||||
AvgTicketPrice FlightDelay
|
AvgTicketPrice FlightDelayMin
|
||||||
count 13059.000000 13059.000000
|
count 13059.000000 13059.000000
|
||||||
mean 628.253689 0.251168
|
mean 628.253689 47.335171
|
||||||
std 266.386661 0.433685
|
std 266.386661 96.743006
|
||||||
min 100.020531 0.000000
|
min 100.020531 0.000000
|
||||||
...
|
...
|
||||||
...
|
...
|
||||||
...
|
...
|
||||||
max 1199.729004 1.000000
|
max 1199.729004 360.000000
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.describe()
|
return self._query_compiler.describe()
|
||||||
|
@ -151,6 +151,8 @@ class Operations:
|
|||||||
for field in numeric_source_fields:
|
for field in numeric_source_fields:
|
||||||
body.metric_aggs(field, func, field)
|
body.metric_aggs(field, func, field)
|
||||||
|
|
||||||
|
#print(body.to_search_body(), columns, numeric_source_fields)
|
||||||
|
|
||||||
response = query_compiler._client.search(
|
response = query_compiler._client.search(
|
||||||
index=query_compiler._index_pattern,
|
index=query_compiler._index_pattern,
|
||||||
size=0,
|
size=0,
|
||||||
|
@ -4,6 +4,8 @@ from pandas.util.testing import assert_series_equal
|
|||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
import eland as ed
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameMetrics(TestData):
|
class TestDataFrameMetrics(TestData):
|
||||||
|
|
||||||
@ -42,3 +44,34 @@ class TestDataFrameMetrics(TestData):
|
|||||||
ed_max = ed_flights.max(numeric_only=True)
|
ed_max = ed_flights.max(numeric_only=True)
|
||||||
|
|
||||||
assert_series_equal(pd_max, ed_max)
|
assert_series_equal(pd_max, ed_max)
|
||||||
|
|
||||||
|
def test_ecommerce_selected_non_numeric_source_fields_max(self):
|
||||||
|
# None of these are numeric
|
||||||
|
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
|
||||||
|
|
||||||
|
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||||
|
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||||
|
|
||||||
|
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True))
|
||||||
|
|
||||||
|
def test_ecommerce_selected_mixed_numeric_source_fields_max(self):
|
||||||
|
# Some of these are numeric
|
||||||
|
columns = ['category', 'currency', 'taxless_total_price', 'customer_birth_date',
|
||||||
|
'total_quantity', 'customer_first_name', 'user']
|
||||||
|
|
||||||
|
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||||
|
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||||
|
|
||||||
|
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True),
|
||||||
|
check_less_precise=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ecommerce_selected_all_numeric_source_fields_max(self):
|
||||||
|
# All of these are numeric
|
||||||
|
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
|
||||||
|
|
||||||
|
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||||
|
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||||
|
|
||||||
|
assert_series_equal(pd_ecommerce.max(numeric_only=True), ed_ecommerce.max(numeric_only=True),
|
||||||
|
check_less_precise=True)
|
||||||
|
77
eland/tests/mappings/test_numeric_source_fields_pytest.py
Normal file
77
eland/tests/mappings/test_numeric_source_fields_pytest.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestMappingsNumericSourceFields(TestData):
|
||||||
|
|
||||||
|
def test_flights_numeric_source_fields(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
ed_numeric = ed_flights._query_compiler._mappings.numeric_source_fields(columns=None, include_bool=False)
|
||||||
|
pd_numeric = pd_flights.select_dtypes(include=np.number)
|
||||||
|
|
||||||
|
assert pd_numeric.columns.to_list() == ed_numeric
|
||||||
|
|
||||||
|
def test_ecommerce_selected_non_numeric_source_fields(self):
|
||||||
|
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
|
||||||
|
"""
|
||||||
|
Note: non of there are numeric
|
||||||
|
category object
|
||||||
|
currency object
|
||||||
|
customer_birth_date datetime64[ns]
|
||||||
|
customer_first_name object
|
||||||
|
user object
|
||||||
|
"""
|
||||||
|
|
||||||
|
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||||
|
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||||
|
|
||||||
|
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
|
||||||
|
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||||
|
|
||||||
|
assert pd_numeric.columns.to_list() == ed_numeric
|
||||||
|
|
||||||
|
def test_ecommerce_selected_mixed_numeric_source_fields(self):
|
||||||
|
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'total_quantity', 'user']
|
||||||
|
|
||||||
|
"""
|
||||||
|
Note: one is numeric
|
||||||
|
category object
|
||||||
|
currency object
|
||||||
|
customer_birth_date datetime64[ns]
|
||||||
|
customer_first_name object
|
||||||
|
total_quantity int64
|
||||||
|
user object
|
||||||
|
"""
|
||||||
|
|
||||||
|
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||||
|
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||||
|
|
||||||
|
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
|
||||||
|
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||||
|
|
||||||
|
assert pd_numeric.columns.to_list() == ed_numeric
|
||||||
|
|
||||||
|
def test_ecommerce_selected_all_numeric_source_fields(self):
|
||||||
|
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
|
||||||
|
|
||||||
|
"""
|
||||||
|
Note: all are numeric
|
||||||
|
total_quantity int64
|
||||||
|
taxful_total_price float64
|
||||||
|
taxless_total_price float64
|
||||||
|
"""
|
||||||
|
|
||||||
|
ed_ecommerce = self.ed_ecommerce()[columns]
|
||||||
|
pd_ecommerce = self.pd_ecommerce()[columns]
|
||||||
|
|
||||||
|
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
|
||||||
|
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||||
|
|
||||||
|
assert pd_numeric.columns.to_list() == ed_numeric
|
Loading…
x
Reference in New Issue
Block a user