eland/tests/conftest.py
Bart Broere 75c57b0775
Support Pandas 2 (#742)
* Fix test setup to match pandas 2.0 demands

* Use the now deprecated _append method

(Better solution might exist)

* Deal with numeric_only being removed in metrics test

* Skip mad metric for other pandas versions

* Account for differences between pandas versions in describe methods

* Run black

* Check Pandas version first

* Mirror behaviour of installed Pandas version when running value_counts

* Allow passing arguments to the individual asserters

* Fix for method _construct_axes_from_arguments no longer existing

* Skip mad metric if it does not exist

* Account for pandas 2.0 timestamp default behaviour

* Deal with empty vs other inferred data types

* Account for default datetime precision change

* Run Black

* Solution for differences in inferred_type only

* Fix csv and json issues

* Skip two doctests

* Passing a set as indexer is no longer allowed

* Don't validate output where it differs between Pandas versions in the environment

* Update test matrix and packaging metadata

* Update version of Python in the docs

* Update Python version in demo notebook

* Match noxfile

* Symmetry

* Fix trailing comma in JSON

* Revert some changes in setup.py to fix building the documentation

* Revert "Revert some changes in setup.py to fix building the documentation"

This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346.

* Use PANDAS_VERSION from eland.common

* Still skip the doctest, but make the output pandas 2 instead of 1

* Still skip doctest, but switch to pandas 2 output

* Prepare for pandas 3

* Reference the right column

* Ignore output in tests but switch to pandas 2 output

* Add line comment about NBVAL_IGNORE_OUTPUT

* Restore missing line and add stderr cell

* Use non-private method instead

* Fix indentation and parameter issues

* If index is not specified, and pandas 1 is present, set it to True

From pandas 2 and upwards, index is set to None by default

* Run black

* Newer version of black might have different opinions?

* Add line comment

* Remove unused import

* Add reason for ignore statement

* Add reason for skip

---------

Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
2025-02-04 17:43:43 +04:00

167 lines
5.3 KiB
Python

# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import inspect
import pandas as pd
import pytest
import eland as ed
from .common import (
TestData,
_ed_ecommerce,
_ed_flights,
_ed_flights_small,
_pd_ecommerce,
_pd_flights,
_pd_flights_small,
assert_frame_equal,
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal,
assert_series_equal,
)
class SymmetricAPIChecker:
def __init__(self, ed_obj, pd_obj):
self.ed = ed_obj
self.pd = pd_obj
def load_dataset(self, dataset):
if dataset == "flights":
self.ed = _ed_flights
self.pd = _pd_flights.copy()
elif dataset == "flights_small":
self.ed = _ed_flights_small
self.pd = _pd_flights_small.copy()
elif dataset == "ecommerce":
self.ed = _ed_ecommerce
self.pd = _pd_ecommerce.copy()
else:
raise ValueError(f"Unknown dataset {dataset!r}")
def return_value_checker(self, func_name):
"""Returns a function which wraps the requested function
and checks the return value when that function is inevitably
called.
"""
def f(*args, **kwargs):
ed_exc = None
try:
ed_obj = getattr(self.ed, func_name)(*args, **kwargs)
except Exception as e:
ed_exc = e
pd_exc = None
try:
if func_name == "to_pandas":
pd_obj = self.pd
else:
pd_obj = getattr(self.pd, func_name)(*args, **kwargs)
except Exception as e:
pd_exc = e
self.check_exception(ed_exc, pd_exc)
try:
self.check_values(ed_obj, pd_obj)
except AssertionError as e:
# This is an attribute we allow to differ when comparing zero-length objects
if (
'Attribute "inferred_type" are different' in repr(e)
and len(ed_obj) == 0
and len(pd_obj) == 0
):
self.check_values(ed_obj, pd_obj, check_index_type=False)
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
return SymmetricAPIChecker(ed_obj, pd_obj)
return pd_obj
return f
def check_values(self, ed_obj, pd_obj, **kwargs):
"""Checks that any two values coming from eland and pandas are equal"""
if isinstance(ed_obj, ed.DataFrame):
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, ed.Series):
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, pd.DataFrame):
assert_frame_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Series):
assert_series_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Index):
assert ed_obj.equals(pd_obj)
else:
assert ed_obj == pd_obj
def check_exception(self, ed_exc, pd_exc):
"""Checks that either an exception was raised or not from both eland and pandas"""
assert (ed_exc is None) == (pd_exc is None) and isinstance(ed_exc, type(pd_exc))
if pd_exc is not None:
raise pd_exc
def __getitem__(self, item):
if isinstance(item, SymmetricAPIChecker):
pd_item = item.pd
ed_item = item.ed
else:
pd_item = ed_item = item
ed_exc = None
pd_exc = None
try:
pd_obj = self.pd[pd_item]
except Exception as e:
pd_exc = e
try:
ed_obj = self.ed[ed_item]
except Exception as e:
ed_exc = e
self.check_exception(ed_exc, pd_exc)
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
return SymmetricAPIChecker(ed_obj, pd_obj)
return pd_obj
def __getattr__(self, item):
if item == "to_pandas":
return self.return_value_checker("to_pandas")
pd_obj = getattr(self.pd, item)
if inspect.isfunction(pd_obj) or inspect.ismethod(pd_obj):
return self.return_value_checker(item)
ed_obj = getattr(self.ed, item)
self.check_values(ed_obj, pd_obj)
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
return SymmetricAPIChecker(ed_obj, pd_obj)
return pd_obj
@pytest.fixture(scope="function")
def df():
return SymmetricAPIChecker(
ed_obj=_ed_flights_small, pd_obj=_pd_flights_small.copy()
)
@pytest.fixture(scope="session")
def testdata():
return TestData()