diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index db44b60..b42a6d0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -29,11 +29,16 @@ steps: machineType: "n2-standard-4" env: PYTHON_VERSION: "{{ matrix.python }}" - PANDAS_VERSION: '1.5.0' + PANDAS_VERSION: "{{ matrix.pandas }}" TEST_SUITE: "xpack" ELASTICSEARCH_VERSION: "{{ matrix.stack }}" matrix: setup: + # Python and pandas versions need to be added to the nox configuration too + # (in the decorators of the test method in noxfile.py) + pandas: + - '1.5.0' + - '2.2.3' python: - '3.12' - '3.11' diff --git a/docs/sphinx/examples/demo_notebook.ipynb b/docs/sphinx/examples/demo_notebook.ipynb index 64abb6a..9321680 100644 --- a/docs/sphinx/examples/demo_notebook.ipynb +++ b/docs/sphinx/examples/demo_notebook.ipynb @@ -24,7 +24,7 @@ "\n", "For this example, you will need:\n", "\n", - "- Python 3.8 or later\n", + "- Python 3.9 or later\n", "- An Elastic deployment\n", " - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n", "\n", diff --git a/eland/dataframe.py b/eland/dataframe.py index 7c5f954..6818423 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore from pandas.util._validators import validate_bool_kwarg # type: ignore import eland.plotting as gfx -from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter +from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter from eland.filter import BooleanFilter from eland.groupby import DataFrameGroupBy from eland.ndframe import NDFrame @@ -411,9 +411,7 @@ class DataFrame(NDFrame): axis = pd.DataFrame._get_axis_name(axis) axes = {axis: labels} elif index is not None or columns is not None: - axes, _ = pd.DataFrame()._construct_axes_from_arguments( - (index, columns), {} - ) + axes = {"columns": columns, "index": index} else: raise ValueError( "Need to specify at least one of 'labels', 'index' or 'columns'" @@ -1361,7 +1359,7 @@ class DataFrame(NDFrame): default_handler=None, lines=False, compression="infer", - index=True, + index=None, indent=None, storage_options=None, ): @@ -1376,6 +1374,8 @@ class DataFrame(NDFrame): -------- :pandas_api_docs:`pandas.DataFrame.to_json` """ + if index is None and PANDAS_VERSION[0] == 1: + index = True # switch to the pandas 1 default kwargs = { "path_or_buf": path_or_buf, "orient": orient, diff --git a/eland/etl.py b/eland/etl.py index df1c5ed..5938746 100644 --- a/eland/etl.py +++ b/eland/etl.py @@ -16,6 +16,7 @@ # under the License. import csv +import warnings from collections import deque from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union @@ -110,15 +111,15 @@ def pandas_to_eland( 2 3.141 1 ... 3 Long text - to be indexed as es type text [3 rows x 8 columns] - >>> pd_df.dtypes - A float64 - B int64 - C object - D datetime64[ns] - E float64 - F bool - G int64 - H object + >>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP + A float64 + B int64 + C object + D datetime64[s] + E float64 + F bool + G int64 + H object dtype: object Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`. @@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore names=None, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=None, - mangle_dupe_cols=True, + mangle_dupe_cols=None, # General Parsing Configuration dtype=None, engine=None, @@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore low_memory: bool = _DEFAULT_LOW_MEMORY, memory_map=False, float_precision=None, + **extra_kwargs, ) -> "DataFrame": """ Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index). @@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore "usecols": usecols, "verbose": verbose, "encoding": encoding, - "squeeze": squeeze, "memory_map": memory_map, "float_precision": float_precision, "na_filter": na_filter, @@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore "error_bad_lines": error_bad_lines, "on_bad_lines": on_bad_lines, "low_memory": low_memory, - "mangle_dupe_cols": mangle_dupe_cols, "infer_datetime_format": infer_datetime_format, "skip_blank_lines": skip_blank_lines, + **extra_kwargs, } if chunksize is None: @@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore kwargs.pop("on_bad_lines") + if "squeeze" in kwargs: + kwargs.pop("squeeze") + warnings.warn( + "This argument no longer works, use .squeeze('columns') on your DataFrame instead" + ) + + if "mangle_dupe_cols" in kwargs: + kwargs.pop("mangle_dupe_cols") + warnings.warn( + "The mangle_dupe_cols argument no longer works. Furthermore, " + "duplicate columns will automatically get a number suffix." + ) # read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch) reader = pd.read_csv(filepath_or_buffer, **kwargs) diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 94cd2e6..37dbe78 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -712,8 +712,11 @@ class FieldMappings: capabilities, orient="index", columns=FieldMappings.column_labels ) - self._mappings_capabilities = self._mappings_capabilities.append( - capability_matrix_row + self._mappings_capabilities = pd.concat( + [ + self._mappings_capabilities, + capability_matrix_row, + ] ) def numeric_source_fields(self) -> List[str]: diff --git a/eland/ml/exporters/es_gb_models.py b/eland/ml/exporters/es_gb_models.py index 4f1e197..cd50b35 100644 --- a/eland/ml/exporters/es_gb_models.py +++ b/eland/ml/exporters/es_gb_models.py @@ -187,7 +187,7 @@ class ESGradientBoostingModel(ABC): if field_name in feature_names and field_name not in input_field_names: input_field_names.add(field_name) - return feature_names, input_field_names + return feature_names, list(input_field_names) @property def preprocessors(self) -> List[Any]: diff --git a/eland/series.py b/eland/series.py index d78fe8b..2463436 100644 --- a/eland/series.py +++ b/eland/series.py @@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd # type: ignore +from pandas.core.indexes.frozen import FrozenList from pandas.io.common import _expand_user, stringify_path # type: ignore import eland.plotting from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString -from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter +from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter from eland.filter import ( BooleanFilter, Equal, @@ -292,18 +293,26 @@ class Series(NDFrame): Examples -------- >>> df = ed.DataFrame('http://localhost:9200', 'flights') - >>> df['Carrier'].value_counts() + >>> df['Carrier'].value_counts() # doctest: +SKIP + Carrier Logstash Airways 3331 JetBeats 3274 Kibana Airlines 3234 ES-Air 3220 - Name: Carrier, dtype: int64 + Name: count, dtype: int64 """ if not isinstance(es_size, int): raise TypeError("es_size must be a positive integer.") elif es_size <= 0: raise ValueError("es_size must be a positive integer.") - return self._query_compiler.value_counts(es_size) + value_counts = self._query_compiler.value_counts(es_size) + # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count + if PANDAS_VERSION[0] == 2: + value_counts.name = "count" + value_counts.index.names = FrozenList([self.es_field_name]) + value_counts.index.name = self.es_field_name + + return value_counts # dtype not implemented for Series as causes query to fail # in pandas.core.computation.ops.Term.type diff --git a/noxfile.py b/noxfile.py index 3632496..3a390ba 100644 --- a/noxfile.py +++ b/noxfile.py @@ -96,7 +96,7 @@ def lint(session): @nox.session(python=["3.9", "3.10", "3.11", "3.12"]) -@nox.parametrize("pandas_version", ["1.5.0"]) +@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"]) def test(session, pandas_version: str): session.install("-r", "requirements-dev.txt") session.install(".") diff --git a/setup.py b/setup.py index 4ac65d1..65dc1f2 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,7 @@ setup( packages=find_packages(include=["eland", "eland.*"]), install_requires=[ "elasticsearch>=8.3,<9", - "pandas>=1.5,<2", + "pandas>=1.5,<3", "matplotlib>=3.6", "numpy>=1.2.0,<2", "packaging", diff --git a/tests/common.py b/tests/common.py index 3e8619b..f95e33e 100644 --- a/tests/common.py +++ b/tests/common.py @@ -24,6 +24,7 @@ import pandas as pd from pandas.testing import assert_frame_equal, assert_series_equal import eland as ed +from eland.common import PANDAS_VERSION ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -45,7 +46,10 @@ with gzip.open(FLIGHTS_FILE_NAME) as f: _pd_flights = pd.DataFrame.from_records(flight_records).reindex( _ed_flights.columns, axis=1 ) -_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"]) +if PANDAS_VERSION[0] >= 2: + _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed") +else: + _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"]) # Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list _pd_flights["Cities"] = _pd_flights.apply( lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1 @@ -62,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl ) _pd_ecommerce.insert(2, "customer_birth_date", None) _pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int -_pd_ecommerce["customer_birth_date"].astype("datetime64") +_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]") _ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME) diff --git a/tests/conftest.py b/tests/conftest.py index 6626373..7b8ae9f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -77,7 +77,16 @@ class SymmetricAPIChecker: pd_exc = e self.check_exception(ed_exc, pd_exc) - self.check_values(ed_obj, pd_obj) + try: + self.check_values(ed_obj, pd_obj) + except AssertionError as e: + # This is an attribute we allow to differ when comparing zero-length objects + if ( + 'Attribute "inferred_type" are different' in repr(e) + and len(ed_obj) == 0 + and len(pd_obj) == 0 + ): + self.check_values(ed_obj, pd_obj, check_index_type=False) if isinstance(ed_obj, (ed.DataFrame, ed.Series)): return SymmetricAPIChecker(ed_obj, pd_obj) @@ -85,16 +94,16 @@ class SymmetricAPIChecker: return f - def check_values(self, ed_obj, pd_obj): + def check_values(self, ed_obj, pd_obj, **kwargs): """Checks that any two values coming from eland and pandas are equal""" if isinstance(ed_obj, ed.DataFrame): - assert_pandas_eland_frame_equal(pd_obj, ed_obj) + assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs) elif isinstance(ed_obj, ed.Series): - assert_pandas_eland_series_equal(pd_obj, ed_obj) + assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs) elif isinstance(ed_obj, pd.DataFrame): - assert_frame_equal(ed_obj, pd_obj) + assert_frame_equal(ed_obj, pd_obj, **kwargs) elif isinstance(ed_obj, pd.Series): - assert_series_equal(ed_obj, pd_obj) + assert_series_equal(ed_obj, pd_obj, **kwargs) elif isinstance(ed_obj, pd.Index): assert ed_obj.equals(pd_obj) else: diff --git a/tests/dataframe/test_datetime_pytest.py b/tests/dataframe/test_datetime_pytest.py index adc8d80..1cabc2b 100644 --- a/tests/dataframe/test_datetime_pytest.py +++ b/tests/dataframe/test_datetime_pytest.py @@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData): }, index=["0", "1", "2"], ) + # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution + df["D"] = df["D"].astype("datetime64[ns]") expected_mappings = { "mappings": { diff --git a/tests/dataframe/test_describe_pytest.py b/tests/dataframe/test_describe_pytest.py index a0f585b..e3e1e02 100644 --- a/tests/dataframe/test_describe_pytest.py +++ b/tests/dataframe/test_describe_pytest.py @@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData): ["Cancelled", "FlightDelay"], axis="columns" ) + # Pandas >= 2 calculates aggregations such as min and max for timestamps too + # This could be implemented in eland, but as of yet this is not the case + # We therefore remove it before the comparison + if "timestamp" in pd_describe.columns: + pd_describe = pd_describe.drop(["timestamp"], axis="columns") + + # Pandas >= 2 orders the aggregations differently than Pandas < 2 + # A sort_index is applied so tests will succeed in both environments assert_frame_equal( - pd_describe.drop(["25%", "50%", "75%"], axis="index"), - ed_describe.drop(["25%", "50%", "75%"], axis="index"), + pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(), + ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(), check_exact=False, rtol=True, ) diff --git a/tests/dataframe/test_head_tail_pytest.py b/tests/dataframe/test_head_tail_pytest.py index 2f4a702..3923771 100644 --- a/tests/dataframe/test_head_tail_pytest.py +++ b/tests/dataframe/test_head_tail_pytest.py @@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData): ed_head_0 = ed_flights.head(0) pd_head_0 = pd_flights.head(0) - assert_pandas_eland_frame_equal(pd_head_0, ed_head_0) + assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False) def test_doc_test_tail(self): df = self.ed_flights() diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 9477629..9f33dc9 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -22,6 +22,7 @@ import pandas as pd import pytest from pandas.testing import assert_frame_equal, assert_series_equal +from eland.common import PANDAS_VERSION from tests.common import TestData, assert_almost_equal @@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData): logger.setLevel(logging.DEBUG) for func in self.extended_funcs: + if PANDAS_VERSION[0] >= 2 and func == "mad": + continue pd_metric = getattr(pd_flights, func)( **({"numeric_only": True} if func != "mad" else {}) ) @@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData): ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]] for func in self.extended_funcs: + if PANDAS_VERSION[0] >= 2 and func == "mad": + continue pd_metric = getattr(pd_flights_1, func)() ed_metric = getattr(ed_flights_1, func)(numeric_only=False) @@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData): ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]] for func in self.extended_funcs: + if PANDAS_VERSION[0] >= 2 and func == "mad": + continue pd_metric = getattr(pd_flights_0, func)() ed_metric = getattr(ed_flights_0, func)(numeric_only=False) @@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData): ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ) - pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only) - ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only) + if PANDAS_VERSION[0] == 1: + pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only) + ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only) + + else: # numeric_only is no longer available for pandas > 2 + pd_quantile = pd_flights.agg(["quantile", "min"]) + ed_quantile = ed_flights.agg(["quantile", "min"]) assert_frame_equal( pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False diff --git a/tests/dataframe/test_utils_pytest.py b/tests/dataframe/test_utils_pytest.py index 5cdb591..7fb5892 100644 --- a/tests/dataframe/test_utils_pytest.py +++ b/tests/dataframe/test_utils_pytest.py @@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData): ) ed_df_head = ed_df.head() + # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution + df["D"] = df["D"].astype("datetime64[ns]") + df["H"] = ( + df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC") + ) + assert_pandas_eland_frame_equal(df, ed_df_head) ES_TEST_CLIENT.indices.delete(index=index_name) diff --git a/tests/notebook/test_demo_notebook.ipynb b/tests/notebook/test_demo_notebook.ipynb index a3b42d0..d0b9ccf 100644 --- a/tests/notebook/test_demo_notebook.ipynb +++ b/tests/notebook/test_demo_notebook.ipynb @@ -1647,6 +1647,14 @@ "execution_count": 32, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/code/eland/.nox/test-3-12-pandas_version-2-2-3/lib/python3.12/site-packages/eland/series.py:464: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " return self._query_compiler.dtypes[0]\n" + ] + }, { "data": { "text/html": [ @@ -1792,6 +1800,9 @@ } ], "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "# The ignore statement above is because of output difference between Pandas 1 and 2\n", + "# and can be removed once Pandas 1 support is dropped\n", "ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')" ] }, @@ -2377,8 +2388,8 @@ " AvgTicketPrice\n", " DistanceKilometers\n", " ...\n", - " FlightTimeMin\n", " dayOfWeek\n", + " timestamp\n", " \n", " \n", " \n", @@ -2388,23 +2399,15 @@ " 13059.000000\n", " ...\n", " 13059.000000\n", - " 13059.000000\n", + " 13059\n", " \n", " \n", " mean\n", " 628.253689\n", " 7092.142455\n", " ...\n", - " 511.127842\n", " 2.835975\n", - " \n", - " \n", - " std\n", - " 266.396861\n", - " 4578.438497\n", - " ...\n", - " 334.753952\n", - " 1.939439\n", + " 2018-01-21 19:20:45.564438016\n", " \n", " \n", " min\n", @@ -2412,57 +2415,65 @@ " 0.000000\n", " ...\n", " 0.000000\n", - " 0.000000\n", + " 2018-01-01 00:00:00\n", " \n", " \n", " 25%\n", " 409.893816\n", " 2459.705673\n", " ...\n", - " 252.333192\n", " 1.000000\n", + " 2018-01-11 05:16:25.500000\n", " \n", " \n", " 50%\n", " 640.556668\n", " 7610.330866\n", " ...\n", - " 503.045170\n", " 3.000000\n", + " 2018-01-22 00:32:11\n", " \n", " \n", " 75%\n", " 842.185470\n", " 9736.637600\n", " ...\n", - " 720.416036\n", " 4.000000\n", + " 2018-02-01 04:51:18\n", " \n", " \n", " max\n", " 1199.729053\n", " 19881.482315\n", " ...\n", - " 1902.902032\n", " 6.000000\n", + " 2018-02-11 23:50:12\n", + " \n", + " \n", + " std\n", + " 266.396861\n", + " 4578.438497\n", + " ...\n", + " 1.939439\n", + " NaN\n", " \n", " \n", "\n", - "

8 rows × 7 columns

\n", + "

8 rows × 8 columns

\n", "" ], "text/plain": [ - " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", - "count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n", - "mean 628.253689 7092.142455 ... 511.127842 2.835975\n", - "std 266.396861 4578.438497 ... 334.753952 1.939439\n", - "min 100.020528 0.000000 ... 0.000000 0.000000\n", - "25% 409.893816 2459.705673 ... 252.333192 1.000000\n", - "50% 640.556668 7610.330866 ... 503.045170 3.000000\n", - "75% 842.185470 9736.637600 ... 720.416036 4.000000\n", - "max 1199.729053 19881.482315 ... 1902.902032 6.000000\n", + " AvgTicketPrice DistanceKilometers ... dayOfWeek timestamp\n", + "count 13059.000000 13059.000000 ... 13059.000000 13059\n", + "mean 628.253689 7092.142455 ... 2.835975 2018-01-21 19:20:45.564438016\n", + "min 100.020528 0.000000 ... 0.000000 2018-01-01 00:00:00\n", + "25% 409.893816 2459.705673 ... 1.000000 2018-01-11 05:16:25.500000\n", + "50% 640.556668 7610.330866 ... 3.000000 2018-01-22 00:32:11\n", + "75% 842.185470 9736.637600 ... 4.000000 2018-02-01 04:51:18\n", + "max 1199.729053 19881.482315 ... 6.000000 2018-02-11 23:50:12\n", + "std 266.396861 4578.438497 ... 1.939439 NaN\n", "\n", - "[8 rows x 7 columns]" + "[8 rows x 8 columns]" ] }, "execution_count": 39, @@ -2471,6 +2482,8 @@ } ], "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "# Once support for pandas <2 is dropped, this and the line above can be removed\n", "pd_flights.describe()" ] }, diff --git a/tests/series/test_filter_pytest.py b/tests/series/test_filter_pytest.py index 948f6d0..0e51201 100644 --- a/tests/series/test_filter_pytest.py +++ b/tests/series/test_filter_pytest.py @@ -58,7 +58,9 @@ class TestSeriesFilter(TestData): ed_ser = ed_flights_small.filter(items=items, axis=0) pd_ser = pd_flights_small.filter(items=items, axis=0) - assert_pandas_eland_series_equal(pd_ser, ed_ser) + # For an empty Series, eland will say the datatype it knows from the Elastic index + # Pandas however will state empty as the datatype + assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False) def test_flights_filter_index_like_and_regex(self): ed_flights_small = self.ed_flights_small()["FlightDelayType"] diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 1fde9ed..4ab7eac 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -24,6 +24,7 @@ import pandas as pd import pytest from pandas.testing import assert_series_equal +from eland.common import PANDAS_VERSION from tests.common import TestData, assert_almost_equal @@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData): ed_flights = self.ed_flights()["AvgTicketPrice"] for func in self.all_funcs: + if PANDAS_VERSION[0] >= 2 and func == "mad": + continue pd_metric = getattr(pd_flights, func)() ed_metric = getattr(ed_flights, func)() @@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData): ed_ecommerce = self.ed_ecommerce()[column] for func in self.all_funcs: + if PANDAS_VERSION[0] >= 2 and func == "mad": + continue pd_metric = getattr(pd_ecommerce, func)() ed_metric = getattr(ed_ecommerce, func)( **({"numeric_only": True} if (func != "nunique") else {})