[Backport 8.x] Fixes for Pandas 2 support (#758)

* Support Pandas 2 (#742)

* Fix test setup to match pandas 2.0 demands

* Use the now deprecated _append method

(Better solution might exist)

* Deal with numeric_only being removed in metrics test

* Skip mad metric for other pandas versions

* Account for differences between pandas versions in describe methods

* Run black

* Check Pandas version first

* Mirror behaviour of installed Pandas version when running value_counts

* Allow passing arguments to the individual asserters

* Fix for method _construct_axes_from_arguments no longer existing

* Skip mad metric if it does not exist

* Account for pandas 2.0 timestamp default behaviour

* Deal with empty vs other inferred data types

* Account for default datetime precision change

* Run Black

* Solution for differences in inferred_type only

* Fix csv and json issues

* Skip two doctests

* Passing a set as indexer is no longer allowed

* Don't validate output where it differs between Pandas versions in the environment

* Update test matrix and packaging metadata

* Update version of Python in the docs

* Update Python version in demo notebook

* Match noxfile

* Symmetry

* Fix trailing comma in JSON

* Revert some changes in setup.py to fix building the documentation

* Revert "Revert some changes in setup.py to fix building the documentation"

This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346.

* Use PANDAS_VERSION from eland.common

* Still skip the doctest, but make the output pandas 2 instead of 1

* Still skip doctest, but switch to pandas 2 output

* Prepare for pandas 3

* Reference the right column

* Ignore output in tests but switch to pandas 2 output

* Add line comment about NBVAL_IGNORE_OUTPUT

* Restore missing line and add stderr cell

* Use non-private method instead

* Fix indentation and parameter issues

* If index is not specified, and pandas 1 is present, set it to True

From pandas 2 and upwards, index is set to None by default

* Run black

* Newer version of black might have different opinions?

* Add line comment

* Remove unused import

* Add reason for ignore statement

* Add reason for skip

---------

Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
(cherry picked from commit 75c57b077532c459a9490613cbf7b37215c27fae)

* Return input_field_names as list as required by Pandas 2

---------

Co-authored-by: Bart Broere <mail@bartbroere.eu>
Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
This commit is contained in:
github-actions[bot] 2025-02-13 14:16:49 +04:00 committed by GitHub
parent d50436b01c
commit af20ef9063
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 161 additions and 70 deletions

View File

@ -29,11 +29,16 @@ steps:
machineType: "n2-standard-4"
env:
PYTHON_VERSION: "{{ matrix.python }}"
PANDAS_VERSION: '1.5.0'
PANDAS_VERSION: "{{ matrix.pandas }}"
TEST_SUITE: "xpack"
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
matrix:
setup:
# Python and pandas versions need to be added to the nox configuration too
# (in the decorators of the test method in noxfile.py)
pandas:
- '1.5.0'
- '2.2.3'
python:
- '3.12'
- '3.11'

View File

@ -24,7 +24,7 @@
"\n",
"For this example, you will need:\n",
"\n",
"- Python 3.8 or later\n",
"- Python 3.9 or later\n",
"- An Elastic deployment\n",
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
"\n",

View File

@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore
from pandas.util._validators import validate_bool_kwarg # type: ignore
import eland.plotting as gfx
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
from eland.filter import BooleanFilter
from eland.groupby import DataFrameGroupBy
from eland.ndframe import NDFrame
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
axis = pd.DataFrame._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
(index, columns), {}
)
axes = {"columns": columns, "index": index}
else:
raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'"
@ -1361,7 +1359,7 @@ class DataFrame(NDFrame):
default_handler=None,
lines=False,
compression="infer",
index=True,
index=None,
indent=None,
storage_options=None,
):
@ -1376,6 +1374,8 @@ class DataFrame(NDFrame):
--------
:pandas_api_docs:`pandas.DataFrame.to_json`
"""
if index is None and PANDAS_VERSION[0] == 1:
index = True # switch to the pandas 1 default
kwargs = {
"path_or_buf": path_or_buf,
"orient": orient,

View File

@ -16,6 +16,7 @@
# under the License.
import csv
import warnings
from collections import deque
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
@ -110,15 +111,15 @@ def pandas_to_eland(
2 3.141 1 ... 3 Long text - to be indexed as es type text
<BLANKLINE>
[3 rows x 8 columns]
>>> pd_df.dtypes
A float64
B int64
C object
D datetime64[ns]
E float64
F bool
G int64
H object
>>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
A float64
B int64
C object
D datetime64[s]
E float64
F bool
G int64
H object
dtype: object
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=None,
mangle_dupe_cols=True,
mangle_dupe_cols=None,
# General Parsing Configuration
dtype=None,
engine=None,
@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
low_memory: bool = _DEFAULT_LOW_MEMORY,
memory_map=False,
float_precision=None,
**extra_kwargs,
) -> "DataFrame":
"""
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
"usecols": usecols,
"verbose": verbose,
"encoding": encoding,
"squeeze": squeeze,
"memory_map": memory_map,
"float_precision": float_precision,
"na_filter": na_filter,
@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
"error_bad_lines": error_bad_lines,
"on_bad_lines": on_bad_lines,
"low_memory": low_memory,
"mangle_dupe_cols": mangle_dupe_cols,
"infer_datetime_format": infer_datetime_format,
"skip_blank_lines": skip_blank_lines,
**extra_kwargs,
}
if chunksize is None:
@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore
kwargs.pop("on_bad_lines")
if "squeeze" in kwargs:
kwargs.pop("squeeze")
warnings.warn(
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
)
if "mangle_dupe_cols" in kwargs:
kwargs.pop("mangle_dupe_cols")
warnings.warn(
"The mangle_dupe_cols argument no longer works. Furthermore, "
"duplicate columns will automatically get a number suffix."
)
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
reader = pd.read_csv(filepath_or_buffer, **kwargs)

View File

@ -712,8 +712,11 @@ class FieldMappings:
capabilities, orient="index", columns=FieldMappings.column_labels
)
self._mappings_capabilities = self._mappings_capabilities.append(
capability_matrix_row
self._mappings_capabilities = pd.concat(
[
self._mappings_capabilities,
capability_matrix_row,
]
)
def numeric_source_fields(self) -> List[str]:

View File

@ -187,7 +187,7 @@ class ESGradientBoostingModel(ABC):
if field_name in feature_names and field_name not in input_field_names:
input_field_names.add(field_name)
return feature_names, input_field_names
return feature_names, list(input_field_names)
@property
def preprocessors(self) -> List[Any]:

View File

@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd # type: ignore
from pandas.core.indexes.frozen import FrozenList
from pandas.io.common import _expand_user, stringify_path # type: ignore
import eland.plotting
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
from eland.filter import (
BooleanFilter,
Equal,
@ -292,18 +293,26 @@ class Series(NDFrame):
Examples
--------
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
>>> df['Carrier'].value_counts()
>>> df['Carrier'].value_counts() # doctest: +SKIP
Carrier
Logstash Airways 3331
JetBeats 3274
Kibana Airlines 3234
ES-Air 3220
Name: Carrier, dtype: int64
Name: count, dtype: int64
"""
if not isinstance(es_size, int):
raise TypeError("es_size must be a positive integer.")
elif es_size <= 0:
raise ValueError("es_size must be a positive integer.")
return self._query_compiler.value_counts(es_size)
value_counts = self._query_compiler.value_counts(es_size)
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
if PANDAS_VERSION[0] == 2:
value_counts.name = "count"
value_counts.index.names = FrozenList([self.es_field_name])
value_counts.index.name = self.es_field_name
return value_counts
# dtype not implemented for Series as causes query to fail
# in pandas.core.computation.ops.Term.type

View File

@ -96,7 +96,7 @@ def lint(session):
@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
@nox.parametrize("pandas_version", ["1.5.0"])
@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
def test(session, pandas_version: str):
session.install("-r", "requirements-dev.txt")
session.install(".")

View File

@ -87,7 +87,7 @@ setup(
packages=find_packages(include=["eland", "eland.*"]),
install_requires=[
"elasticsearch>=8.3,<9",
"pandas>=1.5,<2",
"pandas>=1.5,<3",
"matplotlib>=3.6",
"numpy>=1.2.0,<2",
"packaging",

View File

@ -24,6 +24,7 @@ import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
import eland as ed
from eland.common import PANDAS_VERSION
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -45,7 +46,10 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
_ed_flights.columns, axis=1
)
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
if PANDAS_VERSION[0] >= 2:
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
else:
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
_pd_flights["Cities"] = _pd_flights.apply(
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
@ -62,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
)
_pd_ecommerce.insert(2, "customer_birth_date", None)
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
_pd_ecommerce["customer_birth_date"].astype("datetime64")
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)

View File

@ -77,7 +77,16 @@ class SymmetricAPIChecker:
pd_exc = e
self.check_exception(ed_exc, pd_exc)
self.check_values(ed_obj, pd_obj)
try:
self.check_values(ed_obj, pd_obj)
except AssertionError as e:
# This is an attribute we allow to differ when comparing zero-length objects
if (
'Attribute "inferred_type" are different' in repr(e)
and len(ed_obj) == 0
and len(pd_obj) == 0
):
self.check_values(ed_obj, pd_obj, check_index_type=False)
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
return SymmetricAPIChecker(ed_obj, pd_obj)
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
return f
def check_values(self, ed_obj, pd_obj):
def check_values(self, ed_obj, pd_obj, **kwargs):
"""Checks that any two values coming from eland and pandas are equal"""
if isinstance(ed_obj, ed.DataFrame):
assert_pandas_eland_frame_equal(pd_obj, ed_obj)
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, ed.Series):
assert_pandas_eland_series_equal(pd_obj, ed_obj)
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, pd.DataFrame):
assert_frame_equal(ed_obj, pd_obj)
assert_frame_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Series):
assert_series_equal(ed_obj, pd_obj)
assert_series_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Index):
assert ed_obj.equals(pd_obj)
else:

View File

@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
},
index=["0", "1", "2"],
)
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
df["D"] = df["D"].astype("datetime64[ns]")
expected_mappings = {
"mappings": {

View File

@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
["Cancelled", "FlightDelay"], axis="columns"
)
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
# This could be implemented in eland, but as of yet this is not the case
# We therefore remove it before the comparison
if "timestamp" in pd_describe.columns:
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
# Pandas >= 2 orders the aggregations differently than Pandas < 2
# A sort_index is applied so tests will succeed in both environments
assert_frame_equal(
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
ed_describe.drop(["25%", "50%", "75%"], axis="index"),
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
check_exact=False,
rtol=True,
)

View File

@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
ed_head_0 = ed_flights.head(0)
pd_head_0 = pd_flights.head(0)
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
def test_doc_test_tail(self):
df = self.ed_flights()

View File

@ -22,6 +22,7 @@ import pandas as pd
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal
from eland.common import PANDAS_VERSION
from tests.common import TestData, assert_almost_equal
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
logger.setLevel(logging.DEBUG)
for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights, func)(
**({"numeric_only": True} if func != "mad" else {})
)
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData):
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
)
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
if PANDAS_VERSION[0] == 1:
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
else: # numeric_only is no longer available for pandas > 2
pd_quantile = pd_flights.agg(["quantile", "min"])
ed_quantile = ed_flights.agg(["quantile", "min"])
assert_frame_equal(
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False

View File

@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
)
ed_df_head = ed_df.head()
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
df["D"] = df["D"].astype("datetime64[ns]")
df["H"] = (
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
)
assert_pandas_eland_frame_equal(df, ed_df_head)
ES_TEST_CLIENT.indices.delete(index=index_name)

View File

@ -1647,6 +1647,14 @@
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/code/eland/.nox/test-3-12-pandas_version-2-2-3/lib/python3.12/site-packages/eland/series.py:464: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" return self._query_compiler.dtypes[0]\n"
]
},
{
"data": {
"text/html": [
@ -1792,6 +1800,9 @@
}
],
"source": [
"# NBVAL_IGNORE_OUTPUT\n",
"# The ignore statement above is because of output difference between Pandas 1 and 2\n",
"# and can be removed once Pandas 1 support is dropped\n",
"ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
]
},
@ -2377,8 +2388,8 @@
" <th>AvgTicketPrice</th>\n",
" <th>DistanceKilometers</th>\n",
" <th>...</th>\n",
" <th>FlightTimeMin</th>\n",
" <th>dayOfWeek</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
@ -2388,23 +2399,15 @@
" <td>13059.000000</td>\n",
" <td>...</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>628.253689</td>\n",
" <td>7092.142455</td>\n",
" <td>...</td>\n",
" <td>511.127842</td>\n",
" <td>2.835975</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>266.396861</td>\n",
" <td>4578.438497</td>\n",
" <td>...</td>\n",
" <td>334.753952</td>\n",
" <td>1.939439</td>\n",
" <td>2018-01-21 19:20:45.564438016</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
@ -2412,57 +2415,65 @@
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2018-01-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>409.893816</td>\n",
" <td>2459.705673</td>\n",
" <td>...</td>\n",
" <td>252.333192</td>\n",
" <td>1.000000</td>\n",
" <td>2018-01-11 05:16:25.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>640.556668</td>\n",
" <td>7610.330866</td>\n",
" <td>...</td>\n",
" <td>503.045170</td>\n",
" <td>3.000000</td>\n",
" <td>2018-01-22 00:32:11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>842.185470</td>\n",
" <td>9736.637600</td>\n",
" <td>...</td>\n",
" <td>720.416036</td>\n",
" <td>4.000000</td>\n",
" <td>2018-02-01 04:51:18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1199.729053</td>\n",
" <td>19881.482315</td>\n",
" <td>...</td>\n",
" <td>1902.902032</td>\n",
" <td>6.000000</td>\n",
" <td>2018-02-11 23:50:12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>266.396861</td>\n",
" <td>4578.438497</td>\n",
" <td>...</td>\n",
" <td>1.939439</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 7 columns</p>\n",
"<p>8 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
"count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n",
"mean 628.253689 7092.142455 ... 511.127842 2.835975\n",
"std 266.396861 4578.438497 ... 334.753952 1.939439\n",
"min 100.020528 0.000000 ... 0.000000 0.000000\n",
"25% 409.893816 2459.705673 ... 252.333192 1.000000\n",
"50% 640.556668 7610.330866 ... 503.045170 3.000000\n",
"75% 842.185470 9736.637600 ... 720.416036 4.000000\n",
"max 1199.729053 19881.482315 ... 1902.902032 6.000000\n",
" AvgTicketPrice DistanceKilometers ... dayOfWeek timestamp\n",
"count 13059.000000 13059.000000 ... 13059.000000 13059\n",
"mean 628.253689 7092.142455 ... 2.835975 2018-01-21 19:20:45.564438016\n",
"min 100.020528 0.000000 ... 0.000000 2018-01-01 00:00:00\n",
"25% 409.893816 2459.705673 ... 1.000000 2018-01-11 05:16:25.500000\n",
"50% 640.556668 7610.330866 ... 3.000000 2018-01-22 00:32:11\n",
"75% 842.185470 9736.637600 ... 4.000000 2018-02-01 04:51:18\n",
"max 1199.729053 19881.482315 ... 6.000000 2018-02-11 23:50:12\n",
"std 266.396861 4578.438497 ... 1.939439 NaN\n",
"\n",
"[8 rows x 7 columns]"
"[8 rows x 8 columns]"
]
},
"execution_count": 39,
@ -2471,6 +2482,8 @@
}
],
"source": [
"# NBVAL_IGNORE_OUTPUT\n",
"# Once support for pandas <2 is dropped, this and the line above can be removed\n",
"pd_flights.describe()"
]
},

View File

@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
ed_ser = ed_flights_small.filter(items=items, axis=0)
pd_ser = pd_flights_small.filter(items=items, axis=0)
assert_pandas_eland_series_equal(pd_ser, ed_ser)
# For an empty Series, eland will say the datatype it knows from the Elastic index
# Pandas however will state empty as the datatype
assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
def test_flights_filter_index_like_and_regex(self):
ed_flights_small = self.ed_flights_small()["FlightDelayType"]

View File

@ -24,6 +24,7 @@ import pandas as pd
import pytest
from pandas.testing import assert_series_equal
from eland.common import PANDAS_VERSION
from tests.common import TestData, assert_almost_equal
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
ed_flights = self.ed_flights()["AvgTicketPrice"]
for func in self.all_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)()
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.all_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)(
**({"numeric_only": True} if (func != "nunique") else {})