[Backport 8.x] Fixes for Pandas 2 support (#758)

* Support Pandas 2 (#742)

* Fix test setup to match pandas 2.0 demands

* Use the now deprecated _append method

(Better solution might exist)

* Deal with numeric_only being removed in metrics test

* Skip mad metric for other pandas versions

* Account for differences between pandas versions in describe methods

* Run black

* Check Pandas version first

* Mirror behaviour of installed Pandas version when running value_counts

* Allow passing arguments to the individual asserters

* Fix for method _construct_axes_from_arguments no longer existing

* Skip mad metric if it does not exist

* Account for pandas 2.0 timestamp default behaviour

* Deal with empty vs other inferred data types

* Account for default datetime precision change

* Run Black

* Solution for differences in inferred_type only

* Fix csv and json issues

* Skip two doctests

* Passing a set as indexer is no longer allowed

* Don't validate output where it differs between Pandas versions in the environment

* Update test matrix and packaging metadata

* Update version of Python in the docs

* Update Python version in demo notebook

* Match noxfile

* Symmetry

* Fix trailing comma in JSON

* Revert some changes in setup.py to fix building the documentation

* Revert "Revert some changes in setup.py to fix building the documentation"

This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346.

* Use PANDAS_VERSION from eland.common

* Still skip the doctest, but make the output pandas 2 instead of 1

* Still skip doctest, but switch to pandas 2 output

* Prepare for pandas 3

* Reference the right column

* Ignore output in tests but switch to pandas 2 output

* Add line comment about NBVAL_IGNORE_OUTPUT

* Restore missing line and add stderr cell

* Use non-private method instead

* Fix indentation and parameter issues

* If index is not specified, and pandas 1 is present, set it to True

From pandas 2 and upwards, index is set to None by default

* Run black

* Newer version of black might have different opinions?

* Add line comment

* Remove unused import

* Add reason for ignore statement

* Add reason for skip

---------

Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
(cherry picked from commit 75c57b077532c459a9490613cbf7b37215c27fae)

* Return input_field_names as list as required by Pandas 2

---------

Co-authored-by: Bart Broere <mail@bartbroere.eu>
Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
This commit is contained in:
github-actions[bot] 2025-02-13 14:16:49 +04:00 committed by GitHub
parent d50436b01c
commit af20ef9063
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 161 additions and 70 deletions

View File

@ -29,11 +29,16 @@ steps:
machineType: "n2-standard-4" machineType: "n2-standard-4"
env: env:
PYTHON_VERSION: "{{ matrix.python }}" PYTHON_VERSION: "{{ matrix.python }}"
PANDAS_VERSION: '1.5.0' PANDAS_VERSION: "{{ matrix.pandas }}"
TEST_SUITE: "xpack" TEST_SUITE: "xpack"
ELASTICSEARCH_VERSION: "{{ matrix.stack }}" ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
matrix: matrix:
setup: setup:
# Python and pandas versions need to be added to the nox configuration too
# (in the decorators of the test method in noxfile.py)
pandas:
- '1.5.0'
- '2.2.3'
python: python:
- '3.12' - '3.12'
- '3.11' - '3.11'

View File

@ -24,7 +24,7 @@
"\n", "\n",
"For this example, you will need:\n", "For this example, you will need:\n",
"\n", "\n",
"- Python 3.8 or later\n", "- Python 3.9 or later\n",
"- An Elastic deployment\n", "- An Elastic deployment\n",
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n", " - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
"\n", "\n",

View File

@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore
from pandas.util._validators import validate_bool_kwarg # type: ignore from pandas.util._validators import validate_bool_kwarg # type: ignore
import eland.plotting as gfx import eland.plotting as gfx
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
from eland.filter import BooleanFilter from eland.filter import BooleanFilter
from eland.groupby import DataFrameGroupBy from eland.groupby import DataFrameGroupBy
from eland.ndframe import NDFrame from eland.ndframe import NDFrame
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
axis = pd.DataFrame._get_axis_name(axis) axis = pd.DataFrame._get_axis_name(axis)
axes = {axis: labels} axes = {axis: labels}
elif index is not None or columns is not None: elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments( axes = {"columns": columns, "index": index}
(index, columns), {}
)
else: else:
raise ValueError( raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'" "Need to specify at least one of 'labels', 'index' or 'columns'"
@ -1361,7 +1359,7 @@ class DataFrame(NDFrame):
default_handler=None, default_handler=None,
lines=False, lines=False,
compression="infer", compression="infer",
index=True, index=None,
indent=None, indent=None,
storage_options=None, storage_options=None,
): ):
@ -1376,6 +1374,8 @@ class DataFrame(NDFrame):
-------- --------
:pandas_api_docs:`pandas.DataFrame.to_json` :pandas_api_docs:`pandas.DataFrame.to_json`
""" """
if index is None and PANDAS_VERSION[0] == 1:
index = True # switch to the pandas 1 default
kwargs = { kwargs = {
"path_or_buf": path_or_buf, "path_or_buf": path_or_buf,
"orient": orient, "orient": orient,

View File

@ -16,6 +16,7 @@
# under the License. # under the License.
import csv import csv
import warnings
from collections import deque from collections import deque
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
@ -110,11 +111,11 @@ def pandas_to_eland(
2 3.141 1 ... 3 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text
<BLANKLINE> <BLANKLINE>
[3 rows x 8 columns] [3 rows x 8 columns]
>>> pd_df.dtypes >>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
A float64 A float64
B int64 B int64
C object C object
D datetime64[ns] D datetime64[s]
E float64 E float64
F bool F bool
G int64 G int64
@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
names=None, names=None,
index_col=None, index_col=None,
usecols=None, usecols=None,
squeeze=False, squeeze=None,
prefix=None, prefix=None,
mangle_dupe_cols=True, mangle_dupe_cols=None,
# General Parsing Configuration # General Parsing Configuration
dtype=None, dtype=None,
engine=None, engine=None,
@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
low_memory: bool = _DEFAULT_LOW_MEMORY, low_memory: bool = _DEFAULT_LOW_MEMORY,
memory_map=False, memory_map=False,
float_precision=None, float_precision=None,
**extra_kwargs,
) -> "DataFrame": ) -> "DataFrame":
""" """
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index). Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
"usecols": usecols, "usecols": usecols,
"verbose": verbose, "verbose": verbose,
"encoding": encoding, "encoding": encoding,
"squeeze": squeeze,
"memory_map": memory_map, "memory_map": memory_map,
"float_precision": float_precision, "float_precision": float_precision,
"na_filter": na_filter, "na_filter": na_filter,
@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
"error_bad_lines": error_bad_lines, "error_bad_lines": error_bad_lines,
"on_bad_lines": on_bad_lines, "on_bad_lines": on_bad_lines,
"low_memory": low_memory, "low_memory": low_memory,
"mangle_dupe_cols": mangle_dupe_cols,
"infer_datetime_format": infer_datetime_format, "infer_datetime_format": infer_datetime_format,
"skip_blank_lines": skip_blank_lines, "skip_blank_lines": skip_blank_lines,
**extra_kwargs,
} }
if chunksize is None: if chunksize is None:
@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore
kwargs.pop("on_bad_lines") kwargs.pop("on_bad_lines")
if "squeeze" in kwargs:
kwargs.pop("squeeze")
warnings.warn(
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
)
if "mangle_dupe_cols" in kwargs:
kwargs.pop("mangle_dupe_cols")
warnings.warn(
"The mangle_dupe_cols argument no longer works. Furthermore, "
"duplicate columns will automatically get a number suffix."
)
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch) # read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
reader = pd.read_csv(filepath_or_buffer, **kwargs) reader = pd.read_csv(filepath_or_buffer, **kwargs)

View File

@ -712,8 +712,11 @@ class FieldMappings:
capabilities, orient="index", columns=FieldMappings.column_labels capabilities, orient="index", columns=FieldMappings.column_labels
) )
self._mappings_capabilities = self._mappings_capabilities.append( self._mappings_capabilities = pd.concat(
capability_matrix_row [
self._mappings_capabilities,
capability_matrix_row,
]
) )
def numeric_source_fields(self) -> List[str]: def numeric_source_fields(self) -> List[str]:

View File

@ -187,7 +187,7 @@ class ESGradientBoostingModel(ABC):
if field_name in feature_names and field_name not in input_field_names: if field_name in feature_names and field_name not in input_field_names:
input_field_names.add(field_name) input_field_names.add(field_name)
return feature_names, input_field_names return feature_names, list(input_field_names)
@property @property
def preprocessors(self) -> List[Any]: def preprocessors(self) -> List[Any]:

View File

@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
import numpy as np import numpy as np
import pandas as pd # type: ignore import pandas as pd # type: ignore
from pandas.core.indexes.frozen import FrozenList
from pandas.io.common import _expand_user, stringify_path # type: ignore from pandas.io.common import _expand_user, stringify_path # type: ignore
import eland.plotting import eland.plotting
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
from eland.filter import ( from eland.filter import (
BooleanFilter, BooleanFilter,
Equal, Equal,
@ -292,18 +293,26 @@ class Series(NDFrame):
Examples Examples
-------- --------
>>> df = ed.DataFrame('http://localhost:9200', 'flights') >>> df = ed.DataFrame('http://localhost:9200', 'flights')
>>> df['Carrier'].value_counts() >>> df['Carrier'].value_counts() # doctest: +SKIP
Carrier
Logstash Airways 3331 Logstash Airways 3331
JetBeats 3274 JetBeats 3274
Kibana Airlines 3234 Kibana Airlines 3234
ES-Air 3220 ES-Air 3220
Name: Carrier, dtype: int64 Name: count, dtype: int64
""" """
if not isinstance(es_size, int): if not isinstance(es_size, int):
raise TypeError("es_size must be a positive integer.") raise TypeError("es_size must be a positive integer.")
elif es_size <= 0: elif es_size <= 0:
raise ValueError("es_size must be a positive integer.") raise ValueError("es_size must be a positive integer.")
return self._query_compiler.value_counts(es_size) value_counts = self._query_compiler.value_counts(es_size)
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
if PANDAS_VERSION[0] == 2:
value_counts.name = "count"
value_counts.index.names = FrozenList([self.es_field_name])
value_counts.index.name = self.es_field_name
return value_counts
# dtype not implemented for Series as causes query to fail # dtype not implemented for Series as causes query to fail
# in pandas.core.computation.ops.Term.type # in pandas.core.computation.ops.Term.type

View File

@ -96,7 +96,7 @@ def lint(session):
@nox.session(python=["3.9", "3.10", "3.11", "3.12"]) @nox.session(python=["3.9", "3.10", "3.11", "3.12"])
@nox.parametrize("pandas_version", ["1.5.0"]) @nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
def test(session, pandas_version: str): def test(session, pandas_version: str):
session.install("-r", "requirements-dev.txt") session.install("-r", "requirements-dev.txt")
session.install(".") session.install(".")

View File

@ -87,7 +87,7 @@ setup(
packages=find_packages(include=["eland", "eland.*"]), packages=find_packages(include=["eland", "eland.*"]),
install_requires=[ install_requires=[
"elasticsearch>=8.3,<9", "elasticsearch>=8.3,<9",
"pandas>=1.5,<2", "pandas>=1.5,<3",
"matplotlib>=3.6", "matplotlib>=3.6",
"numpy>=1.2.0,<2", "numpy>=1.2.0,<2",
"packaging", "packaging",

View File

@ -24,6 +24,7 @@ import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal from pandas.testing import assert_frame_equal, assert_series_equal
import eland as ed import eland as ed
from eland.common import PANDAS_VERSION
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -45,6 +46,9 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
_pd_flights = pd.DataFrame.from_records(flight_records).reindex( _pd_flights = pd.DataFrame.from_records(flight_records).reindex(
_ed_flights.columns, axis=1 _ed_flights.columns, axis=1
) )
if PANDAS_VERSION[0] >= 2:
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
else:
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"]) _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list # Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
_pd_flights["Cities"] = _pd_flights.apply( _pd_flights["Cities"] = _pd_flights.apply(
@ -62,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
) )
_pd_ecommerce.insert(2, "customer_birth_date", None) _pd_ecommerce.insert(2, "customer_birth_date", None)
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int _pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
_pd_ecommerce["customer_birth_date"].astype("datetime64") _pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME) _ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)

View File

@ -77,7 +77,16 @@ class SymmetricAPIChecker:
pd_exc = e pd_exc = e
self.check_exception(ed_exc, pd_exc) self.check_exception(ed_exc, pd_exc)
try:
self.check_values(ed_obj, pd_obj) self.check_values(ed_obj, pd_obj)
except AssertionError as e:
# This is an attribute we allow to differ when comparing zero-length objects
if (
'Attribute "inferred_type" are different' in repr(e)
and len(ed_obj) == 0
and len(pd_obj) == 0
):
self.check_values(ed_obj, pd_obj, check_index_type=False)
if isinstance(ed_obj, (ed.DataFrame, ed.Series)): if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
return SymmetricAPIChecker(ed_obj, pd_obj) return SymmetricAPIChecker(ed_obj, pd_obj)
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
return f return f
def check_values(self, ed_obj, pd_obj): def check_values(self, ed_obj, pd_obj, **kwargs):
"""Checks that any two values coming from eland and pandas are equal""" """Checks that any two values coming from eland and pandas are equal"""
if isinstance(ed_obj, ed.DataFrame): if isinstance(ed_obj, ed.DataFrame):
assert_pandas_eland_frame_equal(pd_obj, ed_obj) assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, ed.Series): elif isinstance(ed_obj, ed.Series):
assert_pandas_eland_series_equal(pd_obj, ed_obj) assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
elif isinstance(ed_obj, pd.DataFrame): elif isinstance(ed_obj, pd.DataFrame):
assert_frame_equal(ed_obj, pd_obj) assert_frame_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Series): elif isinstance(ed_obj, pd.Series):
assert_series_equal(ed_obj, pd_obj) assert_series_equal(ed_obj, pd_obj, **kwargs)
elif isinstance(ed_obj, pd.Index): elif isinstance(ed_obj, pd.Index):
assert ed_obj.equals(pd_obj) assert ed_obj.equals(pd_obj)
else: else:

View File

@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
}, },
index=["0", "1", "2"], index=["0", "1", "2"],
) )
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
df["D"] = df["D"].astype("datetime64[ns]")
expected_mappings = { expected_mappings = {
"mappings": { "mappings": {

View File

@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
["Cancelled", "FlightDelay"], axis="columns" ["Cancelled", "FlightDelay"], axis="columns"
) )
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
# This could be implemented in eland, but as of yet this is not the case
# We therefore remove it before the comparison
if "timestamp" in pd_describe.columns:
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
# Pandas >= 2 orders the aggregations differently than Pandas < 2
# A sort_index is applied so tests will succeed in both environments
assert_frame_equal( assert_frame_equal(
pd_describe.drop(["25%", "50%", "75%"], axis="index"), pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
ed_describe.drop(["25%", "50%", "75%"], axis="index"), ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
check_exact=False, check_exact=False,
rtol=True, rtol=True,
) )

View File

@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
ed_head_0 = ed_flights.head(0) ed_head_0 = ed_flights.head(0)
pd_head_0 = pd_flights.head(0) pd_head_0 = pd_flights.head(0)
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0) assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
def test_doc_test_tail(self): def test_doc_test_tail(self):
df = self.ed_flights() df = self.ed_flights()

View File

@ -22,6 +22,7 @@ import pandas as pd
import pytest import pytest
from pandas.testing import assert_frame_equal, assert_series_equal from pandas.testing import assert_frame_equal, assert_series_equal
from eland.common import PANDAS_VERSION
from tests.common import TestData, assert_almost_equal from tests.common import TestData, assert_almost_equal
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
for func in self.extended_funcs: for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights, func)( pd_metric = getattr(pd_flights, func)(
**({"numeric_only": True} if func != "mad" else {}) **({"numeric_only": True} if func != "mad" else {})
) )
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]] ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
for func in self.extended_funcs: for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights_1, func)() pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)(numeric_only=False) ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]] ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
for func in self.extended_funcs: for func in self.extended_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights_0, func)() pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)(numeric_only=False) ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
@ -491,9 +498,14 @@ class TestDataFrameMetrics(TestData):
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"] ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
) )
if PANDAS_VERSION[0] == 1:
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only) pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only) ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
else: # numeric_only is no longer available for pandas > 2
pd_quantile = pd_flights.agg(["quantile", "min"])
ed_quantile = ed_flights.agg(["quantile", "min"])
assert_frame_equal( assert_frame_equal(
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
) )

View File

@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
) )
ed_df_head = ed_df.head() ed_df_head = ed_df.head()
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
df["D"] = df["D"].astype("datetime64[ns]")
df["H"] = (
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
)
assert_pandas_eland_frame_equal(df, ed_df_head) assert_pandas_eland_frame_equal(df, ed_df_head)
ES_TEST_CLIENT.indices.delete(index=index_name) ES_TEST_CLIENT.indices.delete(index=index_name)

View File

@ -1647,6 +1647,14 @@
"execution_count": 32, "execution_count": 32,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/code/eland/.nox/test-3-12-pandas_version-2-2-3/lib/python3.12/site-packages/eland/series.py:464: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" return self._query_compiler.dtypes[0]\n"
]
},
{ {
"data": { "data": {
"text/html": [ "text/html": [
@ -1792,6 +1800,9 @@
} }
], ],
"source": [ "source": [
"# NBVAL_IGNORE_OUTPUT\n",
"# The ignore statement above is because of output difference between Pandas 1 and 2\n",
"# and can be removed once Pandas 1 support is dropped\n",
"ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')" "ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
] ]
}, },
@ -2377,8 +2388,8 @@
" <th>AvgTicketPrice</th>\n", " <th>AvgTicketPrice</th>\n",
" <th>DistanceKilometers</th>\n", " <th>DistanceKilometers</th>\n",
" <th>...</th>\n", " <th>...</th>\n",
" <th>FlightTimeMin</th>\n",
" <th>dayOfWeek</th>\n", " <th>dayOfWeek</th>\n",
" <th>timestamp</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
@ -2388,23 +2399,15 @@
" <td>13059.000000</td>\n", " <td>13059.000000</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>13059.000000</td>\n", " <td>13059.000000</td>\n",
" <td>13059.000000</td>\n", " <td>13059</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>mean</th>\n", " <th>mean</th>\n",
" <td>628.253689</td>\n", " <td>628.253689</td>\n",
" <td>7092.142455</td>\n", " <td>7092.142455</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>511.127842</td>\n",
" <td>2.835975</td>\n", " <td>2.835975</td>\n",
" </tr>\n", " <td>2018-01-21 19:20:45.564438016</td>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>266.396861</td>\n",
" <td>4578.438497</td>\n",
" <td>...</td>\n",
" <td>334.753952</td>\n",
" <td>1.939439</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>min</th>\n", " <th>min</th>\n",
@ -2412,57 +2415,65 @@
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>0.000000</td>\n", " <td>0.000000</td>\n",
" <td>0.000000</td>\n", " <td>2018-01-01 00:00:00</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>25%</th>\n", " <th>25%</th>\n",
" <td>409.893816</td>\n", " <td>409.893816</td>\n",
" <td>2459.705673</td>\n", " <td>2459.705673</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>252.333192</td>\n",
" <td>1.000000</td>\n", " <td>1.000000</td>\n",
" <td>2018-01-11 05:16:25.500000</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>50%</th>\n", " <th>50%</th>\n",
" <td>640.556668</td>\n", " <td>640.556668</td>\n",
" <td>7610.330866</td>\n", " <td>7610.330866</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>503.045170</td>\n",
" <td>3.000000</td>\n", " <td>3.000000</td>\n",
" <td>2018-01-22 00:32:11</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>75%</th>\n", " <th>75%</th>\n",
" <td>842.185470</td>\n", " <td>842.185470</td>\n",
" <td>9736.637600</td>\n", " <td>9736.637600</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>720.416036</td>\n",
" <td>4.000000</td>\n", " <td>4.000000</td>\n",
" <td>2018-02-01 04:51:18</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>max</th>\n", " <th>max</th>\n",
" <td>1199.729053</td>\n", " <td>1199.729053</td>\n",
" <td>19881.482315</td>\n", " <td>19881.482315</td>\n",
" <td>...</td>\n", " <td>...</td>\n",
" <td>1902.902032</td>\n",
" <td>6.000000</td>\n", " <td>6.000000</td>\n",
" <td>2018-02-11 23:50:12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>266.396861</td>\n",
" <td>4578.438497</td>\n",
" <td>...</td>\n",
" <td>1.939439</td>\n",
" <td>NaN</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
"<p>8 rows × 7 columns</p>\n", "<p>8 rows × 8 columns</p>\n",
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", " AvgTicketPrice DistanceKilometers ... dayOfWeek timestamp\n",
"count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n", "count 13059.000000 13059.000000 ... 13059.000000 13059\n",
"mean 628.253689 7092.142455 ... 511.127842 2.835975\n", "mean 628.253689 7092.142455 ... 2.835975 2018-01-21 19:20:45.564438016\n",
"std 266.396861 4578.438497 ... 334.753952 1.939439\n", "min 100.020528 0.000000 ... 0.000000 2018-01-01 00:00:00\n",
"min 100.020528 0.000000 ... 0.000000 0.000000\n", "25% 409.893816 2459.705673 ... 1.000000 2018-01-11 05:16:25.500000\n",
"25% 409.893816 2459.705673 ... 252.333192 1.000000\n", "50% 640.556668 7610.330866 ... 3.000000 2018-01-22 00:32:11\n",
"50% 640.556668 7610.330866 ... 503.045170 3.000000\n", "75% 842.185470 9736.637600 ... 4.000000 2018-02-01 04:51:18\n",
"75% 842.185470 9736.637600 ... 720.416036 4.000000\n", "max 1199.729053 19881.482315 ... 6.000000 2018-02-11 23:50:12\n",
"max 1199.729053 19881.482315 ... 1902.902032 6.000000\n", "std 266.396861 4578.438497 ... 1.939439 NaN\n",
"\n", "\n",
"[8 rows x 7 columns]" "[8 rows x 8 columns]"
] ]
}, },
"execution_count": 39, "execution_count": 39,
@ -2471,6 +2482,8 @@
} }
], ],
"source": [ "source": [
"# NBVAL_IGNORE_OUTPUT\n",
"# Once support for pandas <2 is dropped, this and the line above can be removed\n",
"pd_flights.describe()" "pd_flights.describe()"
] ]
}, },

View File

@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
ed_ser = ed_flights_small.filter(items=items, axis=0) ed_ser = ed_flights_small.filter(items=items, axis=0)
pd_ser = pd_flights_small.filter(items=items, axis=0) pd_ser = pd_flights_small.filter(items=items, axis=0)
assert_pandas_eland_series_equal(pd_ser, ed_ser) # For an empty Series, eland will say the datatype it knows from the Elastic index
# Pandas however will state empty as the datatype
assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
def test_flights_filter_index_like_and_regex(self): def test_flights_filter_index_like_and_regex(self):
ed_flights_small = self.ed_flights_small()["FlightDelayType"] ed_flights_small = self.ed_flights_small()["FlightDelayType"]

View File

@ -24,6 +24,7 @@ import pandas as pd
import pytest import pytest
from pandas.testing import assert_series_equal from pandas.testing import assert_series_equal
from eland.common import PANDAS_VERSION
from tests.common import TestData, assert_almost_equal from tests.common import TestData, assert_almost_equal
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
ed_flights = self.ed_flights()["AvgTicketPrice"] ed_flights = self.ed_flights()["AvgTicketPrice"]
for func in self.all_funcs: for func in self.all_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_flights, func)() pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)() ed_metric = getattr(ed_flights, func)()
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[column] ed_ecommerce = self.ed_ecommerce()[column]
for func in self.all_funcs: for func in self.all_funcs:
if PANDAS_VERSION[0] >= 2 and func == "mad":
continue
pd_metric = getattr(pd_ecommerce, func)() pd_metric = getattr(pd_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)( ed_metric = getattr(ed_ecommerce, func)(
**({"numeric_only": True} if (func != "nunique") else {}) **({"numeric_only": True} if (func != "nunique") else {})