[Backport 8.x] Fixes for Pandas 2 support (#758)

* Support Pandas 2 (#742) * Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co> (cherry picked from commit 75c57b077532c459a9490613cbf7b37215c27fae) * Return input_field_names as list as required by Pandas 2 --------- Co-authored-by: Bart Broere <mail@bartbroere.eu> Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
2025-07-11 00:02:14 +08:00 · 2025-02-13 14:16:49 +04:00 · 2025-02-13 14:16:49 +04:00 · af20ef9063
commit af20ef9063
parent d50436b01c
19 changed files with 161 additions and 70 deletions
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@ -29,11 +29,16 @@ steps:
      machineType: "n2-standard-4"
    env:
      PYTHON_VERSION: "{{ matrix.python }}"
-      PANDAS_VERSION: '1.5.0'
+      PANDAS_VERSION: "{{ matrix.pandas }}"
      TEST_SUITE: "xpack"
      ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
    matrix:
      setup:
        # Python and pandas versions need to be added to the nox configuration too
        # (in the decorators of the test method in noxfile.py)
        pandas:
          - '1.5.0'
          - '2.2.3'
        python:
          - '3.12'
          - '3.11'
--- a/docs/sphinx/examples/demo_notebook.ipynb
+++ b/docs/sphinx/examples/demo_notebook.ipynb
@ -24,7 +24,7 @@
        "\n",
        "For this example, you will need:\n",
        "\n",
-        "- Python 3.8 or later\n",
+        "- Python 3.9 or later\n",
        "- An Elastic deployment\n",
        "  - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
        "\n",
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing  # type: ignore
 from pandas.util._validators import validate_bool_kwarg  # type: ignore
 import eland.plotting as gfx
-from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
+from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
 from eland.filter import BooleanFilter
 from eland.groupby import DataFrameGroupBy
 from eland.ndframe import NDFrame
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
            axis = pd.DataFrame._get_axis_name(axis)
            axes = {axis: labels}
        elif index is not None or columns is not None:
-            axes, _ = pd.DataFrame()._construct_axes_from_arguments(
+            axes = {"columns": columns, "index": index}
                (index, columns), {}
            )
        else:
            raise ValueError(
                "Need to specify at least one of 'labels', 'index' or 'columns'"
@ -1361,7 +1359,7 @@ class DataFrame(NDFrame):
        default_handler=None,
        lines=False,
        compression="infer",
-        index=True,
+        index=None,
        indent=None,
        storage_options=None,
    ):
@ -1376,6 +1374,8 @@ class DataFrame(NDFrame):
        --------
        :pandas_api_docs:`pandas.DataFrame.to_json`
        """
        if index is None and PANDAS_VERSION[0] == 1:
            index = True  # switch to the pandas 1 default
        kwargs = {
            "path_or_buf": path_or_buf,
            "orient": orient,
--- a/eland/etl.py
+++ b/eland/etl.py
@ -16,6 +16,7 @@
 #  under the License.
 import csv
 import warnings
 from collections import deque
 from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
@ -110,11 +111,11 @@ def pandas_to_eland(
    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
    [3 rows x 8 columns]
-    >>> pd_df.dtypes
+    >>> pd_df.dtypes  # doctest skip required for pandas < 2  # doctest: +SKIP
    A          float64
    B            int64
    C           object
-    D    datetime64[ns]
+    D    datetime64[s]
    E          float64
    F             bool
    G            int64
@ -307,9 +308,9 @@ def csv_to_eland(  # type: ignore
    names=None,
    index_col=None,
    usecols=None,
-    squeeze=False,
+    squeeze=None,
    prefix=None,
-    mangle_dupe_cols=True,
+    mangle_dupe_cols=None,
    # General Parsing Configuration
    dtype=None,
    engine=None,
@ -357,6 +358,7 @@ def csv_to_eland(  # type: ignore
    low_memory: bool = _DEFAULT_LOW_MEMORY,
    memory_map=False,
    float_precision=None,
    **extra_kwargs,
 ) -> "DataFrame":
    """
    Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
@ -485,7 +487,6 @@ def csv_to_eland(  # type: ignore
        "usecols": usecols,
        "verbose": verbose,
        "encoding": encoding,
        "squeeze": squeeze,
        "memory_map": memory_map,
        "float_precision": float_precision,
        "na_filter": na_filter,
@ -494,9 +495,9 @@ def csv_to_eland(  # type: ignore
        "error_bad_lines": error_bad_lines,
        "on_bad_lines": on_bad_lines,
        "low_memory": low_memory,
        "mangle_dupe_cols": mangle_dupe_cols,
        "infer_datetime_format": infer_datetime_format,
        "skip_blank_lines": skip_blank_lines,
        **extra_kwargs,
    }
    if chunksize is None:
@ -525,6 +526,18 @@ def csv_to_eland(  # type: ignore
        kwargs.pop("on_bad_lines")
    if "squeeze" in kwargs:
        kwargs.pop("squeeze")
        warnings.warn(
            "This argument no longer works, use .squeeze('columns') on your DataFrame instead"
        )
    if "mangle_dupe_cols" in kwargs:
        kwargs.pop("mangle_dupe_cols")
        warnings.warn(
            "The mangle_dupe_cols argument no longer works. Furthermore, "
            "duplicate columns will automatically get a number suffix."
        )
    # read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
    reader = pd.read_csv(filepath_or_buffer, **kwargs)
--- a/eland/field_mappings.py
+++ b/eland/field_mappings.py
@ -712,8 +712,11 @@ class FieldMappings:
            capabilities, orient="index", columns=FieldMappings.column_labels
        )
-        self._mappings_capabilities = self._mappings_capabilities.append(
+        self._mappings_capabilities = pd.concat(
-            capability_matrix_row
+            [
                self._mappings_capabilities,
                capability_matrix_row,
            ]
        )
    def numeric_source_fields(self) -> List[str]:
--- a/eland/ml/exporters/es_gb_models.py
+++ b/eland/ml/exporters/es_gb_models.py
@ -187,7 +187,7 @@ class ESGradientBoostingModel(ABC):
            if field_name in feature_names and field_name not in input_field_names:
                input_field_names.add(field_name)
-        return feature_names, input_field_names
+        return feature_names, list(input_field_names)
    @property
    def preprocessors(self) -> List[Any]:
--- a/eland/series.py
+++ b/eland/series.py
@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
 import numpy as np
 import pandas as pd  # type: ignore
 from pandas.core.indexes.frozen import FrozenList
 from pandas.io.common import _expand_user, stringify_path  # type: ignore
 import eland.plotting
 from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
-from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
+from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
 from eland.filter import (
    BooleanFilter,
    Equal,
@ -292,18 +293,26 @@ class Series(NDFrame):
        Examples
        --------
        >>> df = ed.DataFrame('http://localhost:9200', 'flights')
-        >>> df['Carrier'].value_counts()
+        >>> df['Carrier'].value_counts()  # doctest: +SKIP
        Carrier
        Logstash Airways    3331
        JetBeats            3274
        Kibana Airlines     3234
        ES-Air              3220
-        Name: Carrier, dtype: int64
+        Name: count, dtype: int64
        """
        if not isinstance(es_size, int):
            raise TypeError("es_size must be a positive integer.")
        elif es_size <= 0:
            raise ValueError("es_size must be a positive integer.")
-        return self._query_compiler.value_counts(es_size)
+        value_counts = self._query_compiler.value_counts(es_size)
        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
        if PANDAS_VERSION[0] == 2:
            value_counts.name = "count"
            value_counts.index.names = FrozenList([self.es_field_name])
            value_counts.index.name = self.es_field_name
        return value_counts
    # dtype not implemented for Series as causes query to fail
    # in pandas.core.computation.ops.Term.type
--- a/noxfile.py
+++ b/noxfile.py
@ -96,7 +96,7 @@ def lint(session):
@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
-@nox.parametrize("pandas_version", ["1.5.0"])
+@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
 def test(session, pandas_version: str):
    session.install("-r", "requirements-dev.txt")
    session.install(".")
--- a/setup.py
+++ b/setup.py
@ -87,7 +87,7 @@ setup(
    packages=find_packages(include=["eland", "eland.*"]),
    install_requires=[
        "elasticsearch>=8.3,<9",
-        "pandas>=1.5,<2",
+        "pandas>=1.5,<3",
        "matplotlib>=3.6",
        "numpy>=1.2.0,<2",
        "packaging",
--- a/tests/common.py
+++ b/tests/common.py
@ -24,6 +24,7 @@ import pandas as pd
 from pandas.testing import assert_frame_equal, assert_series_equal
 import eland as ed
 from eland.common import PANDAS_VERSION
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -45,6 +46,9 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
 _pd_flights = pd.DataFrame.from_records(flight_records).reindex(
    _ed_flights.columns, axis=1
 )
 if PANDAS_VERSION[0] >= 2:
    _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
 else:
    _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
 # Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
 _pd_flights["Cities"] = _pd_flights.apply(
@ -62,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
 )
 _pd_ecommerce.insert(2, "customer_birth_date", None)
 _pd_ecommerce.index = _pd_ecommerce.index.map(str)  # make index 'object' not int
-_pd_ecommerce["customer_birth_date"].astype("datetime64")
+_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
 _ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -77,7 +77,16 @@ class SymmetricAPIChecker:
                pd_exc = e
            self.check_exception(ed_exc, pd_exc)
            try:
                self.check_values(ed_obj, pd_obj)
            except AssertionError as e:
                # This is an attribute we allow to differ when comparing zero-length objects
                if (
                    'Attribute "inferred_type" are different' in repr(e)
                    and len(ed_obj) == 0
                    and len(pd_obj) == 0
                ):
                    self.check_values(ed_obj, pd_obj, check_index_type=False)
            if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
                return SymmetricAPIChecker(ed_obj, pd_obj)
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
        return f
-    def check_values(self, ed_obj, pd_obj):
+    def check_values(self, ed_obj, pd_obj, **kwargs):
        """Checks that any two values coming from eland and pandas are equal"""
        if isinstance(ed_obj, ed.DataFrame):
-            assert_pandas_eland_frame_equal(pd_obj, ed_obj)
+            assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
        elif isinstance(ed_obj, ed.Series):
-            assert_pandas_eland_series_equal(pd_obj, ed_obj)
+            assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
        elif isinstance(ed_obj, pd.DataFrame):
-            assert_frame_equal(ed_obj, pd_obj)
+            assert_frame_equal(ed_obj, pd_obj, **kwargs)
        elif isinstance(ed_obj, pd.Series):
-            assert_series_equal(ed_obj, pd_obj)
+            assert_series_equal(ed_obj, pd_obj, **kwargs)
        elif isinstance(ed_obj, pd.Index):
            assert ed_obj.equals(pd_obj)
        else:
--- a/tests/dataframe/test_datetime_pytest.py
+++ b/tests/dataframe/test_datetime_pytest.py
@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
            },
            index=["0", "1", "2"],
        )
        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
        df["D"] = df["D"].astype("datetime64[ns]")
        expected_mappings = {
            "mappings": {
--- a/tests/dataframe/test_describe_pytest.py
+++ b/tests/dataframe/test_describe_pytest.py
@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
            ["Cancelled", "FlightDelay"], axis="columns"
        )
        # Pandas >= 2 calculates aggregations such as min and max for timestamps too
        # This could be implemented in eland, but as of yet this is not the case
        # We therefore remove it before the comparison
        if "timestamp" in pd_describe.columns:
            pd_describe = pd_describe.drop(["timestamp"], axis="columns")
        # Pandas >= 2 orders the aggregations differently than Pandas < 2
        # A sort_index is applied so tests will succeed in both environments
        assert_frame_equal(
-            pd_describe.drop(["25%", "50%", "75%"], axis="index"),
+            pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
-            ed_describe.drop(["25%", "50%", "75%"], axis="index"),
+            ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
            check_exact=False,
            rtol=True,
        )
--- a/tests/dataframe/test_head_tail_pytest.py
+++ b/tests/dataframe/test_head_tail_pytest.py
@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
        ed_head_0 = ed_flights.head(0)
        pd_head_0 = pd_flights.head(0)
-        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
+        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
    def test_doc_test_tail(self):
        df = self.ed_flights()
--- a/tests/dataframe/test_metrics_pytest.py
+++ b/tests/dataframe/test_metrics_pytest.py
@ -22,6 +22,7 @@ import pandas as pd
 import pytest
 from pandas.testing import assert_frame_equal, assert_series_equal
 from eland.common import PANDAS_VERSION
 from tests.common import TestData, assert_almost_equal
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
        logger.setLevel(logging.DEBUG)
        for func in self.extended_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights, func)(
                **({"numeric_only": True} if func != "mad" else {})
            )
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
        ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
        for func in self.extended_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights_1, func)()
            ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
        ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
        for func in self.extended_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights_0, func)()
            ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
@ -491,9 +498,14 @@ class TestDataFrameMetrics(TestData):
            ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
        )
        if PANDAS_VERSION[0] == 1:
            pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
            ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
        else:  # numeric_only is no longer available for pandas > 2
            pd_quantile = pd_flights.agg(["quantile", "min"])
            ed_quantile = ed_flights.agg(["quantile", "min"])
        assert_frame_equal(
            pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
        )
--- a/tests/dataframe/test_utils_pytest.py
+++ b/tests/dataframe/test_utils_pytest.py
@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
        )
        ed_df_head = ed_df.head()
        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
        df["D"] = df["D"].astype("datetime64[ns]")
        df["H"] = (
            df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
        )
        assert_pandas_eland_frame_equal(df, ed_df_head)
        ES_TEST_CLIENT.indices.delete(index=index_name)
--- a/tests/notebook/test_demo_notebook.ipynb
+++ b/tests/notebook/test_demo_notebook.ipynb
@ -1647,6 +1647,14 @@
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
       "/code/eland/.nox/test-3-12-pandas_version-2-2-3/lib/python3.12/site-packages/eland/series.py:464: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
       "  return self._query_compiler.dtypes[0]\n"
     ]
    },
    {
     "data": {
      "text/html": [
@ -1792,6 +1800,9 @@
    }
   ],
   "source": [
    "# NBVAL_IGNORE_OUTPUT\n",
    "# The ignore statement above is because of output difference between Pandas 1 and 2\n",
    "# and can be removed once Pandas 1 support is dropped\n",
    "ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
   ]
  },
@ -2377,8 +2388,8 @@
       "      <th>AvgTicketPrice</th>\n",
       "      <th>DistanceKilometers</th>\n",
       "      <th>...</th>\n",
       "      <th>FlightTimeMin</th>\n",
       "      <th>dayOfWeek</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
@ -2388,23 +2399,15 @@
       "      <td>13059.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>13059.000000</td>\n",
-       "      <td>13059.000000</td>\n",
+       "      <td>13059</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>628.253689</td>\n",
       "      <td>7092.142455</td>\n",
       "      <td>...</td>\n",
       "      <td>511.127842</td>\n",
       "      <td>2.835975</td>\n",
-       "    </tr>\n",
+       "      <td>2018-01-21 19:20:45.564438016</td>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>266.396861</td>\n",
       "      <td>4578.438497</td>\n",
       "      <td>...</td>\n",
       "      <td>334.753952</td>\n",
       "      <td>1.939439</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
@ -2412,57 +2415,65 @@
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>2018-01-01 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>409.893816</td>\n",
       "      <td>2459.705673</td>\n",
       "      <td>...</td>\n",
       "      <td>252.333192</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2018-01-11 05:16:25.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>640.556668</td>\n",
       "      <td>7610.330866</td>\n",
       "      <td>...</td>\n",
       "      <td>503.045170</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>2018-01-22 00:32:11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>842.185470</td>\n",
       "      <td>9736.637600</td>\n",
       "      <td>...</td>\n",
       "      <td>720.416036</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2018-02-01 04:51:18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1199.729053</td>\n",
       "      <td>19881.482315</td>\n",
       "      <td>...</td>\n",
       "      <td>1902.902032</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2018-02-11 23:50:12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>266.396861</td>\n",
       "      <td>4578.438497</td>\n",
       "      <td>...</td>\n",
       "      <td>1.939439</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
-       "<p>8 rows × 7 columns</p>\n",
+       "<p>8 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
-       "       AvgTicketPrice  DistanceKilometers  ...  FlightTimeMin     dayOfWeek\n",
+       "       AvgTicketPrice  DistanceKilometers  ...     dayOfWeek                      timestamp\n",
-       "count    13059.000000        13059.000000  ...   13059.000000  13059.000000\n",
+       "count    13059.000000        13059.000000  ...  13059.000000                          13059\n",
-       "mean       628.253689         7092.142455  ...     511.127842      2.835975\n",
+       "mean       628.253689         7092.142455  ...      2.835975  2018-01-21 19:20:45.564438016\n",
-       "std        266.396861         4578.438497  ...     334.753952      1.939439\n",
+       "min        100.020528            0.000000  ...      0.000000            2018-01-01 00:00:00\n",
-       "min        100.020528            0.000000  ...       0.000000      0.000000\n",
+       "25%        409.893816         2459.705673  ...      1.000000     2018-01-11 05:16:25.500000\n",
-       "25%        409.893816         2459.705673  ...     252.333192      1.000000\n",
+       "50%        640.556668         7610.330866  ...      3.000000            2018-01-22 00:32:11\n",
-       "50%        640.556668         7610.330866  ...     503.045170      3.000000\n",
+       "75%        842.185470         9736.637600  ...      4.000000            2018-02-01 04:51:18\n",
-       "75%        842.185470         9736.637600  ...     720.416036      4.000000\n",
+       "max       1199.729053        19881.482315  ...      6.000000            2018-02-11 23:50:12\n",
-       "max       1199.729053        19881.482315  ...    1902.902032      6.000000\n",
+       "std        266.396861         4578.438497  ...      1.939439                            NaN\n",
       "\n",
-       "[8 rows x 7 columns]"
+       "[8 rows x 8 columns]"
      ]
     },
     "execution_count": 39,
@ -2471,6 +2482,8 @@
    }
   ],
   "source": [
    "# NBVAL_IGNORE_OUTPUT\n",
    "# Once support for pandas <2 is dropped, this and the line above can be removed\n",
    "pd_flights.describe()"
   ]
  },
--- a/tests/series/test_filter_pytest.py
+++ b/tests/series/test_filter_pytest.py
@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
        ed_ser = ed_flights_small.filter(items=items, axis=0)
        pd_ser = pd_flights_small.filter(items=items, axis=0)
-        assert_pandas_eland_series_equal(pd_ser, ed_ser)
+        # For an empty Series, eland will say the datatype it knows from the Elastic index
        # Pandas however will state empty as the datatype
        assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
    def test_flights_filter_index_like_and_regex(self):
        ed_flights_small = self.ed_flights_small()["FlightDelayType"]
--- a/tests/series/test_metrics_pytest.py
+++ b/tests/series/test_metrics_pytest.py
@ -24,6 +24,7 @@ import pandas as pd
 import pytest
 from pandas.testing import assert_series_equal
 from eland.common import PANDAS_VERSION
 from tests.common import TestData, assert_almost_equal
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
        ed_flights = self.ed_flights()["AvgTicketPrice"]
        for func in self.all_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights, func)()
            ed_metric = getattr(ed_flights, func)()
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
            ed_ecommerce = self.ed_ecommerce()[column]
            for func in self.all_funcs:
                if PANDAS_VERSION[0] >= 2 and func == "mad":
                    continue
                pd_metric = getattr(pd_ecommerce, func)()
                ed_metric = getattr(ed_ecommerce, func)(
                    **({"numeric_only": True} if (func != "nunique") else {})