Update to latest ES versions and fix unit tests (#512)

Update the test matrix to the latest Elasticsearch versions and fix the broken unit tests on the CI.
2025-07-11 00:02:14 +08:00 · 2023-01-31 20:55:29 +01:00 · 2023-01-31 20:55:29 +01:00 · 2ea96322b3
commit 2ea96322b3
parent c55516f376
13 changed files with 101 additions and 93 deletions
--- a/.ci/run-tests
+++ b/.ci/run-tests
@ -12,7 +12,7 @@ set -euxo pipefail

 TEST_SUITE=${TEST_SUITE-xpack}
 NODE_NAME=localhost
-PANDAS_VERSION=${PANDAS_VERSION-1.3.0}
+PANDAS_VERSION=${PANDAS_VERSION-1.5.0}


 elasticsearch_image=elasticsearch
--- a/.ci/test-matrix.yml
+++ b/.ci/test-matrix.yml
@ -1,18 +1,16 @@
 ---

 ELASTICSEARCH_VERSION:
-  - '8.1.0-SNAPSHOT'
-  - '8.0.0-SNAPSHOT'
+  - '8.6.0-SNAPSHOT'
+  - '8.7.0-SNAPSHOT'

 PANDAS_VERSION:
-  - '1.2.0'
-  - '1.3.0'
+  - '1.5.0'

 PYTHON_VERSION:
  - '3.10'
  - '3.9'
  - '3.8'
-  - '3.7'

 TEST_SUITE:
  - xpack
--- a/README.md
+++ b/README.md
@ -49,7 +49,7 @@ $ conda install -c conda-forge eland

 ### Compatibility

- Supports Python 3.7+ and Pandas 1.3
+- Supports Python 3.8+ and Pandas 1.5
 - Supports Elasticsearch clusters that are 7.11+, recommended 8.3 or later for all features to work.
  If you are using the NLP with PyTorch feature make sure your Eland minor version matches the minor 
  version of your Elasticsearch cluster. For all other features it is sufficient for the major versions
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -1,10 +1,11 @@
 elasticsearch>=7.7
-pandas>=1.2.0
-matplotlib
+pandas>=1.5
+matplotlib>=3.6
 nbval
 scikit-learn>=0.22.1
 xgboost>=1
 lightgbm
+sphinx==5.3.0
 nbsphinx
 git+https://github.com/pandas-dev/pydata-sphinx-theme.git

--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -172,7 +172,7 @@ class NDFrame(ABC):
        head = self.head(head_rows).to_pandas()
        tail = self.tail(tail_rows).to_pandas()

-        return head.append(tail)
+        return pd.concat([head, tail])

    def __sizeof__(self) -> int:
        # Don't default to pandas, just return approximation TODO - make this more accurate
--- a/eland/series.py
+++ b/eland/series.py
@ -714,7 +714,7 @@ class Series(NDFrame):
        >>> ed_ecommerce = ed.DataFrame('http://localhost:9200', 'ecommerce')
        >>> ed_ecommerce["day_of_week"].mode()
        0    Thursday
-        dtype: object
+        Name: day_of_week, dtype: object

        >>> ed_ecommerce["order_date"].mode()
        0   2016-12-02 20:36:58
@ -727,16 +727,18 @@ class Series(NDFrame):
        7   2016-12-15 11:38:24
        8   2016-12-22 19:39:22
        9   2016-12-24 06:21:36
-        dtype: datetime64[ns]
+        Name: order_date, dtype: datetime64[ns]

        >>> ed_ecommerce["order_date"].mode(es_size=3)
        0   2016-12-02 20:36:58
        1   2016-12-04 23:44:10
        2   2016-12-08 06:21:36
-        dtype: datetime64[ns]
+        Name: order_date, dtype: datetime64[ns]

        """
-        return self._query_compiler.mode(is_dataframe=False, es_size=es_size)
+        result = self._query_compiler.mode(is_dataframe=False, es_size=es_size)
+        result.name = self.name
+        return result

    def es_match(
        self,
--- a/noxfile.py
+++ b/noxfile.py
@ -100,8 +100,8 @@ def lint(session):
            session.error("\n" + "\n".join(sorted(set(errors))))


-@nox.session(python=["3.7", "3.8", "3.9", "3.10"])
-@nox.parametrize("pandas_version", ["1.2.0", "1.3.0"])
+@nox.session(python=["3.8", "3.9", "3.10"])
+@nox.parametrize("pandas_version", ["1.5.0"])
 def test(session, pandas_version: str):
    session.install("-r", "requirements-dev.txt")
    session.install(".")
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,8 +2,8 @@
 # Basic requirements
 #
 elasticsearch>=8.3,<9
-pandas>=1.2,<2
-matplotlib<4
+pandas>=1.5
+matplotlib>=3.6
 numpy<2
 tqdm<5

@ -12,7 +12,6 @@ tqdm<5
 #
 scikit-learn>=0.22.1,<2
 xgboost>=0.90,<2
-scikit-learn>=0.22.1,<2
 lightgbm>=2,<4

 # PyTorch doesn't support Python 3.10 yet (pytorch/pytorch#66424)
--- a/requirements.txt
+++ b/requirements.txt
@ -2,6 +2,6 @@
 # Basic requirements
 #
 elasticsearch>=8.3,<9
-pandas>=1.2,<2
-matplotlib<4
+pandas>=1.5
+matplotlib>=3.6
 numpy<2
--- a/setup.py
+++ b/setup.py
@ -83,12 +83,12 @@ setup(
    packages=find_packages(include=["eland", "eland.*"]),
    install_requires=[
        "elasticsearch>=8.3,<9",
-        "pandas>=1.2,<2",
-        "matplotlib<4",
+        "pandas>=1.5",
+        "matplotlib>=3.6",
        "numpy<2",
    ],
    scripts=["bin/eland_import_hub_model"],
-    python_requires=">=3.7",
+    python_requires=">=3.8",
    package_data={"eland": ["py.typed"]},
    include_package_data=True,
    zip_safe=False,
--- a/tests/common.py
+++ b/tests/common.py
@ -15,6 +15,8 @@
 #  specific language governing permissions and limitations
 #  under the License.

+import gzip
+import json
 import os
 from datetime import timedelta

@ -30,15 +32,21 @@ from tests import (
    ECOMMERCE_DF_FILE_NAME,
    ECOMMERCE_INDEX_NAME,
    ES_TEST_CLIENT,
-    FLIGHTS_DF_FILE_NAME,
+    FLIGHTS_FILE_NAME,
    FLIGHTS_INDEX_NAME,
    FLIGHTS_SMALL_INDEX_NAME,
 )

-_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
+_ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)
+flight_records = []
+with gzip.open(FLIGHTS_FILE_NAME) as f:
+    for json_obj in f:
+        flight_records.append(json.loads(json_obj))
+_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
+    _ed_flights.columns, axis=1
+)
 _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
 _pd_flights.index = _pd_flights.index.map(str)  # make index 'object' not int
-_ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)

 _pd_flights_small = _pd_flights.head(48)
 _ed_flights_small = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME)
--- a/tests/notebook/test_demo_notebook.ipynb
+++ b/tests/notebook/test_demo_notebook.ipynb
--- a/tests/notebook/test_plotting.ipynb
+++ b/tests/notebook/test_plotting.ipynb