Upgrade Sentence Transformers to v5 (#801 )

Sentence Transformers v5 adds support for sparse embedding models and is now necessary for importing sparse models such as https://huggingface.co/naver/splade-v3-distilbert.
add products to docset.yml (#797 )
2025-07-24 00:00:39 +08:00 · 2025-07-23 08:07:29 +01:00 · 2025-07-23 10:32:54 +04:00 · 2025-06-23 15:39:36 +04:00 · 2025-06-05 15:52:19 +04:00 · 2025-05-16 15:56:20 +01:00
85 changed files with 2049 additions and 2184 deletions
--- a/.buildkite/Dockerfile
+++ b/.buildkite/Dockerfile
@ -1,6 +1,8 @@
 ARG PYTHON_VERSION=3.9
 FROM python:${PYTHON_VERSION}
 ENV FORCE_COLOR=1
 WORKDIR /code/eland
 RUN python -m pip install nox
--- a/.buildkite/build-docker-images.sh
+++ b/.buildkite/build-docker-images.sh
@ -0,0 +1,11 @@
 #!/usr/bin/env bash
 set -eo pipefail
 export LC_ALL=en_US.UTF-8
 echo "--- Building the Wolfi image"
 # Building the linux/arm64 image takes about one hour on Buildkite, which is too slow
 docker build --file Dockerfile.wolfi .
 echo "--- Building the public image"
 docker build .
--- a/.buildkite/build-documentation.sh
+++ b/.buildkite/build-documentation.sh
@ -1,15 +1,8 @@
 #!/usr/bin/env bash
 sudo apt-get update
 sudo apt-get install -y pandoc python3 python3-pip
 python3 -m pip install nox
 /opt/buildkite-agent/.local/bin/nox -s docs
-# I couldn't make this work, for some reason pandoc is not found in the docker container repository:
+docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
-# docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
+docker run \
-# docker run \
+  --name doc_build \
-#        --name doc_build \
+  --rm \
-#        --rm \
+  elastic/eland \
-#        elastic/eland \
+  bash -c "apt-get update && apt-get install --yes pandoc && nox -s docs"
 #        apt-get update && \
 #        sudo apt-get install --yes pandoc && \
 #        nox -s docs
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@ -15,22 +15,36 @@ steps:
      machineType: "n2-standard-2"
    commands:
      - ./.buildkite/build-documentation.sh
-  - label: "Eland :python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }}"
+  - label: ":docker: Build Wolfi image"
    env:
      PYTHON_VERSION: 3.11-bookworm
    agents:
      provider: "gcp"
      machineType: "n2-standard-2"
    commands:
      - ./.buildkite/build-docker-images.sh
  - label: ":python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }} :pandas: {{ matrix.pandas }}"
    agents:
      provider: "gcp"
      machineType: "n2-standard-4"
    env:
      PYTHON_VERSION: "{{ matrix.python }}"
-      PANDAS_VERSION: '1.5.0'
+      PANDAS_VERSION: "{{ matrix.pandas }}"
      TEST_SUITE: "xpack"
      ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
    matrix:
      setup:
        # Python and pandas versions need to be added to the nox configuration too
        # (in the decorators of the test method in noxfile.py)
        pandas:
          - '1.5.0'
          - '2.2.3'
        python:
          - '3.12'
          - '3.11'
          - '3.10'
          - '3.9'
          - '3.8'
        stack:          
-          - '8.11-SNAPSHOT'
+          - '9.0.0'
-          - '8.12-SNAPSHOT'
+          - '9.1.0-SNAPSHOT'
    command: ./.buildkite/run-tests
--- a/.buildkite/pull-requests.json
+++ b/.buildkite/pull-requests.json
@ -11,6 +11,18 @@
      "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
      "skip_ci_labels": ["skip-ci"],
      "skip_ci_on_only_changed": ["\\.md$"]
    },
    {
      "enabled": true,
      "pipeline_slug": "docs-build-pr",
      "allow_org_users": true,
      "allowed_repo_permissions": ["admin", "write"],
      "build_on_commit": true,
      "build_on_comment": true,
      "trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
      "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
      "skip_ci_labels": ["skip-ci"],
      "skip_ci_on_only_changed": ["\\.md$"]
    }
  ]
 }
--- a/.buildkite/release-docker/run.sh
+++ b/.buildkite/release-docker/run.sh
@ -26,6 +26,7 @@ git --no-pager show
 docker buildx rm --force eland-multiarch-builder || true
 docker buildx create --name eland-multiarch-builder --bootstrap --use
 docker buildx build --push \
  --file Dockerfile.wolfi \
  --tag "$docker_registry/eland/eland:$RELEASE_VERSION" \
  --tag "$docker_registry/eland/eland:latest" \
  --platform linux/amd64,linux/arm64 \
--- a/.dockerignore
+++ b/.dockerignore
@ -1,5 +1,4 @@
 # docs and example
 docs/*
 example/*
 # Git
@ -18,9 +17,6 @@ dist/
 # Build folder
 build/
 # docs
 docs/*
 # pytest results
 tests/dataframe/results/*csv
 result_images/
--- a/.github/workflows/backport.yml
+++ b/.github/workflows/backport.yml
@ -0,0 +1,26 @@
 name: Backport
 on:
  pull_request_target:
    types:
      - closed
      - labeled
 jobs:
  backport:
    name: Backport
    runs-on: ubuntu-latest
    # Only react to merged PRs for security reasons.
    # See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target.
    if: >
      github.event.pull_request.merged
      && (
        github.event.action == 'closed'
        || (
          github.event.action == 'labeled'
          && contains(github.event.label.name, 'backport')
        )
      )
    steps:
      - uses: tibdex/backport@9565281eda0731b1d20c4025c43339fb0a23812e # v2.0.4
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/docs-build.yml
+++ b/.github/workflows/docs-build.yml
@ -0,0 +1,19 @@
 name: docs-build
 on:
  push:
    branches:
      - main
  pull_request_target: ~
  merge_group: ~
 jobs:
  docs-preview:
    uses: elastic/docs-builder/.github/workflows/preview-build.yml@main
    with:
      path-pattern: docs/**
    permissions:
      deployments: write
      id-token: write
      contents: read
      pull-requests: write
--- a/.github/workflows/docs-cleanup.yml
+++ b/.github/workflows/docs-cleanup.yml
@ -0,0 +1,14 @@
 name: docs-cleanup
 on:
  pull_request_target:
    types:
      - closed
 jobs:
  docs-preview:
    uses: elastic/docs-builder/.github/workflows/preview-cleanup.yml@main
    permissions:
      contents: none
      id-token: write
      deployments: write
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -3,9 +3,12 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.10"
+    python: "3.11"
 python:
  install:
    - requirements: docs/requirements-docs.txt
    - path: .
    - requirements: docs/requirements-docs.txt
 sphinx:
  configuration: docs/sphinx/conf.py
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -2,6 +2,129 @@
 Changelog
 =========
 9.0.1 (2025-04-30)
 ------------------
 * Forbid Elasticsearch 8 client or server (`#780 <https://github.com/elastic/eland/pull/780>`_)
 * Fix DeBERTa tokenization (`#769 <https://github.com/elastic/eland/pull/769>`_)
 * Upgrade PyTorch to 2.5.1 (`#785 <https://github.com/elastic/eland/pull/785>`_)
 * Upgrade LightGBM to 4.6.0 (`#782 <https://github.com/elastic/eland/pull/782>`_)
 9.0.0 (2025-04-15)
 ------------------
 * Drop Python 3.8, Support Python 3.12 (`#743 <https://github.com/elastic/eland/pull/743>`_)
 * Support Pandas 2 (`#742 <https://github.com/elastic/eland/pull/742>`_)
 * Upgrade transformers to 4.47 (`#752 <https://github.com/elastic/eland/pull/752>`_)
 * Remove ML model export as sklearn Pipeline (`#744 <https://github.com/elastic/eland/pull/744>`_)
 * Allow scikit-learn 1.5 (`#729 <https://github.com/elastic/eland/pull/729>`_)
 * Migrate docs from AsciiDoc to Markdown (`#762 <https://github.com/elastic/eland/pull/762>`_)
 8.17.0 (2025-01-07)
 -------------------
 * Support sparse embedding models such as SPLADE-v3-DistilBERT (`#740 <https://github.com/elastic/eland/pull/740>`_)
 8.16.0 (2024-11-13)
 -------------------
 * Add deprecation warning for ESGradientBoostingModel subclasses (`#738 <https://github.com/elastic/eland/pull/738>`_)
 8.15.4 (2024-10-17)
 -------------------
 * Revert "Allow reading Elasticsearch certs in Wolfi image" (`#734 <https://github.com/elastic/eland/pull/734>`_)
 8.15.3 (2024-10-09)
 -------------------
 * Added support for DeBERTa-V2 tokenizer (`#717 <https://github.com/elastic/eland/pull/717>`_)
 * Fixed ``--ca-cert`` with a shared Elasticsearch Docker volume (`#732 <https://github.com/elastic/eland/pull/732>`_)
 8.15.2 (2024-10-02)
 -------------------
 * Fixed Docker image build (`#728 <https://github.com/elastic/eland/pull/728>`_)
 8.15.1 (2024-10-01)
 -------------------
 * Upgraded PyTorch to version 2.3.1, which is compatible with Elasticsearch 8.15.2 or above (`#718 <https://github.com/elastic/eland/pull/718>`_)
 * Migrated to distroless Wolfi base Docker image (`#720 <https://github.com/elastic/eland/pull/720>`_)
 8.15.0 (2024-08-12)
 -------------------
 * Added a default truncation of ``second`` for text similarity (`#713 <https://github.com/elastic/eland/pull/713>`_)
 * Added note about using text_similarity for rerank in the CLI (`#716 <https://github.com/elastic/eland/pull/716>`_)
 * Added support for lists in result hits (`#707 <https://github.com/elastic/eland/pull/707>`_)
 * Removed input fields from exported LTR models (`#708 <https://github.com/elastic/eland/pull/708>`_)
 8.14.0 (2024-06-10)
 -------------------
 Added
 ^^^^^
 * Added Elasticsearch Serverless support in DataFrames (`#690`_, contributed by `@AshokChoudhary11`_) and eland_import_hub_model (`#698`_)
 Fixed
 ^^^^^
 * Fixed Python 3.8 support (`#695`_, contributed by `@bartbroere`_)
 * Fixed non _source fields missing from the results hits (`#693`_, contributed by `@bartbroere`_)
 .. _@AshokChoudhary11: https://github.com/AshokChoudhary11
 .. _#690: https://github.com/elastic/eland/pull/690
 .. _#693: https://github.com/elastic/eland/pull/693
 .. _#695: https://github.com/elastic/eland/pull/695
 .. _#698: https://github.com/elastic/eland/pull/698
 8.13.1 (2024-05-03)
 -------------------
 Added
 ^^^^^
 * Added support for HTTP proxies in eland_import_hub_model (`#688`_)
 .. _#688: https://github.com/elastic/eland/pull/688
 8.13.0 (2024-03-27)
 -------------------
 Added
 ^^^^^
 * Added support for Python 3.11 (`#681`_) 
 * Added ``eland.DataFrame.to_json`` function (`#661`_, contributed by `@bartbroere`_)
 * Added override option to specify the model's max input size (`#674`_)
 Changed
 ^^^^^^^
 * Upgraded torch to 2.1.2 (`#671`_)
 * Mirrored pandas' ``lineterminator`` instead of ``line_terminator`` in ``to_csv`` (`#595`_, contributed by `@bartbroere`_)
 .. _#595: https://github.com/elastic/eland/pull/595
 .. _#661: https://github.com/elastic/eland/pull/661
 .. _#671: https://github.com/elastic/eland/pull/671
 .. _#674: https://github.com/elastic/eland/pull/674
 .. _#681: https://github.com/elastic/eland/pull/681
 8.12.1 (2024-01-30)
 -------------------
 Fixed
 ^^^^^
 * Fix missing value support for XGBRanker (`#654`_)
 .. _#654: https://github.com/elastic/eland/pull/654
 8.12.0 (2024-01-18)
 -------------------
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,9 +78,15 @@ Once your changes and tests are ready to submit for review:
    # Run Auto-format, lint, mypy type checker for your changes
    $ nox -s format
-    # Run the test suite
+    # Launch Elasticsearch with a trial licence and ML enabled
-    $ pytest --doctest-modules eland/ tests/
+    $ docker run --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.license.self_generated.type=trial" docker.elastic.co/elasticsearch/elasticsearch:9.0.0
-    $ pytest --nbval tests/notebook/
+
    # See all test suites
    $ nox -l
    # Run a specific test suite
    $ nox -rs "test-3.12(pandas_version='2.2.3')"
    # Run a specific test
    $ nox -rs "test-3.12(pandas_version='2.2.3')" -- -k test_learning_to_rank
    ```
@ -169,7 +175,7 @@ currently using a minimum version of PyCharm 2019.2.4.
 * Setup Elasticsearch instance with docker
    ``` bash
-    > ELASTICSEARCH_VERSION=elasticsearch:8.x-SNAPSHOT BUILDKITE=false .buildkite/run-elasticsearch.sh
+    > ELASTICSEARCH_VERSION=elasticsearch:8.17.0 BUILDKITE=false .buildkite/run-elasticsearch.sh
    ```
 * Now check `http://localhost:9200`
@ -203,7 +209,7 @@ currently using a minimum version of PyCharm 2019.2.4.
 * To test specific versions of Python run
    ``` bash
-    > nox -s test-3.8
+    > nox -s test-3.12
    ```
 ### Documentation
--- a/2
+++ b/2
@ -18,7 +18,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
      python3 -m pip install \
        --no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu  \
-        torch==1.13.1+cpu .[all]; \
+        torch==2.5.1+cpu .[all]; \
    else \
      python3 -m pip install \
        --no-cache-dir --disable-pip-version-check \
--- a/Dockerfile.wolfi
+++ b/Dockerfile.wolfi
@ -0,0 +1,42 @@
 # syntax=docker/dockerfile:1
 FROM docker.elastic.co/wolfi/python:3.10-dev AS builder
 WORKDIR /eland
 ENV VIRTUAL_ENV=/eland/venv
 RUN python3 -m venv $VIRTUAL_ENV
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ADD . /eland
 ARG TARGETPLATFORM
 RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
      python3 -m pip install \
        --no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu  \
        torch==2.5.1+cpu .[all]; \
    else \
      python3 -m pip install \
        --no-cache-dir --disable-pip-version-check \
        .[all]; \
    fi
 FROM docker.elastic.co/wolfi/python:3.10
 WORKDIR /eland
 ENV VIRTUAL_ENV=/eland/venv
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 COPY --from=builder /eland /eland
 # The eland_import_hub_model script is intended to be executed by a shell,
 # which will see its shebang line and then execute it with the Python
 # interpreter of the virtual environment. We want to keep this behavior even
 # with Wolfi so that users can use the image as before. To do that, we use two
 # tricks:
 #
 #  * copy /bin/sh (that is, busybox's ash) from the builder image
 #  * revert to Docker's the default entrypoint, which is the only way to pass
 #    parameters to `eland_import_hub_model` without needing quotes.
 #
 COPY --from=builder /bin/sh /bin/sh
 ENTRYPOINT []
--- a/README.md
+++ b/README.md
@ -12,8 +12,7 @@
  <a href="https://pepy.tech/project/eland"><img src="https://static.pepy.tech/badge/eland" alt="Downloads"></a>
  <a href="https://pypi.org/project/eland"><img src="https://img.shields.io/pypi/status/eland.svg"
      alt="Package Status"></a>
-  <a href="https://clients-ci.elastic.co/job/elastic+eland+main"><img
+  <a href="https://buildkite.com/elastic/eland"><img src="https://badge.buildkite.com/d92340e800bc06a7c7c02a71b8d42fcb958bd18c25f99fe2d9.svg" alt="Build Status"></a>
      src="https://clients-ci.elastic.co/buildStatus/icon?job=elastic%2Beland%2Bmain" alt="Build Status"></a>
  <a href="https://github.com/elastic/eland/blob/main/LICENSE.txt"><img src="https://img.shields.io/pypi/l/eland.svg"
      alt="License"></a>
  <a href="https://eland.readthedocs.io"><img
@ -43,7 +42,7 @@ $ python -m pip install eland
 If using Eland to upload NLP models to Elasticsearch install the PyTorch extras:
 ```bash
-$ python -m pip install eland[pytorch]
+$ python -m pip install 'eland[pytorch]'
 ```
 Eland can also be installed from [Conda Forge](https://anaconda.org/conda-forge/eland) with Conda:
@ -54,13 +53,14 @@ $ conda install -c conda-forge eland
 ### Compatibility
- Supports Python 3.8, 3.9, 3.10 and Pandas 1.5
+- Supports Python 3.9, 3.10, 3.11 and 3.12.
- Supports Elasticsearch clusters that are 7.11+, recommended 8.3 or later for all features to work.
+- Supports Pandas 1.5 and 2.
 - Supports Elasticsearch 8+ clusters, recommended 8.16 or later for all features to work.
  If you are using the NLP with PyTorch feature make sure your Eland minor version matches the minor 
  version of your Elasticsearch cluster. For all other features it is sufficient for the major versions
  to match.
- You need to use PyTorch `1.13.1` or earlier to import an NLP model. 
+- You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip
-  Run `pip install torch==1.13.1` to install the aproppriate version of PyTorch.
+  install 'eland[pytorch]'` to install that version.
 ### Prerequisites
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@ -55,7 +55,7 @@ spec:
      repository: elastic/eland
      teams:
        ml-core: {}
-        clients-team: {}
+        devtools-team: {}
        es-docs: {}
        everyone:
          access_level: READ_ONLY
@ -89,6 +89,6 @@ spec:
      repository: elastic/eland
      teams:
        ml-core: {}
-        clients-team: {}
+        devtools-team: {}
        everyone:
          access_level: READ_ONLY
--- a/docs/docset.yml
+++ b/docs/docset.yml
@ -0,0 +1,10 @@
 project: 'Eland Python client'
 products:
  - id: elasticsearch-client
 cross_links:
  - docs-content
 toc:
  - toc: reference
 subs:
  es:   "Elasticsearch"
  ml:   "machine learning"
--- a/docs/guide/index.asciidoc
+++ b/docs/guide/index.asciidoc
@ -1,14 +0,0 @@
 = Eland Python Client
 :doctype:           book
 include::{asciidoc-dir}/../../shared/versions/stack/{source_branch}.asciidoc[]
 include::{asciidoc-dir}/../../shared/attributes.asciidoc[]
 include::overview.asciidoc[]
 include::installation.asciidoc[]
 include::dataframes.asciidoc[]
 include::machine-learning.asciidoc[]
--- a/docs/guide/installation.asciidoc
+++ b/docs/guide/installation.asciidoc
@ -1,16 +0,0 @@
 [[installation]]
 == Installation
 Eland can be installed with https://pip.pypa.io[pip] from https://pypi.org/project/eland[PyPI]. We recommend https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/[using a virtual environment] when installing with pip:
 [source,sh]
 -----------------------------
 $ python -m pip install eland
 -----------------------------
 Alternatively, Eland can be installed with https://docs.conda.io[Conda] from https://anaconda.org/conda-forge/eland[Conda Forge]:
 [source,sh]
 ------------------------------------
 $ conda install -c conda-forge eland
 ------------------------------------
--- a/docs/guide/machine-learning.asciidoc
+++ b/docs/guide/machine-learning.asciidoc
@ -1,194 +0,0 @@
 [[machine-learning]]
 == Machine Learning
 [discrete]
 [[ml-trained-models]]
 === Trained models
 Eland allows transforming trained models from scikit-learn, XGBoost,
 and LightGBM libraries to be serialized and used as an inference
 model in {es}.
 [source,python]
 ------------------------
 >>> from xgboost import XGBClassifier
 >>> from eland.ml import MLModel
 # Train and exercise an XGBoost ML model locally
 >>> xgb_model = XGBClassifier(booster="gbtree")
 >>> xgb_model.fit(training_data[0], training_data[1])
 >>> xgb_model.predict(training_data[0])
 [0 1 1 0 1 0 0 0 1 0]
 # Import the model into Elasticsearch
 >>> es_model = MLModel.import_model(
    es_client="http://localhost:9200",
    model_id="xgb-classifier",
    model=xgb_model,
    feature_names=["f0", "f1", "f2", "f3", "f4"],
 )
 # Exercise the ML model in Elasticsearch with the training data
 >>> es_model.predict(training_data[0])
 [0 1 1 0 1 0 0 0 1 0]
 ------------------------
 [discrete]
 [[ml-nlp-pytorch]]
 === Natural language processing (NLP) with PyTorch
 IMPORTANT: You need to use PyTorch `1.13` or earlier to import an NLP model. 
 Run `pip install torch==1.13` to install the aproppriate version of PyTorch.
 For NLP tasks, Eland enables you to import PyTorch models into {es}. Use the 
 `eland_import_hub_model` script to download and install supported 
 https://huggingface.co/transformers[transformer models] from the
 https://huggingface.co/models[Hugging Face model hub]. For example:
 [source,bash]
 ------------------------
 $ eland_import_hub_model <authentication> \ <1>
  --url http://localhost:9200/ \ <2>
  --hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
  --task-type ner \ <4>
  --start
 ------------------------
 <1> Use an authentication method to access your cluster. Refer to <<ml-nlp-pytorch-auth>>.
 <2> The cluster URL. Alternatively, use `--cloud-id`.
 <3> Specify the identifier for the model in the Hugging Face model hub.
 <4> Specify the type of NLP task. Supported values are `fill_mask`, `ner`,
 `question_answering`, `text_classification`, `text_embedding`, and `zero_shot_classification`.
 [discrete]
 [[ml-nlp-pytorch-docker]]
 ==== Import model with Docker
 IMPORTANT: To use the Docker container, you need to clone the Eland repository: https://github.com/elastic/eland
 If you want to use Eland without installing it, you can use the Docker image:
 You can use the container interactively:
 ```bash
 $ docker run -it --rm --network host docker.elastic.co/eland/eland
 ```
 Running installed scripts is also possible without an interactive shell, for example:
 ```bash
 docker run -it --rm docker.elastic.co/eland/eland \
    eland_import_hub_model \
      --url $ELASTICSEARCH_URL \
      --hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
      --start
 ```
 Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
 [discrete]
 [[ml-nlp-pytorch-air-gapped]]
 ==== Install models in an air-gapped environment 
 You can install models in a restricted or closed network by pointing the 
 `eland_import_hub_model` script to local files. 
 For an offline install of a Hugging Face model, the model first needs to be 
 cloned locally, Git and https://git-lfs.com/[Git Large File Storage] are 
 required to be installed in your system.
 1. Select a model you want to use from Hugging Face. Refer to the 
 {ml-docs}/ml-nlp-model-ref.html[compatible third party model] list for more 
 information on the supported architectures. 
 2. Clone the selected model from Hugging Face by using the model URL. For 
 example:
 +
 --
 [source,bash]
 ----
 git clone https://huggingface.co/dslim/bert-base-NER
 ----
 This command results in a local copy of 
 of the model in the directory `bert-base-NER`.
 --
 3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the 
 directory of the cloned model to install it:
 +
 --
 [source,bash]
 ----
 eland_import_hub_model \
      --url 'XXXX' \
      --hub-model-id /PATH/TO/MODEL \
      --task-type ner \
      --es-username elastic --es-password XXX \
      --es-model-id bert-base-ner
 ----
 If you use the Docker image to run `eland_import_hub_model` you must bind mount 
 the model directory, so the container can read the files:
 [source,bash]
 ----
 docker run --mount type=bind,source=/PATH/TO/MODELS,destination=/models,readonly -it --rm docker.elastic.co/eland/eland \
    eland_import_hub_model \
      --url 'XXXX' \
      --hub-model-id /models/bert-base-NER \
      --task-type ner \
      --es-username elastic --es-password XXX \
      --es-model-id bert-base-ner
 ----
 Once it's uploaded to {es}, the model will have the ID specified by 
 `--es-model-id`. If it is not set, the model ID is derived from 
 `--hub-model-id`; spaces and path delimiters are converted to double 
 underscores `__`.
 --
 [discrete]
 [[ml-nlp-pytorch-auth]]
 ==== Authentication methods
 The following authentication options are available when using the import script:
 * Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
 +
 --
 [source,bash]
 --------------------------------------------------
 eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
 --------------------------------------------------
 These `-u` and `-p` options also work when you use `--url`.
 --
 * Elasticsearch username and password authentication (embedded in the URL):
 +
 --
 [source,bash]
 --------------------------------------------------
 eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
 --------------------------------------------------
 --
 * Elasticsearch API key authentication:
 +
 --
 [source,bash]
 --------------------------------------------------
 eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
 --------------------------------------------------
 --
 * HuggingFace Hub access token (for private models):
 +
 --
 [source,bash]
 --------------------------------------------------
 eland_import_hub_model --hub-access-token <access-token> ...
 --------------------------------------------------
 --
--- a/docs/guide/dataframes.asciidoc
+++ b/docs/guide/dataframes.asciidoc
@ -1,16 +1,16 @@
-[[dataframes]]
+---
-== Data Frames
+mapped_pages:
  - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/dataframes.html
 ---
-`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API
+# Data Frames [dataframes]
 and defers all processing and filtering of data to Elasticsearch
 instead of your local machine. This means you can process large
 amounts of data within Elasticsearch from a Jupyter Notebook
 without overloading your machine.
-[source,python]
+`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API and defers all processing and filtering of data to Elasticsearch instead of your local machine. This means you can process large amounts of data within Elasticsearch from a Jupyter Notebook without overloading your machine.
-------------------------------------
+
 ```python
 >>> import eland as ed
->>> # Connect to 'flights' index via localhost Elasticsearch node
+>>>
 # Connect to 'flights' index via localhost Elasticsearch node
 >>> df = ed.DataFrame('http://localhost:9200', 'flights')
 # eland.DataFrame instance has the same API as pandas.DataFrame
@ -59,4 +59,5 @@ Elasticsearch storage usage: 5.043 MB
 sum        9.261629e+07    8.204365e+06
 min        0.000000e+00    1.000205e+02
 std        4.578263e+03    2.663867e+02
-------------------------------------
+```
--- a/docs/guide/overview.asciidoc
+++ b/docs/guide/overview.asciidoc
@ -1,33 +1,36 @@
-[[overview]]
+---
-== Overview
+mapped_pages:
  - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/index.html
  - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/overview.html
 navigation_title: Eland
 ---
-Eland is a Python client and toolkit for DataFrames and {ml} in {es}.
+# Eland Python client [overview]
 Full documentation is available on https://eland.readthedocs.io[Read the Docs].
 Source code is available on https://github.com/elastic/eland[GitHub].
-[discrete]
+Eland is a Python client and toolkit for DataFrames and {{ml}} in {{es}}. Full documentation is available on [Read the Docs](https://eland.readthedocs.io). Source code is available on [GitHub](https://github.com/elastic/eland).
 === Compatibility
 - Supports Python 3.8+ and Pandas 1.5
 - Supports {es} clusters that are 7.11+, recommended 7.14 or later for all features to work.
  Make sure your Eland major version matches the major version of your Elasticsearch cluster.
-The recommended way to set your requirements in your `setup.py` or
+## Compatibility [_compatibility]
 `requirements.txt` is::
-    # Elasticsearch 8.x
+* Supports Python 3.9+ and Pandas 1.5
-    eland>=8,<9
+* Supports {{es}} 8+ clusters, recommended 8.16 or later for all features to work. Make sure your Eland major version matches the major version of your Elasticsearch cluster.
-    # Elasticsearch 7.x
+The recommended way to set your requirements in your `setup.py` or `requirements.txt` is::
    eland>=7,<8
-[discrete]
+```
-=== Getting Started
+# Elasticsearch 8.x
 eland>=8,<9
 ```
 ```
 # Elasticsearch 7.x
 eland>=7,<8
 ```
-Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`:
+## Getting Started [_getting_started]
-[source,python]
+Create a `DataFrame` object connected to an {{es}} cluster running on `http://localhost:9200`:
------------------------------------
+
 ```python
 >>> import eland as ed
 >>> df = ed.DataFrame(
 ...    es_client="http://localhost:9200",
@ -48,20 +51,19 @@ Create a `DataFrame` object connected to an {es} cluster running on `http://loca
 13058      858.144337      False  ...         6 2018-02-11 14:54:34
 [13059 rows x 27 columns]
------------------------------------
+```
-[discrete]
+
-==== Elastic Cloud
+### Elastic Cloud [_elastic_cloud]
 You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
-[source,python]
+```python
 ------------------------------------
 >>> import eland as ed
 >>> from elasticsearch import Elasticsearch
 # First instantiate an 'Elasticsearch' instance connected to Elastic Cloud
->>> es = Elasticsearch(cloud_id="...", api_key=("...", "..."))
+>>> es = Elasticsearch(cloud_id="...", api_key="...")
 # then wrap the client in an Eland DataFrame:
 >>> df = ed.DataFrame(es, es_index_pattern="flights")
@ -73,16 +75,16 @@ You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
 3          181.694216       True  ...         0 2018-01-01 10:33:28
 4          730.041778      False  ...         0 2018-01-01 05:13:00
 [5 rows x 27 columns]
------------------------------------
+```
 Eland can be used for complex queries and aggregations:
-[source,python]
+```python
 ------------------------------------
 >>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False)
                  AvgTicketPrice  Cancelled                     timestamp
 Carrier
 ES-Air                630.235816   0.129814 2018-01-21 20:45:00.200000000
 JetBeats              627.457373   0.134698 2018-01-21 14:43:18.112400635
 Logstash Airways      624.581974   0.125188 2018-01-21 16:14:50.711798340
------------------------------------
+```
--- a/docs/reference/installation.md
+++ b/docs/reference/installation.md
@ -0,0 +1,19 @@
 ---
 mapped_pages:
  - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/installation.html
 ---
 # Installation [installation]
 Eland can be installed with [pip](https://pip.pypa.io) from [PyPI](https://pypi.org/project/eland). We recommend [using a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) when installing with pip:
 ```sh
 $ python -m pip install eland
 ```
 Alternatively, Eland can be installed with [Conda](https://docs.conda.io) from [Conda Forge](https://anaconda.org/conda-forge/eland):
 ```sh
 $ conda install -c conda-forge eland
 ```
--- a/docs/reference/machine-learning.md
+++ b/docs/reference/machine-learning.md
@ -0,0 +1,199 @@
 ---
 mapped_pages:
  - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html
 ---
 # Machine Learning [machine-learning]
 ## Trained models [ml-trained-models]
 Eland allows transforming *some* 
 [trained models](https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters) from scikit-learn, XGBoost, 
 and LightGBM libraries to be serialized and used as an inference model in {{es}}.
 ```python
 >>> from xgboost import XGBClassifier
 >>> from eland.ml import MLModel
 # Train and exercise an XGBoost ML model locally
 >>> xgb_model = XGBClassifier(booster="gbtree")
 >>> xgb_model.fit(training_data[0], training_data[1])
 >>> xgb_model.predict(training_data[0])
 [0 1 1 0 1 0 0 0 1 0]
 # Import the model into Elasticsearch
 >>> es_model = MLModel.import_model(
    es_client="http://localhost:9200",
    model_id="xgb-classifier",
    model=xgb_model,
    feature_names=["f0", "f1", "f2", "f3", "f4"],
 )
 # Exercise the ML model in Elasticsearch with the training data
 >>> es_model.predict(training_data[0])
 [0 1 1 0 1 0 0 0 1 0]
 ```
 ## Natural language processing (NLP) with PyTorch [ml-nlp-pytorch]
 ::::{important}
 You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version.
 ::::
 For NLP tasks, Eland enables you to import PyTorch models into {{es}}. Use the `eland_import_hub_model` script to download and install supported [transformer models](https://huggingface.co/transformers) from the [Hugging Face model hub](https://huggingface.co/models). For example:
 ```bash
 eland_import_hub_model <authentication> \ <1>
  --url http://localhost:9200/ \ <2>
  --hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
  --task-type ner \ <4>
  --start
 ```
 1. Use an authentication method to access your cluster. Refer to [Authentication methods](machine-learning.md#ml-nlp-pytorch-auth).
 2. The cluster URL. Alternatively, use `--cloud-id`.
 3. Specify the identifier for the model in the Hugging Face model hub.
 4. Specify the type of NLP task. Supported values are `fill_mask`, `ner`, `question_answering`, `text_classification`, `text_embedding`, `text_expansion`, `text_similarity` and `zero_shot_classification`.
 For more information about the available options, run `eland_import_hub_model` with the `--help` option.
 ```bash
 eland_import_hub_model --help
 ```
 ### Import model with Docker [ml-nlp-pytorch-docker]
 ::::{important}
 To use the Docker container, you need to clone the Eland repository: [https://github.com/elastic/eland](https://github.com/elastic/eland)
 ::::
 If you want to use Eland without installing it, you can use the Docker image:
 You can use the container interactively:
 ```bash
 docker run -it --rm --network host docker.elastic.co/eland/eland
 ```
 Running installed scripts is also possible without an interactive shell, for example:
 ```bash
 docker run -it --rm docker.elastic.co/eland/eland \
    eland_import_hub_model \
      --url $ELASTICSEARCH_URL \
      --hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
      --start
 ```
 Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
 ### Install models in an air-gapped environment [ml-nlp-pytorch-air-gapped]
 You can install models in a restricted or closed network by pointing the `eland_import_hub_model` script to local files.
 For an offline install of a Hugging Face model, the model first needs to be cloned locally, Git and [Git Large File Storage](https://git-lfs.com/) are required to be installed in your system.
 1. Select a model you want to use from Hugging Face. Refer to the [compatible third party model](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md) list for more information on the supported architectures.
 2. Clone the selected model from Hugging Face by using the model URL. For example:
    ```bash
    git clone https://huggingface.co/dslim/bert-base-NER
    ```
    This command results in a local copy of of the model in the directory `bert-base-NER`.
 3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the directory of the cloned model to install it:
    ```bash
    eland_import_hub_model \
          --url 'XXXX' \
          --hub-model-id /PATH/TO/MODEL \
          --task-type ner \
          --es-username elastic --es-password XXX \
          --es-model-id bert-base-ner
    ```
    If you use the Docker image to run `eland_import_hub_model` you must bind mount the model directory, so the container can read the files:
    ```bash
    docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \
        eland_import_hub_model \
          --url 'XXXX' \
          --hub-model-id /model \
          --task-type ner \
          --es-username elastic --es-password XXX \
          --es-model-id bert-base-ner
    ```
    Once it’s uploaded to {{es}}, the model will have the ID specified by `--es-model-id`. If it is not set, the model ID is derived from `--hub-model-id`; spaces and path delimiters are converted to double underscores `__`.
 ### Connect to Elasticsearch through a proxy [ml-nlp-pytorch-proxy]
 Behind the scenes, Eland uses the `requests` Python library, which [allows configuring proxies through an environment variable](https://requests.readthedocs.io/en/latest/user/advanced/#proxies). For example, to use an HTTP proxy to connect to an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable when invoking Eland:
 ```bash
 HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ...
 ```
 If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY` instead.
 ### Authentication methods [ml-nlp-pytorch-auth]
 The following authentication options are available when using the import script:
 * Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
    ```bash
    eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
    ```
    These `-u` and `-p` options also work when you use `--url`.
 * Elasticsearch username and password authentication (embedded in the URL):
    ```bash
    eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
    ```
 * Elasticsearch API key authentication:
    ```bash
    eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
    ```
 * HuggingFace Hub access token (for private models):
    ```bash
    eland_import_hub_model --hub-access-token <access-token> ...
    ```
 ### TLS/SSL [ml-nlp-pytorch-tls]
 The following TLS/SSL options for Elasticsearch are available when using the import script:
 * Specify alternate CA bundle to verify the cluster certificate:
    ```bash
    eland_import_hub_model --ca-certs CA_CERTS ...
    ```
 * Disable TLS/SSL verification altogether (strongly discouraged):
    ```bash
    eland_import_hub_model --insecure ...
    ```
--- a/docs/reference/toc.yml
+++ b/docs/reference/toc.yml
@ -0,0 +1,6 @@
 project: 'Eland reference'
 toc:
  - file: index.md
  - file: installation.md
  - file: dataframes.md
  - file: machine-learning.md
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -1,13 +1,5 @@
-elasticsearch>=7.7
+matplotlib
 pandas>=1.5
 matplotlib>=3.6
 nbval
 scikit-learn>=0.22.1
 xgboost>=1
 lightgbm
 sphinx==5.3.0
 nbsphinx
 furo
 # traitlets has been having all sorts of release problems lately.
 traitlets<5.1
--- a/docs/sphinx/development/contributing.rst
+++ b/docs/sphinx/development/contributing.rst
@ -200,7 +200,7 @@ Configuring PyCharm And Running Tests
 - To test specific versions of Python run
   .. code-block:: bash
-      nox -s test-3.8
+      nox -s test-3.12
 Documentation
--- a/docs/sphinx/examples/demo_notebook.ipynb
+++ b/docs/sphinx/examples/demo_notebook.ipynb
@ -24,7 +24,7 @@
        "\n",
        "For this example, you will need:\n",
        "\n",
-        "- Python 3.8 or later\n",
+        "- Python 3.9 or later\n",
        "- An Elastic deployment\n",
        "  - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
        "\n",
--- a/docs/sphinx/reference/api/eland.DataFrame.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.rst
@ -49,6 +49,7 @@
      ~DataFrame.tail
      ~DataFrame.to_csv
      ~DataFrame.to_html
      ~DataFrame.to_json
      ~DataFrame.to_numpy
      ~DataFrame.to_pandas
      ~DataFrame.to_string
--- a/docs/sphinx/reference/api/eland.DataFrame.to_json.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.to_json.rst
@ -0,0 +1,6 @@
 eland.DataFrame.to\_json
 ========================
 .. currentmodule:: eland
 .. automethod:: DataFrame.to_json
--- a/docs/sphinx/reference/api/eland.ml.MLModel.rst
+++ b/docs/sphinx/reference/api/eland.ml.MLModel.rst
@ -17,6 +17,7 @@
      ~MLModel.delete_model
      ~MLModel.exists_model
      ~MLModel.export_model
      ~MLModel.import_ltr_model
      ~MLModel.import_model
      ~MLModel.predict
--- a/docs/sphinx/reference/dataframe.rst
+++ b/docs/sphinx/reference/dataframe.rst
@ -140,5 +140,6 @@ Serialization / IO / Conversion
   DataFrame.to_numpy
   DataFrame.to_csv
   DataFrame.to_html
   DataFrame.to_json
   DataFrame.to_string
   DataFrame.to_pandas
--- a/docs/sphinx/reference/supported_apis.rst
+++ b/docs/sphinx/reference/supported_apis.rst
@ -395,7 +395,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.DataFrame.to_html()``            | **Yes**    |
 +---------------------------------------+------------+
-| ``ed.DataFrame.to_json()``            | No         |
+| ``ed.DataFrame.to_json()``            | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.DataFrame.to_latex()``           | No         |
 +---------------------------------------+------------+
--- a/eland/init.py
+++ b/eland/init.py
@ -15,6 +15,8 @@
 #  specific language governing permissions and limitations
 #  under the License.
 import warnings
 from ._version import (  # noqa: F401
    __author__,
    __author_email__,
@ -25,13 +27,16 @@ from ._version import (  # noqa: F401
    __url__,
    __version__,
 )
-from .common import SortOrder
+from .common import ElandDeprecationWarning, SortOrder
 from .dataframe import DataFrame
 from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland
 from .index import Index
 from .ndframe import NDFrame
 from .series import Series
 # Display Eland deprecation warnings by default
 warnings.simplefilter("default", category=ElandDeprecationWarning)
 __all__ = [
    "DataFrame",
    "Series",
--- a/eland/_version.py
+++ b/eland/_version.py
@ -18,7 +18,7 @@
 __title__ = "eland"
 __description__ = "Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch"
 __url__ = "https://github.com/elastic/eland"
-__version__ = "8.12.0"
+__version__ = "9.0.1"
 __author__ = "Steve Dodson"
 __author_email__ = "steve.dodson@elastic.co"
 __maintainer__ = "Elastic Client Library Maintainers"
--- a/eland/cli/eland_import_hub_model.py
+++ b/eland/cli/eland_import_hub_model.py
@ -32,7 +32,8 @@ import textwrap
 from elastic_transport.client_utils import DEFAULT
 from elasticsearch import AuthenticationException, Elasticsearch
-from eland.common import parse_es_version
+from eland._version import __version__
 from eland.common import is_serverless_es, parse_es_version
 MODEL_HUB_URL = "https://huggingface.co"
@ -40,7 +41,9 @@ MODEL_HUB_URL = "https://huggingface.co"
 def get_arg_parser():
    from eland.ml.pytorch.transformers import SUPPORTED_TASK_TYPES
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
        exit_on_error=False
    )  # throw exception rather than exit
    location_args = parser.add_mutually_exclusive_group(required=True)
    location_args.add_argument(
        "--url",
@ -96,7 +99,7 @@ def get_arg_parser():
        "--task-type",
        required=False,
        choices=SUPPORTED_TASK_TYPES,
-        help="The task type for the model usage. Will attempt to auto-detect task type for the model if not provided. "
+        help="The task type for the model usage. Use text_similarity for rerank tasks. Will attempt to auto-detect task type for the model if not provided. "
        "Default: auto",
        default="auto",
    )
@ -141,15 +144,47 @@ def get_arg_parser():
        help="String to prepend to model input at search",
    )
    parser.add_argument(
        "--max-model-input-length",
        required=False,
        default=None,
        help="""Set the model's max input length.
                Usually the max input length is derived from the Hugging Face
                model confifguation. Use this option to explicity set the model's
                max input length if the value can not be found in the Hugging
                Face configuration. Max input length should never exceed the
                model's true max length, setting a smaller max length is valid.
                """,
        type=int,
    )
    return parser
 def parse_args():
    parser = get_arg_parser()
    try:
        return parser.parse_args()
    except argparse.ArgumentError as argument_error:
        if argument_error.argument_name == "--task-type":
            message = (
                argument_error.message
                + "\n\nUse 'text_similarity' for rerank tasks in Elasticsearch"
            )
            parser.error(message=message)
        else:
            parser.error(message=argument_error.message)
    except argparse.ArgumentTypeError as type_error:
        parser.error(str(type_error))
 def get_es_client(cli_args, logger):
    try:
        es_args = {
            "request_timeout": 300,
            "verify_certs": cli_args.insecure,
            "ca_certs": cli_args.ca_certs,
            "node_class": "requests",
        }
        # Deployment location
@ -180,13 +215,20 @@ def get_es_client(cli_args, logger):
 def check_cluster_version(es_client, logger):
    es_info = es_client.info()
    if is_serverless_es(es_client):
        logger.info(f"Connected to serverless cluster '{es_info['cluster_name']}'")
        # Serverless is compatible
        # Return the latest known semantic version, i.e. this version
        return parse_es_version(__version__)
    # check the semantic version for none serverless clusters
    logger.info(
        f"Connected to cluster named '{es_info['cluster_name']}' (version: {es_info['version']['number']})"
    )
    sem_ver = parse_es_version(es_info["version"]["number"])
    major_version = sem_ver[0]
    minor_version = sem_ver[1]
    # NLP models added in 8
    if major_version < 8:
@ -194,14 +236,9 @@ def check_cluster_version(es_client, logger):
            f"Elasticsearch version {major_version} does not support NLP models. Please upgrade Elasticsearch to the latest version"
        )
        exit(1)
-
+    elif major_version < 9:
    # PyTorch was upgraded to version 1.13.1 in 8.7.
    # and is incompatible with earlier versions
    if major_version == 8 and minor_version < 7:
        import torch
        logger.error(
-            f"Eland uses PyTorch version {torch.__version__} which is incompatible with Elasticsearch versions prior to 8.7. Please upgrade Elasticsearch to at least version 8.7"
+            "Eland 9.x does not support Elasticsearch 8.x. Please upgrade Elasticsearch first."
        )
        exit(1)
@ -220,6 +257,7 @@ def main():
            SUPPORTED_TASK_TYPES,
            TaskTypeError,
            TransformerModel,
            UnknownModelInputSizeError,
        )
    except ModuleNotFoundError as e:
        logger.error(
@ -237,7 +275,7 @@ def main():
    assert SUPPORTED_TASK_TYPES
    # Parse arguments
-    args = get_arg_parser().parse_args()
+    args = parse_args()
    # Connect to ES
    logger.info("Establishing connection to Elasticsearch")
@ -259,6 +297,7 @@ def main():
                quantize=args.quantize,
                ingest_prefix=args.ingest_prefix,
                search_prefix=args.search_prefix,
                max_model_input_size=args.max_model_input_length,
            )
            model_path, config, vocab_path = tm.save(tmp_dir)
        except TaskTypeError as err:
@ -266,6 +305,12 @@ def main():
                f"Failed to get model for task type, please provide valid task type via '--task-type' parameter. Caused by {err}"
            )
            exit(1)
        except UnknownModelInputSizeError as err:
            logger.error(
                f"""Could not automatically determine the model's max input size from the model configuration.
                Please provde the max input size via the --max-model-input-length parameter. Caused by {err}"""
            )
            exit(1)
        ptm = PyTorchModel(
            es, args.es_model_id if args.es_model_id else tm.elasticsearch_model_id()
--- a/eland/common.py
+++ b/eland/common.py
@ -52,6 +52,10 @@ PANDAS_VERSION: Tuple[int, ...] = tuple(
 _ELAND_MAJOR_VERSION = int(_eland_version.split(".")[0])
 class ElandDeprecationWarning(DeprecationWarning):
    """Warning for deprecation functionality in Eland"""
 with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    EMPTY_SERIES_DTYPE = pd.Series().dtype
@ -305,11 +309,15 @@ def elasticsearch_date_to_pandas_date(
 def ensure_es_client(
-    es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
+    es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
 ) -> Elasticsearch:
    if isinstance(es_client, tuple):
        es_client = list(es_client)
-    if not isinstance(es_client, Elasticsearch):
+    if (
        isinstance(es_client, str)
        or isinstance(es_client, list)
        or isinstance(es_client, tuple)
    ):
        es_client = Elasticsearch(es_client)
    return es_client
@ -340,6 +348,17 @@ def es_version(es_client: Elasticsearch) -> Tuple[int, int, int]:
    return eland_es_version
 def is_serverless_es(es_client: Elasticsearch) -> bool:
    """
    Returns true if the client is connected to a serverless instance of Elasticsearch.
    """
    es_info = es_client.info()
    return (
        "build_flavor" in es_info["version"]
        and es_info["version"]["build_flavor"] == "serverless"
    )
 def parse_es_version(version: str) -> Tuple[int, int, int]:
    """
    Parse the semantic version from a string e.g. '8.8.0'
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing  # type: ignore
 from pandas.util._validators import validate_bool_kwarg  # type: ignore
 import eland.plotting as gfx
-from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
+from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
 from eland.filter import BooleanFilter
 from eland.groupby import DataFrameGroupBy
 from eland.ndframe import NDFrame
@ -83,7 +83,7 @@ class DataFrame(NDFrame):
    3      181.694216       True  ...         0 2018-01-01 10:33:28
    4      730.041778      False  ...         0 2018-01-01 05:13:00
    <BLANKLINE>
-    [5 rows x 27 columns]
+    [5 rows x 28 columns]
    Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
@ -173,13 +173,13 @@ class DataFrame(NDFrame):
        >>> df = ed.DataFrame('http://localhost:9200', 'flights')
        >>> assert isinstance(df.columns, pd.Index)
        >>> df.columns
-        Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
+        Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Cities', 'Dest', 'DestAirportID', 'DestCityName',
-        ...   'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
+               'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
-        ...   'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
+               'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
-        ...   'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
+               'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
-        ...   'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
+               'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
-        ...   'timestamp'],
+               'timestamp'],
-        ...   dtype='object')
+              dtype='object')
        """
        return self._query_compiler.columns
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
            axis = pd.DataFrame._get_axis_name(axis)
            axes = {axis: labels}
        elif index is not None or columns is not None:
-            axes, _ = pd.DataFrame()._construct_axes_from_arguments(
+            axes = {"columns": columns, "index": index}
                (index, columns), {}
            )
        else:
            raise ValueError(
                "Need to specify at least one of 'labels', 'index' or 'columns'"
@ -956,8 +954,10 @@ class DataFrame(NDFrame):
        elif verbose is False:  # specifically set to False, not nesc None
            _non_verbose_repr()
        else:
-            _non_verbose_repr() if exceeds_info_cols else _verbose_repr(
+            (
-                number_of_columns
+                _non_verbose_repr()
                if exceeds_info_cols
                else _verbose_repr(number_of_columns)
            )
        # pandas 0.25.1 uses get_dtype_counts() here. This
@ -1303,6 +1303,7 @@ class DataFrame(NDFrame):
        quoting=None,
        quotechar='"',
        line_terminator=None,
        lineterminator=None,
        chunksize=None,
        tupleize_cols=None,
        date_format=None,
@ -1317,6 +1318,13 @@ class DataFrame(NDFrame):
        --------
        :pandas_api_docs:`pandas.DataFrame.to_csv`
        """
        if line_terminator:
            warnings.warn(
                "The line_terminator argument will be replaced by lineterminator",
                PendingDeprecationWarning,
                stacklevel=2,
            )
        kwargs = {
            "path_or_buf": path_or_buf,
            "sep": sep,
@ -1331,7 +1339,7 @@ class DataFrame(NDFrame):
            "compression": compression,
            "quoting": quoting,
            "quotechar": quotechar,
-            "line_terminator": line_terminator,
+            "lineterminator": lineterminator or line_terminator,
            "chunksize": chunksize,
            "date_format": date_format,
            "doublequote": doublequote,
@ -1340,6 +1348,50 @@ class DataFrame(NDFrame):
        }
        return self._query_compiler.to_csv(**kwargs)
    def to_json(
        self,
        path_or_buf=None,
        orient=None,
        date_format=None,
        double_precision=10,
        force_ascii=True,
        date_unit="ms",
        default_handler=None,
        lines=False,
        compression="infer",
        index=None,
        indent=None,
        storage_options=None,
    ):
        """Write Elasticsearch data to a json file.
        By setting the ``lines`` parameter to ``True``, and ``orient`` to ``'records'``,
        the entire DataFrame can be written in a streaming manner.
        Doing so avoids the need to have the entire DataFrame in memory.
        This format is known as JSON lines and can use the file extension ``.jsonl``.
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.to_json`
        """
        if index is None and PANDAS_VERSION[0] == 1:
            index = True  # switch to the pandas 1 default
        kwargs = {
            "path_or_buf": path_or_buf,
            "orient": orient,
            "date_format": date_format,
            "double_precision": double_precision,
            "force_ascii": force_ascii,
            "date_unit": date_unit,
            "default_handler": default_handler,
            "lines": lines,
            "compression": compression,
            "index": index,
            "indent": indent,
            "storage_options": storage_options,
        }
        return self._query_compiler.to_json(**kwargs)
    def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
        """
        Utility method to convert eland.Dataframe to pandas.Dataframe
@ -1962,9 +2014,9 @@ class DataFrame(NDFrame):
        --------
        >>> df = ed.DataFrame('http://localhost:9200', 'flights')
        >>> df.shape
-        (13059, 27)
+        (13059, 28)
        >>> df.query('FlightDelayMin > 60').shape
-        (2730, 27)
+        (2730, 28)
        """
        if isinstance(expr, BooleanFilter):
            return DataFrame(
--- a/eland/etl.py
+++ b/eland/etl.py
@ -16,6 +16,7 @@
 #  under the License.
 import csv
 import warnings
 from collections import deque
 from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
@ -110,15 +111,15 @@ def pandas_to_eland(
    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
    [3 rows x 8 columns]
-    >>> pd_df.dtypes
+    >>> pd_df.dtypes  # doctest skip required for pandas < 2  # doctest: +SKIP
-    A           float64
+    A          float64
-    B             int64
+    B            int64
-    C            object
+    C           object
-    D    datetime64[ns]
+    D    datetime64[s]
-    E           float64
+    E          float64
-    F              bool
+    F             bool
-    G             int64
+    G            int64
-    H            object
+    H           object
    dtype: object
    Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
@ -262,7 +263,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
    3      181.694216       True  ...         0 2018-01-01 10:33:28
    4      730.041778      False  ...         0 2018-01-01 05:13:00
    <BLANKLINE>
-    [5 rows x 27 columns]
+    [5 rows x 28 columns]
    Convert `eland.DataFrame` to `pandas.DataFrame` (Note: this loads entire Elasticsearch index into core memory)
@ -277,7 +278,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
    3      181.694216       True  ...         0 2018-01-01 10:33:28
    4      730.041778      False  ...         0 2018-01-01 05:13:00
    <BLANKLINE>
-    [5 rows x 27 columns]
+    [5 rows x 28 columns]
    Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
@ -307,9 +308,9 @@ def csv_to_eland(  # type: ignore
    names=None,
    index_col=None,
    usecols=None,
-    squeeze=False,
+    squeeze=None,
    prefix=None,
-    mangle_dupe_cols=True,
+    mangle_dupe_cols=None,
    # General Parsing Configuration
    dtype=None,
    engine=None,
@ -357,6 +358,7 @@ def csv_to_eland(  # type: ignore
    low_memory: bool = _DEFAULT_LOW_MEMORY,
    memory_map=False,
    float_precision=None,
    **extra_kwargs,
 ) -> "DataFrame":
    """
    Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
@ -485,7 +487,6 @@ def csv_to_eland(  # type: ignore
        "usecols": usecols,
        "verbose": verbose,
        "encoding": encoding,
        "squeeze": squeeze,
        "memory_map": memory_map,
        "float_precision": float_precision,
        "na_filter": na_filter,
@ -494,9 +495,9 @@ def csv_to_eland(  # type: ignore
        "error_bad_lines": error_bad_lines,
        "on_bad_lines": on_bad_lines,
        "low_memory": low_memory,
        "mangle_dupe_cols": mangle_dupe_cols,
        "infer_datetime_format": infer_datetime_format,
        "skip_blank_lines": skip_blank_lines,
        **extra_kwargs,
    }
    if chunksize is None:
@ -525,6 +526,18 @@ def csv_to_eland(  # type: ignore
        kwargs.pop("on_bad_lines")
    if "squeeze" in kwargs:
        kwargs.pop("squeeze")
        warnings.warn(
            "This argument no longer works, use .squeeze('columns') on your DataFrame instead"
        )
    if "mangle_dupe_cols" in kwargs:
        kwargs.pop("mangle_dupe_cols")
        warnings.warn(
            "The mangle_dupe_cols argument no longer works. Furthermore, "
            "duplicate columns will automatically get a number suffix."
        )
    # read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
    reader = pd.read_csv(filepath_or_buffer, **kwargs)
--- a/eland/field_mappings.py
+++ b/eland/field_mappings.py
@ -443,9 +443,9 @@ class FieldMappings:
                try:
                    series = df.loc[df.es_field_name == es_field_name_keyword]
                    if not series.empty and series.is_aggregatable.squeeze():
-                        row_as_dict[
+                        row_as_dict["aggregatable_es_field_name"] = (
-                            "aggregatable_es_field_name"
+                            es_field_name_keyword
-                        ] = es_field_name_keyword
+                        )
                    else:
                        row_as_dict["aggregatable_es_field_name"] = None
                except KeyError:
@ -712,8 +712,11 @@ class FieldMappings:
            capabilities, orient="index", columns=FieldMappings.column_labels
        )
-        self._mappings_capabilities = self._mappings_capabilities.append(
+        self._mappings_capabilities = pd.concat(
-            capability_matrix_row
+            [
                self._mappings_capabilities,
                capability_matrix_row,
            ]
        )
    def numeric_source_fields(self) -> List[str]:
--- a/eland/index.py
+++ b/eland/index.py
@ -50,10 +50,7 @@ class Index:
        # index_field.setter
        self._is_source_field = False
-        # The type:ignore is due to mypy not being smart enough
+        self.es_index_field = es_index_field
        # to recognize the property.setter has a different type
        # than the property.getter.
        self.es_index_field = es_index_field  # type: ignore
    @property
    def sort_field(self) -> str:
--- a/eland/ml/_model_serializer.py
+++ b/eland/ml/_model_serializer.py
@ -19,7 +19,7 @@ import base64
 import gzip
 import json
 from abc import ABC
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
@ -58,6 +58,9 @@ class ModelSerializer(ABC):
            "ascii"
        )
    def bounds(self) -> Tuple[float, float]:
        raise NotImplementedError
 class TreeNode:
    def __init__(
@ -96,6 +99,7 @@ class TreeNode:
            add_if_exists(d, "split_feature", self._split_feature)
            add_if_exists(d, "threshold", self._threshold)
            add_if_exists(d, "number_samples", self._number_samples)
            add_if_exists(d, "default_left", self._default_left)
        else:
            if len(self._leaf_value) == 1:
                # Support Elasticsearch 7.6 which only
@ -128,6 +132,14 @@ class Tree(ModelSerializer):
        add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
        return {"tree": d}
    def bounds(self) -> Tuple[float, float]:
        leaf_values = [
            tree_node._leaf_value[0]
            for tree_node in self._tree_structure
            if tree_node._leaf_value is not None
        ]
        return min(leaf_values), max(leaf_values)
 class Ensemble(ModelSerializer):
    def __init__(
@ -157,3 +169,9 @@ class Ensemble(ModelSerializer):
        add_if_exists(d, "classification_weights", self._classification_weights)
        add_if_exists(d, "aggregate_output", self._output_aggregator)
        return {"ensemble": d}
    def bounds(self) -> Tuple[float, float]:
        min_bound, max_bound = tuple(
            map(sum, zip(*[model.bounds() for model in self._trained_models]))
        )
        return min_bound, max_bound
--- a/eland/ml/exporters/init.py
+++ b/eland/ml/exporters/init.py
@ -1,16 +0,0 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
--- a/eland/ml/exporters/_sklearn_deserializers.py
+++ b/eland/ml/exporters/_sklearn_deserializers.py
@ -1,222 +0,0 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 from typing import Any, Dict
 import numpy as np
 from .._optional import import_optional_dependency
 import_optional_dependency("sklearn", on_version="warn")
 import sklearn
 from sklearn.preprocessing import FunctionTransformer
 class Tree:
    """Wrapper to create sklearn Tree objects from Elastic ML tree
    description in JSON format.
    """
    def __init__(
        self,
        json_tree: Dict[str, Any],
        feature_names_map: Dict[str, int],
    ):
        tree_leaf = -1
        node_count = len(json_tree["tree_structure"])
        children_left = np.ones((node_count,), dtype=int) * tree_leaf
        children_right = np.ones((node_count,), dtype=int) * tree_leaf
        feature = np.ones((node_count,), dtype=int) * -2
        threshold = np.ones((node_count,), dtype=float) * -2
        impurity = np.zeros((node_count,), dtype=float)
        # value works only for regression and binary classification
        value = np.zeros((node_count, 1, 1), dtype="<f8")
        n_node_samples = np.zeros((node_count,), dtype=int)
        # parse values from the JSON tree
        feature_names = json_tree["feature_names"]
        for json_node in json_tree["tree_structure"]:
            node_id = json_node["node_index"]
            if "number_samples" in json_node:
                n_node_samples[node_id] = json_node["number_samples"]
            else:
                n_node_samples[node_id] = 0
            if "leaf_value" not in json_node:
                children_left[node_id] = json_node["left_child"]
                children_right[node_id] = json_node["right_child"]
                feature[node_id] = feature_names_map[
                    feature_names[json_node["split_feature"]]
                ]
                threshold[node_id] = json_node["threshold"]
                if "split_gain" in json_node:
                    impurity[node_id] = json_node["split_gain"]
                else:
                    impurity[node_id] = -1
            else:
                value[node_id, 0, 0] = json_node["leaf_value"]
        # iterate through tree to get max depth and expected values
        weighted_n_node_samples = n_node_samples.copy()
        self.max_depth = Tree._compute_expectations(
            children_left=children_left,
            children_right=children_right,
            node_sample_weight=weighted_n_node_samples,
            values=value,
            node_index=0,
        )
        self.n_outputs = value.shape[-1]
        # initialize the sklearn tree
        self.tree = sklearn.tree._tree.Tree(
            len(feature_names), np.array([1], dtype=int), 1
        )
        node_state = np.array(
            [
                (
                    children_left[i],
                    children_right[i],
                    feature[i],
                    threshold[i],
                    impurity[i],
                    n_node_samples[i],
                    weighted_n_node_samples[i],
                    True,
                )
                for i in range(node_count)
            ],
            dtype={
                "names": [
                    "left_child",
                    "right_child",
                    "feature",
                    "threshold",
                    "impurity",
                    "n_node_samples",
                    "weighted_n_node_samples",
                    "missing_go_to_left",
                ],
                "formats": ["<i8", "<i8", "<i8", "<f8", "<f8", "<i8", "<f8", "u1"],
            },
        )
        state = {
            "max_depth": self.max_depth,
            "node_count": node_count,
            "nodes": node_state,
            "values": value,
        }
        self.tree.__setstate__(state)
    @staticmethod
    def _compute_expectations(
        children_left, children_right, node_sample_weight, values, node_index
    ) -> int:
        if children_right[node_index] == -1:
            return 0
        left_index = children_left[node_index]
        right_index = children_right[node_index]
        depth_left = Tree._compute_expectations(
            children_left, children_right, node_sample_weight, values, left_index
        )
        depth_right = Tree._compute_expectations(
            children_left, children_right, node_sample_weight, values, right_index
        )
        left_weight = node_sample_weight[left_index]
        right_weight = node_sample_weight[right_index]
        v = (
            (
                left_weight * values[left_index, :]
                + right_weight * values[right_index, :]
            )
            / (left_weight + right_weight)
            if left_weight + right_weight > 0
            else 0
        )
        values[node_index, :] = v
        return max(depth_left, depth_right) + 1
 class TargetMeanEncoder(FunctionTransformer):
    """FunctionTransformer implementation of the target mean encoder, which is
    deserialized from the Elastic ML preprocessor description in JSON formats.
    """
    def __init__(self, preprocessor: Dict[str, Any]):
        self.preprocessor = preprocessor
        target_map = self.preprocessor["target_mean_encoding"]["target_map"]
        feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"]
        self.field_name_in = self.preprocessor["target_mean_encoding"]["field"]
        fallback_value = self.preprocessor["target_mean_encoding"]["default_value"]
        def func(column):
            return np.array(
                [
                    target_map[str(category)]
                    if category in target_map
                    else fallback_value
                    for category in column
                ]
            ).reshape(-1, 1)
        def feature_names_out(ft, carr):
            return [feature_name_out if c == self.field_name_in else c for c in carr]
        super().__init__(func=func, feature_names_out=feature_names_out)
 class FrequencyEncoder(FunctionTransformer):
    """FunctionTransformer implementation of the frequency encoder, which is
    deserialized from the Elastic ML preprocessor description in JSON format.
    """
    def __init__(self, preprocessor: Dict[str, Any]):
        self.preprocessor = preprocessor
        frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"]
        feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"]
        self.field_name_in = self.preprocessor["frequency_encoding"]["field"]
        fallback_value = 0.0
        def func(column):
            return np.array(
                [
                    frequency_map[str(category)]
                    if category in frequency_map
                    else fallback_value
                    for category in column
                ]
            ).reshape(-1, 1)
        def feature_names_out(ft, carr):
            return [feature_name_out if c == self.field_name_in else c for c in carr]
        super().__init__(func=func, feature_names_out=feature_names_out)
 class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
    """Wrapper for sklearn one-hot encoder, which is deserialized from the
    Elastic ML preprocessor description in JSON format.
    """
    def __init__(self, preprocessor: Dict[str, Any]):
        self.preprocessor = preprocessor
        self.field_name_in = self.preprocessor["one_hot_encoding"]["field"]
        self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())]
        super().__init__(categories=self.cats, handle_unknown="ignore")
--- a/eland/ml/exporters/common.py
+++ b/eland/ml/exporters/common.py
@ -1,46 +0,0 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 import eland
 class ModelDefinitionKeyError(Exception):
    """
    This exception is raised when a key is not found in the model definition.
    Attributes:
        missed_key (str): The key that was not found in the model definition.
        available_keys (List[str]): The list of keys that are available in the model definition.
    Examples:
        model_definition = {"key1": "value1", "key2": "value2"}
        try:
            model_definition["key3"]
        except KeyError as ex:
            raise ModelDefinitionKeyError(ex) from ex
    """
    def __init__(self, ex: KeyError):
        self.missed_key = ex.args[0]
    def __str__(self):
        return (
            f'Key "{self.missed_key}" is not available. '
            + "The model definition may have changed. "
            + "Make sure you are using an Elasticsearch version compatible "
            + f"with Eland {eland.__version__}."
        )
--- a/eland/ml/exporters/es_gb_models.py
+++ b/eland/ml/exporters/es_gb_models.py
@ -1,472 +0,0 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 from abc import ABC
 from typing import Any, List, Literal, Mapping, Optional, Set, Tuple, Union
 import numpy as np
 from elasticsearch import Elasticsearch
 from numpy.typing import ArrayLike
 from .._optional import import_optional_dependency
 import_optional_dependency("sklearn", on_version="warn")
 from sklearn.dummy import DummyClassifier, DummyRegressor
 from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
 from sklearn.ensemble._gb_losses import (
    BinomialDeviance,
    HuberLossFunction,
    LeastSquaresError,
 )
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils.validation import check_array
 from eland.common import ensure_es_client
 from eland.ml.common import TYPE_CLASSIFICATION, TYPE_REGRESSION
 from ._sklearn_deserializers import Tree
 from .common import ModelDefinitionKeyError
 class ESGradientBoostingModel(ABC):
    """
    Abstract class for converting Elastic ML model into sklearn Pipeline.
    """
    def __init__(
        self,
        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
        model_id: str,
    ) -> None:
        """
        Parameters
        ----------
        es_client : Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance
        model_id : str
            The unique identifier of the trained inference model in Elasticsearch.
        Raises
        ------
        RuntimeError
            On failure to retrieve trained model information to the specified model ID.
        ValueError
            The model is expected to be trained in Elastic Stack. Models initially imported
            from xgboost, lgbm, or sklearn are not supported.
        """
        self.es_client: Elasticsearch = ensure_es_client(es_client)
        self.model_id = model_id
        self._trained_model_result = self.es_client.ml.get_trained_models(
            model_id=self.model_id,
            decompress_definition=True,
            include=["hyperparameters", "definition"],
        )
        if (
            "trained_model_configs" not in self._trained_model_result
            or len(self._trained_model_result["trained_model_configs"]) == 0
        ):
            raise RuntimeError(
                f"Failed to retrieve the trained model for model ID {self.model_id!r}"
            )
        if "metadata" not in self._trained_model_result["trained_model_configs"][0]:
            raise ValueError(
                "Error initializing sklearn classifier. Incorrect prior class probability. "
                + "Note: only export of models trained in the Elastic Stack is supported."
            )
        preprocessors = []
        if "preprocessors" in self._definition:
            preprocessors = self._definition["preprocessors"]
        (
            self.feature_names_in_,
            self.input_field_names,
        ) = ESGradientBoostingModel._get_feature_names_in_(
            preprocessors,
            self._definition["trained_model"]["ensemble"]["feature_names"],
            self._trained_model_result["trained_model_configs"][0]["input"][
                "field_names"
            ],
        )
        feature_names_map = {name: i for i, name in enumerate(self.feature_names_in_)}
        trained_models = self._definition["trained_model"]["ensemble"]["trained_models"]
        self._trees = []
        for trained_model in trained_models:
            self._trees.append(Tree(trained_model["tree"], feature_names_map))
        # 0's tree is the constant estimator
        self.n_estimators = len(trained_models) - 1
    def _initialize_estimators(self, decision_tree_type) -> None:
        self.estimators_ = np.ndarray(
            (len(self._trees) - 1, 1), dtype=decision_tree_type
        )
        self.n_estimators_ = self.estimators_.shape[0]
        for i in range(self.n_estimators_):
            estimator = decision_tree_type()
            estimator.tree_ = self._trees[i + 1].tree
            estimator.n_features_in_ = self.n_features_in_
            estimator.max_depth = self._max_depth
            estimator.max_features_ = self.max_features_
            self.estimators_[i, 0] = estimator
    def _extract_common_parameters(self) -> None:
        self.n_features_in_ = len(self.feature_names_in_)
        self.max_features_ = self.n_features_in_
    @property
    def _max_depth(self) -> int:
        return max(map(lambda x: x.max_depth, self._trees))
    @property
    def _n_outputs(self) -> int:
        return self._trees[0].n_outputs
    @property
    def _definition(self) -> Mapping[Union[str, int], Any]:
        return self._trained_model_result["trained_model_configs"][0]["definition"]
    @staticmethod
    def _get_feature_names_in_(
        preprocessors, feature_names, field_names
    ) -> Tuple[List[str], Set[str]]:
        input_field_names = set()
        def add_input_field_name(preprocessor_type: str, feature_name: str) -> None:
            if feature_name in feature_names:
                input_field_names.add(preprocessor[preprocessor_type]["field"])
        for preprocessor in preprocessors:
            if "target_mean_encoding" in preprocessor:
                add_input_field_name(
                    "target_mean_encoding",
                    preprocessor["target_mean_encoding"]["feature_name"],
                )
            elif "frequency_encoding" in preprocessor:
                add_input_field_name(
                    "frequency_encoding",
                    preprocessor["frequency_encoding"]["feature_name"],
                )
            elif "one_hot_encoding" in preprocessor:
                for feature_name in preprocessor["one_hot_encoding"][
                    "hot_map"
                ].values():
                    add_input_field_name("one_hot_encoding", feature_name)
        for field_name in field_names:
            if field_name in feature_names and field_name not in input_field_names:
                input_field_names.add(field_name)
        return feature_names, input_field_names
    @property
    def preprocessors(self) -> List[Any]:
        """
        Returns the list of preprocessor JSON definitions.
        Returns
        -------
        List[Any]
            List of preprocessors definitions or [].
        """
        if "preprocessors" in self._definition:
            return self._definition["preprocessors"]
        return []
    def fit(self, X, y, sample_weight=None, monitor=None) -> None:
        """
        Override of the sklearn fit() method. It does nothing since Elastic ML models are
        trained in the Elastic Stack or imported.
        """
        # Do nothing, model if fitted using Elasticsearch API
        pass
 class ESGradientBoostingClassifier(ESGradientBoostingModel, GradientBoostingClassifier):
    """
    Elastic ML model wrapper compatible with sklearn GradientBoostingClassifier.
    """
    def __init__(
        self,
        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
        model_id: str,
    ) -> None:
        """
        Parameters
        ----------
        es_client : Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance
        model_id : str
            The unique identifier of the trained inference model in Elasticsearch.
        Raises
        ------
        NotImplementedError
            Multi-class classification is not supported at the moment.
        ValueError
            The classifier should be defined for at least 2 classes.
        ModelDefinitionKeyError
            If required data cannot be extracted from the model definition due to a schema change.
        """
        try:
            ESGradientBoostingModel.__init__(self, es_client, model_id)
            self._extract_common_parameters()
            GradientBoostingClassifier.__init__(
                self,
                learning_rate=1.0,
                n_estimators=self.n_estimators,
                max_depth=self._max_depth,
            )
            if "classification_labels" in self._definition["trained_model"]["ensemble"]:
                self.classes_ = np.array(
                    self._definition["trained_model"]["ensemble"][
                        "classification_labels"
                    ]
                )
            else:
                self.classes_ = None
            self.n_outputs = self._n_outputs
            if self.classes_ is not None:
                self.n_classes_ = len(self.classes_)
            elif self.n_outputs <= 2:
                self.n_classes_ = 2
            else:
                self.n_classes_ = self.n_outputs
            if self.n_classes_ == 2:
                self._loss = BinomialDeviance(self.n_classes_)
                # self.n_outputs = 1
            elif self.n_classes_ > 2:
                raise NotImplementedError("Only binary classification is implemented.")
            else:
                raise ValueError(f"At least 2 classes required. got {self.n_classes_}.")
            self.init_ = self._initialize_init_()
            self._initialize_estimators(DecisionTreeClassifier)
        except KeyError as ex:
            raise ModelDefinitionKeyError(ex) from ex
    @property
    def analysis_type(self) -> Literal["classification"]:
        return TYPE_CLASSIFICATION
    def _initialize_init_(self) -> DummyClassifier:
        estimator = DummyClassifier(strategy="prior")
        estimator.n_classes_ = self.n_classes_
        estimator.n_outputs_ = self.n_outputs
        estimator.classes_ = np.arange(self.n_classes_)
        estimator._strategy = estimator.strategy
        if self.n_classes_ == 2:
            log_odds = self._trees[0].tree.value.flatten()[0]
            if np.isnan(log_odds):
                raise ValueError(
                    "Error initializing sklearn classifier. Incorrect prior class probability. "
                    + "Note: only export of models trained in the Elastic Stack is supported."
                )
            class_prior = 1 / (1 + np.exp(-log_odds))
            estimator.class_prior_ = np.array([1 - class_prior, class_prior])
        else:
            raise NotImplementedError("Only binary classification is implemented.")
        return estimator
    def predict_proba(
        self, X, feature_names_in: Optional[Union["ArrayLike", List[str]]] = None
    ) -> "ArrayLike":
        """Predict class probabilities for X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples.
        feature_names_in : {array of string, list of string} of length n_features.
            Feature names of the corresponding columns in X. Important, since the column list
            can be extended by ColumnTransformer through the pipeline. By default None.
        Returns
        -------
        ArrayLike of shape (n_samples, n_classes)
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        if feature_names_in is not None:
            if X.shape[1] != len(feature_names_in):
                raise ValueError(
                    f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
                )
            if isinstance(feature_names_in, np.ndarray):
                feature_names_in = feature_names_in.tolist()
            # select columns used by the model in the correct order
            X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
        X = check_array(X)
        return GradientBoostingClassifier.predict_proba(self, X)
    def predict(
        self,
        X: "ArrayLike",
        feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
    ) -> "ArrayLike":
        """Predict class for X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples.
        feature_names_in : {array of string, list of string} of length n_features.
            Feature names of the corresponding columns in X. Important, since the column list
            can be extended by ColumnTransformer through the pipeline. By default None.
        Returns
        -------
        ArrayLike of shape (n_samples,)
            The predicted values.
        """
        if feature_names_in is not None:
            if X.shape[1] != len(feature_names_in):
                raise ValueError(
                    f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
                )
            if isinstance(feature_names_in, np.ndarray):
                feature_names_in = feature_names_in.tolist()
            # select columns used by the model in the correct order
            X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
        X = check_array(X)
        return GradientBoostingClassifier.predict(self, X)
 class ESGradientBoostingRegressor(ESGradientBoostingModel, GradientBoostingRegressor):
    """
    Elastic ML model wrapper compatible with sklearn GradientBoostingRegressor.
    """
    def __init__(
        self,
        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
        model_id: str,
    ) -> None:
        """
        Parameters
        ----------
        es_client : Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance
        model_id : str
            The unique identifier of the trained inference model in Elasticsearch.
        Raises
        ------
        NotImplementedError
            Only MSE, MSLE, and Huber loss functions are supported.
        ModelDefinitionKeyError
            If required data cannot be extracted from the model definition due to a schema change.
        """
        try:
            ESGradientBoostingModel.__init__(self, es_client, model_id)
            self._extract_common_parameters()
            GradientBoostingRegressor.__init__(
                self,
                learning_rate=1.0,
                n_estimators=self.n_estimators,
                max_depth=self._max_depth,
            )
            self.n_outputs = 1
            loss_function = self._trained_model_result["trained_model_configs"][0][
                "metadata"
            ]["analytics_config"]["analysis"][self.analysis_type]["loss_function"]
            if loss_function == "mse" or loss_function == "msle":
                self.criterion = "squared_error"
                self._loss = LeastSquaresError()
            elif loss_function == "huber":
                loss_parameter = loss_function = self._trained_model_result[
                    "trained_model_configs"
                ][0]["metadata"]["analytics_config"]["analysis"][self.analysis_type][
                    "loss_function_parameter"
                ]
                self.criterion = "huber"
                self._loss = HuberLossFunction(loss_parameter)
            else:
                raise NotImplementedError(
                    "Only MSE, MSLE and Huber loss functions are supported."
                )
            self.init_ = self._initialize_init_()
            self._initialize_estimators(DecisionTreeRegressor)
        except KeyError as ex:
            raise ModelDefinitionKeyError(ex) from ex
    @property
    def analysis_type(self) -> Literal["regression"]:
        return TYPE_REGRESSION
    def _initialize_init_(self) -> DummyRegressor:
        constant = self._trees[0].tree.value[0]
        estimator = DummyRegressor(
            strategy="constant",
            constant=constant,
        )
        estimator.constant_ = np.array([constant])
        estimator.n_outputs_ = 1
        return estimator
    def predict(
        self,
        X: "ArrayLike",
        feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
    ) -> "ArrayLike":
        """Predict targets for X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples.
        feature_names_in : {array of string, list of string} of length n_features.
            Feature names of the corresponding columns in X. Important, since the column list
            can be extended by ColumnTransformer through the pipeline. By default None.
        Returns
        -------
        ArrayLike of shape (n_samples,)
            The predicted values.
        """
        if feature_names_in is not None:
            if X.shape[1] != len(feature_names_in):
                raise ValueError(
                    f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
                )
            if isinstance(X, np.ndarray):
                feature_names_in = feature_names_in.tolist()
            # select columns used by the model in the correct order
            X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
        X = check_array(X)
        return GradientBoostingRegressor.predict(self, X)
--- a/eland/ml/ml_model.py
+++ b/eland/ml/ml_model.py
@ -20,7 +20,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Uni
 import elasticsearch
 import numpy as np
-from eland.common import ensure_es_client, es_version
+from eland.common import ensure_es_client, es_version, is_serverless_es
 from eland.utils import deprecated_api
 from .common import TYPE_CLASSIFICATION, TYPE_LEARNING_TO_RANK, TYPE_REGRESSION
@ -38,7 +38,6 @@ if TYPE_CHECKING:
            RandomForestClassifier,
            RandomForestRegressor,
        )
        from sklearn.pipeline import Pipeline  # type: ignore # noqa: F401
        from sklearn.tree import (  # type: ignore # noqa: F401
            DecisionTreeClassifier,
            DecisionTreeRegressor,
@ -504,7 +503,9 @@ class MLModel:
        )
        serializer = transformer.transform()
        model_type = transformer.model_type
-        default_inference_config: Mapping[str, Mapping[str, Any]] = {model_type: {}}
+
        if inference_config is None:
            inference_config = {model_type: {}}
        if es_if_exists is None:
            es_if_exists = "fail"
@ -523,18 +524,25 @@ class MLModel:
        elif es_if_exists == "replace":
            ml_model.delete_model()
        trained_model_input = None
        is_ltr = next(iter(inference_config)) is TYPE_LEARNING_TO_RANK
        if not is_ltr or (
            es_version(es_client) < (8, 15) and not is_serverless_es(es_client)
        ):
            trained_model_input = {"field_names": feature_names}
        if es_compress_model_definition:
            ml_model._client.ml.put_trained_model(
                model_id=model_id,
-                input={"field_names": feature_names},
+                inference_config=inference_config,
-                inference_config=inference_config or default_inference_config,
+                input=trained_model_input,
                compressed_definition=serializer.serialize_and_compress_model(),
            )
        else:
            ml_model._client.ml.put_trained_model(
                model_id=model_id,
-                input={"field_names": feature_names},
+                inference_config=inference_config,
-                inference_config=inference_config or default_inference_config,
+                input=trained_model_input,
                definition=serializer.serialize_model(),
            )
@ -563,83 +571,6 @@ class MLModel:
            return False
        return True
    def export_model(self) -> "Pipeline":
        """Export Elastic ML model as sklearn Pipeline.
        Returns
        -------
        sklearn.pipeline.Pipeline
            _description_
        Raises
        ------
        AssertionError
            If preprocessors JSON definition has unexpected schema.
        ValueError
            The model is expected to be trained in Elastic Stack. Models initially imported
            from xgboost, lgbm, or sklearn are not supported.
        ValueError
            If unexpected categorical encoding is found in the list of preprocessors.
        NotImplementedError
            Only regression and binary classification models are supported currently.
        """
        from sklearn.compose import ColumnTransformer  # type: ignore # noqa: F401
        from sklearn.pipeline import Pipeline
        from .exporters._sklearn_deserializers import (
            FrequencyEncoder,
            OneHotEncoder,
            TargetMeanEncoder,
        )
        from .exporters.es_gb_models import (
            ESGradientBoostingClassifier,
            ESGradientBoostingRegressor,
        )
        if self.model_type == TYPE_CLASSIFICATION:
            model = ESGradientBoostingClassifier(
                es_client=self._client, model_id=self._model_id
            )
        elif self.model_type == TYPE_REGRESSION:
            model = ESGradientBoostingRegressor(
                es_client=self._client, model_id=self._model_id
            )
        else:
            raise NotImplementedError(
                "Only regression and binary classification models are supported currently."
            )
        transformers = []
        for p in model.preprocessors:
            assert (
                len(p) == 1
            ), f"Unexpected preprocessor data structure: {p}. One-key mapping expected."
            encoding_type = list(p.keys())[0]
            field = p[encoding_type]["field"]
            if encoding_type == "frequency_encoding":
                transform = FrequencyEncoder(p)
                transformers.append((f"{field}_{encoding_type}", transform, field))
            elif encoding_type == "target_mean_encoding":
                transform = TargetMeanEncoder(p)
                transformers.append((f"{field}_{encoding_type}", transform, field))
            elif encoding_type == "one_hot_encoding":
                transform = OneHotEncoder(p)
                transformers.append((f"{field}_{encoding_type}", transform, [field]))
            else:
                raise ValueError(
                    f"Unexpected categorical encoding type {encoding_type} found. "
                    + "Expected encodings: frequency_encoding, target_mean_encoding, one_hot_encoding."
                )
        preprocessor = ColumnTransformer(
            transformers=transformers,
            remainder="passthrough",
            verbose_feature_names_out=False,
        )
        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("es_model", model)])
        return pipeline
    @property
    def _trained_model_config(self) -> Dict[str, Any]:
        """Lazily loads an ML models 'trained_model_config' information"""
--- a/eland/ml/pytorch/init.py
+++ b/eland/ml/pytorch/init.py
@ -31,7 +31,10 @@ from eland.ml.pytorch.nlp_ml_model import (
    ZeroShotClassificationInferenceOptions,
 )
 from eland.ml.pytorch.traceable_model import TraceableModel  # noqa: F401
-from eland.ml.pytorch.transformers import task_type_from_model_config
+from eland.ml.pytorch.transformers import (
    UnknownModelInputSizeError,
    task_type_from_model_config,
 )
 __all__ = [
    "PyTorchModel",
@ -49,4 +52,5 @@ __all__ = [
    "TextSimilarityInferenceOptions",
    "ZeroShotClassificationInferenceOptions",
    "task_type_from_model_config",
    "UnknownModelInputSizeError",
 ]
--- a/eland/ml/pytorch/_pytorch_model.py
+++ b/eland/ml/pytorch/_pytorch_model.py
@ -126,6 +126,7 @@ class PyTorchModel:
    def infer(
        self,
        docs: List[Mapping[str, str]],
        inference_config: Optional[Mapping[str, Any]] = None,
        timeout: str = DEFAULT_TIMEOUT,
    ) -> Any:
        if docs is None:
@ -133,6 +134,8 @@ class PyTorchModel:
        __body: Dict[str, Any] = {}
        __body["docs"] = docs
        if inference_config is not None:
            __body["inference_config"] = inference_config
        __path = f"/_ml/trained_models/{_quote(self.model_id)}/_infer"
        __query: Dict[str, Any] = {}
--- a/eland/ml/pytorch/nlp_ml_model.py
+++ b/eland/ml/pytorch/nlp_ml_model.py
@ -86,6 +86,27 @@ class NlpXLMRobertaTokenizationConfig(NlpTokenizationConfig):
        )
 class NlpDebertaV2TokenizationConfig(NlpTokenizationConfig):
    def __init__(
        self,
        *,
        do_lower_case: t.Optional[bool] = None,
        with_special_tokens: t.Optional[bool] = None,
        max_sequence_length: t.Optional[int] = None,
        truncate: t.Optional[
            t.Union["t.Literal['first', 'none', 'second']", str]
        ] = None,
        span: t.Optional[int] = None,
    ):
        super().__init__(
            configuration_type="deberta_v2",
            with_special_tokens=with_special_tokens,
            max_sequence_length=max_sequence_length,
            truncate=truncate,
            span=span,
        )
 class NlpBertTokenizationConfig(NlpTokenizationConfig):
    def __init__(
        self,
--- a/eland/ml/pytorch/traceable_model.py
+++ b/eland/ml/pytorch/traceable_model.py
@ -50,12 +50,10 @@ class TraceableModel(ABC):
        return self._trace()
    @abstractmethod
-    def sample_output(self) -> torch.Tensor:
+    def sample_output(self) -> torch.Tensor: ...
        ...
    @abstractmethod
-    def _trace(self) -> TracedModelTypes:
+    def _trace(self) -> TracedModelTypes: ...
        ...
    def classification_labels(self) -> Optional[List[str]]:
        return None
--- a/eland/ml/pytorch/transformers.py
+++ b/eland/ml/pytorch/transformers.py
@ -25,17 +25,14 @@ import os.path
 import random
 import re
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 import torch  # type: ignore
 import transformers  # type: ignore
-from sentence_transformers import SentenceTransformer  # type: ignore
+from torch import Tensor
 from torch import Tensor, nn
 from torch.profiler import profile  # type: ignore
 from transformers import (
-    AutoConfig,
+    BertTokenizer,
    AutoModel,
    AutoModelForQuestionAnswering,
    PretrainedConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
@ -47,6 +44,7 @@ from eland.ml.pytorch.nlp_ml_model import (
    NerInferenceOptions,
    NlpBertJapaneseTokenizationConfig,
    NlpBertTokenizationConfig,
    NlpDebertaV2TokenizationConfig,
    NlpMPNetTokenizationConfig,
    NlpRobertaTokenizationConfig,
    NlpTokenizationConfig,
@ -63,8 +61,13 @@ from eland.ml.pytorch.nlp_ml_model import (
    ZeroShotClassificationInferenceOptions,
 )
 from eland.ml.pytorch.traceable_model import TraceableModel
 from eland.ml.pytorch.wrappers import (
    _DistilBertWrapper,
    _DPREncoderWrapper,
    _QuestionAnsweringWrapperModule,
    _SentenceTransformerWrapperModule,
 )
 DEFAULT_OUTPUT_KEY = "sentence_embedding"
 SUPPORTED_TASK_TYPES = {
    "fill_mask",
    "ner",
@ -115,6 +118,7 @@ SUPPORTED_TOKENIZERS = (
    transformers.BartTokenizer,
    transformers.SqueezeBertTokenizer,
    transformers.XLMRobertaTokenizer,
    transformers.DebertaV2Tokenizer,
 )
 SUPPORTED_TOKENIZERS_NAMES = ", ".join(sorted([str(x) for x in SUPPORTED_TOKENIZERS]))
@ -130,6 +134,10 @@ class TaskTypeError(Exception):
    pass
 class UnknownModelInputSizeError(Exception):
    pass
 def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]:
    if model_config.architectures is None:
        if model_config.name_or_path.startswith("sentence-transformers/"):
@ -165,283 +173,6 @@ def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]
    return potential_task_types.pop()
 class _QuestionAnsweringWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around a question answering model.
    Our inference engine only takes the first tuple if the inference response
    is a tuple.
    This wrapper transforms the output to be a stacked tensor if its a tuple.
    Otherwise it passes it through
    """
    def __init__(self, model: PreTrainedModel):
        super().__init__()
        self._hf_model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        model = AutoModelForQuestionAnswering.from_pretrained(
            model_id, token=token, torchscript=True
        )
        if isinstance(
            model.config,
            (
                transformers.MPNetConfig,
                transformers.XLMRobertaConfig,
                transformers.RobertaConfig,
                transformers.BartConfig,
            ),
        ):
            return _TwoParameterQuestionAnsweringWrapper(model)
        else:
            return _QuestionAnsweringWrapper(model)
 class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
            del inputs["position_ids"]
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _DistilBertWrapper(nn.Module):  # type: ignore
    """
    In Elasticsearch the BERT tokenizer is used for DistilBERT models but
    the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
    Wrap the model's forward function in a method that accepts the 4
    arguments passed to a BERT model then discard the token_type_ids
    and the position_ids to match the wrapped DistilBERT model forward
    function
    """
    def __init__(self, model: transformers.PreTrainedModel):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
        if isinstance(model.config, transformers.DistilBertConfig):
            return _DistilBertWrapper(model)
        else:
            return model
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        _token_type_ids: Tensor = None,
        _position_ids: Tensor = None,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(input_ids=input_ids, attention_mask=attention_mask)
 class _SentenceTransformerWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around sentence-transformer models to provide pooling,
    normalization and other graph layers that are not defined in the base
    HuggingFace transformer model.
    """
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__()
        self._hf_model = model
        self._st_model = SentenceTransformer(model.config.name_or_path)
        self._output_key = output_key
        self.config = model.config
        self._remove_pooling_layer()
        self._replace_transformer_layer()
    @staticmethod
    def from_pretrained(
        model_id: str,
        tokenizer: PreTrainedTokenizer,
        *,
        token: Optional[str] = None,
        output_key: str = DEFAULT_OUTPUT_KEY,
    ) -> Optional[Any]:
        model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
        if isinstance(
            tokenizer,
            (
                transformers.BartTokenizer,
                transformers.MPNetTokenizer,
                transformers.RobertaTokenizer,
                transformers.XLMRobertaTokenizer,
            ),
        ):
            return _TwoParameterSentenceTransformerWrapper(model, output_key)
        else:
            return _SentenceTransformerWrapper(model, output_key)
    def _remove_pooling_layer(self) -> None:
        """
        Removes any last pooling layer which is not used to create embeddings.
        Leaving this layer in will cause it to return a NoneType which in turn
        will fail to load in libtorch. Alternatively, we can just use the output
        of the pooling layer as a dummy but this also affects (if only in a
        minor way) the performance of inference, so we're better off removing
        the layer if we can.
        """
        if hasattr(self._hf_model, "pooler"):
            self._hf_model.pooler = None
    def _replace_transformer_layer(self) -> None:
        """
        Replaces the HuggingFace Transformer layer in the SentenceTransformer
        modules so we can set it with one that has pooling layer removed and
        was loaded ready for TorchScript export.
        """
        self._st_model._modules["0"].auto_model = self._hf_model
 class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
        return self._st_model(inputs)[self._output_key]
 class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        return self._st_model(inputs)[self._output_key]
 class _DPREncoderWrapper(nn.Module):  # type: ignore
    """
    AutoModel loading does not work for DPRContextEncoders, this only exists as
    a workaround. This may never be fixed so this is likely permanent.
    See: https://github.com/huggingface/transformers/issues/13670
    """
    _SUPPORTED_MODELS = {
        transformers.DPRContextEncoder,
        transformers.DPRQuestionEncoder,
    }
    _SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
    def __init__(
        self,
        model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
    ):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        config = AutoConfig.from_pretrained(model_id, token=token)
        def is_compatible() -> bool:
            is_dpr_model = config.model_type == "dpr"
            has_architectures = (
                config.architectures is not None and len(config.architectures) == 1
            )
            is_supported_architecture = has_architectures and (
                config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
            )
            return is_dpr_model and is_supported_architecture
        if is_compatible():
            model = getattr(transformers, config.architectures[0]).from_pretrained(
                model_id, torchscript=True
            )
            return _DPREncoderWrapper(model)
        else:
            return None
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        _position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
 class _TransformerTraceableModel(TraceableModel):
    """A base class representing a HuggingFace transformer model that can be traced."""
@ -460,7 +191,7 @@ class _TransformerTraceableModel(TraceableModel):
    def _trace(self) -> TracedModelTypes:
        inputs = self._compatible_inputs()
-        return torch.jit.trace(self._model, inputs)
+        return torch.jit.trace(self._model, example_inputs=inputs)
    def sample_output(self) -> Tensor:
        inputs = self._compatible_inputs()
@ -483,9 +214,15 @@ class _TransformerTraceableModel(TraceableModel):
                transformers.XLMRobertaTokenizer,
            ),
        ):
            del inputs["token_type_ids"]
            return (inputs["input_ids"], inputs["attention_mask"])
        if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
            return (
                inputs["input_ids"],
                inputs["attention_mask"],
                inputs["token_type_ids"],
            )
        position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
        inputs["position_ids"] = position_ids
        return (
@ -496,8 +233,7 @@ class _TransformerTraceableModel(TraceableModel):
        )
    @abstractmethod
-    def _prepare_inputs(self) -> transformers.BatchEncoding:
+    def _prepare_inputs(self) -> transformers.BatchEncoding: ...
        ...
 class _TraceableClassificationModel(_TransformerTraceableModel, ABC):
@ -519,6 +255,15 @@ class _TraceableFillMaskModel(_TransformerTraceableModel):
        )
 class _TraceableTextExpansionModel(_TransformerTraceableModel):
    def _prepare_inputs(self) -> transformers.BatchEncoding:
        return self._tokenizer(
            "This is an example sentence.",
            padding="max_length",
            return_tensors="pt",
        )
 class _TraceableNerModel(_TraceableClassificationModel):
    def _prepare_inputs(self) -> transformers.BatchEncoding:
        return self._tokenizer(
@ -553,7 +298,7 @@ class _TraceableTextEmbeddingModel(_TransformerTraceableModel):
    def _prepare_inputs(self) -> transformers.BatchEncoding:
        return self._tokenizer(
            "This is an example sentence.",
-            padding="max_length",
+            padding="longest",
            return_tensors="pt",
        )
@ -599,6 +344,7 @@ class TransformerModel:
        access_token: Optional[str] = None,
        ingest_prefix: Optional[str] = None,
        search_prefix: Optional[str] = None,
        max_model_input_size: Optional[int] = None,
    ):
        """
        Loads a model from the Hugging Face repository or local file and creates
@ -630,6 +376,12 @@ class TransformerModel:
        search_prefix: Optional[str]
            Prefix string to prepend to input at search
        max_model_input_size: Optional[int]
            The max model input size counted in tokens.
            Usually this value should be extracted from the model configuration
            but if that is not possible or the data is missing it can be
            explicitly set with this parameter.
        """
        self._model_id = model_id
@ -637,6 +389,7 @@ class TransformerModel:
        self._task_type = task_type.replace("-", "_")
        self._ingest_prefix = ingest_prefix
        self._search_prefix = search_prefix
        self._max_model_input_size = max_model_input_size
        # load Hugging Face model and tokenizer
        # use padding in the tokenizer to ensure max length sequences are used for tracing (at call time)
@ -669,7 +422,12 @@ class TransformerModel:
                " ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
            ]
            vocab_obj["merges"] = merges
-        sp_model = getattr(self._tokenizer, "sp_model", None)
+
        if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
            sp_model = self._tokenizer._tokenizer.spm
        else:
            sp_model = getattr(self._tokenizer, "sp_model", None)
        if sp_model:
            id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
            scores = []
@ -686,7 +444,10 @@ class TransformerModel:
        return vocab_obj
    def _create_tokenization_config(self) -> NlpTokenizationConfig:
-        _max_sequence_length = self._find_max_sequence_length()
+        if self._max_model_input_size:
            _max_sequence_length = self._max_model_input_size
        else:
            _max_sequence_length = self._find_max_sequence_length()
        if isinstance(self._tokenizer, transformers.MPNetTokenizer):
            return NlpMPNetTokenizationConfig(
@ -704,6 +465,11 @@ class TransformerModel:
            return NlpXLMRobertaTokenizationConfig(
                max_sequence_length=_max_sequence_length
            )
        elif isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
            return NlpDebertaV2TokenizationConfig(
                max_sequence_length=_max_sequence_length,
                do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
            )
        else:
            japanese_morphological_tokenizers = ["mecab"]
            if (
@ -725,25 +491,28 @@ class TransformerModel:
        # Sometimes the max_... values are present but contain
        # a random or very large value.
        REASONABLE_MAX_LENGTH = 8192
        max_len = getattr(self._tokenizer, "max_model_input_sizes", dict()).get(
            self._model_id
        )
        if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
            return int(max_len)
        max_len = getattr(self._tokenizer, "model_max_length", None)
        if max_len is not None and max_len <= REASONABLE_MAX_LENGTH:
            return int(max_len)
        max_sizes = getattr(self._tokenizer, "max_model_input_sizes", dict())
        max_len = max_sizes.get(self._model_id)
        if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
            return int(max_len)
-        model_config = getattr(self._traceable_model._model, "config", None)
+        if max_sizes:
-        if model_config is None:
+            # The model id wasn't found in the max sizes dict but
-            raise ValueError("Cannot determine model max input length")
+            # if all the values correspond then take that value
            sizes = {size for size in max_sizes.values()}
            if len(sizes) == 1:
                max_len = sizes.pop()
                if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
                    return int(max_len)
-        max_len = getattr(model_config, "max_position_embeddings", None)
+        if isinstance(self._tokenizer, BertTokenizer):
-        if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
+            return 512
            return int(max_len)
-        raise ValueError("Cannot determine model max input length")
+        raise UnknownModelInputSizeError("Cannot determine model max input length")
    def _create_config(
        self, es_version: Optional[Tuple[int, int, int]]
@ -756,6 +525,9 @@ class TransformerModel:
            tokenization_config.span = 128
            tokenization_config.truncate = "none"
        if self._task_type == "text_similarity":
            tokenization_config.truncate = "second"
        if self._traceable_model.classification_labels():
            inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
                tokenization=tokenization_config,
@ -954,6 +726,13 @@ class TransformerModel:
            else:
                self._task_type = maybe_task_type
        if self._task_type == "text_expansion":
            model = transformers.AutoModelForMaskedLM.from_pretrained(
                self._model_id, token=self._access_token, torchscript=True
            )
            model = _DistilBertWrapper.try_wrapping(model)
            return _TraceableTextExpansionModel(self._tokenizer, model)
        if self._task_type == "fill_mask":
            model = transformers.AutoModelForMaskedLM.from_pretrained(
                self._model_id, token=self._access_token, torchscript=True
@ -1013,7 +792,7 @@ class TransformerModel:
        else:
            raise TypeError(
-                f"Unknown task type {self._task_type}, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
+                f"Task {self._task_type} is not supported, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
            )
    def elasticsearch_model_id(self) -> str:
@ -1044,6 +823,5 @@ def elasticsearch_model_id(model_id: str) -> str:
    """
    id = re.sub(r"[\s\\/]", "__", model_id).lower()[-64:]
-    if id.startswith("__"):
+    id = id.removeprefix("__")
        id = id.removeprefix("__")
    return id
--- a/eland/ml/pytorch/wrappers.py
+++ b/eland/ml/pytorch/wrappers.py
@ -0,0 +1,317 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 """
 This module contains the wrapper classes for the Hugging Face models.
 Wrapping is necessary to ensure that the forward method of the model
 is called with the same arguments the ml-cpp pytorch_inference process
 uses.
 """
 from typing import Any, Optional, Union
 import torch  # type: ignore
 import transformers  # type: ignore
 from sentence_transformers import SentenceTransformer  # type: ignore
 from torch import Tensor, nn
 from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForQuestionAnswering,
    PreTrainedModel,
    PreTrainedTokenizer,
 )
 DEFAULT_OUTPUT_KEY = "sentence_embedding"
 class _QuestionAnsweringWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around a question answering model.
    Our inference engine only takes the first tuple if the inference response
    is a tuple.
    This wrapper transforms the output to be a stacked tensor if its a tuple.
    Otherwise it passes it through
    """
    def __init__(self, model: PreTrainedModel):
        super().__init__()
        self._hf_model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        model = AutoModelForQuestionAnswering.from_pretrained(
            model_id, token=token, torchscript=True
        )
        if isinstance(
            model.config,
            (
                transformers.MPNetConfig,
                transformers.XLMRobertaConfig,
                transformers.RobertaConfig,
                transformers.BartConfig,
            ),
        ):
            return _TwoParameterQuestionAnsweringWrapper(model)
        else:
            return _QuestionAnsweringWrapper(model)
 class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
            del inputs["position_ids"]
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
    def __init__(self, model: PreTrainedModel):
        super().__init__(model=model)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        response = self._hf_model(**inputs)
        if isinstance(response, tuple):
            return torch.stack(list(response), dim=0)
        return response
 class _DistilBertWrapper(nn.Module):  # type: ignore
    """
    In Elasticsearch the BERT tokenizer is used for DistilBERT models but
    the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
    Wrap the model's forward function in a method that accepts the 4
    arguments passed to a BERT model then discard the token_type_ids
    and the position_ids to match the wrapped DistilBERT model forward
    function
    """
    def __init__(self, model: transformers.PreTrainedModel):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
        if isinstance(model.config, transformers.DistilBertConfig):
            return _DistilBertWrapper(model)
        else:
            return model
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        _token_type_ids: Tensor = None,
        _position_ids: Tensor = None,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(input_ids=input_ids, attention_mask=attention_mask)
 class _SentenceTransformerWrapperModule(nn.Module):  # type: ignore
    """
    A wrapper around sentence-transformer models to provide pooling,
    normalization and other graph layers that are not defined in the base
    HuggingFace transformer model.
    """
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__()
        self._hf_model = model
        self._st_model = SentenceTransformer(model.config.name_or_path)
        self._output_key = output_key
        self.config = model.config
        self._remove_pooling_layer()
        self._replace_transformer_layer()
    @staticmethod
    def from_pretrained(
        model_id: str,
        tokenizer: PreTrainedTokenizer,
        *,
        token: Optional[str] = None,
        output_key: str = DEFAULT_OUTPUT_KEY,
    ) -> Optional[Any]:
        model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
        if isinstance(
            tokenizer,
            (
                transformers.BartTokenizer,
                transformers.MPNetTokenizer,
                transformers.RobertaTokenizer,
                transformers.XLMRobertaTokenizer,
                transformers.DebertaV2Tokenizer,
            ),
        ):
            return _TwoParameterSentenceTransformerWrapper(model, output_key)
        else:
            return _SentenceTransformerWrapper(model, output_key)
    def _remove_pooling_layer(self) -> None:
        """
        Removes any last pooling layer which is not used to create embeddings.
        Leaving this layer in will cause it to return a NoneType which in turn
        will fail to load in libtorch. Alternatively, we can just use the output
        of the pooling layer as a dummy but this also affects (if only in a
        minor way) the performance of inference, so we're better off removing
        the layer if we can.
        """
        if hasattr(self._hf_model, "pooler"):
            self._hf_model.pooler = None
    def _replace_transformer_layer(self) -> None:
        """
        Replaces the HuggingFace Transformer layer in the SentenceTransformer
        modules so we can set it with one that has pooling layer removed and
        was loaded ready for TorchScript export.
        """
        self._st_model._modules["0"].auto_model = self._hf_model
 class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "position_ids": position_ids,
        }
        # remove inputs for specific model types
        if isinstance(self._hf_model.config, transformers.DistilBertConfig):
            del inputs["token_type_ids"]
        return self._st_model(inputs)[self._output_key]
 class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
    def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
        super().__init__(model=model, output_key=output_key)
    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        return self._st_model(inputs)[self._output_key]
 class _DPREncoderWrapper(nn.Module):  # type: ignore
    """
    AutoModel loading does not work for DPRContextEncoders, this only exists as
    a workaround. This may never be fixed so this is likely permanent.
    See: https://github.com/huggingface/transformers/issues/13670
    """
    _SUPPORTED_MODELS = {
        transformers.DPRContextEncoder,
        transformers.DPRQuestionEncoder,
    }
    _SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
    def __init__(
        self,
        model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
    ):
        super().__init__()
        self._model = model
        self.config = model.config
    @staticmethod
    def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
        config = AutoConfig.from_pretrained(model_id, token=token)
        def is_compatible() -> bool:
            is_dpr_model = config.model_type == "dpr"
            has_architectures = (
                config.architectures is not None and len(config.architectures) == 1
            )
            is_supported_architecture = has_architectures and (
                config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
            )
            return is_dpr_model and is_supported_architecture
        if is_compatible():
            model = getattr(transformers, config.architectures[0]).from_pretrained(
                model_id, torchscript=True
            )
            return _DPREncoderWrapper(model)
        else:
            return None
    def forward(
        self,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor,
        _position_ids: Tensor,
    ) -> Tensor:
        """Wrap the input and output to conform to the native process interface."""
        return self._model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
--- a/eland/ml/transformers/lightgbm.py
+++ b/eland/ml/transformers/lightgbm.py
@ -97,9 +97,11 @@ class LGBMForestTransformer(ModelTransformer):
        return TreeNode(
            node_idx=node_id,
            leaf_value=[float(tree_node_json_obj["leaf_value"])],
-            number_samples=int(tree_node_json_obj["leaf_count"])
+            number_samples=(
-            if "leaf_count" in tree_node_json_obj
+                int(tree_node_json_obj["leaf_count"])
-            else None,
+                if "leaf_count" in tree_node_json_obj
                else None
            ),
        )
    def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree:
@ -235,9 +237,11 @@ class LGBMClassifierTransformer(LGBMForestTransformer):
        return TreeNode(
            node_idx=node_id,
            leaf_value=leaf_val,
-            number_samples=int(tree_node_json_obj["leaf_count"])
+            number_samples=(
-            if "leaf_count" in tree_node_json_obj
+                int(tree_node_json_obj["leaf_count"])
-            else None,
+                if "leaf_count" in tree_node_json_obj
                else None
            ),
        )
    def check_model_booster(self) -> None:
--- a/eland/ml/transformers/xgboost.py
+++ b/eland/ml/transformers/xgboost.py
@ -107,6 +107,7 @@ class XGBoostForestTransformer(ModelTransformer):
                decision_type=self._node_decision_type,
                left_child=self.extract_node_id(row["Yes"], curr_tree),
                right_child=self.extract_node_id(row["No"], curr_tree),
                default_left=row["Yes"] == row["Missing"],
                threshold=float(row["Split"]),
                split_feature=self.get_feature_id(row["Feature"]),
            )
--- a/eland/operations.py
+++ b/eland/operations.py
@ -16,6 +16,7 @@
 #  under the License.
 import copy
 import os
 import warnings
 from collections import defaultdict
 from datetime import datetime
@ -1156,9 +1157,11 @@ class Operations:
        # piggy-back on that single aggregation.
        if extended_stats_calls >= 2:
            es_aggs = [
-                ("extended_stats", es_agg)
+                (
-                if es_agg in extended_stats_es_aggs
+                    ("extended_stats", es_agg)
-                else es_agg
+                    if es_agg in extended_stats_es_aggs
                    else es_agg
                )
                for es_agg in es_aggs
            ]
@ -1248,6 +1251,46 @@ class Operations:
        if path_or_buf is None:
            return "".join(result)
    def to_json(  # type: ignore
        self,
        query_compiler: "QueryCompiler",
        path_or_buf=None,
        orient=None,
        lines=False,
        **kwargs,
    ):
        if orient == "records" and lines is True:
            result: List[str] = []
            our_filehandle = False
            if isinstance(path_or_buf, os.PathLike):
                buf = open(path_or_buf, "w")
                our_filehandle = True
            elif isinstance(path_or_buf, str):
                buf = open(path_or_buf, "w")
                our_filehandle = True
            else:
                buf = path_or_buf
            for i, df in enumerate(
                self.search_yield_pandas_dataframes(query_compiler=query_compiler)
            ):
                output = df.to_json(
                    orient=orient,
                    lines=lines,
                    **kwargs,
                )
                if buf is None:
                    result.append(output)
                else:
                    buf.write(output)
            # If we opened the file ourselves, we should close it
            if our_filehandle:
                buf.close()
            return "".join(result) or None
        else:
            return self.to_pandas(query_compiler=query_compiler).to_json(
                path_or_buf, orient=orient, lines=lines, **kwargs
            )
    def to_pandas(
        self, query_compiler: "QueryCompiler", show_progress: bool = False
    ) -> pd.DataFrame:
@ -1500,6 +1543,24 @@ def quantile_to_percentile(quantile: Union[int, float]) -> float:
    return float(min(100, max(0, quantile * 100)))
 def is_field_already_present(
    key: str, data: Union[Dict[str, Any], List[Dict[str, Any]]]
 ) -> bool:
    if "." in key:
        splitted = key.split(".")
        if isinstance(data, dict):
            return is_field_already_present(
                ".".join(splitted[1:]), data.get(splitted[0], {})
            )
        if isinstance(data, list):
            return any(
                is_field_already_present(".".join(splitted[1:]), x.get(splitted[0], {}))
                for x in data
            )
    else:
        return key in data
 def _search_yield_hits(
    query_compiler: "QueryCompiler",
    body: Dict[str, Any],
@ -1557,10 +1618,24 @@ def _search_yield_hits(
        # Modify the search with the new point in time ID and keep-alive time.
        body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
        if isinstance(body["_source"], list):
            body["fields"] = body["_source"]
        while max_number_of_hits is None or hits_yielded < max_number_of_hits:
            resp = client.search(**body)
-            hits: List[Dict[str, Any]] = resp["hits"]["hits"]
+            hits: List[Dict[str, Any]] = []
            for hit in resp["hits"]["hits"]:
                # Copy some of the fields to _source if they are missing there.
                if "fields" in hit and "_source" in hit:
                    fields = hit["fields"]
                    del hit["fields"]
                    for k, v in fields.items():
                        if not is_field_already_present(k, hit["_source"]):
                            if isinstance(v, list):
                                hit["_source"][k] = list(sorted(v))
                            else:
                                hit["_source"][k] = v
                hits.append(hit)
            # The point in time ID can change between searches so we
            # need to keep the next search up-to-date
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -514,6 +514,14 @@ class QueryCompiler:
        """
        return self._operations.to_csv(query_compiler=self, **kwargs)
    def to_json(self, **kwargs) -> Optional[str]:
        """Serialises Eland Dataframe to CSV
        Returns:
            If path_or_buf is None, returns the resulting json as a string.
        """
        return self._operations.to_json(query_compiler=self, **kwargs)
    def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
        return self._operations.search_yield_pandas_dataframes(self)
--- a/eland/series.py
+++ b/eland/series.py
@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
 import numpy as np
 import pandas as pd  # type: ignore
 from pandas.core.indexes.frozen import FrozenList
 from pandas.io.common import _expand_user, stringify_path  # type: ignore
 import eland.plotting
 from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
-from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
+from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
 from eland.filter import (
    BooleanFilter,
    Equal,
@ -292,18 +293,26 @@ class Series(NDFrame):
        Examples
        --------
        >>> df = ed.DataFrame('http://localhost:9200', 'flights')
-        >>> df['Carrier'].value_counts()
+        >>> df['Carrier'].value_counts()  # doctest: +SKIP
        Carrier
        Logstash Airways    3331
        JetBeats            3274
        Kibana Airlines     3234
        ES-Air              3220
-        Name: Carrier, dtype: int64
+        Name: count, dtype: int64
        """
        if not isinstance(es_size, int):
            raise TypeError("es_size must be a positive integer.")
        elif es_size <= 0:
            raise ValueError("es_size must be a positive integer.")
-        return self._query_compiler.value_counts(es_size)
+        value_counts = self._query_compiler.value_counts(es_size)
        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
        if PANDAS_VERSION[0] == 2:
            value_counts.name = "count"
            value_counts.index.names = FrozenList([self.es_field_name])
            value_counts.index.name = self.es_field_name
        return value_counts
    # dtype not implemented for Series as causes query to fail
    # in pandas.core.computation.ops.Term.type
--- a/noxfile.py
+++ b/noxfile.py
@ -16,7 +16,6 @@
 #  under the License.
 import os
 import subprocess
 from pathlib import Path
 import nox
@ -56,52 +55,48 @@ TYPED_FILES = (
 )
-@nox.session(reuse_venv=True)
+@nox.session(reuse_venv=True, python="3.11")
 def format(session):
-    session.install("black", "isort", "flynt")
+    session.install("black ~= 25.0", "isort", "flynt")
    session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES)
    session.run("flynt", *SOURCE_FILES)
-    session.run("black", "--target-version=py38", *SOURCE_FILES)
+    session.run("black", "--target-version=py39", *SOURCE_FILES)
    session.run("isort", "--profile=black", *SOURCE_FILES)
    lint(session)
-@nox.session(reuse_venv=True)
+@nox.session(reuse_venv=True, python="3.11")
 def lint(session):
    # Install numpy to use its mypy plugin
    # https://numpy.org/devdocs/reference/typing.html#mypy-plugin
-    session.install("black", "flake8", "mypy", "isort", "numpy")
+    session.install("black ~= 25.0", "flake8", "mypy", "isort", "numpy")
-    session.install("--pre", "elasticsearch>=8.3,<9")
+    session.install(".")
    session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
-    session.run("black", "--check", "--target-version=py38", *SOURCE_FILES)
+    session.run("black", "--check", "--target-version=py39", *SOURCE_FILES)
    session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
-    session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES)
+    session.run("flake8", "--extend-ignore=E203,E402,E501,E704,E712", *SOURCE_FILES)
    # TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
-    session.log("mypy --show-error-codes --strict eland/")
+    stdout = session.run(
-    for typed_file in TYPED_FILES:
+        "mypy",
-        if not os.path.isfile(typed_file):
+        "--show-error-codes",
-            session.error(f"The file {typed_file!r} couldn't be found")
+        "--strict",
-        process = subprocess.run(
+        *TYPED_FILES,
-            ["mypy", "--show-error-codes", "--strict", typed_file],
+        success_codes=(0, 1),
-            env=session.env,
+        silent=True,
-            stdout=subprocess.PIPE,
+    )
            stderr=subprocess.STDOUT,
        )
        # Ensure that mypy itself ran successfully
        assert process.returncode in (0, 1)
-        errors = []
+    errors = []
-        for line in process.stdout.decode().split("\n"):
+    for line in stdout.splitlines():
-            filepath = line.partition(":")[0]
+        filepath = line.partition(":")[0]
-            if filepath in TYPED_FILES:
+        if filepath in TYPED_FILES:
-                errors.append(line)
+            errors.append(line)
-        if errors:
+    if errors:
-            session.error("\n" + "\n".join(sorted(set(errors))))
+        session.error("\n" + "\n".join(sorted(set(errors))))
-@nox.session(python=["3.8", "3.9", "3.10"])
+@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
-@nox.parametrize("pandas_version", ["1.5.0"])
+@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
 def test(session, pandas_version: str):
    session.install("-r", "requirements-dev.txt")
    session.install(".")
@ -121,9 +116,6 @@ def test(session, pandas_version: str):
        "--nbval",
    )
    # PyTorch doesn't support Python 3.11 yet
    if session.python == "3.11":
        pytest_args += ("--ignore=eland/ml/pytorch",)
    session.run(
        *pytest_args,
        *(session.posargs or ("eland/", "tests/")),
@ -140,7 +132,6 @@ def test(session, pandas_version: str):
            "scikit-learn",
            "xgboost",
            "lightgbm",
            "shap",
        )
        session.run("pytest", "tests/ml/")
@ -150,8 +141,8 @@ def docs(session):
    # Run this so users get an error if they don't have Pandoc installed.
    session.run("pandoc", "--version", external=True)
    session.install("-r", "docs/requirements-docs.txt")
    session.install(".")
    session.install("-r", "docs/requirements-docs.txt")
    # See if we have an Elasticsearch cluster active
    # to rebuild the Jupyter notebooks with.
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,26 +1,7 @@
 #
-# Basic requirements
+# Basic requirements with extras
 #
-elasticsearch>=8.3,<9
+.[all]
 pandas>=1.5,<2
 matplotlib>=3.6
 numpy>=1.2.0,<2
 tqdm<5
 #
 # Extras
 #
 scikit-learn>=1.3,<1.4
 xgboost>=0.90,<2
 lightgbm>=2,<4
 # PyTorch doesn't support Python 3.11 yet (pytorch/pytorch#86566)
 # Elasticsearch uses v1.13.1 of PyTorch
 torch>=1.13.1,<2.0; python_version<'3.11'
 # Versions known to be compatible with PyTorch 1.13.1
 sentence-transformers>=2.1.0,<=2.2.2; python_version<'3.11'
 transformers[torch]>=4.31.0,<=4.33.2; python_version<'3.11'
 #
 # Testing
@ -29,7 +10,6 @@ pytest>=5.2.1
 pytest-mock
 pytest-cov
 nbval
 shap==0.43.0
 #
 # Docs
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +0,0 @@
 #
 # Basic requirements
 #
 elasticsearch>=8.3,<9
 pandas>=1.5,<2
 matplotlib>=3.6
 numpy>=1.2.0,<2
--- a/setup.py
+++ b/setup.py
@ -38,9 +38,10 @@ CLASSIFIERS = [
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3 :: Only",
    "Programming Language :: Python :: 3.8",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Topic :: Scientific/Engineering",
 ]
@ -55,12 +56,16 @@ with open(path.join(here, "README.md"), "r", "utf-8") as f:
 extras = {
    "xgboost": ["xgboost>=0.90,<2"],
-    "scikit-learn": ["scikit-learn>=1.3,<1.4"],
+    "scikit-learn": ["scikit-learn>=1.3,<1.6"],
-    "lightgbm": ["lightgbm>=2,<4"],
+    "lightgbm": ["lightgbm>=3,<5"],
    "pytorch": [
-        "torch>=1.13.1,<2.0",
+        "requests<3",
-        "sentence-transformers>=2.1.0,<=2.2.2",
+        "torch==2.5.1",
-        "transformers[torch]>=4.31.0,<=4.33.2",
+        "tqdm",
        "sentence-transformers>=5.0.0,<6.0.0",
        # sentencepiece is a required dependency for the slow tokenizers
        # https://huggingface.co/transformers/v4.4.2/migration.html#sentencepiece-is-removed-from-the-required-dependencies
        "transformers[sentencepiece]>=4.47.0,<4.50.3",
    ],
 }
 extras["all"] = list({dep for deps in extras.values() for dep in deps})
@ -81,8 +86,8 @@ setup(
    keywords="elastic eland pandas python",
    packages=find_packages(include=["eland", "eland.*"]),
    install_requires=[
-        "elasticsearch>=8.3,<9",
+        "elasticsearch>=9,<10",
-        "pandas>=1.5,<2",
+        "pandas>=1.5,<3",
        "matplotlib>=3.6",
        "numpy>=1.2.0,<2",
        "packaging",
@ -90,7 +95,7 @@ setup(
    entry_points={
        "console_scripts": "eland_import_hub_model=eland.cli.eland_import_hub_model:main"
    },
-    python_requires=">=3.8",
+    python_requires=">=3.9,<3.13",
    package_data={"eland": ["py.typed"]},
    include_package_data=True,
    zip_safe=False,
--- a/tests/init.py
+++ b/tests/init.py
@ -20,7 +20,7 @@ import os
 import pandas as pd
 from elasticsearch import Elasticsearch
-from eland.common import es_version
+from eland.common import es_version, is_serverless_es
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -33,6 +33,7 @@ ELASTICSEARCH_HOST = os.environ.get(
 ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)
 ES_VERSION = es_version(ES_TEST_CLIENT)
 ES_IS_SERVERLESS = is_serverless_es(ES_TEST_CLIENT)
 FLIGHTS_INDEX_NAME = "flights"
 FLIGHTS_MAPPING = {
@ -43,7 +44,7 @@ FLIGHTS_MAPPING = {
            "Carrier": {"type": "keyword"},
            "Dest": {"type": "keyword"},
            "DestAirportID": {"type": "keyword"},
-            "DestCityName": {"type": "keyword"},
+            "DestCityName": {"type": "keyword", "copy_to": "Cities"},
            "DestCountry": {"type": "keyword"},
            "DestLocation": {"type": "geo_point"},
            "DestRegion": {"type": "keyword"},
@ -58,11 +59,12 @@ FLIGHTS_MAPPING = {
            "FlightTimeMin": {"type": "float"},
            "Origin": {"type": "keyword"},
            "OriginAirportID": {"type": "keyword"},
-            "OriginCityName": {"type": "keyword"},
+            "OriginCityName": {"type": "keyword", "copy_to": "Cities"},
            "OriginCountry": {"type": "keyword"},
            "OriginLocation": {"type": "geo_point"},
            "OriginRegion": {"type": "keyword"},
            "OriginWeather": {"type": "keyword"},
            "Cities": {"type": "text"},
            "dayOfWeek": {"type": "byte"},
            "timestamp": {"type": "date", "format": "strict_date_hour_minute_second"},
        }
--- a/tests/common.py
+++ b/tests/common.py
@ -24,6 +24,7 @@ import pandas as pd
 from pandas.testing import assert_frame_equal, assert_series_equal
 import eland as ed
 from eland.common import PANDAS_VERSION
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -45,7 +46,14 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
 _pd_flights = pd.DataFrame.from_records(flight_records).reindex(
    _ed_flights.columns, axis=1
 )
-_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
+if PANDAS_VERSION[0] >= 2:
    _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
 else:
    _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
 # Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
 _pd_flights["Cities"] = _pd_flights.apply(
    lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
 )
 _pd_flights.index = _pd_flights.index.map(str)  # make index 'object' not int
 _pd_flights_small = _pd_flights.head(48)
@ -58,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
 )
 _pd_ecommerce.insert(2, "customer_birth_date", None)
 _pd_ecommerce.index = _pd_ecommerce.index.map(str)  # make index 'object' not int
-_pd_ecommerce["customer_birth_date"].astype("datetime64")
+_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
 _ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -77,7 +77,16 @@ class SymmetricAPIChecker:
                pd_exc = e
            self.check_exception(ed_exc, pd_exc)
-            self.check_values(ed_obj, pd_obj)
+            try:
                self.check_values(ed_obj, pd_obj)
            except AssertionError as e:
                # This is an attribute we allow to differ when comparing zero-length objects
                if (
                    'Attribute "inferred_type" are different' in repr(e)
                    and len(ed_obj) == 0
                    and len(pd_obj) == 0
                ):
                    self.check_values(ed_obj, pd_obj, check_index_type=False)
            if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
                return SymmetricAPIChecker(ed_obj, pd_obj)
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
        return f
-    def check_values(self, ed_obj, pd_obj):
+    def check_values(self, ed_obj, pd_obj, **kwargs):
        """Checks that any two values coming from eland and pandas are equal"""
        if isinstance(ed_obj, ed.DataFrame):
-            assert_pandas_eland_frame_equal(pd_obj, ed_obj)
+            assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
        elif isinstance(ed_obj, ed.Series):
-            assert_pandas_eland_series_equal(pd_obj, ed_obj)
+            assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
        elif isinstance(ed_obj, pd.DataFrame):
-            assert_frame_equal(ed_obj, pd_obj)
+            assert_frame_equal(ed_obj, pd_obj, **kwargs)
        elif isinstance(ed_obj, pd.Series):
-            assert_series_equal(ed_obj, pd_obj)
+            assert_series_equal(ed_obj, pd_obj, **kwargs)
        elif isinstance(ed_obj, pd.Index):
            assert ed_obj.equals(pd_obj)
        else:
--- a/tests/dataframe/test_datetime_pytest.py
+++ b/tests/dataframe/test_datetime_pytest.py
@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
            },
            index=["0", "1", "2"],
        )
        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
        df["D"] = df["D"].astype("datetime64[ns]")
        expected_mappings = {
            "mappings": {
--- a/tests/dataframe/test_describe_pytest.py
+++ b/tests/dataframe/test_describe_pytest.py
@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
            ["Cancelled", "FlightDelay"], axis="columns"
        )
        # Pandas >= 2 calculates aggregations such as min and max for timestamps too
        # This could be implemented in eland, but as of yet this is not the case
        # We therefore remove it before the comparison
        if "timestamp" in pd_describe.columns:
            pd_describe = pd_describe.drop(["timestamp"], axis="columns")
        # Pandas >= 2 orders the aggregations differently than Pandas < 2
        # A sort_index is applied so tests will succeed in both environments
        assert_frame_equal(
-            pd_describe.drop(["25%", "50%", "75%"], axis="index"),
+            pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
-            ed_describe.drop(["25%", "50%", "75%"], axis="index"),
+            ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
            check_exact=False,
            rtol=True,
        )
--- a/tests/dataframe/test_dtypes_pytest.py
+++ b/tests/dataframe/test_dtypes_pytest.py
@ -43,6 +43,7 @@ class TestDataFrameDtypes:
                    "AvgTicketPrice": "float",
                    "Cancelled": "boolean",
                    "Carrier": "keyword",
                    "Cities": "text",
                    "Dest": "keyword",
                    "DestAirportID": "keyword",
                    "DestCityName": "keyword",
--- a/tests/dataframe/test_head_tail_pytest.py
+++ b/tests/dataframe/test_head_tail_pytest.py
@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
        ed_head_0 = ed_flights.head(0)
        pd_head_0 = pd_flights.head(0)
-        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
+        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
    def test_doc_test_tail(self):
        df = self.ed_flights()
--- a/tests/dataframe/test_iterrows_itertuples_pytest.py
+++ b/tests/dataframe/test_iterrows_itertuples_pytest.py
@ -54,9 +54,13 @@ class TestDataFrameIterrowsItertuples(TestData):
            # Shim which uses pytest.approx() for floating point values inside tuples.
            assert len(left) == len(right)
            assert all(
-                (lt == rt)  # Not floats? Use ==
+                (
-                if not isinstance(lt, float) and not isinstance(rt, float)
+                    # Not floats? Use ==
-                else (lt == pytest.approx(rt))  # If both are floats use pytest.approx()
+                    (lt == rt)
                    if not isinstance(lt, float) and not isinstance(rt, float)
                    # If both are floats use pytest.approx()
                    else (lt == pytest.approx(rt))
                )
                for lt, rt in zip(left, right)
            )
--- a/tests/dataframe/test_metrics_pytest.py
+++ b/tests/dataframe/test_metrics_pytest.py
@ -22,6 +22,7 @@ import pandas as pd
 import pytest
 from pandas.testing import assert_frame_equal, assert_series_equal
 from eland.common import PANDAS_VERSION
 from tests.common import TestData, assert_almost_equal
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
        logger.setLevel(logging.DEBUG)
        for func in self.extended_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights, func)(
                **({"numeric_only": True} if func != "mad" else {})
            )
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
        ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
        for func in self.extended_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights_1, func)()
            ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
        ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
        for func in self.extended_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights_0, func)()
            ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData):
            ["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
        )
-        pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
+        if PANDAS_VERSION[0] == 1:
-        ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
+            pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
            ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
        else:  # numeric_only is no longer available for pandas > 2
            pd_quantile = pd_flights.agg(["quantile", "min"])
            ed_quantile = ed_flights.agg(["quantile", "min"])
        assert_frame_equal(
            pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
--- a/tests/dataframe/test_to_csv_pytest.py
+++ b/tests/dataframe/test_to_csv_pytest.py
@ -15,7 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.
-# File called _pytest for PyCharm compatability
+# File called _pytest for PyCharm compatibility
 import ast
 import time
@ -41,8 +41,9 @@ class TestDataFrameToCSV(TestData):
            results_file,
            index_col=0,
            converters={
-                "DestLocation": lambda x: ast.literal_eval(x),
+                "DestLocation": ast.literal_eval,
-                "OriginLocation": lambda x: ast.literal_eval(x),
+                "OriginLocation": ast.literal_eval,
                "Cities": ast.literal_eval,
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
@ -63,8 +64,9 @@ class TestDataFrameToCSV(TestData):
            results_file,
            index_col=0,
            converters={
-                "DestLocation": lambda x: ast.literal_eval(x),
+                "DestLocation": ast.literal_eval,
-                "OriginLocation": lambda x: ast.literal_eval(x),
+                "OriginLocation": ast.literal_eval,
                "Cities": ast.literal_eval,
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
@ -112,8 +114,9 @@ class TestDataFrameToCSV(TestData):
            results,
            index_col=0,
            converters={
-                "DestLocation": lambda x: ast.literal_eval(x),
+                "DestLocation": ast.literal_eval,
-                "OriginLocation": lambda x: ast.literal_eval(x),
+                "OriginLocation": ast.literal_eval,
                "Cities": ast.literal_eval,
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
--- a/tests/dataframe/test_to_json_pytest.py
+++ b/tests/dataframe/test_to_json_pytest.py
@ -0,0 +1,139 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 # File called _pytest for PyCharm compatibility
 from io import StringIO
 from pathlib import Path
 import pandas
 from pandas.testing import assert_frame_equal
 from tests.common import ROOT_DIR, TestData
 class TestDataFrameToJSON(TestData):
    def test_to_json_default_arguments(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights.to_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl")
        pd_flights.to_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl")
        assert_frame_equal(
            pandas.read_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl"),
            pandas.read_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl"),
        )
    def test_to_json_streaming_mode(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights.to_json(
            ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
            lines=True,
            orient="records",
        )
        pd_flights.to_json(
            ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
            lines=True,
            orient="records",
        )
        assert_frame_equal(
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
                lines=True,
                orient="records",
            ),
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
                lines=True,
                orient="records",
            ),
        )
    def test_to_json_streaming_mode_pathlib(self):
        root_dir = Path(ROOT_DIR)
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights.to_json(
            root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
            lines=True,
            orient="records",
        )
        pd_flights.to_json(
            root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
            lines=True,
            orient="records",
        )
        assert_frame_equal(
            pandas.read_json(
                root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
                lines=True,
                orient="records",
            ),
            pandas.read_json(
                root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
                lines=True,
                orient="records",
            ),
        )
    def test_to_json_with_other_buffer(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        output_buffer = StringIO()
        ed_flights.to_json(output_buffer, lines=True, orient="records")
        output_string = pd_flights.to_json(lines=True, orient="records")
        output_buffer.seek(0)  # rewind our StringIO object
        assert_frame_equal(
            pandas.read_json(output_buffer, lines=True, orient="records"),
            pandas.read_json(
                StringIO(output_string),
                lines=True,
                orient="records",
            ),
        )
    def test_to_json_with_file_handle(self):
        root_dir = Path(ROOT_DIR)
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        with open(
            root_dir / "dataframe" / "results" / "fh_eland_to_json.jsonl", "w"
        ) as w:
            ed_flights.to_json(w)
        pd_flights.to_json(
            root_dir / "dataframe" / "results" / "check_pandas_to_json.jsonl"
        )
        assert_frame_equal(
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/fh_eland_to_json.jsonl",
                lines=True,
                orient="records",
            ),
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/check_pandas_to_json.jsonl",
                lines=True,
                orient="records",
            ),
        )
--- a/tests/dataframe/test_utils_pytest.py
+++ b/tests/dataframe/test_utils_pytest.py
@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
        )
        ed_df_head = ed_df.head()
        # https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
        df["D"] = df["D"].astype("datetime64[ns]")
        df["H"] = (
            df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
        )
        assert_pandas_eland_frame_equal(df, ed_df_head)
        ES_TEST_CLIENT.indices.delete(index=index_name)
--- a/tests/ml/pytorch/test_pytorch_model_config_pytest.py
+++ b/tests/ml/pytorch/test_pytorch_model_config_pytest.py
@ -39,6 +39,7 @@ try:
    from eland.ml.pytorch import (
        FillMaskInferenceOptions,
        NlpBertTokenizationConfig,
        NlpDebertaV2TokenizationConfig,
        NlpMPNetTokenizationConfig,
        NlpRobertaTokenizationConfig,
        NlpXLMRobertaTokenizationConfig,
@ -57,10 +58,6 @@ except ImportError:
 from tests import ES_VERSION
 pytestmark = [
    pytest.mark.skipif(
        ES_VERSION < (8, 7, 0),
        reason="Eland uses Pytorch 1.13.1, versions of Elasticsearch prior to 8.7.0 are incompatible with PyTorch 1.13.1",
    ),
    pytest.mark.skipif(
        not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
    ),
@ -149,13 +146,20 @@ if HAS_PYTORCH and HAS_SKLEARN and HAS_TRANSFORMERS:
            1024,
            None,
        ),
        (
            "microsoft/deberta-v3-xsmall",
            "fill_mask",
            FillMaskInferenceOptions,
            NlpDebertaV2TokenizationConfig,
            512,
            None,
        ),
    ]
 else:
    MODEL_CONFIGURATIONS = []
 class TestModelConfguration:
    @pytest.mark.skip(reason="https://github.com/elastic/eland/issues/633")
    @pytest.mark.parametrize(
        "model_id,task_type,config_type,tokenizer_type,max_sequence_len,embedding_size",
        MODEL_CONFIGURATIONS,
@ -209,6 +213,9 @@ class TestModelConfguration:
                assert isinstance(config.inference_config.classification_labels, list)
                assert len(config.inference_config.classification_labels) > 0
            if task_type == "text_similarity":
                assert tokenization.truncate == "second"
            del tm
    def test_model_config_with_prefix_string(self):
@ -235,3 +242,16 @@ class TestModelConfguration:
                ingest_prefix="INGEST:",
                search_prefix="SEARCH:",
            )
    def test_model_config_with_user_specified_input_length(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tm = TransformerModel(
                model_id="sentence-transformers/all-distilroberta-v1",
                task_type="text_embedding",
                es_version=(8, 13, 0),
                quantize=False,
                max_model_input_size=213,
            )
            _, config, _ = tm.save(tmp_dir)
            tokenization = config.inference_config.tokenization
            assert tokenization.max_sequence_length == 213
--- a/tests/ml/pytorch/test_pytorch_model_upload_pytest.py
+++ b/tests/ml/pytorch/test_pytorch_model_upload_pytest.py
@ -38,10 +38,6 @@ except ImportError:
 from tests import ES_TEST_CLIENT, ES_VERSION
 pytestmark = [
    pytest.mark.skipif(
        ES_VERSION < (8, 7, 0),
        reason="Eland uses Pytorch 1.13.1, versions of Elasticsearch prior to 8.7.0 are incompatible with PyTorch 1.13.1",
    ),
    pytest.mark.skipif(
        not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
    ),
@ -67,6 +63,10 @@ TEXT_EMBEDDING_MODELS = [
    )
 ]
 TEXT_SIMILARITY_MODELS = ["mixedbread-ai/mxbai-rerank-xsmall-v1"]
 TEXT_EXPANSION_MODELS = ["naver/splade-v3-distilbert"]
@pytest.fixture(scope="function", autouse=True)
 def setup_and_tear_down():
@ -135,3 +135,44 @@ class TestPytorchModel:
                    )
                    > 0
                )
    @pytest.mark.skipif(
        ES_VERSION < (8, 16, 0), reason="requires 8.16.0 for DeBERTa models"
    )
    @pytest.mark.parametrize("model_id", TEXT_SIMILARITY_MODELS)
    def test_text_similarity(self, model_id):
        with tempfile.TemporaryDirectory() as tmp_dir:
            ptm = download_model_and_start_deployment(
                tmp_dir, False, model_id, "text_similarity"
            )
            result = ptm.infer(
                docs=[
                    {
                        "text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
                    },
                    {"text_field": "Paris is the capital of France"},
                ],
                inference_config={"text_similarity": {"text": "France"}},
            )
            assert result.body["inference_results"][0]["predicted_value"] < 0
            assert result.body["inference_results"][1]["predicted_value"] > 0
    @pytest.mark.skipif(ES_VERSION < (9, 0, 0), reason="requires current major version")
    @pytest.mark.parametrize("model_id", TEXT_EXPANSION_MODELS)
    def test_text_expansion(self, model_id):
        with tempfile.TemporaryDirectory() as tmp_dir:
            ptm = download_model_and_start_deployment(
                tmp_dir, False, model_id, "text_expansion"
            )
            result = ptm.infer(
                docs=[
                    {
                        "text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
                    },
                    {"text_field": "Paris is the capital of France"},
                ]
            )
            assert len(result.body["inference_results"][0]["predicted_value"]) > 0
            assert len(result.body["inference_results"][1]["predicted_value"]) > 0
--- a/tests/ml/test_ml_model_pytest.py
+++ b/tests/ml/test_ml_model_pytest.py
@ -15,19 +15,18 @@
 #  specific language governing permissions and limitations
 #  under the License.
 from operator import itemgetter
 from typing import Tuple
 import numpy as np
 import pytest
 import eland as ed
 from eland.ml import MLModel
-from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor
+from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
 from eland.ml.transformers import get_model_transformer
 from tests import (
    ES_IS_SERVERLESS,
    ES_TEST_CLIENT,
    ES_VERSION,
    FLIGHTS_SMALL_INDEX_NAME,
    NATIONAL_PARKS_INDEX_NAME,
 )
@ -54,26 +53,16 @@ try:
 except ImportError:
    HAS_LIGHTGBM = False
 try:
    import shap
    HAS_SHAP = True
 except ImportError:
    HAS_SHAP = False
 requires_sklearn = pytest.mark.skipif(
-    not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run."
+    not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
 )
 requires_xgboost = pytest.mark.skipif(
-    not HAS_XGBOOST, reason="This test requires 'xgboost' package to run."
+    not HAS_XGBOOST, reason="This test requires 'xgboost' package to run"
 )
 requires_shap = pytest.mark.skipif(
    not HAS_SHAP, reason="This tests requries 'shap' package to run."
 )
 requires_no_ml_extras = pytest.mark.skipif(
    HAS_SKLEARN or HAS_XGBOOST,
-    reason="This test requires 'scikit-learn' and 'xgboost' to not be installed.",
+    reason="This test requires 'scikit-learn' and 'xgboost' to not be installed",
 )
 requires_lightgbm = pytest.mark.skipif(
@ -107,100 +96,11 @@ def check_prediction_equality(es_model: MLModel, py_model, test_data):
    np.testing.assert_almost_equal(test_results, es_results, decimal=2)
-def yield_model_id(analysis, analyzed_fields):
+def randomize_model_id(prefix, suffix_size=10):
    import random
    import string
    import time
-    suffix = "".join(random.choices(string.ascii_lowercase, k=4))
+    return f"{prefix}-{''.join(random.choices(string.ascii_lowercase, k=suffix_size))}"
    job_id = "test-flights-regression-" + suffix
    dest = job_id + "-dest"
    response = ES_TEST_CLIENT.ml.put_data_frame_analytics(
        id=job_id,
        analysis=analysis,
        dest={"index": dest},
        source={"index": [FLIGHTS_SMALL_INDEX_NAME]},
        analyzed_fields=analyzed_fields,
    )
    assert response.meta.status == 200
    response = ES_TEST_CLIENT.ml.start_data_frame_analytics(id=job_id)
    assert response.meta.status == 200
    time.sleep(2)
    response = ES_TEST_CLIENT.ml.get_trained_models(model_id=job_id + "*")
    assert response.meta.status == 200
    assert response.body["count"] == 1
    model_id = response.body["trained_model_configs"][0]["model_id"]
    yield model_id
    ES_TEST_CLIENT.ml.delete_data_frame_analytics(id=job_id)
    ES_TEST_CLIENT.indices.delete(index=dest)
    ES_TEST_CLIENT.ml.delete_trained_model(model_id=model_id)
@pytest.fixture(params=[[0, 4], [0, 1], range(5)])
 def regression_model_id(request):
    analysis = {
        "regression": {
            "dependent_variable": "FlightDelayMin",
            "max_trees": 3,
            "num_top_feature_importance_values": 0,
            "max_optimization_rounds_per_hyperparameter": 1,
            "prediction_field_name": "FlightDelayMin_prediction",
            "training_percent": 30,
            "randomize_seed": 1000,
            "loss_function": "mse",
            "early_stopping_enabled": True,
        }
    }
    all_includes = [
        "FlightDelayMin",
        "FlightDelayType",
        "FlightTimeMin",
        "DistanceMiles",
        "OriginAirportID",
    ]
    includes = [all_includes[i] for i in request.param]
    analyzed_fields = {
        "includes": includes,
        "excludes": [],
    }
    yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
@pytest.fixture(params=[[0, 6], [5, 6], range(7)])
 def classification_model_id(request):
    analysis = {
        "classification": {
            "dependent_variable": "Cancelled",
            "max_trees": 5,
            "num_top_feature_importance_values": 0,
            "max_optimization_rounds_per_hyperparameter": 1,
            "prediction_field_name": "Cancelled_prediction",
            "training_percent": 50,
            "randomize_seed": 1000,
            "num_top_classes": -1,
            "class_assignment_objective": "maximize_accuracy",
            "early_stopping_enabled": True,
        }
    }
    all_includes = [
        "OriginWeather",
        "OriginAirportID",
        "DestCityName",
        "DestWeather",
        "DestRegion",
        "AvgTicketPrice",
        "Cancelled",
    ]
    includes = [all_includes[i] for i in request.param]
    analyzed_fields = {
        "includes": includes,
        "excludes": [],
    }
    yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
 class TestMLModel:
@ -320,17 +220,71 @@ class TestMLModel:
        # Clean up
        es_model.delete_model()
    def _normalize_ltr_score_from_XGBRanker(self, ranker, ltr_model_config, scores):
        """Normalize the scores of an XGBRanker model as ES implementation of LTR would do.
        Parameters
        ----------
        ranker : XGBRanker
            The XGBRanker model to retrieve the minimum score from.
        ltr_model_config : LTRModelConfig
            LTR model config.
        Returns
        -------
        scores : List[float]
            Normalized scores for the model.
        """
        should_rescore = (
            (ES_VERSION[0] == 8 and ES_VERSION >= (8, 19))
            or (
                ES_VERSION[0] == 9
                and (ES_VERSION[1] >= 1 or (ES_VERSION[1] == 0 and ES_VERSION[2] >= 1))
            )
            or ES_IS_SERVERLESS
        )
        if should_rescore:
            # In 8.19+, 9.0.1 and 9.1, the scores are normalized if there are negative scores
            min_model_score, _ = (
                get_model_transformer(
                    ranker, feature_names=ltr_model_config.feature_names
                )
                .transform()
                .bounds()
            )
            if min_model_score < 0:
                scores = [score - min_model_score for score in scores]
        return scores
    @requires_elasticsearch_version((8, 12))
-    @requires_sklearn
+    @requires_xgboost
    @pytest.mark.parametrize("compress_model_definition", [True, False])
-    def test_learning_to_rank(self, compress_model_definition):
+    @pytest.mark.parametrize(
-        # Train model
+        "objective",
-        training_data = datasets.make_regression(n_features=2)
+        ["rank:ndcg", "rank:map", "rank:pairwise"],
-        regressor = DecisionTreeRegressor()
+    )
-        regressor.fit(training_data[0], training_data[1])
+    def test_learning_to_rank(self, objective, compress_model_definition):
        X, y = datasets.make_classification(
            n_features=3, n_informative=2, n_redundant=1
        )
        rng = np.random.default_rng()
        qid = rng.integers(0, 3, size=X.shape[0])
        # Sort the inputs based on query index
        sorted_idx = np.argsort(qid)
        X = X[sorted_idx, :]
        y = y[sorted_idx]
        qid = qid[sorted_idx]
        ranker = XGBRanker(objective=objective)
        ranker.fit(X, y, qid=qid)
        # Serialise the models to Elasticsearch
-        model_id = "test_learning_to_rank"
+        model_id = randomize_model_id("test_learning_to_rank")
        ltr_model_config = LTRModelConfig(
            feature_extractors=[
                QueryFeatureExtractor(
@ -356,9 +310,8 @@ class TestMLModel:
        es_model = MLModel.import_ltr_model(
            ES_TEST_CLIENT,
            model_id,
-            regressor,
+            ranker,
            ltr_model_config,
            es_if_exists="replace",
            es_compress_model_definition=compress_model_definition,
        )
@ -366,9 +319,19 @@ class TestMLModel:
        response = ES_TEST_CLIENT.ml.get_trained_models(model_id=model_id)
        assert response.meta.status == 200
        assert response.body["count"] == 1
-        saved_inference_config = response.body["trained_model_configs"][0][
+
-            "inference_config"
+        saved_trained_model_config = response.body["trained_model_configs"][0]
-        ]
+
        assert "input" in saved_trained_model_config
        assert "field_names" in saved_trained_model_config["input"]
        if not ES_IS_SERVERLESS and ES_VERSION < (8, 15):
            assert len(saved_trained_model_config["input"]["field_names"]) == 3
        else:
            assert not len(saved_trained_model_config["input"]["field_names"])
        saved_inference_config = saved_trained_model_config["inference_config"]
        assert "learning_to_rank" in saved_inference_config
        assert "feature_extractors" in saved_inference_config["learning_to_rank"]
        saved_feature_extractors = saved_inference_config["learning_to_rank"][
@ -388,16 +351,32 @@ class TestMLModel:
                "learning_to_rank": {
                    "model_id": model_id,
                    "params": {"query_string": "yosemite"},
-                }
+                },
                "window_size": 2,
            },
        )
-        # Assert that:
+        # Assert that rescored search result match predition.
        # - all documents from the query are present
        # - all documents have been rescored (score != 1.0)
        doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]]
-        assert len(search_result["hits"]["hits"]) == 2
+
-        assert all(score != float(1) for score in doc_scores)
+        feature_logger = FeatureLogger(
            ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME, ltr_model_config
        )
        expected_scores = sorted(
            [
                ranker.predict(np.asarray([doc_features]))[0]
                for _, doc_features in feature_logger.extract_features(
                    {"query_string": "yosemite"}, ["park_yosemite", "park_everglades"]
                ).items()
            ],
            reverse=True,
        )
        expected_scores = self._normalize_ltr_score_from_XGBRanker(
            ranker, ltr_model_config, expected_scores
        )
        np.testing.assert_almost_equal(expected_scores, doc_scores, decimal=2)
        # Verify prediction is not supported for LTR
        try:
@ -406,6 +385,9 @@ class TestMLModel:
            pass
        # Clean up
        ES_TEST_CLIENT.cluster.health(
            index=".ml-*", wait_for_active_shards="all"
        )  # Added to prevent flakiness in the test
        es_model.delete_model()
    @requires_sklearn
@ -434,6 +416,7 @@ class TestMLModel:
        )
        # Clean up
        es_model.delete_model()
    @requires_sklearn
@ -744,172 +727,3 @@ class TestMLModel:
        # Clean up
        es_model.delete_model()
    @requires_sklearn
    @requires_shap
    def test_export_regressor(self, regression_model_id):
        ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
        types = dict(ed_flights.dtypes)
        X = ed_flights.to_pandas().astype(types)
        model = MLModel(es_client=ES_TEST_CLIENT, model_id=regression_model_id)
        pipeline = model.export_model()
        pipeline.fit(X)
        predictions_sklearn = pipeline.predict(
            X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
        )
        response = ES_TEST_CLIENT.ml.infer_trained_model(
            model_id=regression_model_id,
            docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
        )
        predictions_es = np.array(
            list(
                map(
                    itemgetter("FlightDelayMin_prediction"),
                    response.body["inference_results"],
                )
            )
        )
        np.testing.assert_array_almost_equal(predictions_sklearn, predictions_es)
        import pandas as pd
        X_transformed = pipeline["preprocessor"].transform(X=X)
        X_transformed = pd.DataFrame(
            X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
        )
        explainer = shap.TreeExplainer(pipeline["es_model"])
        shap_values = explainer.shap_values(
            X_transformed[pipeline["es_model"].feature_names_in_]
        )
        np.testing.assert_array_almost_equal(
            predictions_sklearn, shap_values.sum(axis=1) + explainer.expected_value
        )
    @requires_sklearn
    def test_export_classification(self, classification_model_id):
        ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
        X = ed.eland_to_pandas(ed_flights)
        model = MLModel(es_client=ES_TEST_CLIENT, model_id=classification_model_id)
        pipeline = model.export_model()
        pipeline.fit(X)
        predictions_sklearn = pipeline.predict(
            X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
        )
        prediction_proba_sklearn = pipeline.predict_proba(
            X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
        ).max(axis=1)
        response = ES_TEST_CLIENT.ml.infer_trained_model(
            model_id=classification_model_id,
            docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
        )
        predictions_es = np.array(
            list(
                map(
                    lambda x: str(int(x["Cancelled_prediction"])),
                    response.body["inference_results"],
                )
            )
        )
        prediction_proba_es = np.array(
            list(
                map(
                    itemgetter("prediction_probability"),
                    response.body["inference_results"],
                )
            )
        )
        np.testing.assert_array_almost_equal(
            prediction_proba_sklearn, prediction_proba_es
        )
        np.testing.assert_array_equal(predictions_sklearn, predictions_es)
        import pandas as pd
        X_transformed = pipeline["preprocessor"].transform(X=X)
        X_transformed = pd.DataFrame(
            X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
        )
        explainer = shap.TreeExplainer(pipeline["es_model"])
        shap_values = explainer.shap_values(
            X_transformed[pipeline["es_model"].feature_names_in_]
        )
        log_odds = shap_values.sum(axis=1) + explainer.expected_value
        prediction_proba_shap = 1 / (1 + np.exp(-log_odds))
        # use probability of the predicted class
        prediction_proba_shap[prediction_proba_shap < 0.5] = (
            1 - prediction_proba_shap[prediction_proba_shap < 0.5]
        )
        np.testing.assert_array_almost_equal(
            prediction_proba_sklearn, prediction_proba_shap
        )
    @requires_xgboost
    @requires_sklearn
    @pytest.mark.parametrize("objective", ["binary:logistic", "reg:squarederror"])
    def test_xgb_import_export(self, objective):
        booster = "gbtree"
        if objective.startswith("binary:"):
            training_data = datasets.make_classification(n_features=5)
            xgb_model = XGBClassifier(
                booster=booster, objective=objective, use_label_encoder=False
            )
        else:
            training_data = datasets.make_regression(n_features=5)
            xgb_model = XGBRegressor(
                booster=booster, objective=objective, use_label_encoder=False
            )
        # Train model
        xgb_model.fit(training_data[0], training_data[1])
        # Serialise the models to Elasticsearch
        feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
        model_id = "test_xgb_model"
        es_model = MLModel.import_model(
            ES_TEST_CLIENT, model_id, xgb_model, feature_names, es_if_exists="replace"
        )
        # Export suppose to fail
        with pytest.raises(ValueError) as ex:
            es_model.export_model()
        assert ex.match("Error initializing sklearn classifier.")
        # Clean up
        es_model.delete_model()
    @requires_lightgbm
    @pytest.mark.parametrize("objective", ["regression", "binary"])
    def test_lgbm_import_export(self, objective):
        booster = "gbdt"
        if objective == "binary":
            training_data = datasets.make_classification(n_features=5)
            lgbm_model = LGBMClassifier(boosting_type=booster, objective=objective)
        else:
            training_data = datasets.make_regression(n_features=5)
            lgbm_model = LGBMRegressor(boosting_type=booster, objective=objective)
        # Train model
        lgbm_model.fit(training_data[0], training_data[1])
        # Serialise the models to Elasticsearch
        feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
        model_id = "test_lgbm_model"
        es_model = MLModel.import_model(
            ES_TEST_CLIENT, model_id, lgbm_model, feature_names, es_if_exists="replace"
        )
        # Export suppose to fail
        with pytest.raises(ValueError) as ex:
            es_model.export_model()
        assert ex.match("Error initializing sklearn classifier.")
        # Clean up
        es_model.delete_model()
--- a/tests/notebook/test_demo_notebook.ipynb
+++ b/tests/notebook/test_demo_notebook.ipynb
--- a/tests/notebook/test_etl.ipynb
+++ b/tests/notebook/test_etl.ipynb
@ -19,7 +19,7 @@
    {
     "data": {
      "text/plain": [
-       "False"
+       "HeadApiResponse(False)"
      ]
     },
     "execution_count": 2,
@ -43,8 +43,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2021-03-30 11:57:39.116425: read 10000 rows\n",
+      "2024-05-21 09:07:17.882569: read 10000 rows\n",
-      "2021-03-30 11:57:39.522722: read 13059 rows\n"
+      "2024-05-21 09:07:18.375305: read 13059 rows\n"
     ]
    }
   ],
@ -78,6 +78,18 @@
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'\n",
      "  reader = pd.read_csv(filepath_or_buffer, **kwargs)\n",
      "/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze(\"columns\") to the call to squeeze.\n",
      "\n",
      "\n",
      "  reader = pd.read_csv(filepath_or_buffer, **kwargs)\n"
     ]
    },
    {
     "data": {
      "text/html": [
@ -218,35 +230,7 @@
    {
     "data": {
      "text/plain": [
-       "{'took': 0,\n",
+       "ObjectApiResponse({'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'churn', '_id': '0', '_score': 1.0, '_source': {'state': 'KS', 'account length': 128, 'area code': 415, 'phone number': '382-4657', 'international plan': 'no', 'voice mail plan': 'yes', 'number vmail messages': 25, 'total day minutes': 265.1, 'total day calls': 110, 'total day charge': 45.07, 'total eve minutes': 197.4, 'total eve calls': 99, 'total eve charge': 16.78, 'total night minutes': 244.7, 'total night calls': 91, 'total night charge': 11.01, 'total intl minutes': 10.0, 'total intl calls': 3, 'total intl charge': 2.7, 'customer service calls': 1, 'churn': 0}}]}})"
       " 'timed_out': False,\n",
       " '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n",
       " 'hits': {'total': {'value': 2, 'relation': 'eq'},\n",
       "  'max_score': 1.0,\n",
       "  'hits': [{'_index': 'churn',\n",
       "    '_id': '0',\n",
       "    '_score': 1.0,\n",
       "    '_source': {'state': 'KS',\n",
       "     'account length': 128,\n",
       "     'area code': 415,\n",
       "     'phone number': '382-4657',\n",
       "     'international plan': 'no',\n",
       "     'voice mail plan': 'yes',\n",
       "     'number vmail messages': 25,\n",
       "     'total day minutes': 265.1,\n",
       "     'total day calls': 110,\n",
       "     'total day charge': 45.07,\n",
       "     'total eve minutes': 197.4,\n",
       "     'total eve calls': 99,\n",
       "     'total eve charge': 16.78,\n",
       "     'total night minutes': 244.7,\n",
       "     'total night calls': 91,\n",
       "     'total night charge': 11.01,\n",
       "     'total intl minutes': 10.0,\n",
       "     'total intl calls': 3,\n",
       "     'total intl charge': 2.7,\n",
       "     'customer service calls': 1,\n",
       "     'churn': 0}}]}}"
      ]
     },
     "execution_count": 6,
@ -267,7 +251,7 @@
    {
     "data": {
      "text/plain": [
-       "{'acknowledged': True}"
+       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 7,
@ -297,7 +281,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.13"
  }
 },
 "nbformat": 4,
--- a/tests/notebook/test_metrics.ipynb
+++ b/tests/notebook/test_metrics.ipynb
@ -33,10 +33,10 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice                       640.387285\n",
+       "AvgTicketPrice                       639.433214\n",
       "Cancelled                                 False\n",
-       "dayOfWeek                                     3\n",
+       "dayOfWeek                                     2\n",
-       "timestamp         2018-01-21 23:43:19.256498944\n",
+       "timestamp         2018-01-21 20:23:15.159835648\n",
       "dtype: object"
      ]
     },
@ -58,9 +58,9 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    640.387285\n",
+       "AvgTicketPrice    639.433214\n",
       "Cancelled           0.000000\n",
-       "dayOfWeek           3.000000\n",
+       "dayOfWeek           2.935777\n",
       "dtype: float64"
      ]
     },
@ -82,10 +82,10 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice                       640.387285\n",
+       "AvgTicketPrice                       639.433214\n",
       "Cancelled                                 False\n",
-       "dayOfWeek                                     3\n",
+       "dayOfWeek                                     2\n",
-       "timestamp         2018-01-21 23:43:19.256498944\n",
+       "timestamp         2018-01-21 20:23:15.159835648\n",
       "DestCountry                                 NaN\n",
       "dtype: object"
      ]
@ -108,7 +108,7 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    213.430365\n",
+       "AvgTicketPrice    213.453156\n",
       "dayOfWeek           2.000000\n",
       "dtype: float64"
      ]
@ -131,7 +131,7 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    213.430365\n",
+       "AvgTicketPrice    213.453156\n",
       "dayOfWeek           2.000000\n",
       "dtype: float64"
      ]
@ -154,7 +154,7 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    213.430365\n",
+       "AvgTicketPrice    213.453156\n",
       "Cancelled                NaN\n",
       "dayOfWeek                2.0\n",
       "timestamp                NaT\n",
@ -189,7 +189,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.13"
  }
 },
 "nbformat": 4,
--- a/tests/notebook/test_plotting.ipynb
+++ b/tests/notebook/test_plotting.ipynb
--- a/tests/series/test_filter_pytest.py
+++ b/tests/series/test_filter_pytest.py
@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
        ed_ser = ed_flights_small.filter(items=items, axis=0)
        pd_ser = pd_flights_small.filter(items=items, axis=0)
-        assert_pandas_eland_series_equal(pd_ser, ed_ser)
+        # For an empty Series, eland will say the datatype it knows from the Elastic index
        # Pandas however will state empty as the datatype
        assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
    def test_flights_filter_index_like_and_regex(self):
        ed_flights_small = self.ed_flights_small()["FlightDelayType"]
--- a/tests/series/test_metrics_pytest.py
+++ b/tests/series/test_metrics_pytest.py
@ -24,6 +24,7 @@ import pandas as pd
 import pytest
 from pandas.testing import assert_series_equal
 from eland.common import PANDAS_VERSION
 from tests.common import TestData, assert_almost_equal
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
        ed_flights = self.ed_flights()["AvgTicketPrice"]
        for func in self.all_funcs:
            if PANDAS_VERSION[0] >= 2 and func == "mad":
                continue
            pd_metric = getattr(pd_flights, func)()
            ed_metric = getattr(ed_flights, func)()
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
            ed_ecommerce = self.ed_ecommerce()[column]
            for func in self.all_funcs:
                if PANDAS_VERSION[0] >= 2 and func == "mad":
                    continue
                pd_metric = getattr(pd_ecommerce, func)()
                ed_metric = getattr(ed_ecommerce, func)(
                    **({"numeric_only": True} if (func != "nunique") else {})
--- a/utils/license-headers.py
+++ b/utils/license-headers.py
@ -65,7 +65,7 @@ def find_files_to_fix(sources: List[str]) -> Iterator[str]:
 def does_file_need_fix(filepath: str) -> bool:
    if not filepath.endswith(".py"):
        return False
-    with open(filepath, mode="r") as f:
+    with open(filepath) as f:
        first_license_line = None
        for line in f:
            if line == license_header_lines[0]:
@ -82,7 +82,7 @@ def does_file_need_fix(filepath: str) -> bool:
 def add_header_to_file(filepath: str) -> None:
-    with open(filepath, mode="r") as f:
+    with open(filepath) as f:
        lines = list(f)
    i = 0
    for i, line in enumerate(lines):
Author	SHA1	Message	Date
David Kyle	bebb9d52e5	Upgrade Sentence Transformers to v5 (#801 ) Sentence Transformers v5 adds support for sparse embedding models and is now necessary for importing sparse models such as https://huggingface.co/naver/splade-v3-distilbert.	2025-07-23 08:07:29 +01:00
Colleen McGinnis	117f61b010	add products to docset.yml (#797 )	2025-07-23 10:32:54 +04:00
Jan Calanog	cef4710695	docs-builder: add `pull-requests: write` permission to docs-build workflow (#800 )	2025-06-23 15:39:36 +04:00
Quentin Pradet	44ead02b05	Fix lint (#798 )	2025-06-05 15:52:19 +04:00
Miguel Grinberg	cb7c4fb122	Update README.md (#796 ) Update Pandas support to include v2	2025-05-16 15:56:20 +01:00
Quentin Pradet	9e8f164677	Release 9.0.1	2025-04-30 17:25:32 +04:00
Quentin Pradet	3c3ffd7403	Forbid Elasticsearch 8 client or server (#780 )	2025-04-30 16:25:33 +04:00
David Kyle	f5c2dcfc9d	Remove version checks in test (#792 )	2025-04-30 16:24:05 +04:00
David Kyle	878cde6126	Upgrade PyTorch to 2.5.1 (#785 ) PyTorch was upgraded to 2.5.1 in ml-cpp on the 8.18 and 9.0 branches in elastic/ml-cpp#2800	2025-04-30 10:57:45 +01:00
Mark J. Hoy	ec45c395fd	add 9.0.1 for LTR rescoring (#790 )	2025-04-25 08:19:23 -04:00
Quentin Pradet	00dc55b3bd	Update instructions to run ML tests with Elasticsearch (#781 ) * Update instructions to run ML tests with Elasticsearch * Update CONTRIBUTING.md Co-authored-by: David Kyle <david.kyle@elastic.co> --------- Co-authored-by: David Kyle <david.kyle@elastic.co>	2025-04-24 15:42:00 +04:00
Quentin Pradet	8147eb517a	Allow lightgbm 4.6.0 (#782 )	2025-04-24 15:40:39 +04:00
Quentin Pradet	4728d9b648	Run PyTorch tests on 3.12 too (#779 ) PyTorch 2.3.1 does support Python 3.12.	2025-04-24 14:26:50 +04:00
Mark J. Hoy	51a2b9cc19	Add 9.1.0 Snapshot to Build and Fix test_ml_model Tests to Normalized Expected Scores if Min Score is Less Than Zero (#777 ) * normalized expected scores if min is < 0 * only normalize scores for ES after 8.19+ / 9.1+ * add 9.1.0 snapshot to build matrix * get min score from booster trees * removing typing on function definition * properly flatten our tree leaf scores * simplify getting min score * debugging messages * get all the matches in better way * Fix model score normalization. * lint * lint again * lint; correct return for bounds map/list * revert to Aurelian's fix * re-lint :/ --------- Co-authored-by: Aurelien FOUCRET <aurelien.foucret@elastic.co>	2025-04-23 15:53:32 +00:00
David Kyle	a9c36927f6	Fix tokeniser for DeBERTa models (#769 )	2025-04-23 09:10:02 +01:00
Quentin Pradet	87380ef716	Release 9.0.0 Co-authored-by: Miguel Grinberg <miguel.grinberg@gmail.com>	2025-04-16 15:21:04 +04:00
Quentin Pradet	9ca76d7888	Revert "Release 8.18.0" (#774 ) This reverts commit ced3cdfe32bd04e3d127b18f66f9b143b2956564.	2025-04-16 14:53:51 +04:00
Quentin Pradet	ced3cdfe32	Release 8.18.0	2025-04-15 20:52:30 +04:00
kosabogi	87379c53de	[DOCS] Clean up CLI examples in ML docs (#766 ) * [DOCS] Clean up CLI examples in ML docs * Fixes spaces * Rebuild for testing copy-paste	2025-04-07 10:06:37 +02:00
Paulo	1ddae81769	Update the documentation to reflect the partial support of eland/sckitlearn (#768 )	2025-04-03 15:56:23 +02:00
Colleen McGinnis	9302bef7db	remove unused substitutions (#763 )	2025-03-21 09:24:09 -05:00
Colleen McGinnis	ca64672fd7	[docs] Migrate docs from AsciiDoc to Markdown (#762 ) Co-authored-by: István Zoltán Szabó <szabosteve@gmail.com>	2025-02-26 17:48:16 +01:00
Colleen McGinnis	6692251d9e	add the new ci checks (#761 )	2025-02-26 16:40:43 +01:00
David Kyle	ee4d701aa4	Upgrade transformers to 4.47 (#752 ) The upgrade fixes a crash tracing the baai/bge-m3 model	2025-02-12 17:30:45 +00:00
Quentin Pradet	acdeeeded2	Allow nox 2025.02.09 (#754 )	2025-02-12 16:33:59 +04:00
Quentin Pradet	8350f06ea8	Fix pipeline labels (#751 )	2025-02-12 15:07:51 +04:00
Quentin Pradet	e846fb7697	Add backport action (#750 )	2025-02-12 15:07:43 +04:00
Quentin Pradet	c4ac64e3a0	Allow scikit-learn 1.5 to address CVE-2024-5206 (#729 )	2025-02-12 14:34:13 +04:00
Jan Calanog	214c4645e9	github-action: Add AsciiDoc freeze warning (#748 ) * github-action: Add AsciiDoc freeze warning * Update .github/workflows/comment-on-asciidoc-changes.yml	2025-02-12 07:45:07 +04:00
Quentin Pradet	871e52b37a	Pin nox to avoid session.env issue (#753 )	2025-02-11 18:36:57 +04:00
Quentin Pradet	aa5196edee	Switch to black's 2025 code style (#749 )	2025-02-11 14:57:16 +04:00
Bart Broere	75c57b0775	Support Pandas 2 (#742 ) * Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>	2025-02-04 17:43:43 +04:00
Valeriy Khakhutskyy	77589b26b8	Remove ML model export as sklearn Pipeline and clean up code (#744 ) * Revert "[ML] Export ML model as sklearn Pipeline (#509)" This reverts commit 0576114a1d886eafabca3191743a9bea9dc20b1a. * Keep useful changes * formatting * Remove obsolete test matrix configuration and update version references in documentation and Noxfile * formatting --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>	2025-02-04 11:36:50 +04:00
Bart Broere	9b5badb941	Drop Python 3.8 support and introduce Python 3.12 CI/CD (#743 )	2025-01-22 21:55:57 +04:00
Quentin Pradet	f99adce23f	Build documentation using Docker again (#746 )	2025-01-14 18:16:39 +04:00
Quentin Pradet	7774a506ae	Release 8.17.0	2025-01-07 10:58:59 +04:00
Dai Sugimori	82492fe771	Expansion support (#740 )	2024-11-23 00:20:58 +09:00
Quentin Pradet	04102f2a4e	Release 8.16.0	2024-11-14 09:07:39 +04:00
Valeriy Khakhutskyy	9aec8fc751	Add deprecation warning for ESGradientBoostingModel subclasses (#738 ) Introduce a warning indicating that exporting data frame analytics models as ESGradientBoostingModel subclasses is deprecated and will be removed in version 9.0.0. The implementation of ESGradientBoostingModel relies on importing undocumented private classes that were changed in 1.4 to https://github.com/scikit-learn/scikit-learn/pull/26278. This dependency makes the code difficult to maintain, while the functionality is not widely used by users. Therefore, we will deprecate this functionality in 8.16 and remove it completely in 9.0.0. --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>	2024-11-11 14:26:11 +01:00
Quentin Pradet	79d9a6ae29	Release 8.15.4	2024-10-18 10:52:52 +04:00
Quentin Pradet	939f4d672c	Revert "Add feedback request to README" (#735 )	2024-10-18 08:06:42 +04:00
Quentin Pradet	1312e96220	Revert "Allow reading Elasticsearch certs in Wolfi image" (#734 ) This reverts commit 5dabe9c0996e62d8bf4b493dcea7d4bc161dead4.	2024-10-11 16:52:41 +04:00
Quentin Pradet	2916b51fa7	Release 8.15.3	2024-10-09 16:16:52 +04:00
Quentin Pradet	5dabe9c099	Allow reading Elasticsearch certs in Wolfi image (#732 ) The config/certs directory of Elasticsearch is not readable by other users and groups. This work in the public image, which uses the root user, but the Wolfi image does not. Using the same user id fixes the problem.	2024-10-09 15:37:05 +04:00
Max Hniebergall	06b65e211e	Add support for DeBERTa-V2 tokenizer (#717 )	2024-10-03 14:04:19 -04:00
Quentin Pradet	a45c7bc357	Release 8.15.2	2024-10-02 13:54:03 +04:00
Quentin Pradet	d1e533ffb9	Fix Docker image build on Linux (#728 ) * Fix Docker image build on Linux * Build Docker images in CI * Fix bash syntax * Only load, not push * Parallelize docker build It's currently the slowest step. * Only build Linux images	2024-10-02 10:33:35 +04:00
Quentin Pradet	a83ce20fcc	Release 8.15.1	2024-10-01 15:31:24 +04:00
David Kyle	03af8a6319	Fix path in docker model upload example (#726 )	2024-10-01 08:53:28 +01:00
David Kyle	5253501704	Upgrade PyTorch to version 2.3.1 (#718 ) Upgrades the PyTorch, transformers and sentence transformer requirements. Elasticsearch has upgraded to PyTorch to 2.3.1 in 8.16 and 8.15.2. For compatibility reasons Eland will refuse to upload to an Elasticsearch cluster that has is using an earlier version of PyTorch.	2024-09-30 10:22:02 +01:00
David Kyle	ec66b5f320	Add ES 8.16 and 8.15.2 to test matrix (#725 )	2024-09-27 13:37:31 +01:00
Quentin Pradet	64d05e4c68	Restore public Dockerfile (#722 )	2024-09-25 12:49:46 +04:00
Quentin Pradet	f79180be42	Migrate to Wolfi base Docker image (#720 )	2024-09-03 18:02:08 +04:00
Miguel Grinberg	0ce3db26e8	Release 8.15.0 (#715 ) * Release 8.15.0 * update release notes	2024-08-13 09:47:48 +01:00
David Kyle	5a76f826df	Add note about using text_similarity for rerank to the CLI (#716 )	2024-08-12 14:40:12 +01:00
David Kyle	fd8886da6a	Default truncation to `second` for text similarity the task type(#713 ) In reranking the first input (the query) is generally shorter. In this case it makes more sense to truncate the second input (the document text)	2024-08-05 11:47:15 +01:00
Aurélien FOUCRET	bee6d0e1f7	Remove input fields from exported LTR models (#708 )	2024-07-05 14:31:22 +02:00
Bart Broere	f18aa35e8e	Deal with the possibility of lists (#707 )	2024-06-28 22:25:47 +04:00
Quentin Pradet	56a46d0f85	Rename Buildkite team from clients-team to devtools-team (#702 )	2024-06-12 11:39:25 +04:00
Quentin Pradet	c497683064	Quote remaining eland[pytorch] for ZSH users (#701 )	2024-06-10 16:50:03 +00:00
Quentin Pradet	0ddc21b895	Release 8.14.0	2024-06-10 15:56:43 +04:00
István Zoltán Szabó	5a3e7d78b3	[DOCS] Completes the list of available NLP task types. (#699 )	2024-06-10 12:30:07 +02:00
Bart Broere	1014ecdb39	Fix non _source fields missing from the result hits (#693 )	2024-06-10 11:09:52 +04:00
David Kyle	632074c0f0	Make eland_import_hub_model script compatible with serverless (#698 ) Checks for build_flavor == serverless rather than a version	2024-06-07 14:46:12 +01:00
Bart Broere	35a96ab3f0	Fix missing method str.removeprefix in Python 3.8 (#695 )	2024-05-24 10:25:04 +04:00
Quentin Pradet	116416b3e8	Stop duplicating requirements (#691 )	2024-05-14 15:59:39 +04:00
Ashok Kumar	5b728c29c1	Replace check for Elasticsearch to str/list in ensure_es_client (#690 )	2024-05-04 09:01:31 +04:00
Quentin Pradet	e76b32eee2	Release 8.13.1	2024-05-03 09:20:45 +04:00
Quentin Pradet	fd38e26df1	Support HTTP proxies in eland_import_hub_model (#688 ) * Document TLS/SSL options for import script * Mention --help option * Add HTTP proxy support * Mention HTTP_PROXY too --------- Co-authored-by: David Kyle <david.kyle@elastic.co>	2024-05-02 21:03:44 +04:00
Quentin Pradet	f7f6e0aba9	Document TLS/SSL options for import script (#667 )	2024-05-02 18:06:40 +04:00
Aurélien FOUCRET	9cea2385e6	Work around LTR model cache in tests (#685 )	2024-04-08 14:00:36 +04:00
Quentin Pradet	1921792df8	Release 8.13.0	2024-03-27 18:18:21 +04:00
David Kyle	c16e36c051	Add Python 3.11 to support matrix (#681 )	2024-03-27 10:34:35 +00:00
David Kyle	ae0bba34c6	Upgrade torch to 2.1.2 (#671 ) Compatible with Elasticsearch 8.13 where the same upgrade has been made	2024-03-26 10:06:50 +00:00
Iulia Feroli	aaec995b1b	Update overview.asciidoc to replace tuple reference to API Key (#678 )	2024-03-21 15:31:19 +04:00
Iulia Feroli	de83f3f905	Improve PyTorch installation instructions (#677 )	2024-03-21 14:21:32 +04:00
David Kyle	8e8c49ddbf	Mute the Learning to Rank tests (#676 )	2024-03-21 10:13:31 +00:00
David Kyle	5d34dc3cc4	Add override option to specify the model's max input size(#674 ) If the max input size cannot be found in the configuration the user can specify it as a parameter to the eland_import_hub_model script	2024-03-20 10:02:43 +00:00
Bart Broere	9b335315bb	Mirror pandas' to_csv lineterminator instead of line_terminator (#595 ) * Mirror pandas' to_csv lineterminator instead of line_terminator (even though it looks a little weird perhaps) * Remove squeeze argument * Revert "Merge branch 'remove-squeeze-argument' into patch-2" This reverts commit 8b9ab5647e244d78ec3471b80ee7c42e019cf347. * Don't remove the parameter yet since people might use it * Add pending deprecation warning --------- Co-authored-by: David Kyle <david.kyle@elastic.co>	2024-02-23 14:23:58 +04:00
Quentin Pradet	28eda95ba9	Add feedback request to README (#665 )	2024-02-15 15:23:45 +04:00
Quentin Pradet	f4b30753ad	Fix CI badge in README (#664 )	2024-02-15 15:14:16 +04:00
Bart Broere	33cf029efe	Implement eland.DataFrame.to_json (#661 ) Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>	2024-02-15 11:32:54 +04:00
Aurélien FOUCRET	9d492b03aa	Release 8.12.1 Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>	2024-02-01 10:50:18 +04:00
Quentin Pradet	fd2ceab846	Run Buildkite docs jobs in pull requests from forks (#652 )	2024-01-31 20:55:19 +04:00
Quentin Pradet	02190e74e7	Switch to 2024 black style (#657 )	2024-01-31 14:47:19 +04:00
Aurélien FOUCRET	2a6a4b1f06	Fix missing value support for XGBRanker. (#654 ) * Fix missing value support for XGBRanker. * lint * Sort expected scores * lint	2024-01-23 18:42:24 +01:00