mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Compare commits
45 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
cef4710695 | ||
|
44ead02b05 | ||
|
cb7c4fb122 | ||
|
9e8f164677 | ||
|
3c3ffd7403 | ||
|
f5c2dcfc9d | ||
|
878cde6126 | ||
|
ec45c395fd | ||
|
00dc55b3bd | ||
|
8147eb517a | ||
|
4728d9b648 | ||
|
51a2b9cc19 | ||
|
a9c36927f6 | ||
|
87380ef716 | ||
|
9ca76d7888 | ||
|
ced3cdfe32 | ||
|
87379c53de | ||
|
1ddae81769 | ||
|
9302bef7db | ||
|
ca64672fd7 | ||
|
6692251d9e | ||
|
ee4d701aa4 | ||
|
acdeeeded2 | ||
|
8350f06ea8 | ||
|
e846fb7697 | ||
|
c4ac64e3a0 | ||
|
214c4645e9 | ||
|
871e52b37a | ||
|
aa5196edee | ||
|
75c57b0775 | ||
|
77589b26b8 | ||
|
9b5badb941 | ||
|
f99adce23f | ||
|
7774a506ae | ||
|
82492fe771 | ||
|
04102f2a4e | ||
|
9aec8fc751 | ||
|
79d9a6ae29 | ||
|
939f4d672c | ||
|
1312e96220 | ||
|
2916b51fa7 | ||
|
5dabe9c099 | ||
|
06b65e211e | ||
|
a45c7bc357 | ||
|
d1e533ffb9 |
@ -1,6 +1,8 @@
|
||||
ARG PYTHON_VERSION=3.9
|
||||
FROM python:${PYTHON_VERSION}
|
||||
|
||||
ENV FORCE_COLOR=1
|
||||
|
||||
WORKDIR /code/eland
|
||||
RUN python -m pip install nox
|
||||
|
||||
|
11
.buildkite/build-docker-images.sh
Normal file
11
.buildkite/build-docker-images.sh
Normal file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eo pipefail
|
||||
export LC_ALL=en_US.UTF-8
|
||||
|
||||
echo "--- Building the Wolfi image"
|
||||
# Building the linux/arm64 image takes about one hour on Buildkite, which is too slow
|
||||
docker build --file Dockerfile.wolfi .
|
||||
|
||||
echo "--- Building the public image"
|
||||
docker build .
|
@ -1,15 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y pandoc python3 python3-pip
|
||||
python3 -m pip install nox
|
||||
/opt/buildkite-agent/.local/bin/nox -s docs
|
||||
|
||||
# I couldn't make this work, for some reason pandoc is not found in the docker container repository:
|
||||
# docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
|
||||
# docker run \
|
||||
# --name doc_build \
|
||||
# --rm \
|
||||
# elastic/eland \
|
||||
# apt-get update && \
|
||||
# sudo apt-get install --yes pandoc && \
|
||||
# nox -s docs
|
||||
docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
|
||||
docker run \
|
||||
--name doc_build \
|
||||
--rm \
|
||||
elastic/eland \
|
||||
bash -c "apt-get update && apt-get install --yes pandoc && nox -s docs"
|
||||
|
@ -15,24 +15,36 @@ steps:
|
||||
machineType: "n2-standard-2"
|
||||
commands:
|
||||
- ./.buildkite/build-documentation.sh
|
||||
- label: "Eland :python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }}"
|
||||
- label: ":docker: Build Wolfi image"
|
||||
env:
|
||||
PYTHON_VERSION: 3.11-bookworm
|
||||
agents:
|
||||
provider: "gcp"
|
||||
machineType: "n2-standard-2"
|
||||
commands:
|
||||
- ./.buildkite/build-docker-images.sh
|
||||
- label: ":python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }} :pandas: {{ matrix.pandas }}"
|
||||
agents:
|
||||
provider: "gcp"
|
||||
machineType: "n2-standard-4"
|
||||
env:
|
||||
PYTHON_VERSION: "{{ matrix.python }}"
|
||||
PANDAS_VERSION: '1.5.0'
|
||||
PANDAS_VERSION: "{{ matrix.pandas }}"
|
||||
TEST_SUITE: "xpack"
|
||||
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
|
||||
matrix:
|
||||
setup:
|
||||
# Python and pandas versions need to be added to the nox configuration too
|
||||
# (in the decorators of the test method in noxfile.py)
|
||||
pandas:
|
||||
- '1.5.0'
|
||||
- '2.2.3'
|
||||
python:
|
||||
- '3.12'
|
||||
- '3.11'
|
||||
- '3.10'
|
||||
- '3.9'
|
||||
- '3.8'
|
||||
stack:
|
||||
- '8.16.0-SNAPSHOT'
|
||||
- '8.15.2'
|
||||
- '8.14.1'
|
||||
- '9.0.0'
|
||||
- '9.1.0-SNAPSHOT'
|
||||
command: ./.buildkite/run-tests
|
||||
|
@ -1,5 +1,4 @@
|
||||
# docs and example
|
||||
docs/*
|
||||
example/*
|
||||
|
||||
# Git
|
||||
@ -18,9 +17,6 @@ dist/
|
||||
# Build folder
|
||||
build/
|
||||
|
||||
# docs
|
||||
docs/*
|
||||
|
||||
# pytest results
|
||||
tests/dataframe/results/*csv
|
||||
result_images/
|
||||
|
26
.github/workflows/backport.yml
vendored
Normal file
26
.github/workflows/backport.yml
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
name: Backport
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- closed
|
||||
- labeled
|
||||
|
||||
jobs:
|
||||
backport:
|
||||
name: Backport
|
||||
runs-on: ubuntu-latest
|
||||
# Only react to merged PRs for security reasons.
|
||||
# See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target.
|
||||
if: >
|
||||
github.event.pull_request.merged
|
||||
&& (
|
||||
github.event.action == 'closed'
|
||||
|| (
|
||||
github.event.action == 'labeled'
|
||||
&& contains(github.event.label.name, 'backport')
|
||||
)
|
||||
)
|
||||
steps:
|
||||
- uses: tibdex/backport@9565281eda0731b1d20c4025c43339fb0a23812e # v2.0.4
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
19
.github/workflows/docs-build.yml
vendored
Normal file
19
.github/workflows/docs-build.yml
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
name: docs-build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request_target: ~
|
||||
merge_group: ~
|
||||
|
||||
jobs:
|
||||
docs-preview:
|
||||
uses: elastic/docs-builder/.github/workflows/preview-build.yml@main
|
||||
with:
|
||||
path-pattern: docs/**
|
||||
permissions:
|
||||
deployments: write
|
||||
id-token: write
|
||||
contents: read
|
||||
pull-requests: write
|
14
.github/workflows/docs-cleanup.yml
vendored
Normal file
14
.github/workflows/docs-cleanup.yml
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
name: docs-cleanup
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- closed
|
||||
|
||||
jobs:
|
||||
docs-preview:
|
||||
uses: elastic/docs-builder/.github/workflows/preview-cleanup.yml@main
|
||||
permissions:
|
||||
contents: none
|
||||
id-token: write
|
||||
deployments: write
|
@ -9,3 +9,6 @@ python:
|
||||
install:
|
||||
- path: .
|
||||
- requirements: docs/requirements-docs.txt
|
||||
|
||||
sphinx:
|
||||
configuration: docs/sphinx/conf.py
|
||||
|
@ -2,11 +2,56 @@
|
||||
Changelog
|
||||
=========
|
||||
|
||||
9.0.1 (2025-04-30)
|
||||
------------------
|
||||
|
||||
* Forbid Elasticsearch 8 client or server (`#780 <https://github.com/elastic/eland/pull/780>`_)
|
||||
* Fix DeBERTa tokenization (`#769 <https://github.com/elastic/eland/pull/769>`_)
|
||||
* Upgrade PyTorch to 2.5.1 (`#785 <https://github.com/elastic/eland/pull/785>`_)
|
||||
* Upgrade LightGBM to 4.6.0 (`#782 <https://github.com/elastic/eland/pull/782>`_)
|
||||
|
||||
9.0.0 (2025-04-15)
|
||||
------------------
|
||||
|
||||
* Drop Python 3.8, Support Python 3.12 (`#743 <https://github.com/elastic/eland/pull/743>`_)
|
||||
* Support Pandas 2 (`#742 <https://github.com/elastic/eland/pull/742>`_)
|
||||
* Upgrade transformers to 4.47 (`#752 <https://github.com/elastic/eland/pull/752>`_)
|
||||
* Remove ML model export as sklearn Pipeline (`#744 <https://github.com/elastic/eland/pull/744>`_)
|
||||
* Allow scikit-learn 1.5 (`#729 <https://github.com/elastic/eland/pull/729>`_)
|
||||
* Migrate docs from AsciiDoc to Markdown (`#762 <https://github.com/elastic/eland/pull/762>`_)
|
||||
|
||||
8.17.0 (2025-01-07)
|
||||
-------------------
|
||||
|
||||
* Support sparse embedding models such as SPLADE-v3-DistilBERT (`#740 <https://github.com/elastic/eland/pull/740>`_)
|
||||
|
||||
8.16.0 (2024-11-13)
|
||||
-------------------
|
||||
|
||||
* Add deprecation warning for ESGradientBoostingModel subclasses (`#738 <https://github.com/elastic/eland/pull/738>`_)
|
||||
|
||||
8.15.4 (2024-10-17)
|
||||
-------------------
|
||||
|
||||
* Revert "Allow reading Elasticsearch certs in Wolfi image" (`#734 <https://github.com/elastic/eland/pull/734>`_)
|
||||
|
||||
8.15.3 (2024-10-09)
|
||||
-------------------
|
||||
|
||||
* Added support for DeBERTa-V2 tokenizer (`#717 <https://github.com/elastic/eland/pull/717>`_)
|
||||
* Fixed ``--ca-cert`` with a shared Elasticsearch Docker volume (`#732 <https://github.com/elastic/eland/pull/732>`_)
|
||||
|
||||
8.15.2 (2024-10-02)
|
||||
-------------------
|
||||
|
||||
* Fixed Docker image build (`#728 <https://github.com/elastic/eland/pull/728>`_)
|
||||
|
||||
8.15.1 (2024-10-01)
|
||||
-------------------
|
||||
|
||||
* Upgrade PyTorch to version 2.3.1, which is compatible with Elasticsearch 8.15.2 or above
|
||||
* Migrate to distroless Wolfi base Docker image
|
||||
* Upgraded PyTorch to version 2.3.1, which is compatible with Elasticsearch 8.15.2 or above (`#718 <https://github.com/elastic/eland/pull/718>`_)
|
||||
* Migrated to distroless Wolfi base Docker image (`#720 <https://github.com/elastic/eland/pull/720>`_)
|
||||
|
||||
|
||||
8.15.0 (2024-08-12)
|
||||
-------------------
|
||||
|
@ -78,9 +78,15 @@ Once your changes and tests are ready to submit for review:
|
||||
# Run Auto-format, lint, mypy type checker for your changes
|
||||
$ nox -s format
|
||||
|
||||
# Run the test suite
|
||||
$ pytest --doctest-modules eland/ tests/
|
||||
$ pytest --nbval tests/notebook/
|
||||
# Launch Elasticsearch with a trial licence and ML enabled
|
||||
$ docker run --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.license.self_generated.type=trial" docker.elastic.co/elasticsearch/elasticsearch:9.0.0
|
||||
|
||||
# See all test suites
|
||||
$ nox -l
|
||||
# Run a specific test suite
|
||||
$ nox -rs "test-3.12(pandas_version='2.2.3')"
|
||||
# Run a specific test
|
||||
$ nox -rs "test-3.12(pandas_version='2.2.3')" -- -k test_learning_to_rank
|
||||
|
||||
```
|
||||
|
||||
@ -169,7 +175,7 @@ currently using a minimum version of PyCharm 2019.2.4.
|
||||
* Setup Elasticsearch instance with docker
|
||||
|
||||
``` bash
|
||||
> ELASTICSEARCH_VERSION=elasticsearch:8.x-SNAPSHOT BUILDKITE=false .buildkite/run-elasticsearch.sh
|
||||
> ELASTICSEARCH_VERSION=elasticsearch:8.17.0 BUILDKITE=false .buildkite/run-elasticsearch.sh
|
||||
```
|
||||
|
||||
* Now check `http://localhost:9200`
|
||||
@ -203,7 +209,7 @@ currently using a minimum version of PyCharm 2019.2.4.
|
||||
* To test specific versions of Python run
|
||||
|
||||
``` bash
|
||||
> nox -s test-3.8
|
||||
> nox -s test-3.12
|
||||
```
|
||||
|
||||
### Documentation
|
||||
|
@ -18,7 +18,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||
torch==2.1.2+cpu .[all]; \
|
||||
torch==2.5.1+cpu .[all]; \
|
||||
else \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check \
|
||||
|
@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||
torch==2.1.2+cpu .[all]; \
|
||||
torch==2.5.1+cpu .[all]; \
|
||||
else \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check \
|
||||
|
17
README.md
17
README.md
@ -53,8 +53,9 @@ $ conda install -c conda-forge eland
|
||||
|
||||
### Compatibility
|
||||
|
||||
- Supports Python 3.8, 3.9, 3.10, 3.11 and Pandas 1.5
|
||||
- Supports Elasticsearch clusters that are 7.11+, recommended 8.13 or later for all features to work.
|
||||
- Supports Python 3.9, 3.10, 3.11 and 3.12.
|
||||
- Supports Pandas 1.5 and 2.
|
||||
- Supports Elasticsearch 8+ clusters, recommended 8.16 or later for all features to work.
|
||||
If you are using the NLP with PyTorch feature make sure your Eland minor version matches the minor
|
||||
version of your Elasticsearch cluster. For all other features it is sufficient for the major versions
|
||||
to match.
|
||||
@ -277,15 +278,3 @@ Downloading: 100%|██████████| 249M/249M [00:23<00:00, 11.2MB
|
||||
>>> ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
|
||||
100%|██████████| 63/63 [00:12<00:00, 5.02it/s]
|
||||
```
|
||||
|
||||
## Feedback 🗣️
|
||||
|
||||
The engineering team here at Elastic is looking for developers to participate in
|
||||
research and feedback sessions to learn more about how you use Eland and what
|
||||
improvements we can make to their design and your workflow. If you're interested
|
||||
in sharing your insights into developer experience and language client design,
|
||||
please fill out this [short form](https://forms.gle/bYZwDQXijfhfwshn9).
|
||||
Depending on the number of responses we get, we may either contact you for a 1:1
|
||||
conversation or a focus group with other developers who use the same client.
|
||||
Thank you in advance - your feedback is crucial to improving the user experience
|
||||
for all Elasticsearch developers!
|
||||
|
8
docs/docset.yml
Normal file
8
docs/docset.yml
Normal file
@ -0,0 +1,8 @@
|
||||
project: 'Eland Python client'
|
||||
cross_links:
|
||||
- docs-content
|
||||
toc:
|
||||
- toc: reference
|
||||
subs:
|
||||
es: "Elasticsearch"
|
||||
ml: "machine learning"
|
@ -1,14 +0,0 @@
|
||||
= Eland Python Client
|
||||
|
||||
:doctype: book
|
||||
|
||||
include::{asciidoc-dir}/../../shared/versions/stack/{source_branch}.asciidoc[]
|
||||
include::{asciidoc-dir}/../../shared/attributes.asciidoc[]
|
||||
|
||||
include::overview.asciidoc[]
|
||||
|
||||
include::installation.asciidoc[]
|
||||
|
||||
include::dataframes.asciidoc[]
|
||||
|
||||
include::machine-learning.asciidoc[]
|
@ -1,16 +0,0 @@
|
||||
[[installation]]
|
||||
== Installation
|
||||
|
||||
Eland can be installed with https://pip.pypa.io[pip] from https://pypi.org/project/eland[PyPI]. We recommend https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/[using a virtual environment] when installing with pip:
|
||||
|
||||
[source,sh]
|
||||
-----------------------------
|
||||
$ python -m pip install eland
|
||||
-----------------------------
|
||||
|
||||
Alternatively, Eland can be installed with https://docs.conda.io[Conda] from https://anaconda.org/conda-forge/eland[Conda Forge]:
|
||||
|
||||
[source,sh]
|
||||
------------------------------------
|
||||
$ conda install -c conda-forge eland
|
||||
------------------------------------
|
@ -1,242 +0,0 @@
|
||||
[[machine-learning]]
|
||||
== Machine Learning
|
||||
|
||||
[discrete]
|
||||
[[ml-trained-models]]
|
||||
=== Trained models
|
||||
|
||||
Eland allows transforming trained models from scikit-learn, XGBoost,
|
||||
and LightGBM libraries to be serialized and used as an inference
|
||||
model in {es}.
|
||||
|
||||
[source,python]
|
||||
------------------------
|
||||
>>> from xgboost import XGBClassifier
|
||||
>>> from eland.ml import MLModel
|
||||
|
||||
# Train and exercise an XGBoost ML model locally
|
||||
>>> xgb_model = XGBClassifier(booster="gbtree")
|
||||
>>> xgb_model.fit(training_data[0], training_data[1])
|
||||
|
||||
>>> xgb_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
|
||||
# Import the model into Elasticsearch
|
||||
>>> es_model = MLModel.import_model(
|
||||
es_client="http://localhost:9200",
|
||||
model_id="xgb-classifier",
|
||||
model=xgb_model,
|
||||
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
||||
)
|
||||
|
||||
# Exercise the ML model in Elasticsearch with the training data
|
||||
>>> es_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
------------------------
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch]]
|
||||
=== Natural language processing (NLP) with PyTorch
|
||||
|
||||
IMPORTANT: You need to install the appropriate version of PyTorch to import an
|
||||
NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version.
|
||||
|
||||
For NLP tasks, Eland enables you to import PyTorch models into {es}. Use the
|
||||
`eland_import_hub_model` script to download and install supported
|
||||
https://huggingface.co/transformers[transformer models] from the
|
||||
https://huggingface.co/models[Hugging Face model hub]. For example:
|
||||
|
||||
[source,bash]
|
||||
------------------------
|
||||
$ eland_import_hub_model <authentication> \ <1>
|
||||
--url http://localhost:9200/ \ <2>
|
||||
--hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
|
||||
--task-type ner \ <4>
|
||||
--start
|
||||
------------------------
|
||||
<1> Use an authentication method to access your cluster. Refer to <<ml-nlp-pytorch-auth>>.
|
||||
<2> The cluster URL. Alternatively, use `--cloud-id`.
|
||||
<3> Specify the identifier for the model in the Hugging Face model hub.
|
||||
<4> Specify the type of NLP task. Supported values are `fill_mask`, `ner`,
|
||||
`question_answering`, `text_classification`, `text_embedding`, `text_expansion`,
|
||||
`text_similarity` and `zero_shot_classification`.
|
||||
|
||||
For more information about the available options, run `eland_import_hub_model` with the `--help` option.
|
||||
|
||||
[source,bash]
|
||||
------------------------
|
||||
$ eland_import_hub_model --help
|
||||
------------------------
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-docker]]
|
||||
==== Import model with Docker
|
||||
|
||||
IMPORTANT: To use the Docker container, you need to clone the Eland repository: https://github.com/elastic/eland
|
||||
|
||||
If you want to use Eland without installing it, you can use the Docker image:
|
||||
|
||||
You can use the container interactively:
|
||||
|
||||
```bash
|
||||
$ docker run -it --rm --network host docker.elastic.co/eland/eland
|
||||
```
|
||||
|
||||
Running installed scripts is also possible without an interactive shell, for example:
|
||||
|
||||
```bash
|
||||
docker run -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url $ELASTICSEARCH_URL \
|
||||
--hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
|
||||
--start
|
||||
```
|
||||
|
||||
Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-air-gapped]]
|
||||
==== Install models in an air-gapped environment
|
||||
|
||||
You can install models in a restricted or closed network by pointing the
|
||||
`eland_import_hub_model` script to local files.
|
||||
|
||||
For an offline install of a Hugging Face model, the model first needs to be
|
||||
cloned locally, Git and https://git-lfs.com/[Git Large File Storage] are
|
||||
required to be installed in your system.
|
||||
|
||||
1. Select a model you want to use from Hugging Face. Refer to the
|
||||
{ml-docs}/ml-nlp-model-ref.html[compatible third party model] list for more
|
||||
information on the supported architectures.
|
||||
|
||||
2. Clone the selected model from Hugging Face by using the model URL. For
|
||||
example:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
----
|
||||
git clone https://huggingface.co/dslim/bert-base-NER
|
||||
----
|
||||
This command results in a local copy of
|
||||
of the model in the directory `bert-base-NER`.
|
||||
--
|
||||
|
||||
3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the
|
||||
directory of the cloned model to install it:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
----
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /PATH/TO/MODEL \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
----
|
||||
|
||||
If you use the Docker image to run `eland_import_hub_model` you must bind mount
|
||||
the model directory, so the container can read the files:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /model \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
----
|
||||
Once it's uploaded to {es}, the model will have the ID specified by
|
||||
`--es-model-id`. If it is not set, the model ID is derived from
|
||||
`--hub-model-id`; spaces and path delimiters are converted to double
|
||||
underscores `__`.
|
||||
|
||||
--
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-proxy]]
|
||||
==== Connect to Elasticsearch through a proxy
|
||||
|
||||
Behind the scenes, Eland uses the `requests` Python library, which
|
||||
https://requests.readthedocs.io/en/latest/user/advanced/#proxies[allows configuring
|
||||
proxies through an environment variable]. For example, to use an HTTP proxy to connect to
|
||||
an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable
|
||||
when invoking Eland:
|
||||
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ...
|
||||
--------------------------------------------------
|
||||
|
||||
If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY`
|
||||
instead.
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-auth]]
|
||||
==== Authentication methods
|
||||
|
||||
The following authentication options are available when using the import script:
|
||||
|
||||
* Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
|
||||
--------------------------------------------------
|
||||
These `-u` and `-p` options also work when you use `--url`.
|
||||
--
|
||||
|
||||
* Elasticsearch username and password authentication (embedded in the URL):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
|
||||
--------------------------------------------------
|
||||
--
|
||||
|
||||
* Elasticsearch API key authentication:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
|
||||
--------------------------------------------------
|
||||
--
|
||||
|
||||
* HuggingFace Hub access token (for private models):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --hub-access-token <access-token> ...
|
||||
--------------------------------------------------
|
||||
--
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-tls]]
|
||||
==== TLS/SSL
|
||||
|
||||
The following TLS/SSL options for Elasticsearch are available when using the import script:
|
||||
|
||||
|
||||
* Specify alternate CA bundle to verify the cluster certificate:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --ca-certs CA_CERTS ...
|
||||
--------------------------------------------------
|
||||
--
|
||||
|
||||
* Disable TLS/SSL verification altogether (strongly discouraged):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --insecure ...
|
||||
--------------------------------------------------
|
||||
--
|
@ -1,16 +1,16 @@
|
||||
[[dataframes]]
|
||||
== Data Frames
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/dataframes.html
|
||||
---
|
||||
|
||||
`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API
|
||||
and defers all processing and filtering of data to Elasticsearch
|
||||
instead of your local machine. This means you can process large
|
||||
amounts of data within Elasticsearch from a Jupyter Notebook
|
||||
without overloading your machine.
|
||||
# Data Frames [dataframes]
|
||||
|
||||
[source,python]
|
||||
-------------------------------------
|
||||
`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API and defers all processing and filtering of data to Elasticsearch instead of your local machine. This means you can process large amounts of data within Elasticsearch from a Jupyter Notebook without overloading your machine.
|
||||
|
||||
```python
|
||||
>>> import eland as ed
|
||||
>>> # Connect to 'flights' index via localhost Elasticsearch node
|
||||
>>>
|
||||
# Connect to 'flights' index via localhost Elasticsearch node
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
|
||||
# eland.DataFrame instance has the same API as pandas.DataFrame
|
||||
@ -59,4 +59,5 @@ Elasticsearch storage usage: 5.043 MB
|
||||
sum 9.261629e+07 8.204365e+06
|
||||
min 0.000000e+00 1.000205e+02
|
||||
std 4.578263e+03 2.663867e+02
|
||||
-------------------------------------
|
||||
```
|
||||
|
@ -1,33 +1,36 @@
|
||||
[[overview]]
|
||||
== Overview
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/index.html
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/overview.html
|
||||
navigation_title: Eland
|
||||
---
|
||||
|
||||
Eland is a Python client and toolkit for DataFrames and {ml} in {es}.
|
||||
Full documentation is available on https://eland.readthedocs.io[Read the Docs].
|
||||
Source code is available on https://github.com/elastic/eland[GitHub].
|
||||
# Eland Python client [overview]
|
||||
|
||||
[discrete]
|
||||
=== Compatibility
|
||||
Eland is a Python client and toolkit for DataFrames and {{ml}} in {{es}}. Full documentation is available on [Read the Docs](https://eland.readthedocs.io). Source code is available on [GitHub](https://github.com/elastic/eland).
|
||||
|
||||
- Supports Python 3.8+ and Pandas 1.5
|
||||
- Supports {es} clusters that are 7.11+, recommended 7.14 or later for all features to work.
|
||||
Make sure your Eland major version matches the major version of your Elasticsearch cluster.
|
||||
|
||||
The recommended way to set your requirements in your `setup.py` or
|
||||
`requirements.txt` is::
|
||||
## Compatibility [_compatibility]
|
||||
|
||||
# Elasticsearch 8.x
|
||||
eland>=8,<9
|
||||
* Supports Python 3.9+ and Pandas 1.5
|
||||
* Supports {{es}} 8+ clusters, recommended 8.16 or later for all features to work. Make sure your Eland major version matches the major version of your Elasticsearch cluster.
|
||||
|
||||
# Elasticsearch 7.x
|
||||
eland>=7,<8
|
||||
The recommended way to set your requirements in your `setup.py` or `requirements.txt` is::
|
||||
|
||||
[discrete]
|
||||
=== Getting Started
|
||||
```
|
||||
# Elasticsearch 8.x
|
||||
eland>=8,<9
|
||||
```
|
||||
```
|
||||
# Elasticsearch 7.x
|
||||
eland>=7,<8
|
||||
```
|
||||
|
||||
Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`:
|
||||
## Getting Started [_getting_started]
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
Create a `DataFrame` object connected to an {{es}} cluster running on `http://localhost:9200`:
|
||||
|
||||
```python
|
||||
>>> import eland as ed
|
||||
>>> df = ed.DataFrame(
|
||||
... es_client="http://localhost:9200",
|
||||
@ -48,15 +51,14 @@ Create a `DataFrame` object connected to an {es} cluster running on `http://loca
|
||||
13058 858.144337 False ... 6 2018-02-11 14:54:34
|
||||
|
||||
[13059 rows x 27 columns]
|
||||
------------------------------------
|
||||
```
|
||||
|
||||
[discrete]
|
||||
==== Elastic Cloud
|
||||
|
||||
### Elastic Cloud [_elastic_cloud]
|
||||
|
||||
You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
```python
|
||||
>>> import eland as ed
|
||||
>>> from elasticsearch import Elasticsearch
|
||||
|
||||
@ -73,16 +75,16 @@ You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
[5 rows x 27 columns]
|
||||
------------------------------------
|
||||
```
|
||||
|
||||
Eland can be used for complex queries and aggregations:
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
```python
|
||||
>>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False)
|
||||
AvgTicketPrice Cancelled timestamp
|
||||
Carrier
|
||||
ES-Air 630.235816 0.129814 2018-01-21 20:45:00.200000000
|
||||
JetBeats 627.457373 0.134698 2018-01-21 14:43:18.112400635
|
||||
Logstash Airways 624.581974 0.125188 2018-01-21 16:14:50.711798340
|
||||
------------------------------------
|
||||
```
|
||||
|
19
docs/reference/installation.md
Normal file
19
docs/reference/installation.md
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/installation.html
|
||||
---
|
||||
|
||||
# Installation [installation]
|
||||
|
||||
Eland can be installed with [pip](https://pip.pypa.io) from [PyPI](https://pypi.org/project/eland). We recommend [using a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) when installing with pip:
|
||||
|
||||
```sh
|
||||
$ python -m pip install eland
|
||||
```
|
||||
|
||||
Alternatively, Eland can be installed with [Conda](https://docs.conda.io) from [Conda Forge](https://anaconda.org/conda-forge/eland):
|
||||
|
||||
```sh
|
||||
$ conda install -c conda-forge eland
|
||||
```
|
||||
|
199
docs/reference/machine-learning.md
Normal file
199
docs/reference/machine-learning.md
Normal file
@ -0,0 +1,199 @@
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html
|
||||
---
|
||||
|
||||
# Machine Learning [machine-learning]
|
||||
|
||||
|
||||
## Trained models [ml-trained-models]
|
||||
|
||||
Eland allows transforming *some*
|
||||
[trained models](https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters) from scikit-learn, XGBoost,
|
||||
and LightGBM libraries to be serialized and used as an inference model in {{es}}.
|
||||
|
||||
```python
|
||||
>>> from xgboost import XGBClassifier
|
||||
>>> from eland.ml import MLModel
|
||||
|
||||
# Train and exercise an XGBoost ML model locally
|
||||
>>> xgb_model = XGBClassifier(booster="gbtree")
|
||||
>>> xgb_model.fit(training_data[0], training_data[1])
|
||||
|
||||
>>> xgb_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
|
||||
# Import the model into Elasticsearch
|
||||
>>> es_model = MLModel.import_model(
|
||||
es_client="http://localhost:9200",
|
||||
model_id="xgb-classifier",
|
||||
model=xgb_model,
|
||||
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
||||
)
|
||||
|
||||
# Exercise the ML model in Elasticsearch with the training data
|
||||
>>> es_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
```
|
||||
|
||||
|
||||
## Natural language processing (NLP) with PyTorch [ml-nlp-pytorch]
|
||||
|
||||
::::{important}
|
||||
You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version.
|
||||
::::
|
||||
|
||||
|
||||
For NLP tasks, Eland enables you to import PyTorch models into {{es}}. Use the `eland_import_hub_model` script to download and install supported [transformer models](https://huggingface.co/transformers) from the [Hugging Face model hub](https://huggingface.co/models). For example:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model <authentication> \ <1>
|
||||
--url http://localhost:9200/ \ <2>
|
||||
--hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
|
||||
--task-type ner \ <4>
|
||||
--start
|
||||
```
|
||||
|
||||
1. Use an authentication method to access your cluster. Refer to [Authentication methods](machine-learning.md#ml-nlp-pytorch-auth).
|
||||
2. The cluster URL. Alternatively, use `--cloud-id`.
|
||||
3. Specify the identifier for the model in the Hugging Face model hub.
|
||||
4. Specify the type of NLP task. Supported values are `fill_mask`, `ner`, `question_answering`, `text_classification`, `text_embedding`, `text_expansion`, `text_similarity` and `zero_shot_classification`.
|
||||
|
||||
|
||||
For more information about the available options, run `eland_import_hub_model` with the `--help` option.
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --help
|
||||
```
|
||||
|
||||
|
||||
### Import model with Docker [ml-nlp-pytorch-docker]
|
||||
|
||||
::::{important}
|
||||
To use the Docker container, you need to clone the Eland repository: [https://github.com/elastic/eland](https://github.com/elastic/eland)
|
||||
::::
|
||||
|
||||
|
||||
If you want to use Eland without installing it, you can use the Docker image:
|
||||
|
||||
You can use the container interactively:
|
||||
|
||||
```bash
|
||||
docker run -it --rm --network host docker.elastic.co/eland/eland
|
||||
```
|
||||
|
||||
Running installed scripts is also possible without an interactive shell, for example:
|
||||
|
||||
```bash
|
||||
docker run -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url $ELASTICSEARCH_URL \
|
||||
--hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
|
||||
--start
|
||||
```
|
||||
|
||||
Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
|
||||
|
||||
|
||||
### Install models in an air-gapped environment [ml-nlp-pytorch-air-gapped]
|
||||
|
||||
You can install models in a restricted or closed network by pointing the `eland_import_hub_model` script to local files.
|
||||
|
||||
For an offline install of a Hugging Face model, the model first needs to be cloned locally, Git and [Git Large File Storage](https://git-lfs.com/) are required to be installed in your system.
|
||||
|
||||
1. Select a model you want to use from Hugging Face. Refer to the [compatible third party model](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md) list for more information on the supported architectures.
|
||||
2. Clone the selected model from Hugging Face by using the model URL. For example:
|
||||
|
||||
```bash
|
||||
git clone https://huggingface.co/dslim/bert-base-NER
|
||||
```
|
||||
|
||||
This command results in a local copy of of the model in the directory `bert-base-NER`.
|
||||
|
||||
3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the directory of the cloned model to install it:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /PATH/TO/MODEL \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
```
|
||||
|
||||
If you use the Docker image to run `eland_import_hub_model` you must bind mount the model directory, so the container can read the files:
|
||||
|
||||
```bash
|
||||
docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /model \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
```
|
||||
|
||||
Once it’s uploaded to {{es}}, the model will have the ID specified by `--es-model-id`. If it is not set, the model ID is derived from `--hub-model-id`; spaces and path delimiters are converted to double underscores `__`.
|
||||
|
||||
|
||||
|
||||
### Connect to Elasticsearch through a proxy [ml-nlp-pytorch-proxy]
|
||||
|
||||
Behind the scenes, Eland uses the `requests` Python library, which [allows configuring proxies through an environment variable](https://requests.readthedocs.io/en/latest/user/advanced/#proxies). For example, to use an HTTP proxy to connect to an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable when invoking Eland:
|
||||
|
||||
```bash
|
||||
HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ...
|
||||
```
|
||||
|
||||
If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY` instead.
|
||||
|
||||
|
||||
### Authentication methods [ml-nlp-pytorch-auth]
|
||||
|
||||
The following authentication options are available when using the import script:
|
||||
|
||||
* Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
|
||||
```
|
||||
|
||||
These `-u` and `-p` options also work when you use `--url`.
|
||||
|
||||
* Elasticsearch username and password authentication (embedded in the URL):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
|
||||
```
|
||||
|
||||
* Elasticsearch API key authentication:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
|
||||
```
|
||||
|
||||
* HuggingFace Hub access token (for private models):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --hub-access-token <access-token> ...
|
||||
```
|
||||
|
||||
|
||||
|
||||
### TLS/SSL [ml-nlp-pytorch-tls]
|
||||
|
||||
The following TLS/SSL options for Elasticsearch are available when using the import script:
|
||||
|
||||
* Specify alternate CA bundle to verify the cluster certificate:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --ca-certs CA_CERTS ...
|
||||
```
|
||||
|
||||
* Disable TLS/SSL verification altogether (strongly discouraged):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --insecure ...
|
||||
```
|
||||
|
||||
|
6
docs/reference/toc.yml
Normal file
6
docs/reference/toc.yml
Normal file
@ -0,0 +1,6 @@
|
||||
project: 'Eland reference'
|
||||
toc:
|
||||
- file: index.md
|
||||
- file: installation.md
|
||||
- file: dataframes.md
|
||||
- file: machine-learning.md
|
@ -200,7 +200,7 @@ Configuring PyCharm And Running Tests
|
||||
- To test specific versions of Python run
|
||||
.. code-block:: bash
|
||||
|
||||
nox -s test-3.8
|
||||
nox -s test-3.12
|
||||
|
||||
|
||||
Documentation
|
||||
|
@ -24,7 +24,7 @@
|
||||
"\n",
|
||||
"For this example, you will need:\n",
|
||||
"\n",
|
||||
"- Python 3.8 or later\n",
|
||||
"- Python 3.9 or later\n",
|
||||
"- An Elastic deployment\n",
|
||||
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
|
||||
"\n",
|
||||
|
@ -1,5 +1,5 @@
|
||||
eland.DataFrame.to\_json
|
||||
=======================
|
||||
========================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
~MLModel.delete_model
|
||||
~MLModel.exists_model
|
||||
~MLModel.export_model
|
||||
~MLModel.import_ltr_model
|
||||
~MLModel.import_model
|
||||
~MLModel.predict
|
||||
|
||||
|
@ -15,6 +15,8 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import warnings
|
||||
|
||||
from ._version import ( # noqa: F401
|
||||
__author__,
|
||||
__author_email__,
|
||||
@ -25,13 +27,16 @@ from ._version import ( # noqa: F401
|
||||
__url__,
|
||||
__version__,
|
||||
)
|
||||
from .common import SortOrder
|
||||
from .common import ElandDeprecationWarning, SortOrder
|
||||
from .dataframe import DataFrame
|
||||
from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland
|
||||
from .index import Index
|
||||
from .ndframe import NDFrame
|
||||
from .series import Series
|
||||
|
||||
# Display Eland deprecation warnings by default
|
||||
warnings.simplefilter("default", category=ElandDeprecationWarning)
|
||||
|
||||
__all__ = [
|
||||
"DataFrame",
|
||||
"Series",
|
||||
|
@ -18,7 +18,7 @@
|
||||
__title__ = "eland"
|
||||
__description__ = "Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch"
|
||||
__url__ = "https://github.com/elastic/eland"
|
||||
__version__ = "8.15.1"
|
||||
__version__ = "9.0.1"
|
||||
__author__ = "Steve Dodson"
|
||||
__author_email__ = "steve.dodson@elastic.co"
|
||||
__maintainer__ = "Elastic Client Library Maintainers"
|
||||
|
@ -236,14 +236,9 @@ def check_cluster_version(es_client, logger):
|
||||
f"Elasticsearch version {major_version} does not support NLP models. Please upgrade Elasticsearch to the latest version"
|
||||
)
|
||||
exit(1)
|
||||
|
||||
# PyTorch was upgraded to version 2.3.1 in 8.15.2
|
||||
# and is incompatible with earlier versions
|
||||
if sem_ver < (8, 15, 2):
|
||||
import torch
|
||||
|
||||
elif major_version < 9:
|
||||
logger.error(
|
||||
f"Eland uses PyTorch version {torch.__version__} which is incompatible with Elasticsearch versions prior to 8.15.2. Please upgrade Elasticsearch to at least version 8.15.2"
|
||||
"Eland 9.x does not support Elasticsearch 8.x. Please upgrade Elasticsearch first."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
|
@ -52,6 +52,10 @@ PANDAS_VERSION: Tuple[int, ...] = tuple(
|
||||
_ELAND_MAJOR_VERSION = int(_eland_version.split(".")[0])
|
||||
|
||||
|
||||
class ElandDeprecationWarning(DeprecationWarning):
|
||||
"""Warning for deprecation functionality in Eland"""
|
||||
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
||||
@ -305,7 +309,7 @@ def elasticsearch_date_to_pandas_date(
|
||||
|
||||
|
||||
def ensure_es_client(
|
||||
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
|
||||
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
|
||||
) -> Elasticsearch:
|
||||
if isinstance(es_client, tuple):
|
||||
es_client = list(es_client)
|
||||
|
@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore
|
||||
from pandas.util._validators import validate_bool_kwarg # type: ignore
|
||||
|
||||
import eland.plotting as gfx
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||
from eland.filter import BooleanFilter
|
||||
from eland.groupby import DataFrameGroupBy
|
||||
from eland.ndframe import NDFrame
|
||||
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
|
||||
axis = pd.DataFrame._get_axis_name(axis)
|
||||
axes = {axis: labels}
|
||||
elif index is not None or columns is not None:
|
||||
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
|
||||
(index, columns), {}
|
||||
)
|
||||
axes = {"columns": columns, "index": index}
|
||||
else:
|
||||
raise ValueError(
|
||||
"Need to specify at least one of 'labels', 'index' or 'columns'"
|
||||
@ -1361,7 +1359,7 @@ class DataFrame(NDFrame):
|
||||
default_handler=None,
|
||||
lines=False,
|
||||
compression="infer",
|
||||
index=True,
|
||||
index=None,
|
||||
indent=None,
|
||||
storage_options=None,
|
||||
):
|
||||
@ -1376,6 +1374,8 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.to_json`
|
||||
"""
|
||||
if index is None and PANDAS_VERSION[0] == 1:
|
||||
index = True # switch to the pandas 1 default
|
||||
kwargs = {
|
||||
"path_or_buf": path_or_buf,
|
||||
"orient": orient,
|
||||
|
39
eland/etl.py
39
eland/etl.py
@ -16,6 +16,7 @@
|
||||
# under the License.
|
||||
|
||||
import csv
|
||||
import warnings
|
||||
from collections import deque
|
||||
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
|
||||
|
||||
@ -110,15 +111,15 @@ def pandas_to_eland(
|
||||
2 3.141 1 ... 3 Long text - to be indexed as es type text
|
||||
<BLANKLINE>
|
||||
[3 rows x 8 columns]
|
||||
>>> pd_df.dtypes
|
||||
A float64
|
||||
B int64
|
||||
C object
|
||||
D datetime64[ns]
|
||||
E float64
|
||||
F bool
|
||||
G int64
|
||||
H object
|
||||
>>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
|
||||
A float64
|
||||
B int64
|
||||
C object
|
||||
D datetime64[s]
|
||||
E float64
|
||||
F bool
|
||||
G int64
|
||||
H object
|
||||
dtype: object
|
||||
|
||||
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
|
||||
@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
squeeze=None,
|
||||
prefix=None,
|
||||
mangle_dupe_cols=True,
|
||||
mangle_dupe_cols=None,
|
||||
# General Parsing Configuration
|
||||
dtype=None,
|
||||
engine=None,
|
||||
@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
|
||||
low_memory: bool = _DEFAULT_LOW_MEMORY,
|
||||
memory_map=False,
|
||||
float_precision=None,
|
||||
**extra_kwargs,
|
||||
) -> "DataFrame":
|
||||
"""
|
||||
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
|
||||
@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
|
||||
"usecols": usecols,
|
||||
"verbose": verbose,
|
||||
"encoding": encoding,
|
||||
"squeeze": squeeze,
|
||||
"memory_map": memory_map,
|
||||
"float_precision": float_precision,
|
||||
"na_filter": na_filter,
|
||||
@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
|
||||
"error_bad_lines": error_bad_lines,
|
||||
"on_bad_lines": on_bad_lines,
|
||||
"low_memory": low_memory,
|
||||
"mangle_dupe_cols": mangle_dupe_cols,
|
||||
"infer_datetime_format": infer_datetime_format,
|
||||
"skip_blank_lines": skip_blank_lines,
|
||||
**extra_kwargs,
|
||||
}
|
||||
|
||||
if chunksize is None:
|
||||
@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore
|
||||
|
||||
kwargs.pop("on_bad_lines")
|
||||
|
||||
if "squeeze" in kwargs:
|
||||
kwargs.pop("squeeze")
|
||||
warnings.warn(
|
||||
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
|
||||
)
|
||||
|
||||
if "mangle_dupe_cols" in kwargs:
|
||||
kwargs.pop("mangle_dupe_cols")
|
||||
warnings.warn(
|
||||
"The mangle_dupe_cols argument no longer works. Furthermore, "
|
||||
"duplicate columns will automatically get a number suffix."
|
||||
)
|
||||
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
|
||||
reader = pd.read_csv(filepath_or_buffer, **kwargs)
|
||||
|
||||
|
@ -712,8 +712,11 @@ class FieldMappings:
|
||||
capabilities, orient="index", columns=FieldMappings.column_labels
|
||||
)
|
||||
|
||||
self._mappings_capabilities = self._mappings_capabilities.append(
|
||||
capability_matrix_row
|
||||
self._mappings_capabilities = pd.concat(
|
||||
[
|
||||
self._mappings_capabilities,
|
||||
capability_matrix_row,
|
||||
]
|
||||
)
|
||||
|
||||
def numeric_source_fields(self) -> List[str]:
|
||||
|
@ -50,10 +50,7 @@ class Index:
|
||||
# index_field.setter
|
||||
self._is_source_field = False
|
||||
|
||||
# The type:ignore is due to mypy not being smart enough
|
||||
# to recognize the property.setter has a different type
|
||||
# than the property.getter.
|
||||
self.es_index_field = es_index_field # type: ignore
|
||||
self.es_index_field = es_index_field
|
||||
|
||||
@property
|
||||
def sort_field(self) -> str:
|
||||
|
@ -19,7 +19,7 @@ import base64
|
||||
import gzip
|
||||
import json
|
||||
from abc import ABC
|
||||
from typing import Any, Dict, List, Optional, Sequence
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
|
||||
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
|
||||
@ -58,6 +58,9 @@ class ModelSerializer(ABC):
|
||||
"ascii"
|
||||
)
|
||||
|
||||
def bounds(self) -> Tuple[float, float]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class TreeNode:
|
||||
def __init__(
|
||||
@ -129,6 +132,14 @@ class Tree(ModelSerializer):
|
||||
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
|
||||
return {"tree": d}
|
||||
|
||||
def bounds(self) -> Tuple[float, float]:
|
||||
leaf_values = [
|
||||
tree_node._leaf_value[0]
|
||||
for tree_node in self._tree_structure
|
||||
if tree_node._leaf_value is not None
|
||||
]
|
||||
return min(leaf_values), max(leaf_values)
|
||||
|
||||
|
||||
class Ensemble(ModelSerializer):
|
||||
def __init__(
|
||||
@ -158,3 +169,9 @@ class Ensemble(ModelSerializer):
|
||||
add_if_exists(d, "classification_weights", self._classification_weights)
|
||||
add_if_exists(d, "aggregate_output", self._output_aggregator)
|
||||
return {"ensemble": d}
|
||||
|
||||
def bounds(self) -> Tuple[float, float]:
|
||||
min_bound, max_bound = tuple(
|
||||
map(sum, zip(*[model.bounds() for model in self._trained_models]))
|
||||
)
|
||||
return min_bound, max_bound
|
||||
|
@ -1,16 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
@ -1,226 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .._optional import import_optional_dependency
|
||||
|
||||
import_optional_dependency("sklearn", on_version="warn")
|
||||
|
||||
import sklearn
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
|
||||
|
||||
class Tree:
|
||||
"""Wrapper to create sklearn Tree objects from Elastic ML tree
|
||||
description in JSON format.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
json_tree: Dict[str, Any],
|
||||
feature_names_map: Dict[str, int],
|
||||
):
|
||||
tree_leaf = -1
|
||||
|
||||
node_count = len(json_tree["tree_structure"])
|
||||
children_left = np.ones((node_count,), dtype=int) * tree_leaf
|
||||
children_right = np.ones((node_count,), dtype=int) * tree_leaf
|
||||
feature = np.ones((node_count,), dtype=int) * -2
|
||||
threshold = np.ones((node_count,), dtype=float) * -2
|
||||
impurity = np.zeros((node_count,), dtype=float)
|
||||
# value works only for regression and binary classification
|
||||
value = np.zeros((node_count, 1, 1), dtype="<f8")
|
||||
n_node_samples = np.zeros((node_count,), dtype=int)
|
||||
|
||||
# parse values from the JSON tree
|
||||
feature_names = json_tree["feature_names"]
|
||||
for json_node in json_tree["tree_structure"]:
|
||||
node_id = json_node["node_index"]
|
||||
if "number_samples" in json_node:
|
||||
n_node_samples[node_id] = json_node["number_samples"]
|
||||
else:
|
||||
n_node_samples[node_id] = 0
|
||||
|
||||
if "leaf_value" not in json_node:
|
||||
children_left[node_id] = json_node["left_child"]
|
||||
children_right[node_id] = json_node["right_child"]
|
||||
feature[node_id] = feature_names_map[
|
||||
feature_names[json_node["split_feature"]]
|
||||
]
|
||||
threshold[node_id] = json_node["threshold"]
|
||||
if "split_gain" in json_node:
|
||||
impurity[node_id] = json_node["split_gain"]
|
||||
else:
|
||||
impurity[node_id] = -1
|
||||
else:
|
||||
value[node_id, 0, 0] = json_node["leaf_value"]
|
||||
|
||||
# iterate through tree to get max depth and expected values
|
||||
weighted_n_node_samples = n_node_samples.copy()
|
||||
self.max_depth = Tree._compute_expectations(
|
||||
children_left=children_left,
|
||||
children_right=children_right,
|
||||
node_sample_weight=weighted_n_node_samples,
|
||||
values=value,
|
||||
node_index=0,
|
||||
)
|
||||
self.n_outputs = value.shape[-1]
|
||||
|
||||
# initialize the sklearn tree
|
||||
self.tree = sklearn.tree._tree.Tree(
|
||||
len(feature_names), np.array([1], dtype=int), 1
|
||||
)
|
||||
node_state = np.array(
|
||||
[
|
||||
(
|
||||
children_left[i],
|
||||
children_right[i],
|
||||
feature[i],
|
||||
threshold[i],
|
||||
impurity[i],
|
||||
n_node_samples[i],
|
||||
weighted_n_node_samples[i],
|
||||
True,
|
||||
)
|
||||
for i in range(node_count)
|
||||
],
|
||||
dtype={
|
||||
"names": [
|
||||
"left_child",
|
||||
"right_child",
|
||||
"feature",
|
||||
"threshold",
|
||||
"impurity",
|
||||
"n_node_samples",
|
||||
"weighted_n_node_samples",
|
||||
"missing_go_to_left",
|
||||
],
|
||||
"formats": ["<i8", "<i8", "<i8", "<f8", "<f8", "<i8", "<f8", "u1"],
|
||||
},
|
||||
)
|
||||
state = {
|
||||
"max_depth": self.max_depth,
|
||||
"node_count": node_count,
|
||||
"nodes": node_state,
|
||||
"values": value,
|
||||
}
|
||||
self.tree.__setstate__(state)
|
||||
|
||||
@staticmethod
|
||||
def _compute_expectations(
|
||||
children_left, children_right, node_sample_weight, values, node_index
|
||||
) -> int:
|
||||
if children_right[node_index] == -1:
|
||||
return 0
|
||||
|
||||
left_index = children_left[node_index]
|
||||
right_index = children_right[node_index]
|
||||
depth_left = Tree._compute_expectations(
|
||||
children_left, children_right, node_sample_weight, values, left_index
|
||||
)
|
||||
depth_right = Tree._compute_expectations(
|
||||
children_left, children_right, node_sample_weight, values, right_index
|
||||
)
|
||||
left_weight = node_sample_weight[left_index]
|
||||
right_weight = node_sample_weight[right_index]
|
||||
|
||||
v = (
|
||||
(
|
||||
left_weight * values[left_index, :]
|
||||
+ right_weight * values[right_index, :]
|
||||
)
|
||||
/ (left_weight + right_weight)
|
||||
if left_weight + right_weight > 0
|
||||
else 0
|
||||
)
|
||||
values[node_index, :] = v
|
||||
return max(depth_left, depth_right) + 1
|
||||
|
||||
|
||||
class TargetMeanEncoder(FunctionTransformer):
|
||||
"""FunctionTransformer implementation of the target mean encoder, which is
|
||||
deserialized from the Elastic ML preprocessor description in JSON formats.
|
||||
"""
|
||||
|
||||
def __init__(self, preprocessor: Dict[str, Any]):
|
||||
self.preprocessor = preprocessor
|
||||
target_map = self.preprocessor["target_mean_encoding"]["target_map"]
|
||||
feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"]
|
||||
self.field_name_in = self.preprocessor["target_mean_encoding"]["field"]
|
||||
fallback_value = self.preprocessor["target_mean_encoding"]["default_value"]
|
||||
|
||||
def func(column):
|
||||
return np.array(
|
||||
[
|
||||
(
|
||||
target_map[str(category)]
|
||||
if category in target_map
|
||||
else fallback_value
|
||||
)
|
||||
for category in column
|
||||
]
|
||||
).reshape(-1, 1)
|
||||
|
||||
def feature_names_out(ft, carr):
|
||||
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
||||
|
||||
super().__init__(func=func, feature_names_out=feature_names_out)
|
||||
|
||||
|
||||
class FrequencyEncoder(FunctionTransformer):
|
||||
"""FunctionTransformer implementation of the frequency encoder, which is
|
||||
deserialized from the Elastic ML preprocessor description in JSON format.
|
||||
"""
|
||||
|
||||
def __init__(self, preprocessor: Dict[str, Any]):
|
||||
self.preprocessor = preprocessor
|
||||
frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"]
|
||||
feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"]
|
||||
self.field_name_in = self.preprocessor["frequency_encoding"]["field"]
|
||||
fallback_value = 0.0
|
||||
|
||||
def func(column):
|
||||
return np.array(
|
||||
[
|
||||
(
|
||||
frequency_map[str(category)]
|
||||
if category in frequency_map
|
||||
else fallback_value
|
||||
)
|
||||
for category in column
|
||||
]
|
||||
).reshape(-1, 1)
|
||||
|
||||
def feature_names_out(ft, carr):
|
||||
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
||||
|
||||
super().__init__(func=func, feature_names_out=feature_names_out)
|
||||
|
||||
|
||||
class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
|
||||
"""Wrapper for sklearn one-hot encoder, which is deserialized from the
|
||||
Elastic ML preprocessor description in JSON format.
|
||||
"""
|
||||
|
||||
def __init__(self, preprocessor: Dict[str, Any]):
|
||||
self.preprocessor = preprocessor
|
||||
self.field_name_in = self.preprocessor["one_hot_encoding"]["field"]
|
||||
self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())]
|
||||
super().__init__(categories=self.cats, handle_unknown="ignore")
|
@ -1,46 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import eland
|
||||
|
||||
|
||||
class ModelDefinitionKeyError(Exception):
|
||||
"""
|
||||
This exception is raised when a key is not found in the model definition.
|
||||
|
||||
Attributes:
|
||||
missed_key (str): The key that was not found in the model definition.
|
||||
available_keys (List[str]): The list of keys that are available in the model definition.
|
||||
|
||||
Examples:
|
||||
model_definition = {"key1": "value1", "key2": "value2"}
|
||||
try:
|
||||
model_definition["key3"]
|
||||
except KeyError as ex:
|
||||
raise ModelDefinitionKeyError(ex) from ex
|
||||
"""
|
||||
|
||||
def __init__(self, ex: KeyError):
|
||||
self.missed_key = ex.args[0]
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f'Key "{self.missed_key}" is not available. '
|
||||
+ "The model definition may have changed. "
|
||||
+ "Make sure you are using an Elasticsearch version compatible "
|
||||
+ f"with Eland {eland.__version__}."
|
||||
)
|
@ -1,472 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from abc import ABC
|
||||
from typing import Any, List, Literal, Mapping, Optional, Set, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from elasticsearch import Elasticsearch
|
||||
from numpy.typing import ArrayLike
|
||||
|
||||
from .._optional import import_optional_dependency
|
||||
|
||||
import_optional_dependency("sklearn", on_version="warn")
|
||||
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
||||
from sklearn.ensemble._gb_losses import (
|
||||
BinomialDeviance,
|
||||
HuberLossFunction,
|
||||
LeastSquaresError,
|
||||
)
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils.validation import check_array
|
||||
|
||||
from eland.common import ensure_es_client
|
||||
from eland.ml.common import TYPE_CLASSIFICATION, TYPE_REGRESSION
|
||||
|
||||
from ._sklearn_deserializers import Tree
|
||||
from .common import ModelDefinitionKeyError
|
||||
|
||||
|
||||
class ESGradientBoostingModel(ABC):
|
||||
"""
|
||||
Abstract class for converting Elastic ML model into sklearn Pipeline.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client : Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
model_id : str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
Raises
|
||||
------
|
||||
RuntimeError
|
||||
On failure to retrieve trained model information to the specified model ID.
|
||||
ValueError
|
||||
The model is expected to be trained in Elastic Stack. Models initially imported
|
||||
from xgboost, lgbm, or sklearn are not supported.
|
||||
"""
|
||||
self.es_client: Elasticsearch = ensure_es_client(es_client)
|
||||
self.model_id = model_id
|
||||
|
||||
self._trained_model_result = self.es_client.ml.get_trained_models(
|
||||
model_id=self.model_id,
|
||||
decompress_definition=True,
|
||||
include=["hyperparameters", "definition"],
|
||||
)
|
||||
|
||||
if (
|
||||
"trained_model_configs" not in self._trained_model_result
|
||||
or len(self._trained_model_result["trained_model_configs"]) == 0
|
||||
):
|
||||
raise RuntimeError(
|
||||
f"Failed to retrieve the trained model for model ID {self.model_id!r}"
|
||||
)
|
||||
|
||||
if "metadata" not in self._trained_model_result["trained_model_configs"][0]:
|
||||
raise ValueError(
|
||||
"Error initializing sklearn classifier. Incorrect prior class probability. "
|
||||
+ "Note: only export of models trained in the Elastic Stack is supported."
|
||||
)
|
||||
preprocessors = []
|
||||
if "preprocessors" in self._definition:
|
||||
preprocessors = self._definition["preprocessors"]
|
||||
(
|
||||
self.feature_names_in_,
|
||||
self.input_field_names,
|
||||
) = ESGradientBoostingModel._get_feature_names_in_(
|
||||
preprocessors,
|
||||
self._definition["trained_model"]["ensemble"]["feature_names"],
|
||||
self._trained_model_result["trained_model_configs"][0]["input"][
|
||||
"field_names"
|
||||
],
|
||||
)
|
||||
|
||||
feature_names_map = {name: i for i, name in enumerate(self.feature_names_in_)}
|
||||
|
||||
trained_models = self._definition["trained_model"]["ensemble"]["trained_models"]
|
||||
self._trees = []
|
||||
for trained_model in trained_models:
|
||||
self._trees.append(Tree(trained_model["tree"], feature_names_map))
|
||||
|
||||
# 0's tree is the constant estimator
|
||||
self.n_estimators = len(trained_models) - 1
|
||||
|
||||
def _initialize_estimators(self, decision_tree_type) -> None:
|
||||
self.estimators_ = np.ndarray(
|
||||
(len(self._trees) - 1, 1), dtype=decision_tree_type
|
||||
)
|
||||
self.n_estimators_ = self.estimators_.shape[0]
|
||||
|
||||
for i in range(self.n_estimators_):
|
||||
estimator = decision_tree_type()
|
||||
estimator.tree_ = self._trees[i + 1].tree
|
||||
estimator.n_features_in_ = self.n_features_in_
|
||||
estimator.max_depth = self._max_depth
|
||||
estimator.max_features_ = self.max_features_
|
||||
self.estimators_[i, 0] = estimator
|
||||
|
||||
def _extract_common_parameters(self) -> None:
|
||||
self.n_features_in_ = len(self.feature_names_in_)
|
||||
self.max_features_ = self.n_features_in_
|
||||
|
||||
@property
|
||||
def _max_depth(self) -> int:
|
||||
return max(map(lambda x: x.max_depth, self._trees))
|
||||
|
||||
@property
|
||||
def _n_outputs(self) -> int:
|
||||
return self._trees[0].n_outputs
|
||||
|
||||
@property
|
||||
def _definition(self) -> Mapping[Union[str, int], Any]:
|
||||
return self._trained_model_result["trained_model_configs"][0]["definition"]
|
||||
|
||||
@staticmethod
|
||||
def _get_feature_names_in_(
|
||||
preprocessors, feature_names, field_names
|
||||
) -> Tuple[List[str], Set[str]]:
|
||||
input_field_names = set()
|
||||
|
||||
def add_input_field_name(preprocessor_type: str, feature_name: str) -> None:
|
||||
if feature_name in feature_names:
|
||||
input_field_names.add(preprocessor[preprocessor_type]["field"])
|
||||
|
||||
for preprocessor in preprocessors:
|
||||
if "target_mean_encoding" in preprocessor:
|
||||
add_input_field_name(
|
||||
"target_mean_encoding",
|
||||
preprocessor["target_mean_encoding"]["feature_name"],
|
||||
)
|
||||
elif "frequency_encoding" in preprocessor:
|
||||
add_input_field_name(
|
||||
"frequency_encoding",
|
||||
preprocessor["frequency_encoding"]["feature_name"],
|
||||
)
|
||||
elif "one_hot_encoding" in preprocessor:
|
||||
for feature_name in preprocessor["one_hot_encoding"][
|
||||
"hot_map"
|
||||
].values():
|
||||
add_input_field_name("one_hot_encoding", feature_name)
|
||||
|
||||
for field_name in field_names:
|
||||
if field_name in feature_names and field_name not in input_field_names:
|
||||
input_field_names.add(field_name)
|
||||
|
||||
return feature_names, input_field_names
|
||||
|
||||
@property
|
||||
def preprocessors(self) -> List[Any]:
|
||||
"""
|
||||
Returns the list of preprocessor JSON definitions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Any]
|
||||
List of preprocessors definitions or [].
|
||||
"""
|
||||
if "preprocessors" in self._definition:
|
||||
return self._definition["preprocessors"]
|
||||
return []
|
||||
|
||||
def fit(self, X, y, sample_weight=None, monitor=None) -> None:
|
||||
"""
|
||||
Override of the sklearn fit() method. It does nothing since Elastic ML models are
|
||||
trained in the Elastic Stack or imported.
|
||||
"""
|
||||
# Do nothing, model if fitted using Elasticsearch API
|
||||
pass
|
||||
|
||||
|
||||
class ESGradientBoostingClassifier(ESGradientBoostingModel, GradientBoostingClassifier):
|
||||
"""
|
||||
Elastic ML model wrapper compatible with sklearn GradientBoostingClassifier.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client : Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
model_id : str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
Multi-class classification is not supported at the moment.
|
||||
ValueError
|
||||
The classifier should be defined for at least 2 classes.
|
||||
ModelDefinitionKeyError
|
||||
If required data cannot be extracted from the model definition due to a schema change.
|
||||
"""
|
||||
|
||||
try:
|
||||
ESGradientBoostingModel.__init__(self, es_client, model_id)
|
||||
self._extract_common_parameters()
|
||||
GradientBoostingClassifier.__init__(
|
||||
self,
|
||||
learning_rate=1.0,
|
||||
n_estimators=self.n_estimators,
|
||||
max_depth=self._max_depth,
|
||||
)
|
||||
|
||||
if "classification_labels" in self._definition["trained_model"]["ensemble"]:
|
||||
self.classes_ = np.array(
|
||||
self._definition["trained_model"]["ensemble"][
|
||||
"classification_labels"
|
||||
]
|
||||
)
|
||||
else:
|
||||
self.classes_ = None
|
||||
|
||||
self.n_outputs = self._n_outputs
|
||||
if self.classes_ is not None:
|
||||
self.n_classes_ = len(self.classes_)
|
||||
elif self.n_outputs <= 2:
|
||||
self.n_classes_ = 2
|
||||
else:
|
||||
self.n_classes_ = self.n_outputs
|
||||
|
||||
if self.n_classes_ == 2:
|
||||
self._loss = BinomialDeviance(self.n_classes_)
|
||||
# self.n_outputs = 1
|
||||
elif self.n_classes_ > 2:
|
||||
raise NotImplementedError("Only binary classification is implemented.")
|
||||
else:
|
||||
raise ValueError(f"At least 2 classes required. got {self.n_classes_}.")
|
||||
|
||||
self.init_ = self._initialize_init_()
|
||||
self._initialize_estimators(DecisionTreeClassifier)
|
||||
except KeyError as ex:
|
||||
raise ModelDefinitionKeyError(ex) from ex
|
||||
|
||||
@property
|
||||
def analysis_type(self) -> Literal["classification"]:
|
||||
return TYPE_CLASSIFICATION
|
||||
|
||||
def _initialize_init_(self) -> DummyClassifier:
|
||||
estimator = DummyClassifier(strategy="prior")
|
||||
|
||||
estimator.n_classes_ = self.n_classes_
|
||||
estimator.n_outputs_ = self.n_outputs
|
||||
estimator.classes_ = np.arange(self.n_classes_)
|
||||
estimator._strategy = estimator.strategy
|
||||
|
||||
if self.n_classes_ == 2:
|
||||
log_odds = self._trees[0].tree.value.flatten()[0]
|
||||
if np.isnan(log_odds):
|
||||
raise ValueError(
|
||||
"Error initializing sklearn classifier. Incorrect prior class probability. "
|
||||
+ "Note: only export of models trained in the Elastic Stack is supported."
|
||||
)
|
||||
class_prior = 1 / (1 + np.exp(-log_odds))
|
||||
estimator.class_prior_ = np.array([1 - class_prior, class_prior])
|
||||
else:
|
||||
raise NotImplementedError("Only binary classification is implemented.")
|
||||
|
||||
return estimator
|
||||
|
||||
def predict_proba(
|
||||
self, X, feature_names_in: Optional[Union["ArrayLike", List[str]]] = None
|
||||
) -> "ArrayLike":
|
||||
"""Predict class probabilities for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
feature_names_in : {array of string, list of string} of length n_features.
|
||||
Feature names of the corresponding columns in X. Important, since the column list
|
||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ArrayLike of shape (n_samples, n_classes)
|
||||
The class probabilities of the input samples. The order of the
|
||||
classes corresponds to that in the attribute :term:`classes_`.
|
||||
"""
|
||||
if feature_names_in is not None:
|
||||
if X.shape[1] != len(feature_names_in):
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
||||
)
|
||||
if isinstance(feature_names_in, np.ndarray):
|
||||
feature_names_in = feature_names_in.tolist()
|
||||
# select columns used by the model in the correct order
|
||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
||||
|
||||
X = check_array(X)
|
||||
return GradientBoostingClassifier.predict_proba(self, X)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
X: "ArrayLike",
|
||||
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
|
||||
) -> "ArrayLike":
|
||||
"""Predict class for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
feature_names_in : {array of string, list of string} of length n_features.
|
||||
Feature names of the corresponding columns in X. Important, since the column list
|
||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ArrayLike of shape (n_samples,)
|
||||
The predicted values.
|
||||
"""
|
||||
if feature_names_in is not None:
|
||||
if X.shape[1] != len(feature_names_in):
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
||||
)
|
||||
if isinstance(feature_names_in, np.ndarray):
|
||||
feature_names_in = feature_names_in.tolist()
|
||||
# select columns used by the model in the correct order
|
||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
||||
|
||||
X = check_array(X)
|
||||
return GradientBoostingClassifier.predict(self, X)
|
||||
|
||||
|
||||
class ESGradientBoostingRegressor(ESGradientBoostingModel, GradientBoostingRegressor):
|
||||
"""
|
||||
Elastic ML model wrapper compatible with sklearn GradientBoostingRegressor.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client : Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
model_id : str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
Only MSE, MSLE, and Huber loss functions are supported.
|
||||
ModelDefinitionKeyError
|
||||
If required data cannot be extracted from the model definition due to a schema change.
|
||||
"""
|
||||
try:
|
||||
ESGradientBoostingModel.__init__(self, es_client, model_id)
|
||||
self._extract_common_parameters()
|
||||
GradientBoostingRegressor.__init__(
|
||||
self,
|
||||
learning_rate=1.0,
|
||||
n_estimators=self.n_estimators,
|
||||
max_depth=self._max_depth,
|
||||
)
|
||||
|
||||
self.n_outputs = 1
|
||||
loss_function = self._trained_model_result["trained_model_configs"][0][
|
||||
"metadata"
|
||||
]["analytics_config"]["analysis"][self.analysis_type]["loss_function"]
|
||||
if loss_function == "mse" or loss_function == "msle":
|
||||
self.criterion = "squared_error"
|
||||
self._loss = LeastSquaresError()
|
||||
elif loss_function == "huber":
|
||||
loss_parameter = loss_function = self._trained_model_result[
|
||||
"trained_model_configs"
|
||||
][0]["metadata"]["analytics_config"]["analysis"][self.analysis_type][
|
||||
"loss_function_parameter"
|
||||
]
|
||||
self.criterion = "huber"
|
||||
self._loss = HuberLossFunction(loss_parameter)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only MSE, MSLE and Huber loss functions are supported."
|
||||
)
|
||||
|
||||
self.init_ = self._initialize_init_()
|
||||
self._initialize_estimators(DecisionTreeRegressor)
|
||||
except KeyError as ex:
|
||||
raise ModelDefinitionKeyError(ex) from ex
|
||||
|
||||
@property
|
||||
def analysis_type(self) -> Literal["regression"]:
|
||||
return TYPE_REGRESSION
|
||||
|
||||
def _initialize_init_(self) -> DummyRegressor:
|
||||
constant = self._trees[0].tree.value[0]
|
||||
estimator = DummyRegressor(
|
||||
strategy="constant",
|
||||
constant=constant,
|
||||
)
|
||||
estimator.constant_ = np.array([constant])
|
||||
estimator.n_outputs_ = 1
|
||||
return estimator
|
||||
|
||||
def predict(
|
||||
self,
|
||||
X: "ArrayLike",
|
||||
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
|
||||
) -> "ArrayLike":
|
||||
"""Predict targets for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
feature_names_in : {array of string, list of string} of length n_features.
|
||||
Feature names of the corresponding columns in X. Important, since the column list
|
||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ArrayLike of shape (n_samples,)
|
||||
The predicted values.
|
||||
"""
|
||||
if feature_names_in is not None:
|
||||
if X.shape[1] != len(feature_names_in):
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
||||
)
|
||||
if isinstance(X, np.ndarray):
|
||||
feature_names_in = feature_names_in.tolist()
|
||||
# select columns used by the model in the correct order
|
||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
||||
|
||||
X = check_array(X)
|
||||
return GradientBoostingRegressor.predict(self, X)
|
@ -38,7 +38,6 @@ if TYPE_CHECKING:
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
)
|
||||
from sklearn.pipeline import Pipeline # type: ignore # noqa: F401
|
||||
from sklearn.tree import ( # type: ignore # noqa: F401
|
||||
DecisionTreeClassifier,
|
||||
DecisionTreeRegressor,
|
||||
@ -572,83 +571,6 @@ class MLModel:
|
||||
return False
|
||||
return True
|
||||
|
||||
def export_model(self) -> "Pipeline":
|
||||
"""Export Elastic ML model as sklearn Pipeline.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sklearn.pipeline.Pipeline
|
||||
_description_
|
||||
|
||||
Raises
|
||||
------
|
||||
AssertionError
|
||||
If preprocessors JSON definition has unexpected schema.
|
||||
ValueError
|
||||
The model is expected to be trained in Elastic Stack. Models initially imported
|
||||
from xgboost, lgbm, or sklearn are not supported.
|
||||
ValueError
|
||||
If unexpected categorical encoding is found in the list of preprocessors.
|
||||
NotImplementedError
|
||||
Only regression and binary classification models are supported currently.
|
||||
"""
|
||||
from sklearn.compose import ColumnTransformer # type: ignore # noqa: F401
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from .exporters._sklearn_deserializers import (
|
||||
FrequencyEncoder,
|
||||
OneHotEncoder,
|
||||
TargetMeanEncoder,
|
||||
)
|
||||
from .exporters.es_gb_models import (
|
||||
ESGradientBoostingClassifier,
|
||||
ESGradientBoostingRegressor,
|
||||
)
|
||||
|
||||
if self.model_type == TYPE_CLASSIFICATION:
|
||||
model = ESGradientBoostingClassifier(
|
||||
es_client=self._client, model_id=self._model_id
|
||||
)
|
||||
elif self.model_type == TYPE_REGRESSION:
|
||||
model = ESGradientBoostingRegressor(
|
||||
es_client=self._client, model_id=self._model_id
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only regression and binary classification models are supported currently."
|
||||
)
|
||||
|
||||
transformers = []
|
||||
for p in model.preprocessors:
|
||||
assert (
|
||||
len(p) == 1
|
||||
), f"Unexpected preprocessor data structure: {p}. One-key mapping expected."
|
||||
encoding_type = list(p.keys())[0]
|
||||
field = p[encoding_type]["field"]
|
||||
if encoding_type == "frequency_encoding":
|
||||
transform = FrequencyEncoder(p)
|
||||
transformers.append((f"{field}_{encoding_type}", transform, field))
|
||||
elif encoding_type == "target_mean_encoding":
|
||||
transform = TargetMeanEncoder(p)
|
||||
transformers.append((f"{field}_{encoding_type}", transform, field))
|
||||
elif encoding_type == "one_hot_encoding":
|
||||
transform = OneHotEncoder(p)
|
||||
transformers.append((f"{field}_{encoding_type}", transform, [field]))
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected categorical encoding type {encoding_type} found. "
|
||||
+ "Expected encodings: frequency_encoding, target_mean_encoding, one_hot_encoding."
|
||||
)
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=transformers,
|
||||
remainder="passthrough",
|
||||
verbose_feature_names_out=False,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("es_model", model)])
|
||||
|
||||
return pipeline
|
||||
|
||||
@property
|
||||
def _trained_model_config(self) -> Dict[str, Any]:
|
||||
"""Lazily loads an ML models 'trained_model_config' information"""
|
||||
|
@ -126,6 +126,7 @@ class PyTorchModel:
|
||||
def infer(
|
||||
self,
|
||||
docs: List[Mapping[str, str]],
|
||||
inference_config: Optional[Mapping[str, Any]] = None,
|
||||
timeout: str = DEFAULT_TIMEOUT,
|
||||
) -> Any:
|
||||
if docs is None:
|
||||
@ -133,6 +134,8 @@ class PyTorchModel:
|
||||
|
||||
__body: Dict[str, Any] = {}
|
||||
__body["docs"] = docs
|
||||
if inference_config is not None:
|
||||
__body["inference_config"] = inference_config
|
||||
|
||||
__path = f"/_ml/trained_models/{_quote(self.model_id)}/_infer"
|
||||
__query: Dict[str, Any] = {}
|
||||
|
@ -86,6 +86,27 @@ class NlpXLMRobertaTokenizationConfig(NlpTokenizationConfig):
|
||||
)
|
||||
|
||||
|
||||
class NlpDebertaV2TokenizationConfig(NlpTokenizationConfig):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
do_lower_case: t.Optional[bool] = None,
|
||||
with_special_tokens: t.Optional[bool] = None,
|
||||
max_sequence_length: t.Optional[int] = None,
|
||||
truncate: t.Optional[
|
||||
t.Union["t.Literal['first', 'none', 'second']", str]
|
||||
] = None,
|
||||
span: t.Optional[int] = None,
|
||||
):
|
||||
super().__init__(
|
||||
configuration_type="deberta_v2",
|
||||
with_special_tokens=with_special_tokens,
|
||||
max_sequence_length=max_sequence_length,
|
||||
truncate=truncate,
|
||||
span=span,
|
||||
)
|
||||
|
||||
|
||||
class NlpBertTokenizationConfig(NlpTokenizationConfig):
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -25,17 +25,13 @@ import os.path
|
||||
import random
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import torch # type: ignore
|
||||
import transformers # type: ignore
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
from torch import Tensor, nn
|
||||
from torch import Tensor
|
||||
from torch.profiler import profile # type: ignore
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForQuestionAnswering,
|
||||
BertTokenizer,
|
||||
PretrainedConfig,
|
||||
PreTrainedModel,
|
||||
@ -48,6 +44,7 @@ from eland.ml.pytorch.nlp_ml_model import (
|
||||
NerInferenceOptions,
|
||||
NlpBertJapaneseTokenizationConfig,
|
||||
NlpBertTokenizationConfig,
|
||||
NlpDebertaV2TokenizationConfig,
|
||||
NlpMPNetTokenizationConfig,
|
||||
NlpRobertaTokenizationConfig,
|
||||
NlpTokenizationConfig,
|
||||
@ -64,8 +61,13 @@ from eland.ml.pytorch.nlp_ml_model import (
|
||||
ZeroShotClassificationInferenceOptions,
|
||||
)
|
||||
from eland.ml.pytorch.traceable_model import TraceableModel
|
||||
from eland.ml.pytorch.wrappers import (
|
||||
_DistilBertWrapper,
|
||||
_DPREncoderWrapper,
|
||||
_QuestionAnsweringWrapperModule,
|
||||
_SentenceTransformerWrapperModule,
|
||||
)
|
||||
|
||||
DEFAULT_OUTPUT_KEY = "sentence_embedding"
|
||||
SUPPORTED_TASK_TYPES = {
|
||||
"fill_mask",
|
||||
"ner",
|
||||
@ -116,6 +118,7 @@ SUPPORTED_TOKENIZERS = (
|
||||
transformers.BartTokenizer,
|
||||
transformers.SqueezeBertTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
transformers.DebertaV2Tokenizer,
|
||||
)
|
||||
SUPPORTED_TOKENIZERS_NAMES = ", ".join(sorted([str(x) for x in SUPPORTED_TOKENIZERS]))
|
||||
|
||||
@ -170,283 +173,6 @@ def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]
|
||||
return potential_task_types.pop()
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around a question answering model.
|
||||
Our inference engine only takes the first tuple if the inference response
|
||||
is a tuple.
|
||||
|
||||
This wrapper transforms the output to be a stacked tensor if its a tuple.
|
||||
|
||||
Otherwise it passes it through
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||
model_id, token=token, torchscript=True
|
||||
)
|
||||
if isinstance(
|
||||
model.config,
|
||||
(
|
||||
transformers.MPNetConfig,
|
||||
transformers.XLMRobertaConfig,
|
||||
transformers.RobertaConfig,
|
||||
transformers.BartConfig,
|
||||
),
|
||||
):
|
||||
return _TwoParameterQuestionAnsweringWrapper(model)
|
||||
else:
|
||||
return _QuestionAnsweringWrapper(model)
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
del inputs["position_ids"]
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _DistilBertWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
In Elasticsearch the BERT tokenizer is used for DistilBERT models but
|
||||
the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
|
||||
|
||||
Wrap the model's forward function in a method that accepts the 4
|
||||
arguments passed to a BERT model then discard the token_type_ids
|
||||
and the position_ids to match the wrapped DistilBERT model forward
|
||||
function
|
||||
"""
|
||||
|
||||
def __init__(self, model: transformers.PreTrainedModel):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
|
||||
if isinstance(model.config, transformers.DistilBertConfig):
|
||||
return _DistilBertWrapper(model)
|
||||
else:
|
||||
return model
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
_token_type_ids: Tensor = None,
|
||||
_position_ids: Tensor = None,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
class _SentenceTransformerWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around sentence-transformer models to provide pooling,
|
||||
normalization and other graph layers that are not defined in the base
|
||||
HuggingFace transformer model.
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self._st_model = SentenceTransformer(model.config.name_or_path)
|
||||
self._output_key = output_key
|
||||
self.config = model.config
|
||||
|
||||
self._remove_pooling_layer()
|
||||
self._replace_transformer_layer()
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(
|
||||
model_id: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
*,
|
||||
token: Optional[str] = None,
|
||||
output_key: str = DEFAULT_OUTPUT_KEY,
|
||||
) -> Optional[Any]:
|
||||
model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
|
||||
if isinstance(
|
||||
tokenizer,
|
||||
(
|
||||
transformers.BartTokenizer,
|
||||
transformers.MPNetTokenizer,
|
||||
transformers.RobertaTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
),
|
||||
):
|
||||
return _TwoParameterSentenceTransformerWrapper(model, output_key)
|
||||
else:
|
||||
return _SentenceTransformerWrapper(model, output_key)
|
||||
|
||||
def _remove_pooling_layer(self) -> None:
|
||||
"""
|
||||
Removes any last pooling layer which is not used to create embeddings.
|
||||
Leaving this layer in will cause it to return a NoneType which in turn
|
||||
will fail to load in libtorch. Alternatively, we can just use the output
|
||||
of the pooling layer as a dummy but this also affects (if only in a
|
||||
minor way) the performance of inference, so we're better off removing
|
||||
the layer if we can.
|
||||
"""
|
||||
|
||||
if hasattr(self._hf_model, "pooler"):
|
||||
self._hf_model.pooler = None
|
||||
|
||||
def _replace_transformer_layer(self) -> None:
|
||||
"""
|
||||
Replaces the HuggingFace Transformer layer in the SentenceTransformer
|
||||
modules so we can set it with one that has pooling layer removed and
|
||||
was loaded ready for TorchScript export.
|
||||
"""
|
||||
|
||||
self._st_model._modules["0"].auto_model = self._hf_model
|
||||
|
||||
|
||||
class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _DPREncoderWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
AutoModel loading does not work for DPRContextEncoders, this only exists as
|
||||
a workaround. This may never be fixed so this is likely permanent.
|
||||
See: https://github.com/huggingface/transformers/issues/13670
|
||||
"""
|
||||
|
||||
_SUPPORTED_MODELS = {
|
||||
transformers.DPRContextEncoder,
|
||||
transformers.DPRQuestionEncoder,
|
||||
}
|
||||
_SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
|
||||
):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
config = AutoConfig.from_pretrained(model_id, token=token)
|
||||
|
||||
def is_compatible() -> bool:
|
||||
is_dpr_model = config.model_type == "dpr"
|
||||
has_architectures = (
|
||||
config.architectures is not None and len(config.architectures) == 1
|
||||
)
|
||||
is_supported_architecture = has_architectures and (
|
||||
config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
|
||||
)
|
||||
return is_dpr_model and is_supported_architecture
|
||||
|
||||
if is_compatible():
|
||||
model = getattr(transformers, config.architectures[0]).from_pretrained(
|
||||
model_id, torchscript=True
|
||||
)
|
||||
return _DPREncoderWrapper(model)
|
||||
else:
|
||||
return None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
_position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
)
|
||||
|
||||
|
||||
class _TransformerTraceableModel(TraceableModel):
|
||||
"""A base class representing a HuggingFace transformer model that can be traced."""
|
||||
|
||||
@ -488,9 +214,15 @@ class _TransformerTraceableModel(TraceableModel):
|
||||
transformers.XLMRobertaTokenizer,
|
||||
),
|
||||
):
|
||||
del inputs["token_type_ids"]
|
||||
return (inputs["input_ids"], inputs["attention_mask"])
|
||||
|
||||
if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||
return (
|
||||
inputs["input_ids"],
|
||||
inputs["attention_mask"],
|
||||
inputs["token_type_ids"],
|
||||
)
|
||||
|
||||
position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
|
||||
inputs["position_ids"] = position_ids
|
||||
return (
|
||||
@ -523,6 +255,15 @@ class _TraceableFillMaskModel(_TransformerTraceableModel):
|
||||
)
|
||||
|
||||
|
||||
class _TraceableTextExpansionModel(_TransformerTraceableModel):
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
return self._tokenizer(
|
||||
"This is an example sentence.",
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
|
||||
class _TraceableNerModel(_TraceableClassificationModel):
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
return self._tokenizer(
|
||||
@ -557,7 +298,7 @@ class _TraceableTextEmbeddingModel(_TransformerTraceableModel):
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
return self._tokenizer(
|
||||
"This is an example sentence.",
|
||||
padding="max_length",
|
||||
padding="longest",
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
@ -681,7 +422,12 @@ class TransformerModel:
|
||||
" ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
|
||||
]
|
||||
vocab_obj["merges"] = merges
|
||||
sp_model = getattr(self._tokenizer, "sp_model", None)
|
||||
|
||||
if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||
sp_model = self._tokenizer._tokenizer.spm
|
||||
else:
|
||||
sp_model = getattr(self._tokenizer, "sp_model", None)
|
||||
|
||||
if sp_model:
|
||||
id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
|
||||
scores = []
|
||||
@ -719,6 +465,11 @@ class TransformerModel:
|
||||
return NlpXLMRobertaTokenizationConfig(
|
||||
max_sequence_length=_max_sequence_length
|
||||
)
|
||||
elif isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||
return NlpDebertaV2TokenizationConfig(
|
||||
max_sequence_length=_max_sequence_length,
|
||||
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
|
||||
)
|
||||
else:
|
||||
japanese_morphological_tokenizers = ["mecab"]
|
||||
if (
|
||||
@ -741,7 +492,7 @@ class TransformerModel:
|
||||
# a random or very large value.
|
||||
REASONABLE_MAX_LENGTH = 8192
|
||||
max_len = getattr(self._tokenizer, "model_max_length", None)
|
||||
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
||||
if max_len is not None and max_len <= REASONABLE_MAX_LENGTH:
|
||||
return int(max_len)
|
||||
|
||||
max_sizes = getattr(self._tokenizer, "max_model_input_sizes", dict())
|
||||
@ -975,6 +726,13 @@ class TransformerModel:
|
||||
else:
|
||||
self._task_type = maybe_task_type
|
||||
|
||||
if self._task_type == "text_expansion":
|
||||
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
||||
self._model_id, token=self._access_token, torchscript=True
|
||||
)
|
||||
model = _DistilBertWrapper.try_wrapping(model)
|
||||
return _TraceableTextExpansionModel(self._tokenizer, model)
|
||||
|
||||
if self._task_type == "fill_mask":
|
||||
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
||||
self._model_id, token=self._access_token, torchscript=True
|
||||
@ -1034,7 +792,7 @@ class TransformerModel:
|
||||
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unknown task type {self._task_type}, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
|
||||
f"Task {self._task_type} is not supported, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
|
||||
)
|
||||
|
||||
def elasticsearch_model_id(self) -> str:
|
||||
@ -1065,9 +823,5 @@ def elasticsearch_model_id(model_id: str) -> str:
|
||||
"""
|
||||
|
||||
id = re.sub(r"[\s\\/]", "__", model_id).lower()[-64:]
|
||||
if id.startswith("__"):
|
||||
# This check is only needed as long as Eland supports Python 3.8
|
||||
# str.removeprefix was introduced in Python 3.9 and can be used
|
||||
# once 3.8 support is dropped
|
||||
id = id[2:]
|
||||
id = id.removeprefix("__")
|
||||
return id
|
||||
|
317
eland/ml/pytorch/wrappers.py
Normal file
317
eland/ml/pytorch/wrappers.py
Normal file
@ -0,0 +1,317 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
This module contains the wrapper classes for the Hugging Face models.
|
||||
Wrapping is necessary to ensure that the forward method of the model
|
||||
is called with the same arguments the ml-cpp pytorch_inference process
|
||||
uses.
|
||||
"""
|
||||
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch # type: ignore
|
||||
import transformers # type: ignore
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
from torch import Tensor, nn
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForQuestionAnswering,
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizer,
|
||||
)
|
||||
|
||||
DEFAULT_OUTPUT_KEY = "sentence_embedding"
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around a question answering model.
|
||||
Our inference engine only takes the first tuple if the inference response
|
||||
is a tuple.
|
||||
|
||||
This wrapper transforms the output to be a stacked tensor if its a tuple.
|
||||
|
||||
Otherwise it passes it through
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||
model_id, token=token, torchscript=True
|
||||
)
|
||||
if isinstance(
|
||||
model.config,
|
||||
(
|
||||
transformers.MPNetConfig,
|
||||
transformers.XLMRobertaConfig,
|
||||
transformers.RobertaConfig,
|
||||
transformers.BartConfig,
|
||||
),
|
||||
):
|
||||
return _TwoParameterQuestionAnsweringWrapper(model)
|
||||
else:
|
||||
return _QuestionAnsweringWrapper(model)
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
del inputs["position_ids"]
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _DistilBertWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
In Elasticsearch the BERT tokenizer is used for DistilBERT models but
|
||||
the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
|
||||
|
||||
Wrap the model's forward function in a method that accepts the 4
|
||||
arguments passed to a BERT model then discard the token_type_ids
|
||||
and the position_ids to match the wrapped DistilBERT model forward
|
||||
function
|
||||
"""
|
||||
|
||||
def __init__(self, model: transformers.PreTrainedModel):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
|
||||
if isinstance(model.config, transformers.DistilBertConfig):
|
||||
return _DistilBertWrapper(model)
|
||||
else:
|
||||
return model
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
_token_type_ids: Tensor = None,
|
||||
_position_ids: Tensor = None,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
class _SentenceTransformerWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around sentence-transformer models to provide pooling,
|
||||
normalization and other graph layers that are not defined in the base
|
||||
HuggingFace transformer model.
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self._st_model = SentenceTransformer(model.config.name_or_path)
|
||||
self._output_key = output_key
|
||||
self.config = model.config
|
||||
|
||||
self._remove_pooling_layer()
|
||||
self._replace_transformer_layer()
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(
|
||||
model_id: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
*,
|
||||
token: Optional[str] = None,
|
||||
output_key: str = DEFAULT_OUTPUT_KEY,
|
||||
) -> Optional[Any]:
|
||||
model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
|
||||
if isinstance(
|
||||
tokenizer,
|
||||
(
|
||||
transformers.BartTokenizer,
|
||||
transformers.MPNetTokenizer,
|
||||
transformers.RobertaTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
transformers.DebertaV2Tokenizer,
|
||||
),
|
||||
):
|
||||
return _TwoParameterSentenceTransformerWrapper(model, output_key)
|
||||
else:
|
||||
return _SentenceTransformerWrapper(model, output_key)
|
||||
|
||||
def _remove_pooling_layer(self) -> None:
|
||||
"""
|
||||
Removes any last pooling layer which is not used to create embeddings.
|
||||
Leaving this layer in will cause it to return a NoneType which in turn
|
||||
will fail to load in libtorch. Alternatively, we can just use the output
|
||||
of the pooling layer as a dummy but this also affects (if only in a
|
||||
minor way) the performance of inference, so we're better off removing
|
||||
the layer if we can.
|
||||
"""
|
||||
|
||||
if hasattr(self._hf_model, "pooler"):
|
||||
self._hf_model.pooler = None
|
||||
|
||||
def _replace_transformer_layer(self) -> None:
|
||||
"""
|
||||
Replaces the HuggingFace Transformer layer in the SentenceTransformer
|
||||
modules so we can set it with one that has pooling layer removed and
|
||||
was loaded ready for TorchScript export.
|
||||
"""
|
||||
|
||||
self._st_model._modules["0"].auto_model = self._hf_model
|
||||
|
||||
|
||||
class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _DPREncoderWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
AutoModel loading does not work for DPRContextEncoders, this only exists as
|
||||
a workaround. This may never be fixed so this is likely permanent.
|
||||
See: https://github.com/huggingface/transformers/issues/13670
|
||||
"""
|
||||
|
||||
_SUPPORTED_MODELS = {
|
||||
transformers.DPRContextEncoder,
|
||||
transformers.DPRQuestionEncoder,
|
||||
}
|
||||
_SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
|
||||
):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
config = AutoConfig.from_pretrained(model_id, token=token)
|
||||
|
||||
def is_compatible() -> bool:
|
||||
is_dpr_model = config.model_type == "dpr"
|
||||
has_architectures = (
|
||||
config.architectures is not None and len(config.architectures) == 1
|
||||
)
|
||||
is_supported_architecture = has_architectures and (
|
||||
config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
|
||||
)
|
||||
return is_dpr_model and is_supported_architecture
|
||||
|
||||
if is_compatible():
|
||||
model = getattr(transformers, config.architectures[0]).from_pretrained(
|
||||
model_id, torchscript=True
|
||||
)
|
||||
return _DPREncoderWrapper(model)
|
||||
else:
|
||||
return None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
_position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
)
|
@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd # type: ignore
|
||||
from pandas.core.indexes.frozen import FrozenList
|
||||
from pandas.io.common import _expand_user, stringify_path # type: ignore
|
||||
|
||||
import eland.plotting
|
||||
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||
from eland.filter import (
|
||||
BooleanFilter,
|
||||
Equal,
|
||||
@ -292,18 +293,26 @@ class Series(NDFrame):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df['Carrier'].value_counts()
|
||||
>>> df['Carrier'].value_counts() # doctest: +SKIP
|
||||
Carrier
|
||||
Logstash Airways 3331
|
||||
JetBeats 3274
|
||||
Kibana Airlines 3234
|
||||
ES-Air 3220
|
||||
Name: Carrier, dtype: int64
|
||||
Name: count, dtype: int64
|
||||
"""
|
||||
if not isinstance(es_size, int):
|
||||
raise TypeError("es_size must be a positive integer.")
|
||||
elif es_size <= 0:
|
||||
raise ValueError("es_size must be a positive integer.")
|
||||
return self._query_compiler.value_counts(es_size)
|
||||
value_counts = self._query_compiler.value_counts(es_size)
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
|
||||
if PANDAS_VERSION[0] == 2:
|
||||
value_counts.name = "count"
|
||||
value_counts.index.names = FrozenList([self.es_field_name])
|
||||
value_counts.index.name = self.es_field_name
|
||||
|
||||
return value_counts
|
||||
|
||||
# dtype not implemented for Series as causes query to fail
|
||||
# in pandas.core.computation.ops.Term.type
|
||||
|
51
noxfile.py
51
noxfile.py
@ -16,7 +16,6 @@
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import nox
|
||||
@ -58,10 +57,10 @@ TYPED_FILES = (
|
||||
|
||||
@nox.session(reuse_venv=True, python="3.11")
|
||||
def format(session):
|
||||
session.install("black", "isort", "flynt")
|
||||
session.install("black ~= 25.0", "isort", "flynt")
|
||||
session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES)
|
||||
session.run("flynt", *SOURCE_FILES)
|
||||
session.run("black", "--target-version=py38", *SOURCE_FILES)
|
||||
session.run("black", "--target-version=py39", *SOURCE_FILES)
|
||||
session.run("isort", "--profile=black", *SOURCE_FILES)
|
||||
lint(session)
|
||||
|
||||
@ -70,38 +69,34 @@ def format(session):
|
||||
def lint(session):
|
||||
# Install numpy to use its mypy plugin
|
||||
# https://numpy.org/devdocs/reference/typing.html#mypy-plugin
|
||||
session.install("black", "flake8", "mypy", "isort", "numpy")
|
||||
session.install("black ~= 25.0", "flake8", "mypy", "isort", "numpy")
|
||||
session.install(".")
|
||||
session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
|
||||
session.run("black", "--check", "--target-version=py38", *SOURCE_FILES)
|
||||
session.run("black", "--check", "--target-version=py39", *SOURCE_FILES)
|
||||
session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
|
||||
session.run("flake8", "--extend-ignore=E203,E402,E501,E704,E712", *SOURCE_FILES)
|
||||
|
||||
# TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
|
||||
session.log("mypy --show-error-codes --strict eland/")
|
||||
for typed_file in TYPED_FILES:
|
||||
if not os.path.isfile(typed_file):
|
||||
session.error(f"The file {typed_file!r} couldn't be found")
|
||||
process = subprocess.run(
|
||||
["mypy", "--show-error-codes", "--strict", typed_file],
|
||||
env=session.env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
# Ensure that mypy itself ran successfully
|
||||
assert process.returncode in (0, 1)
|
||||
stdout = session.run(
|
||||
"mypy",
|
||||
"--show-error-codes",
|
||||
"--strict",
|
||||
*TYPED_FILES,
|
||||
success_codes=(0, 1),
|
||||
silent=True,
|
||||
)
|
||||
|
||||
errors = []
|
||||
for line in process.stdout.decode().split("\n"):
|
||||
filepath = line.partition(":")[0]
|
||||
if filepath in TYPED_FILES:
|
||||
errors.append(line)
|
||||
if errors:
|
||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||
errors = []
|
||||
for line in stdout.splitlines():
|
||||
filepath = line.partition(":")[0]
|
||||
if filepath in TYPED_FILES:
|
||||
errors.append(line)
|
||||
if errors:
|
||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||
|
||||
|
||||
@nox.session(python=["3.8", "3.9", "3.10", "3.11"])
|
||||
@nox.parametrize("pandas_version", ["1.5.0"])
|
||||
@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
|
||||
@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
|
||||
def test(session, pandas_version: str):
|
||||
session.install("-r", "requirements-dev.txt")
|
||||
session.install(".")
|
||||
@ -121,9 +116,6 @@ def test(session, pandas_version: str):
|
||||
"--nbval",
|
||||
)
|
||||
|
||||
# PyTorch 2.3.1 doesn't support Python 3.12
|
||||
if session.python == "3.12":
|
||||
pytest_args += ("--ignore=eland/ml/pytorch",)
|
||||
session.run(
|
||||
*pytest_args,
|
||||
*(session.posargs or ("eland/", "tests/")),
|
||||
@ -140,7 +132,6 @@ def test(session, pandas_version: str):
|
||||
"scikit-learn",
|
||||
"xgboost",
|
||||
"lightgbm",
|
||||
"shap",
|
||||
)
|
||||
session.run("pytest", "tests/ml/")
|
||||
|
||||
|
@ -10,7 +10,6 @@ pytest>=5.2.1
|
||||
pytest-mock
|
||||
pytest-cov
|
||||
nbval
|
||||
shap==0.43.0
|
||||
|
||||
#
|
||||
# Docs
|
||||
|
16
setup.py
16
setup.py
@ -38,10 +38,10 @@ CLASSIFIERS = [
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering",
|
||||
]
|
||||
|
||||
@ -56,16 +56,16 @@ with open(path.join(here, "README.md"), "r", "utf-8") as f:
|
||||
|
||||
extras = {
|
||||
"xgboost": ["xgboost>=0.90,<2"],
|
||||
"scikit-learn": ["scikit-learn>=1.3,<1.4"],
|
||||
"lightgbm": ["lightgbm>=2,<4"],
|
||||
"scikit-learn": ["scikit-learn>=1.3,<1.6"],
|
||||
"lightgbm": ["lightgbm>=3,<5"],
|
||||
"pytorch": [
|
||||
"requests<3",
|
||||
"torch==2.3.1",
|
||||
"torch==2.5.1",
|
||||
"tqdm",
|
||||
"sentence-transformers>=2.1.0,<=2.7.0",
|
||||
# sentencepiece is a required dependency for the slow tokenizers
|
||||
# https://huggingface.co/transformers/v4.4.2/migration.html#sentencepiece-is-removed-from-the-required-dependencies
|
||||
"transformers[sentencepiece]>=4.31.0,<4.44.0",
|
||||
"transformers[sentencepiece]>=4.47.0",
|
||||
],
|
||||
}
|
||||
extras["all"] = list({dep for deps in extras.values() for dep in deps})
|
||||
@ -86,8 +86,8 @@ setup(
|
||||
keywords="elastic eland pandas python",
|
||||
packages=find_packages(include=["eland", "eland.*"]),
|
||||
install_requires=[
|
||||
"elasticsearch>=8.3,<9",
|
||||
"pandas>=1.5,<2",
|
||||
"elasticsearch>=9,<10",
|
||||
"pandas>=1.5,<3",
|
||||
"matplotlib>=3.6",
|
||||
"numpy>=1.2.0,<2",
|
||||
"packaging",
|
||||
@ -95,7 +95,7 @@ setup(
|
||||
entry_points={
|
||||
"console_scripts": "eland_import_hub_model=eland.cli.eland_import_hub_model:main"
|
||||
},
|
||||
python_requires=">=3.8,<3.12",
|
||||
python_requires=">=3.9,<3.13",
|
||||
package_data={"eland": ["py.typed"]},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
import eland as ed
|
||||
from eland.common import PANDAS_VERSION
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@ -45,7 +46,10 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
|
||||
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
||||
_ed_flights.columns, axis=1
|
||||
)
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
if PANDAS_VERSION[0] >= 2:
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
|
||||
else:
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
|
||||
_pd_flights["Cities"] = _pd_flights.apply(
|
||||
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
|
||||
@ -62,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
|
||||
)
|
||||
_pd_ecommerce.insert(2, "customer_birth_date", None)
|
||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce["customer_birth_date"].astype("datetime64")
|
||||
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
|
||||
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
|
||||
|
||||
|
||||
|
@ -77,7 +77,16 @@ class SymmetricAPIChecker:
|
||||
pd_exc = e
|
||||
|
||||
self.check_exception(ed_exc, pd_exc)
|
||||
self.check_values(ed_obj, pd_obj)
|
||||
try:
|
||||
self.check_values(ed_obj, pd_obj)
|
||||
except AssertionError as e:
|
||||
# This is an attribute we allow to differ when comparing zero-length objects
|
||||
if (
|
||||
'Attribute "inferred_type" are different' in repr(e)
|
||||
and len(ed_obj) == 0
|
||||
and len(pd_obj) == 0
|
||||
):
|
||||
self.check_values(ed_obj, pd_obj, check_index_type=False)
|
||||
|
||||
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
|
||||
return SymmetricAPIChecker(ed_obj, pd_obj)
|
||||
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
|
||||
|
||||
return f
|
||||
|
||||
def check_values(self, ed_obj, pd_obj):
|
||||
def check_values(self, ed_obj, pd_obj, **kwargs):
|
||||
"""Checks that any two values coming from eland and pandas are equal"""
|
||||
if isinstance(ed_obj, ed.DataFrame):
|
||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj)
|
||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
|
||||
elif isinstance(ed_obj, ed.Series):
|
||||
assert_pandas_eland_series_equal(pd_obj, ed_obj)
|
||||
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.DataFrame):
|
||||
assert_frame_equal(ed_obj, pd_obj)
|
||||
assert_frame_equal(ed_obj, pd_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.Series):
|
||||
assert_series_equal(ed_obj, pd_obj)
|
||||
assert_series_equal(ed_obj, pd_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.Index):
|
||||
assert ed_obj.equals(pd_obj)
|
||||
else:
|
||||
|
@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
|
||||
},
|
||||
index=["0", "1", "2"],
|
||||
)
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||
df["D"] = df["D"].astype("datetime64[ns]")
|
||||
|
||||
expected_mappings = {
|
||||
"mappings": {
|
||||
|
@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
|
||||
["Cancelled", "FlightDelay"], axis="columns"
|
||||
)
|
||||
|
||||
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
|
||||
# This could be implemented in eland, but as of yet this is not the case
|
||||
# We therefore remove it before the comparison
|
||||
if "timestamp" in pd_describe.columns:
|
||||
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
|
||||
|
||||
# Pandas >= 2 orders the aggregations differently than Pandas < 2
|
||||
# A sort_index is applied so tests will succeed in both environments
|
||||
assert_frame_equal(
|
||||
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||
ed_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||
check_exact=False,
|
||||
rtol=True,
|
||||
)
|
||||
|
@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
ed_head_0 = ed_flights.head(0)
|
||||
pd_head_0 = pd_flights.head(0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
|
||||
|
||||
def test_doc_test_tail(self):
|
||||
df = self.ed_flights()
|
||||
|
@ -22,6 +22,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
from eland.common import PANDAS_VERSION
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights, func)(
|
||||
**({"numeric_only": True} if func != "mad" else {})
|
||||
)
|
||||
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights_1, func)()
|
||||
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
||||
|
||||
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights_0, func)()
|
||||
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
||||
|
||||
@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData):
|
||||
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
|
||||
)
|
||||
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
if PANDAS_VERSION[0] == 1:
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
|
||||
else: # numeric_only is no longer available for pandas > 2
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"])
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"])
|
||||
|
||||
assert_frame_equal(
|
||||
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
|
||||
|
@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
|
||||
)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||
df["D"] = df["D"].astype("datetime64[ns]")
|
||||
df["H"] = (
|
||||
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
|
||||
)
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
||||
ES_TEST_CLIENT.indices.delete(index=index_name)
|
||||
|
@ -39,6 +39,7 @@ try:
|
||||
from eland.ml.pytorch import (
|
||||
FillMaskInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
NlpDebertaV2TokenizationConfig,
|
||||
NlpMPNetTokenizationConfig,
|
||||
NlpRobertaTokenizationConfig,
|
||||
NlpXLMRobertaTokenizationConfig,
|
||||
@ -57,10 +58,6 @@ except ImportError:
|
||||
from tests import ES_VERSION
|
||||
|
||||
pytestmark = [
|
||||
pytest.mark.skipif(
|
||||
ES_VERSION < (8, 15, 1),
|
||||
reason="Eland uses Pytorch 2.3.1, versions of Elasticsearch prior to 8.15.1 are incompatible with PyTorch 2.3.1",
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
),
|
||||
@ -149,6 +146,14 @@ if HAS_PYTORCH and HAS_SKLEARN and HAS_TRANSFORMERS:
|
||||
1024,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"microsoft/deberta-v3-xsmall",
|
||||
"fill_mask",
|
||||
FillMaskInferenceOptions,
|
||||
NlpDebertaV2TokenizationConfig,
|
||||
512,
|
||||
None,
|
||||
),
|
||||
]
|
||||
else:
|
||||
MODEL_CONFIGURATIONS = []
|
||||
|
@ -38,10 +38,6 @@ except ImportError:
|
||||
from tests import ES_TEST_CLIENT, ES_VERSION
|
||||
|
||||
pytestmark = [
|
||||
pytest.mark.skipif(
|
||||
ES_VERSION < (8, 15, 2),
|
||||
reason="Eland uses Pytorch 2.3.1, versions of Elasticsearch prior to 8.15.2 are incompatible with PyTorch 2.3.1",
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
),
|
||||
@ -67,6 +63,8 @@ TEXT_EMBEDDING_MODELS = [
|
||||
)
|
||||
]
|
||||
|
||||
TEXT_SIMILARITY_MODELS = ["mixedbread-ai/mxbai-rerank-xsmall-v1"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def setup_and_tear_down():
|
||||
@ -135,3 +133,25 @@ class TestPytorchModel:
|
||||
)
|
||||
> 0
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
ES_VERSION < (8, 16, 0), reason="requires 8.16.0 for DeBERTa models"
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", TEXT_SIMILARITY_MODELS)
|
||||
def test_text_similarity(self, model_id):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
ptm = download_model_and_start_deployment(
|
||||
tmp_dir, False, model_id, "text_similarity"
|
||||
)
|
||||
result = ptm.infer(
|
||||
docs=[
|
||||
{
|
||||
"text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
|
||||
},
|
||||
{"text_field": "Paris is the capital of France"},
|
||||
],
|
||||
inference_config={"text_similarity": {"text": "France"}},
|
||||
)
|
||||
|
||||
assert result.body["inference_results"][0]["predicted_value"] < 0
|
||||
assert result.body["inference_results"][1]["predicted_value"] > 0
|
||||
|
@ -15,20 +15,18 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import eland as ed
|
||||
from eland.ml import MLModel
|
||||
from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
|
||||
from eland.ml.transformers import get_model_transformer
|
||||
from tests import (
|
||||
ES_IS_SERVERLESS,
|
||||
ES_TEST_CLIENT,
|
||||
ES_VERSION,
|
||||
FLIGHTS_SMALL_INDEX_NAME,
|
||||
NATIONAL_PARKS_INDEX_NAME,
|
||||
)
|
||||
|
||||
@ -55,26 +53,16 @@ try:
|
||||
except ImportError:
|
||||
HAS_LIGHTGBM = False
|
||||
|
||||
try:
|
||||
import shap
|
||||
|
||||
HAS_SHAP = True
|
||||
except ImportError:
|
||||
HAS_SHAP = False
|
||||
|
||||
|
||||
requires_sklearn = pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run."
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
)
|
||||
requires_xgboost = pytest.mark.skipif(
|
||||
not HAS_XGBOOST, reason="This test requires 'xgboost' package to run."
|
||||
)
|
||||
requires_shap = pytest.mark.skipif(
|
||||
not HAS_SHAP, reason="This tests requries 'shap' package to run."
|
||||
not HAS_XGBOOST, reason="This test requires 'xgboost' package to run"
|
||||
)
|
||||
requires_no_ml_extras = pytest.mark.skipif(
|
||||
HAS_SKLEARN or HAS_XGBOOST,
|
||||
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed.",
|
||||
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed",
|
||||
)
|
||||
|
||||
requires_lightgbm = pytest.mark.skipif(
|
||||
@ -108,102 +96,6 @@ def check_prediction_equality(es_model: MLModel, py_model, test_data):
|
||||
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
|
||||
|
||||
|
||||
def yield_model_id(analysis, analyzed_fields):
|
||||
import random
|
||||
import string
|
||||
import time
|
||||
|
||||
suffix = "".join(random.choices(string.ascii_lowercase, k=4))
|
||||
job_id = "test-flights-regression-" + suffix
|
||||
dest = job_id + "-dest"
|
||||
|
||||
response = ES_TEST_CLIENT.ml.put_data_frame_analytics(
|
||||
id=job_id,
|
||||
analysis=analysis,
|
||||
dest={"index": dest},
|
||||
source={"index": [FLIGHTS_SMALL_INDEX_NAME]},
|
||||
analyzed_fields=analyzed_fields,
|
||||
)
|
||||
assert response.meta.status == 200
|
||||
response = ES_TEST_CLIENT.ml.start_data_frame_analytics(id=job_id)
|
||||
assert response.meta.status == 200
|
||||
|
||||
time.sleep(2)
|
||||
response = ES_TEST_CLIENT.ml.get_trained_models(model_id=job_id + "*")
|
||||
assert response.meta.status == 200
|
||||
assert response.body["count"] == 1
|
||||
model_id = response.body["trained_model_configs"][0]["model_id"]
|
||||
|
||||
yield model_id
|
||||
|
||||
ES_TEST_CLIENT.ml.delete_data_frame_analytics(id=job_id)
|
||||
ES_TEST_CLIENT.indices.delete(index=dest)
|
||||
ES_TEST_CLIENT.ml.delete_trained_model(model_id=model_id)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[0, 4], [0, 1], range(5)])
|
||||
def regression_model_id(request):
|
||||
analysis = {
|
||||
"regression": {
|
||||
"dependent_variable": "FlightDelayMin",
|
||||
"max_trees": 3,
|
||||
"num_top_feature_importance_values": 0,
|
||||
"max_optimization_rounds_per_hyperparameter": 1,
|
||||
"prediction_field_name": "FlightDelayMin_prediction",
|
||||
"training_percent": 30,
|
||||
"randomize_seed": 1000,
|
||||
"loss_function": "mse",
|
||||
"early_stopping_enabled": True,
|
||||
}
|
||||
}
|
||||
all_includes = [
|
||||
"FlightDelayMin",
|
||||
"FlightDelayType",
|
||||
"FlightTimeMin",
|
||||
"DistanceMiles",
|
||||
"OriginAirportID",
|
||||
]
|
||||
includes = [all_includes[i] for i in request.param]
|
||||
analyzed_fields = {
|
||||
"includes": includes,
|
||||
"excludes": [],
|
||||
}
|
||||
yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[0, 6], [5, 6], range(7)])
|
||||
def classification_model_id(request):
|
||||
analysis = {
|
||||
"classification": {
|
||||
"dependent_variable": "Cancelled",
|
||||
"max_trees": 5,
|
||||
"num_top_feature_importance_values": 0,
|
||||
"max_optimization_rounds_per_hyperparameter": 1,
|
||||
"prediction_field_name": "Cancelled_prediction",
|
||||
"training_percent": 50,
|
||||
"randomize_seed": 1000,
|
||||
"num_top_classes": -1,
|
||||
"class_assignment_objective": "maximize_accuracy",
|
||||
"early_stopping_enabled": True,
|
||||
}
|
||||
}
|
||||
all_includes = [
|
||||
"OriginWeather",
|
||||
"OriginAirportID",
|
||||
"DestCityName",
|
||||
"DestWeather",
|
||||
"DestRegion",
|
||||
"AvgTicketPrice",
|
||||
"Cancelled",
|
||||
]
|
||||
includes = [all_includes[i] for i in request.param]
|
||||
analyzed_fields = {
|
||||
"includes": includes,
|
||||
"excludes": [],
|
||||
}
|
||||
yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
|
||||
|
||||
|
||||
def randomize_model_id(prefix, suffix_size=10):
|
||||
import random
|
||||
import string
|
||||
@ -328,6 +220,46 @@ class TestMLModel:
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
def _normalize_ltr_score_from_XGBRanker(self, ranker, ltr_model_config, scores):
|
||||
"""Normalize the scores of an XGBRanker model as ES implementation of LTR would do.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ranker : XGBRanker
|
||||
The XGBRanker model to retrieve the minimum score from.
|
||||
|
||||
ltr_model_config : LTRModelConfig
|
||||
LTR model config.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scores : List[float]
|
||||
Normalized scores for the model.
|
||||
"""
|
||||
|
||||
should_rescore = (
|
||||
(ES_VERSION[0] == 8 and ES_VERSION >= (8, 19))
|
||||
or (
|
||||
ES_VERSION[0] == 9
|
||||
and (ES_VERSION[1] >= 1 or (ES_VERSION[1] == 0 and ES_VERSION[2] >= 1))
|
||||
)
|
||||
or ES_IS_SERVERLESS
|
||||
)
|
||||
|
||||
if should_rescore:
|
||||
# In 8.19+, 9.0.1 and 9.1, the scores are normalized if there are negative scores
|
||||
min_model_score, _ = (
|
||||
get_model_transformer(
|
||||
ranker, feature_names=ltr_model_config.feature_names
|
||||
)
|
||||
.transform()
|
||||
.bounds()
|
||||
)
|
||||
if min_model_score < 0:
|
||||
scores = [score - min_model_score for score in scores]
|
||||
|
||||
return scores
|
||||
|
||||
@requires_elasticsearch_version((8, 12))
|
||||
@requires_xgboost
|
||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||
@ -439,6 +371,11 @@ class TestMLModel:
|
||||
],
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
expected_scores = self._normalize_ltr_score_from_XGBRanker(
|
||||
ranker, ltr_model_config, expected_scores
|
||||
)
|
||||
|
||||
np.testing.assert_almost_equal(expected_scores, doc_scores, decimal=2)
|
||||
|
||||
# Verify prediction is not supported for LTR
|
||||
@ -790,172 +727,3 @@ class TestMLModel:
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_sklearn
|
||||
@requires_shap
|
||||
def test_export_regressor(self, regression_model_id):
|
||||
ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
|
||||
types = dict(ed_flights.dtypes)
|
||||
X = ed_flights.to_pandas().astype(types)
|
||||
|
||||
model = MLModel(es_client=ES_TEST_CLIENT, model_id=regression_model_id)
|
||||
pipeline = model.export_model()
|
||||
pipeline.fit(X)
|
||||
|
||||
predictions_sklearn = pipeline.predict(
|
||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
response = ES_TEST_CLIENT.ml.infer_trained_model(
|
||||
model_id=regression_model_id,
|
||||
docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
|
||||
)
|
||||
predictions_es = np.array(
|
||||
list(
|
||||
map(
|
||||
itemgetter("FlightDelayMin_prediction"),
|
||||
response.body["inference_results"],
|
||||
)
|
||||
)
|
||||
)
|
||||
np.testing.assert_array_almost_equal(predictions_sklearn, predictions_es)
|
||||
|
||||
import pandas as pd
|
||||
|
||||
X_transformed = pipeline["preprocessor"].transform(X=X)
|
||||
X_transformed = pd.DataFrame(
|
||||
X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
explainer = shap.TreeExplainer(pipeline["es_model"])
|
||||
shap_values = explainer.shap_values(
|
||||
X_transformed[pipeline["es_model"].feature_names_in_]
|
||||
)
|
||||
np.testing.assert_array_almost_equal(
|
||||
predictions_sklearn, shap_values.sum(axis=1) + explainer.expected_value
|
||||
)
|
||||
|
||||
@requires_sklearn
|
||||
def test_export_classification(self, classification_model_id):
|
||||
ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
|
||||
X = ed.eland_to_pandas(ed_flights)
|
||||
|
||||
model = MLModel(es_client=ES_TEST_CLIENT, model_id=classification_model_id)
|
||||
pipeline = model.export_model()
|
||||
pipeline.fit(X)
|
||||
|
||||
predictions_sklearn = pipeline.predict(
|
||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
prediction_proba_sklearn = pipeline.predict_proba(
|
||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
||||
).max(axis=1)
|
||||
|
||||
response = ES_TEST_CLIENT.ml.infer_trained_model(
|
||||
model_id=classification_model_id,
|
||||
docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
|
||||
)
|
||||
predictions_es = np.array(
|
||||
list(
|
||||
map(
|
||||
lambda x: str(int(x["Cancelled_prediction"])),
|
||||
response.body["inference_results"],
|
||||
)
|
||||
)
|
||||
)
|
||||
prediction_proba_es = np.array(
|
||||
list(
|
||||
map(
|
||||
itemgetter("prediction_probability"),
|
||||
response.body["inference_results"],
|
||||
)
|
||||
)
|
||||
)
|
||||
np.testing.assert_array_almost_equal(
|
||||
prediction_proba_sklearn, prediction_proba_es
|
||||
)
|
||||
np.testing.assert_array_equal(predictions_sklearn, predictions_es)
|
||||
|
||||
import pandas as pd
|
||||
|
||||
X_transformed = pipeline["preprocessor"].transform(X=X)
|
||||
X_transformed = pd.DataFrame(
|
||||
X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
explainer = shap.TreeExplainer(pipeline["es_model"])
|
||||
shap_values = explainer.shap_values(
|
||||
X_transformed[pipeline["es_model"].feature_names_in_]
|
||||
)
|
||||
log_odds = shap_values.sum(axis=1) + explainer.expected_value
|
||||
prediction_proba_shap = 1 / (1 + np.exp(-log_odds))
|
||||
# use probability of the predicted class
|
||||
prediction_proba_shap[prediction_proba_shap < 0.5] = (
|
||||
1 - prediction_proba_shap[prediction_proba_shap < 0.5]
|
||||
)
|
||||
np.testing.assert_array_almost_equal(
|
||||
prediction_proba_sklearn, prediction_proba_shap
|
||||
)
|
||||
|
||||
@requires_xgboost
|
||||
@requires_sklearn
|
||||
@pytest.mark.parametrize("objective", ["binary:logistic", "reg:squarederror"])
|
||||
def test_xgb_import_export(self, objective):
|
||||
booster = "gbtree"
|
||||
|
||||
if objective.startswith("binary:"):
|
||||
training_data = datasets.make_classification(n_features=5)
|
||||
xgb_model = XGBClassifier(
|
||||
booster=booster, objective=objective, use_label_encoder=False
|
||||
)
|
||||
else:
|
||||
training_data = datasets.make_regression(n_features=5)
|
||||
xgb_model = XGBRegressor(
|
||||
booster=booster, objective=objective, use_label_encoder=False
|
||||
)
|
||||
|
||||
# Train model
|
||||
xgb_model.fit(training_data[0], training_data[1])
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
|
||||
model_id = "test_xgb_model"
|
||||
|
||||
es_model = MLModel.import_model(
|
||||
ES_TEST_CLIENT, model_id, xgb_model, feature_names, es_if_exists="replace"
|
||||
)
|
||||
|
||||
# Export suppose to fail
|
||||
with pytest.raises(ValueError) as ex:
|
||||
es_model.export_model()
|
||||
assert ex.match("Error initializing sklearn classifier.")
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_lightgbm
|
||||
@pytest.mark.parametrize("objective", ["regression", "binary"])
|
||||
def test_lgbm_import_export(self, objective):
|
||||
booster = "gbdt"
|
||||
if objective == "binary":
|
||||
training_data = datasets.make_classification(n_features=5)
|
||||
lgbm_model = LGBMClassifier(boosting_type=booster, objective=objective)
|
||||
else:
|
||||
training_data = datasets.make_regression(n_features=5)
|
||||
lgbm_model = LGBMRegressor(boosting_type=booster, objective=objective)
|
||||
|
||||
# Train model
|
||||
lgbm_model.fit(training_data[0], training_data[1])
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
|
||||
model_id = "test_lgbm_model"
|
||||
|
||||
es_model = MLModel.import_model(
|
||||
ES_TEST_CLIENT, model_id, lgbm_model, feature_names, es_if_exists="replace"
|
||||
)
|
||||
|
||||
# Export suppose to fail
|
||||
with pytest.raises(ValueError) as ex:
|
||||
es_model.export_model()
|
||||
assert ex.match("Error initializing sklearn classifier.")
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
@ -1647,6 +1647,14 @@
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/code/eland/.nox/test-3-12-pandas_version-2-2-3/lib/python3.12/site-packages/eland/series.py:464: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" return self._query_compiler.dtypes[0]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -1792,6 +1800,9 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NBVAL_IGNORE_OUTPUT\n",
|
||||
"# The ignore statement above is because of output difference between Pandas 1 and 2\n",
|
||||
"# and can be removed once Pandas 1 support is dropped\n",
|
||||
"ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
|
||||
]
|
||||
},
|
||||
@ -2377,8 +2388,8 @@
|
||||
" <th>AvgTicketPrice</th>\n",
|
||||
" <th>DistanceKilometers</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>FlightTimeMin</th>\n",
|
||||
" <th>dayOfWeek</th>\n",
|
||||
" <th>timestamp</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
@ -2388,23 +2399,15 @@
|
||||
" <td>13059.000000</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>13059.000000</td>\n",
|
||||
" <td>13059.000000</td>\n",
|
||||
" <td>13059</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>628.253689</td>\n",
|
||||
" <td>7092.142455</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>511.127842</td>\n",
|
||||
" <td>2.835975</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>266.396861</td>\n",
|
||||
" <td>4578.438497</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>334.753952</td>\n",
|
||||
" <td>1.939439</td>\n",
|
||||
" <td>2018-01-21 19:20:45.564438016</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
@ -2412,57 +2415,65 @@
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>2018-01-01 00:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>409.893816</td>\n",
|
||||
" <td>2459.705673</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>252.333192</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>2018-01-11 05:16:25.500000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>640.556668</td>\n",
|
||||
" <td>7610.330866</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>503.045170</td>\n",
|
||||
" <td>3.000000</td>\n",
|
||||
" <td>2018-01-22 00:32:11</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>842.185470</td>\n",
|
||||
" <td>9736.637600</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>720.416036</td>\n",
|
||||
" <td>4.000000</td>\n",
|
||||
" <td>2018-02-01 04:51:18</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td>1199.729053</td>\n",
|
||||
" <td>19881.482315</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>1902.902032</td>\n",
|
||||
" <td>6.000000</td>\n",
|
||||
" <td>2018-02-11 23:50:12</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>266.396861</td>\n",
|
||||
" <td>4578.438497</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>1.939439</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>8 rows × 7 columns</p>\n",
|
||||
"<p>8 rows × 8 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
|
||||
"count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n",
|
||||
"mean 628.253689 7092.142455 ... 511.127842 2.835975\n",
|
||||
"std 266.396861 4578.438497 ... 334.753952 1.939439\n",
|
||||
"min 100.020528 0.000000 ... 0.000000 0.000000\n",
|
||||
"25% 409.893816 2459.705673 ... 252.333192 1.000000\n",
|
||||
"50% 640.556668 7610.330866 ... 503.045170 3.000000\n",
|
||||
"75% 842.185470 9736.637600 ... 720.416036 4.000000\n",
|
||||
"max 1199.729053 19881.482315 ... 1902.902032 6.000000\n",
|
||||
" AvgTicketPrice DistanceKilometers ... dayOfWeek timestamp\n",
|
||||
"count 13059.000000 13059.000000 ... 13059.000000 13059\n",
|
||||
"mean 628.253689 7092.142455 ... 2.835975 2018-01-21 19:20:45.564438016\n",
|
||||
"min 100.020528 0.000000 ... 0.000000 2018-01-01 00:00:00\n",
|
||||
"25% 409.893816 2459.705673 ... 1.000000 2018-01-11 05:16:25.500000\n",
|
||||
"50% 640.556668 7610.330866 ... 3.000000 2018-01-22 00:32:11\n",
|
||||
"75% 842.185470 9736.637600 ... 4.000000 2018-02-01 04:51:18\n",
|
||||
"max 1199.729053 19881.482315 ... 6.000000 2018-02-11 23:50:12\n",
|
||||
"std 266.396861 4578.438497 ... 1.939439 NaN\n",
|
||||
"\n",
|
||||
"[8 rows x 7 columns]"
|
||||
"[8 rows x 8 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
@ -2471,6 +2482,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NBVAL_IGNORE_OUTPUT\n",
|
||||
"# Once support for pandas <2 is dropped, this and the line above can be removed\n",
|
||||
"pd_flights.describe()"
|
||||
]
|
||||
},
|
||||
|
@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
|
||||
ed_ser = ed_flights_small.filter(items=items, axis=0)
|
||||
pd_ser = pd_flights_small.filter(items=items, axis=0)
|
||||
|
||||
assert_pandas_eland_series_equal(pd_ser, ed_ser)
|
||||
# For an empty Series, eland will say the datatype it knows from the Elastic index
|
||||
# Pandas however will state empty as the datatype
|
||||
assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
|
||||
|
||||
def test_flights_filter_index_like_and_regex(self):
|
||||
ed_flights_small = self.ed_flights_small()["FlightDelayType"]
|
||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_series_equal
|
||||
|
||||
from eland.common import PANDAS_VERSION
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
|
||||
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
||||
|
||||
for func in self.all_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights, func)()
|
||||
ed_metric = getattr(ed_flights, func)()
|
||||
|
||||
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
|
||||
ed_ecommerce = self.ed_ecommerce()[column]
|
||||
|
||||
for func in self.all_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_ecommerce, func)()
|
||||
ed_metric = getattr(ed_ecommerce, func)(
|
||||
**({"numeric_only": True} if (func != "nunique") else {})
|
||||
|
Loading…
x
Reference in New Issue
Block a user