mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Compare commits
115 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
cef4710695 | ||
|
44ead02b05 | ||
|
cb7c4fb122 | ||
|
9e8f164677 | ||
|
3c3ffd7403 | ||
|
f5c2dcfc9d | ||
|
878cde6126 | ||
|
ec45c395fd | ||
|
00dc55b3bd | ||
|
8147eb517a | ||
|
4728d9b648 | ||
|
51a2b9cc19 | ||
|
a9c36927f6 | ||
|
87380ef716 | ||
|
9ca76d7888 | ||
|
ced3cdfe32 | ||
|
87379c53de | ||
|
1ddae81769 | ||
|
9302bef7db | ||
|
ca64672fd7 | ||
|
6692251d9e | ||
|
ee4d701aa4 | ||
|
acdeeeded2 | ||
|
8350f06ea8 | ||
|
e846fb7697 | ||
|
c4ac64e3a0 | ||
|
214c4645e9 | ||
|
871e52b37a | ||
|
aa5196edee | ||
|
75c57b0775 | ||
|
77589b26b8 | ||
|
9b5badb941 | ||
|
f99adce23f | ||
|
7774a506ae | ||
|
82492fe771 | ||
|
04102f2a4e | ||
|
9aec8fc751 | ||
|
79d9a6ae29 | ||
|
939f4d672c | ||
|
1312e96220 | ||
|
2916b51fa7 | ||
|
5dabe9c099 | ||
|
06b65e211e | ||
|
a45c7bc357 | ||
|
d1e533ffb9 | ||
|
a83ce20fcc | ||
|
03af8a6319 | ||
|
5253501704 | ||
|
ec66b5f320 | ||
|
64d05e4c68 | ||
|
f79180be42 | ||
|
0ce3db26e8 | ||
|
5a76f826df | ||
|
fd8886da6a | ||
|
bee6d0e1f7 | ||
|
f18aa35e8e | ||
|
56a46d0f85 | ||
|
c497683064 | ||
|
0ddc21b895 | ||
|
5a3e7d78b3 | ||
|
1014ecdb39 | ||
|
632074c0f0 | ||
|
35a96ab3f0 | ||
|
116416b3e8 | ||
|
5b728c29c1 | ||
|
e76b32eee2 | ||
|
fd38e26df1 | ||
|
f7f6e0aba9 | ||
|
9cea2385e6 | ||
|
1921792df8 | ||
|
c16e36c051 | ||
|
ae0bba34c6 | ||
|
aaec995b1b | ||
|
de83f3f905 | ||
|
8e8c49ddbf | ||
|
5d34dc3cc4 | ||
|
9b335315bb | ||
|
28eda95ba9 | ||
|
f4b30753ad | ||
|
33cf029efe | ||
|
9d492b03aa | ||
|
fd2ceab846 | ||
|
02190e74e7 | ||
|
2a6a4b1f06 | ||
|
1190364abb | ||
|
64216d44fb | ||
|
0a6e3db157 | ||
|
5169cc926a | ||
|
d2291889f8 | ||
|
d3ed669a5e | ||
|
926f0b9b5c | ||
|
840871f9d9 | ||
|
05c5859b8a | ||
|
0f91224daf | ||
|
927acc86ad | ||
|
6ef418f465 | ||
|
081250cdec | ||
|
af26897313 | ||
|
add61a69ec | ||
|
b689759278 | ||
|
87d18bd850 | ||
|
dfc522eb31 | ||
|
508de981ff | ||
|
41db37246f | ||
|
6cecb454e3 | ||
|
28e6d92430 | ||
|
adf0535608 | ||
|
5e5f36bdf8 | ||
|
5b3a83e7f2 | ||
|
ab6e44f430 | ||
|
0c0a8ab19f | ||
|
36b941e336 | ||
|
6a4fd511cc | ||
|
c6ce4b2c46 | ||
|
48e290a927 |
@ -1,6 +1,8 @@
|
||||
ARG PYTHON_VERSION=3.9
|
||||
FROM python:${PYTHON_VERSION}
|
||||
|
||||
ENV FORCE_COLOR=1
|
||||
|
||||
WORKDIR /code/eland
|
||||
RUN python -m pip install nox
|
||||
|
||||
|
11
.buildkite/build-docker-images.sh
Normal file
11
.buildkite/build-docker-images.sh
Normal file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eo pipefail
|
||||
export LC_ALL=en_US.UTF-8
|
||||
|
||||
echo "--- Building the Wolfi image"
|
||||
# Building the linux/arm64 image takes about one hour on Buildkite, which is too slow
|
||||
docker build --file Dockerfile.wolfi .
|
||||
|
||||
echo "--- Building the public image"
|
||||
docker build .
|
@ -1,15 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y pandoc python3 python3-pip
|
||||
python3 -m pip install nox
|
||||
/opt/buildkite-agent/.local/bin/nox -s docs
|
||||
|
||||
# I couldn't make this work, for some reason pandoc is not found in the docker container repository:
|
||||
# docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
|
||||
# docker run \
|
||||
# --name doc_build \
|
||||
# --rm \
|
||||
# elastic/eland \
|
||||
# apt-get update && \
|
||||
# sudo apt-get install --yes pandoc && \
|
||||
# nox -s docs
|
||||
docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
|
||||
docker run \
|
||||
--name doc_build \
|
||||
--rm \
|
||||
elastic/eland \
|
||||
bash -c "apt-get update && apt-get install --yes pandoc && nox -s docs"
|
||||
|
@ -4,6 +4,7 @@ steps:
|
||||
PYTHON_VERSION: 3
|
||||
agents:
|
||||
provider: "gcp"
|
||||
machineType: "n2-standard-2"
|
||||
commands:
|
||||
- ./.buildkite/lint-code.sh
|
||||
- label: ":books: Build documentation"
|
||||
@ -11,23 +12,39 @@ steps:
|
||||
PYTHON_VERSION: 3.9-bookworm
|
||||
agents:
|
||||
provider: "gcp"
|
||||
machineType: "n2-standard-2"
|
||||
commands:
|
||||
- ./.buildkite/build-documentation.sh
|
||||
- label: "Eland :python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }}"
|
||||
- label: ":docker: Build Wolfi image"
|
||||
env:
|
||||
PYTHON_VERSION: 3.11-bookworm
|
||||
agents:
|
||||
provider: "gcp"
|
||||
machineType: "n2-standard-2"
|
||||
commands:
|
||||
- ./.buildkite/build-docker-images.sh
|
||||
- label: ":python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }} :pandas: {{ matrix.pandas }}"
|
||||
agents:
|
||||
provider: "gcp"
|
||||
machineType: "n2-standard-4"
|
||||
env:
|
||||
PYTHON_VERSION: "{{ matrix.python }}"
|
||||
PANDAS_VERSION: '1.5.0'
|
||||
PANDAS_VERSION: "{{ matrix.pandas }}"
|
||||
TEST_SUITE: "xpack"
|
||||
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
|
||||
matrix:
|
||||
setup:
|
||||
# Python and pandas versions need to be added to the nox configuration too
|
||||
# (in the decorators of the test method in noxfile.py)
|
||||
pandas:
|
||||
- '1.5.0'
|
||||
- '2.2.3'
|
||||
python:
|
||||
- '3.12'
|
||||
- '3.11'
|
||||
- '3.10'
|
||||
- '3.9'
|
||||
- '3.8'
|
||||
stack:
|
||||
- '8.8-SNAPSHOT'
|
||||
- '8.9-SNAPSHOT'
|
||||
stack:
|
||||
- '9.0.0'
|
||||
- '9.1.0-SNAPSHOT'
|
||||
command: ./.buildkite/run-tests
|
||||
|
@ -11,6 +11,18 @@
|
||||
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
||||
"skip_ci_labels": ["skip-ci"],
|
||||
"skip_ci_on_only_changed": ["\\.md$"]
|
||||
},
|
||||
{
|
||||
"enabled": true,
|
||||
"pipeline_slug": "docs-build-pr",
|
||||
"allow_org_users": true,
|
||||
"allowed_repo_permissions": ["admin", "write"],
|
||||
"build_on_commit": true,
|
||||
"build_on_comment": true,
|
||||
"trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
||||
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
||||
"skip_ci_labels": ["skip-ci"],
|
||||
"skip_ci_on_only_changed": ["\\.md$"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ git --no-pager show
|
||||
docker buildx rm --force eland-multiarch-builder || true
|
||||
docker buildx create --name eland-multiarch-builder --bootstrap --use
|
||||
docker buildx build --push \
|
||||
--file Dockerfile.wolfi \
|
||||
--tag "$docker_registry/eland/eland:$RELEASE_VERSION" \
|
||||
--tag "$docker_registry/eland/eland:latest" \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
|
@ -16,7 +16,12 @@ fi
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
SCRIPT_PATH=$(dirname $(realpath -s $0))
|
||||
# realpath on MacOS use different flags than on Linux
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
SCRIPT_PATH=$(dirname $(realpath $0))
|
||||
else
|
||||
SCRIPT_PATH=$(dirname $(realpath -s $0))
|
||||
fi
|
||||
|
||||
moniker=$(echo "$ELASTICSEARCH_VERSION" | tr -C "[:alnum:]" '-')
|
||||
suffix=rest-test
|
||||
@ -132,7 +137,7 @@ url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME"
|
||||
docker_pull_attempts=0
|
||||
until [ "$docker_pull_attempts" -ge 5 ]
|
||||
do
|
||||
docker pull docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION" && break
|
||||
docker pull docker.elastic.co/elasticsearch/$ELASTICSEARCH_VERSION && break
|
||||
docker_pull_attempts=$((docker_pull_attempts+1))
|
||||
sleep 10
|
||||
done
|
||||
|
@ -1,5 +1,4 @@
|
||||
# docs and example
|
||||
docs/*
|
||||
example/*
|
||||
|
||||
# Git
|
||||
@ -18,9 +17,6 @@ dist/
|
||||
# Build folder
|
||||
build/
|
||||
|
||||
# docs
|
||||
docs/*
|
||||
|
||||
# pytest results
|
||||
tests/dataframe/results/*csv
|
||||
result_images/
|
||||
|
26
.github/workflows/backport.yml
vendored
Normal file
26
.github/workflows/backport.yml
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
name: Backport
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- closed
|
||||
- labeled
|
||||
|
||||
jobs:
|
||||
backport:
|
||||
name: Backport
|
||||
runs-on: ubuntu-latest
|
||||
# Only react to merged PRs for security reasons.
|
||||
# See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target.
|
||||
if: >
|
||||
github.event.pull_request.merged
|
||||
&& (
|
||||
github.event.action == 'closed'
|
||||
|| (
|
||||
github.event.action == 'labeled'
|
||||
&& contains(github.event.label.name, 'backport')
|
||||
)
|
||||
)
|
||||
steps:
|
||||
- uses: tibdex/backport@9565281eda0731b1d20c4025c43339fb0a23812e # v2.0.4
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
19
.github/workflows/docs-build.yml
vendored
Normal file
19
.github/workflows/docs-build.yml
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
name: docs-build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request_target: ~
|
||||
merge_group: ~
|
||||
|
||||
jobs:
|
||||
docs-preview:
|
||||
uses: elastic/docs-builder/.github/workflows/preview-build.yml@main
|
||||
with:
|
||||
path-pattern: docs/**
|
||||
permissions:
|
||||
deployments: write
|
||||
id-token: write
|
||||
contents: read
|
||||
pull-requests: write
|
14
.github/workflows/docs-cleanup.yml
vendored
Normal file
14
.github/workflows/docs-cleanup.yml
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
name: docs-cleanup
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- closed
|
||||
|
||||
jobs:
|
||||
docs-preview:
|
||||
uses: elastic/docs-builder/.github/workflows/preview-cleanup.yml@main
|
||||
permissions:
|
||||
contents: none
|
||||
id-token: write
|
||||
deployments: write
|
@ -3,9 +3,12 @@ version: 2
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3"
|
||||
python: "3.11"
|
||||
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
- path: .
|
||||
- requirements: docs/requirements-docs.txt
|
||||
|
||||
sphinx:
|
||||
configuration: docs/sphinx/conf.py
|
||||
|
206
CHANGELOG.rst
206
CHANGELOG.rst
@ -2,6 +2,209 @@
|
||||
Changelog
|
||||
=========
|
||||
|
||||
9.0.1 (2025-04-30)
|
||||
------------------
|
||||
|
||||
* Forbid Elasticsearch 8 client or server (`#780 <https://github.com/elastic/eland/pull/780>`_)
|
||||
* Fix DeBERTa tokenization (`#769 <https://github.com/elastic/eland/pull/769>`_)
|
||||
* Upgrade PyTorch to 2.5.1 (`#785 <https://github.com/elastic/eland/pull/785>`_)
|
||||
* Upgrade LightGBM to 4.6.0 (`#782 <https://github.com/elastic/eland/pull/782>`_)
|
||||
|
||||
9.0.0 (2025-04-15)
|
||||
------------------
|
||||
|
||||
* Drop Python 3.8, Support Python 3.12 (`#743 <https://github.com/elastic/eland/pull/743>`_)
|
||||
* Support Pandas 2 (`#742 <https://github.com/elastic/eland/pull/742>`_)
|
||||
* Upgrade transformers to 4.47 (`#752 <https://github.com/elastic/eland/pull/752>`_)
|
||||
* Remove ML model export as sklearn Pipeline (`#744 <https://github.com/elastic/eland/pull/744>`_)
|
||||
* Allow scikit-learn 1.5 (`#729 <https://github.com/elastic/eland/pull/729>`_)
|
||||
* Migrate docs from AsciiDoc to Markdown (`#762 <https://github.com/elastic/eland/pull/762>`_)
|
||||
|
||||
8.17.0 (2025-01-07)
|
||||
-------------------
|
||||
|
||||
* Support sparse embedding models such as SPLADE-v3-DistilBERT (`#740 <https://github.com/elastic/eland/pull/740>`_)
|
||||
|
||||
8.16.0 (2024-11-13)
|
||||
-------------------
|
||||
|
||||
* Add deprecation warning for ESGradientBoostingModel subclasses (`#738 <https://github.com/elastic/eland/pull/738>`_)
|
||||
|
||||
8.15.4 (2024-10-17)
|
||||
-------------------
|
||||
|
||||
* Revert "Allow reading Elasticsearch certs in Wolfi image" (`#734 <https://github.com/elastic/eland/pull/734>`_)
|
||||
|
||||
8.15.3 (2024-10-09)
|
||||
-------------------
|
||||
|
||||
* Added support for DeBERTa-V2 tokenizer (`#717 <https://github.com/elastic/eland/pull/717>`_)
|
||||
* Fixed ``--ca-cert`` with a shared Elasticsearch Docker volume (`#732 <https://github.com/elastic/eland/pull/732>`_)
|
||||
|
||||
8.15.2 (2024-10-02)
|
||||
-------------------
|
||||
|
||||
* Fixed Docker image build (`#728 <https://github.com/elastic/eland/pull/728>`_)
|
||||
|
||||
8.15.1 (2024-10-01)
|
||||
-------------------
|
||||
|
||||
* Upgraded PyTorch to version 2.3.1, which is compatible with Elasticsearch 8.15.2 or above (`#718 <https://github.com/elastic/eland/pull/718>`_)
|
||||
* Migrated to distroless Wolfi base Docker image (`#720 <https://github.com/elastic/eland/pull/720>`_)
|
||||
|
||||
|
||||
8.15.0 (2024-08-12)
|
||||
-------------------
|
||||
|
||||
* Added a default truncation of ``second`` for text similarity (`#713 <https://github.com/elastic/eland/pull/713>`_)
|
||||
* Added note about using text_similarity for rerank in the CLI (`#716 <https://github.com/elastic/eland/pull/716>`_)
|
||||
* Added support for lists in result hits (`#707 <https://github.com/elastic/eland/pull/707>`_)
|
||||
* Removed input fields from exported LTR models (`#708 <https://github.com/elastic/eland/pull/708>`_)
|
||||
|
||||
8.14.0 (2024-06-10)
|
||||
-------------------
|
||||
|
||||
Added
|
||||
^^^^^
|
||||
|
||||
* Added Elasticsearch Serverless support in DataFrames (`#690`_, contributed by `@AshokChoudhary11`_) and eland_import_hub_model (`#698`_)
|
||||
|
||||
Fixed
|
||||
^^^^^
|
||||
|
||||
* Fixed Python 3.8 support (`#695`_, contributed by `@bartbroere`_)
|
||||
* Fixed non _source fields missing from the results hits (`#693`_, contributed by `@bartbroere`_)
|
||||
|
||||
.. _@AshokChoudhary11: https://github.com/AshokChoudhary11
|
||||
.. _#690: https://github.com/elastic/eland/pull/690
|
||||
.. _#693: https://github.com/elastic/eland/pull/693
|
||||
.. _#695: https://github.com/elastic/eland/pull/695
|
||||
.. _#698: https://github.com/elastic/eland/pull/698
|
||||
|
||||
8.13.1 (2024-05-03)
|
||||
-------------------
|
||||
|
||||
Added
|
||||
^^^^^
|
||||
|
||||
* Added support for HTTP proxies in eland_import_hub_model (`#688`_)
|
||||
|
||||
.. _#688: https://github.com/elastic/eland/pull/688
|
||||
|
||||
8.13.0 (2024-03-27)
|
||||
-------------------
|
||||
|
||||
Added
|
||||
^^^^^
|
||||
|
||||
* Added support for Python 3.11 (`#681`_)
|
||||
* Added ``eland.DataFrame.to_json`` function (`#661`_, contributed by `@bartbroere`_)
|
||||
* Added override option to specify the model's max input size (`#674`_)
|
||||
|
||||
Changed
|
||||
^^^^^^^
|
||||
|
||||
* Upgraded torch to 2.1.2 (`#671`_)
|
||||
* Mirrored pandas' ``lineterminator`` instead of ``line_terminator`` in ``to_csv`` (`#595`_, contributed by `@bartbroere`_)
|
||||
|
||||
.. _#595: https://github.com/elastic/eland/pull/595
|
||||
.. _#661: https://github.com/elastic/eland/pull/661
|
||||
.. _#671: https://github.com/elastic/eland/pull/671
|
||||
.. _#674: https://github.com/elastic/eland/pull/674
|
||||
.. _#681: https://github.com/elastic/eland/pull/681
|
||||
|
||||
|
||||
8.12.1 (2024-01-30)
|
||||
-------------------
|
||||
|
||||
Fixed
|
||||
^^^^^
|
||||
|
||||
* Fix missing value support for XGBRanker (`#654`_)
|
||||
|
||||
.. _#654: https://github.com/elastic/eland/pull/654
|
||||
|
||||
|
||||
8.12.0 (2024-01-18)
|
||||
-------------------
|
||||
|
||||
Added
|
||||
^^^^^
|
||||
|
||||
* Supported XGBRanker model (`#649`_)
|
||||
* Accepted LTR (Learning to rank) model config when importing model (`#645`_, `#651`_)
|
||||
* Added LTR feature logger (`#648`_)
|
||||
* Added ``prefix_string`` config option to the import model hub script (`#642`_)
|
||||
* Made online retail analysis notebook runnable in Colab (`#641`_)
|
||||
* Added new movie dataset to the tests (`#646`_)
|
||||
|
||||
|
||||
.. _#641: https://github.com/elastic/eland/pull/641
|
||||
.. _#642: https://github.com/elastic/eland/pull/642
|
||||
.. _#645: https://github.com/elastic/eland/pull/645
|
||||
.. _#646: https://github.com/elastic/eland/pull/646
|
||||
.. _#648: https://github.com/elastic/eland/pull/648
|
||||
.. _#649: https://github.com/elastic/eland/pull/649
|
||||
.. _#651: https://github.com/elastic/eland/pull/651
|
||||
|
||||
8.11.1 (2023-11-22)
|
||||
-------------------
|
||||
Added
|
||||
^^^^^
|
||||
|
||||
* Make demo notebook runnable in Colab (`#630`_)
|
||||
|
||||
Changed
|
||||
^^^^^^^
|
||||
|
||||
* Bump Shap version to 0.43 (`#636`_)
|
||||
|
||||
Fixed
|
||||
^^^^^
|
||||
|
||||
* Fix failed import of Sentence Transformer RoBERTa models (`#637`_)
|
||||
|
||||
|
||||
.. _#630: https://github.com/elastic/eland/pull/630
|
||||
.. _#636: https://github.com/elastic/eland/pull/636
|
||||
.. _#637: https://github.com/elastic/eland/pull/637
|
||||
|
||||
8.11.0 (2023-11-08)
|
||||
-------------------
|
||||
|
||||
Added
|
||||
^^^^^
|
||||
|
||||
* Support E5 small multilingual model (`#625`_)
|
||||
|
||||
Changed
|
||||
^^^^^^^
|
||||
|
||||
* Stream writes in ``ed.DataFrame.to_csv()`` (`#579`_)
|
||||
* Improve memory estimation for NLP models (`#568`_)
|
||||
|
||||
Fixed
|
||||
^^^^^
|
||||
|
||||
* Fixed deprecations in preparation of Pandas 2.0 support (`#602`_, `#603`_, contributed by `@bartbroere`_)
|
||||
|
||||
|
||||
.. _#568: https://github.com/elastic/eland/pull/568
|
||||
.. _#579: https://github.com/elastic/eland/pull/579
|
||||
.. _#602: https://github.com/elastic/eland/pull/602
|
||||
.. _#603: https://github.com/elastic/eland/pull/603
|
||||
.. _#625: https://github.com/elastic/eland/pull/625
|
||||
|
||||
8.10.1 (2023-10-11)
|
||||
-------------------
|
||||
|
||||
Fixed
|
||||
^^^^^
|
||||
|
||||
* Fixed direct usage of TransformerModel (`#619`_)
|
||||
|
||||
.. _#619: https://github.com/elastic/eland/pull/619
|
||||
|
||||
8.10.0 (2023-10-09)
|
||||
-------------------
|
||||
|
||||
@ -24,8 +227,9 @@ Changed
|
||||
Fixed
|
||||
^^^^^
|
||||
|
||||
* Fixed deprecations in preparation of Pandas 2.0 support (`#593`_, `#596`_)
|
||||
* Fixed deprecations in preparation of Pandas 2.0 support (`#593`_, `#596`_, contributed by `@bartbroere`_)
|
||||
|
||||
.. _@bartbroere: https://github.com/bartbroere
|
||||
.. _#613: https://github.com/elastic/eland/pull/613
|
||||
.. _#608: https://github.com/elastic/eland/pull/608
|
||||
.. _#615: https://github.com/elastic/eland/pull/615
|
||||
|
@ -78,9 +78,15 @@ Once your changes and tests are ready to submit for review:
|
||||
# Run Auto-format, lint, mypy type checker for your changes
|
||||
$ nox -s format
|
||||
|
||||
# Run the test suite
|
||||
$ pytest --doctest-modules eland/ tests/
|
||||
$ pytest --nbval tests/notebook/
|
||||
# Launch Elasticsearch with a trial licence and ML enabled
|
||||
$ docker run --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.license.self_generated.type=trial" docker.elastic.co/elasticsearch/elasticsearch:9.0.0
|
||||
|
||||
# See all test suites
|
||||
$ nox -l
|
||||
# Run a specific test suite
|
||||
$ nox -rs "test-3.12(pandas_version='2.2.3')"
|
||||
# Run a specific test
|
||||
$ nox -rs "test-3.12(pandas_version='2.2.3')" -- -k test_learning_to_rank
|
||||
|
||||
```
|
||||
|
||||
@ -169,7 +175,7 @@ currently using a minimum version of PyCharm 2019.2.4.
|
||||
* Setup Elasticsearch instance with docker
|
||||
|
||||
``` bash
|
||||
> ELASTICSEARCH_VERSION=elasticsearch:7.x-SNAPSHOT .ci/run-elasticsearch.sh
|
||||
> ELASTICSEARCH_VERSION=elasticsearch:8.17.0 BUILDKITE=false .buildkite/run-elasticsearch.sh
|
||||
```
|
||||
|
||||
* Now check `http://localhost:9200`
|
||||
@ -203,7 +209,7 @@ currently using a minimum version of PyCharm 2019.2.4.
|
||||
* To test specific versions of Python run
|
||||
|
||||
``` bash
|
||||
> nox -s test-3.8
|
||||
> nox -s test-3.12
|
||||
```
|
||||
|
||||
### Documentation
|
||||
|
@ -18,7 +18,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||
torch==1.13.1+cpu .[all]; \
|
||||
torch==2.5.1+cpu .[all]; \
|
||||
else \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check \
|
||||
|
42
Dockerfile.wolfi
Normal file
42
Dockerfile.wolfi
Normal file
@ -0,0 +1,42 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM docker.elastic.co/wolfi/python:3.10-dev AS builder
|
||||
|
||||
WORKDIR /eland
|
||||
ENV VIRTUAL_ENV=/eland/venv
|
||||
RUN python3 -m venv $VIRTUAL_ENV
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
ADD . /eland
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||
torch==2.5.1+cpu .[all]; \
|
||||
else \
|
||||
python3 -m pip install \
|
||||
--no-cache-dir --disable-pip-version-check \
|
||||
.[all]; \
|
||||
fi
|
||||
|
||||
FROM docker.elastic.co/wolfi/python:3.10
|
||||
|
||||
WORKDIR /eland
|
||||
ENV VIRTUAL_ENV=/eland/venv
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
COPY --from=builder /eland /eland
|
||||
|
||||
# The eland_import_hub_model script is intended to be executed by a shell,
|
||||
# which will see its shebang line and then execute it with the Python
|
||||
# interpreter of the virtual environment. We want to keep this behavior even
|
||||
# with Wolfi so that users can use the image as before. To do that, we use two
|
||||
# tricks:
|
||||
#
|
||||
# * copy /bin/sh (that is, busybox's ash) from the builder image
|
||||
# * revert to Docker's the default entrypoint, which is the only way to pass
|
||||
# parameters to `eland_import_hub_model` without needing quotes.
|
||||
#
|
||||
COPY --from=builder /bin/sh /bin/sh
|
||||
ENTRYPOINT []
|
@ -50,3 +50,6 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
--
|
||||
This product contains an adapted version of the "us-national-parks" dataset, https://data.world/kevinnayar/us-national-parks, by Kevin Nayar, https://data.world/kevinnayar, is licensed under CC BY, https://creativecommons.org/licenses/by/4.0/legalcode
|
||||
|
14
README.md
14
README.md
@ -12,8 +12,7 @@
|
||||
<a href="https://pepy.tech/project/eland"><img src="https://static.pepy.tech/badge/eland" alt="Downloads"></a>
|
||||
<a href="https://pypi.org/project/eland"><img src="https://img.shields.io/pypi/status/eland.svg"
|
||||
alt="Package Status"></a>
|
||||
<a href="https://clients-ci.elastic.co/job/elastic+eland+main"><img
|
||||
src="https://clients-ci.elastic.co/buildStatus/icon?job=elastic%2Beland%2Bmain" alt="Build Status"></a>
|
||||
<a href="https://buildkite.com/elastic/eland"><img src="https://badge.buildkite.com/d92340e800bc06a7c7c02a71b8d42fcb958bd18c25f99fe2d9.svg" alt="Build Status"></a>
|
||||
<a href="https://github.com/elastic/eland/blob/main/LICENSE.txt"><img src="https://img.shields.io/pypi/l/eland.svg"
|
||||
alt="License"></a>
|
||||
<a href="https://eland.readthedocs.io"><img
|
||||
@ -43,7 +42,7 @@ $ python -m pip install eland
|
||||
|
||||
If using Eland to upload NLP models to Elasticsearch install the PyTorch extras:
|
||||
```bash
|
||||
$ python -m pip install eland[pytorch]
|
||||
$ python -m pip install 'eland[pytorch]'
|
||||
```
|
||||
|
||||
Eland can also be installed from [Conda Forge](https://anaconda.org/conda-forge/eland) with Conda:
|
||||
@ -54,13 +53,14 @@ $ conda install -c conda-forge eland
|
||||
|
||||
### Compatibility
|
||||
|
||||
- Supports Python 3.8, 3.9, 3.10 and Pandas 1.5
|
||||
- Supports Elasticsearch clusters that are 7.11+, recommended 8.3 or later for all features to work.
|
||||
- Supports Python 3.9, 3.10, 3.11 and 3.12.
|
||||
- Supports Pandas 1.5 and 2.
|
||||
- Supports Elasticsearch 8+ clusters, recommended 8.16 or later for all features to work.
|
||||
If you are using the NLP with PyTorch feature make sure your Eland minor version matches the minor
|
||||
version of your Elasticsearch cluster. For all other features it is sufficient for the major versions
|
||||
to match.
|
||||
- You need to use PyTorch `1.13.1` or earlier to import an NLP model.
|
||||
Run `pip install torch==1.13.1` to install the aproppriate version of PyTorch.
|
||||
- You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip
|
||||
install 'eland[pytorch]'` to install that version.
|
||||
|
||||
|
||||
### Prerequisites
|
||||
|
@ -55,7 +55,8 @@ spec:
|
||||
repository: elastic/eland
|
||||
teams:
|
||||
ml-core: {}
|
||||
clients-team: {}
|
||||
devtools-team: {}
|
||||
es-docs: {}
|
||||
everyone:
|
||||
access_level: READ_ONLY
|
||||
|
||||
@ -88,6 +89,6 @@ spec:
|
||||
repository: elastic/eland
|
||||
teams:
|
||||
ml-core: {}
|
||||
clients-team: {}
|
||||
devtools-team: {}
|
||||
everyone:
|
||||
access_level: READ_ONLY
|
||||
|
8
docs/docset.yml
Normal file
8
docs/docset.yml
Normal file
@ -0,0 +1,8 @@
|
||||
project: 'Eland Python client'
|
||||
cross_links:
|
||||
- docs-content
|
||||
toc:
|
||||
- toc: reference
|
||||
subs:
|
||||
es: "Elasticsearch"
|
||||
ml: "machine learning"
|
@ -1,14 +0,0 @@
|
||||
= Eland Python Client
|
||||
|
||||
:doctype: book
|
||||
|
||||
include::{asciidoc-dir}/../../shared/versions/stack/{source_branch}.asciidoc[]
|
||||
include::{asciidoc-dir}/../../shared/attributes.asciidoc[]
|
||||
|
||||
include::overview.asciidoc[]
|
||||
|
||||
include::installation.asciidoc[]
|
||||
|
||||
include::dataframes.asciidoc[]
|
||||
|
||||
include::machine-learning.asciidoc[]
|
@ -1,16 +0,0 @@
|
||||
[[installation]]
|
||||
== Installation
|
||||
|
||||
Eland can be installed with https://pip.pypa.io[pip] from https://pypi.org/project/eland[PyPI]. We recommend https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/[using a virtual environment] when installing with pip:
|
||||
|
||||
[source,sh]
|
||||
-----------------------------
|
||||
$ python -m pip install eland
|
||||
-----------------------------
|
||||
|
||||
Alternatively, Eland can be installed with https://docs.conda.io[Conda] from https://anaconda.org/conda-forge/eland[Conda Forge]:
|
||||
|
||||
[source,sh]
|
||||
------------------------------------
|
||||
$ conda install -c conda-forge eland
|
||||
------------------------------------
|
@ -1,194 +0,0 @@
|
||||
[[machine-learning]]
|
||||
== Machine Learning
|
||||
|
||||
[discrete]
|
||||
[[ml-trained-models]]
|
||||
=== Trained models
|
||||
|
||||
Eland allows transforming trained models from scikit-learn, XGBoost,
|
||||
and LightGBM libraries to be serialized and used as an inference
|
||||
model in {es}.
|
||||
|
||||
[source,python]
|
||||
------------------------
|
||||
>>> from xgboost import XGBClassifier
|
||||
>>> from eland.ml import MLModel
|
||||
|
||||
# Train and exercise an XGBoost ML model locally
|
||||
>>> xgb_model = XGBClassifier(booster="gbtree")
|
||||
>>> xgb_model.fit(training_data[0], training_data[1])
|
||||
|
||||
>>> xgb_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
|
||||
# Import the model into Elasticsearch
|
||||
>>> es_model = MLModel.import_model(
|
||||
es_client="http://localhost:9200",
|
||||
model_id="xgb-classifier",
|
||||
model=xgb_model,
|
||||
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
||||
)
|
||||
|
||||
# Exercise the ML model in Elasticsearch with the training data
|
||||
>>> es_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
------------------------
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch]]
|
||||
=== Natural language processing (NLP) with PyTorch
|
||||
|
||||
|
||||
IMPORTANT: You need to use PyTorch `1.13` or earlier to import an NLP model.
|
||||
Run `pip install torch==1.13` to install the aproppriate version of PyTorch.
|
||||
|
||||
For NLP tasks, Eland enables you to import PyTorch models into {es}. Use the
|
||||
`eland_import_hub_model` script to download and install supported
|
||||
https://huggingface.co/transformers[transformer models] from the
|
||||
https://huggingface.co/models[Hugging Face model hub]. For example:
|
||||
|
||||
[source,bash]
|
||||
------------------------
|
||||
$ eland_import_hub_model <authentication> \ <1>
|
||||
--url http://localhost:9200/ \ <2>
|
||||
--hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
|
||||
--task-type ner \ <4>
|
||||
--start
|
||||
------------------------
|
||||
<1> Use an authentication method to access your cluster. Refer to <<ml-nlp-pytorch-auth>>.
|
||||
<2> The cluster URL. Alternatively, use `--cloud-id`.
|
||||
<3> Specify the identifier for the model in the Hugging Face model hub.
|
||||
<4> Specify the type of NLP task. Supported values are `fill_mask`, `ner`,
|
||||
`question_answering`, `text_classification`, `text_embedding`, and `zero_shot_classification`.
|
||||
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-docker]]
|
||||
==== Import model with Docker
|
||||
|
||||
IMPORTANT: To use the Docker container, you need to clone the Eland repository: https://github.com/elastic/eland
|
||||
|
||||
If you want to use Eland without installing it, you can use the Docker image:
|
||||
|
||||
You can use the container interactively:
|
||||
|
||||
```bash
|
||||
$ docker run -it --rm --network host docker.elastic.co/eland/eland
|
||||
```
|
||||
|
||||
Running installed scripts is also possible without an interactive shell, for example:
|
||||
|
||||
```bash
|
||||
docker run -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url $ELASTICSEARCH_URL \
|
||||
--hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
|
||||
--start
|
||||
```
|
||||
|
||||
Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-air-gapped]]
|
||||
==== Install models in an air-gapped environment
|
||||
|
||||
You can install models in a restricted or closed network by pointing the
|
||||
`eland_import_hub_model` script to local files.
|
||||
|
||||
For an offline install of a Hugging Face model, the model first needs to be
|
||||
cloned locally, Git and https://git-lfs.com/[Git Large File Storage] are
|
||||
required to be installed in your system.
|
||||
|
||||
1. Select a model you want to use from Hugging Face. Refer to the
|
||||
{ml-docs}/ml-nlp-model-ref.html[compatible third party model] list for more
|
||||
information on the supported architectures.
|
||||
|
||||
2. Clone the selected model from Hugging Face by using the model URL. For
|
||||
example:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
----
|
||||
git clone https://huggingface.co/dslim/bert-base-NER
|
||||
----
|
||||
This command results in a local copy of
|
||||
of the model in the directory `bert-base-NER`.
|
||||
--
|
||||
|
||||
3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the
|
||||
directory of the cloned model to install it:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
----
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /PATH/TO/MODEL \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
----
|
||||
|
||||
If you use the Docker image to run `eland_import_hub_model` you must bind mount
|
||||
the model directory, so the container can read the files:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
docker run --mount type=bind,source=/PATH/TO/MODELS,destination=/models,readonly -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /models/bert-base-NER \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
----
|
||||
Once it's uploaded to {es}, the model will have the ID specified by
|
||||
`--es-model-id`. If it is not set, the model ID is derived from
|
||||
`--hub-model-id`; spaces and path delimiters are converted to double
|
||||
underscores `__`.
|
||||
|
||||
--
|
||||
|
||||
|
||||
[discrete]
|
||||
[[ml-nlp-pytorch-auth]]
|
||||
==== Authentication methods
|
||||
|
||||
The following authentication options are available when using the import script:
|
||||
|
||||
* Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
|
||||
--------------------------------------------------
|
||||
These `-u` and `-p` options also work when you use `--url`.
|
||||
--
|
||||
|
||||
* Elasticsearch username and password authentication (embedded in the URL):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
|
||||
--------------------------------------------------
|
||||
--
|
||||
|
||||
* Elasticsearch API key authentication:
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
|
||||
--------------------------------------------------
|
||||
--
|
||||
|
||||
* HuggingFace Hub access token (for private models):
|
||||
+
|
||||
--
|
||||
[source,bash]
|
||||
--------------------------------------------------
|
||||
eland_import_hub_model --hub-access-token <access-token> ...
|
||||
--------------------------------------------------
|
||||
--
|
@ -1,16 +1,16 @@
|
||||
[[dataframes]]
|
||||
== Data Frames
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/dataframes.html
|
||||
---
|
||||
|
||||
`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API
|
||||
and defers all processing and filtering of data to Elasticsearch
|
||||
instead of your local machine. This means you can process large
|
||||
amounts of data within Elasticsearch from a Jupyter Notebook
|
||||
without overloading your machine.
|
||||
# Data Frames [dataframes]
|
||||
|
||||
[source,python]
|
||||
-------------------------------------
|
||||
`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API and defers all processing and filtering of data to Elasticsearch instead of your local machine. This means you can process large amounts of data within Elasticsearch from a Jupyter Notebook without overloading your machine.
|
||||
|
||||
```python
|
||||
>>> import eland as ed
|
||||
>>> # Connect to 'flights' index via localhost Elasticsearch node
|
||||
>>>
|
||||
# Connect to 'flights' index via localhost Elasticsearch node
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
|
||||
# eland.DataFrame instance has the same API as pandas.DataFrame
|
||||
@ -29,14 +29,14 @@ without overloading your machine.
|
||||
<class 'eland.dataframe.DataFrame'>
|
||||
Index: 13059 entries, 0 to 13058
|
||||
Data columns (total 27 columns):
|
||||
# Column Non-Null Count Dtype
|
||||
--- ------ -------------- -----
|
||||
0 AvgTicketPrice 13059 non-null float64
|
||||
1 Cancelled 13059 non-null bool
|
||||
2 Carrier 13059 non-null object
|
||||
...
|
||||
24 OriginWeather 13059 non-null object
|
||||
25 dayOfWeek 13059 non-null int64
|
||||
# Column Non-Null Count Dtype
|
||||
--- ------ -------------- -----
|
||||
0 AvgTicketPrice 13059 non-null float64
|
||||
1 Cancelled 13059 non-null bool
|
||||
2 Carrier 13059 non-null object
|
||||
...
|
||||
24 OriginWeather 13059 non-null object
|
||||
25 dayOfWeek 13059 non-null int64
|
||||
26 timestamp 13059 non-null datetime64[ns]
|
||||
dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
|
||||
memory usage: 80.0 bytes
|
||||
@ -59,4 +59,5 @@ Elasticsearch storage usage: 5.043 MB
|
||||
sum 9.261629e+07 8.204365e+06
|
||||
min 0.000000e+00 1.000205e+02
|
||||
std 4.578263e+03 2.663867e+02
|
||||
-------------------------------------
|
||||
```
|
||||
|
@ -1,33 +1,36 @@
|
||||
[[overview]]
|
||||
== Overview
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/index.html
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/overview.html
|
||||
navigation_title: Eland
|
||||
---
|
||||
|
||||
Eland is a Python client and toolkit for DataFrames and {ml} in {es}.
|
||||
Full documentation is available on https://eland.readthedocs.io[Read the Docs].
|
||||
Source code is available on https://github.com/elastic/eland[GitHub].
|
||||
# Eland Python client [overview]
|
||||
|
||||
[discrete]
|
||||
=== Compatibility
|
||||
Eland is a Python client and toolkit for DataFrames and {{ml}} in {{es}}. Full documentation is available on [Read the Docs](https://eland.readthedocs.io). Source code is available on [GitHub](https://github.com/elastic/eland).
|
||||
|
||||
- Supports Python 3.8+ and Pandas 1.5
|
||||
- Supports {es} clusters that are 7.11+, recommended 7.14 or later for all features to work.
|
||||
Make sure your Eland major version matches the major version of your Elasticsearch cluster.
|
||||
|
||||
The recommended way to set your requirements in your `setup.py` or
|
||||
`requirements.txt` is::
|
||||
## Compatibility [_compatibility]
|
||||
|
||||
# Elasticsearch 8.x
|
||||
eland>=8,<9
|
||||
* Supports Python 3.9+ and Pandas 1.5
|
||||
* Supports {{es}} 8+ clusters, recommended 8.16 or later for all features to work. Make sure your Eland major version matches the major version of your Elasticsearch cluster.
|
||||
|
||||
# Elasticsearch 7.x
|
||||
eland>=7,<8
|
||||
The recommended way to set your requirements in your `setup.py` or `requirements.txt` is::
|
||||
|
||||
[discrete]
|
||||
=== Getting Started
|
||||
```
|
||||
# Elasticsearch 8.x
|
||||
eland>=8,<9
|
||||
```
|
||||
```
|
||||
# Elasticsearch 7.x
|
||||
eland>=7,<8
|
||||
```
|
||||
|
||||
Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`:
|
||||
## Getting Started [_getting_started]
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
Create a `DataFrame` object connected to an {{es}} cluster running on `http://localhost:9200`:
|
||||
|
||||
```python
|
||||
>>> import eland as ed
|
||||
>>> df = ed.DataFrame(
|
||||
... es_client="http://localhost:9200",
|
||||
@ -48,20 +51,19 @@ Create a `DataFrame` object connected to an {es} cluster running on `http://loca
|
||||
13058 858.144337 False ... 6 2018-02-11 14:54:34
|
||||
|
||||
[13059 rows x 27 columns]
|
||||
------------------------------------
|
||||
```
|
||||
|
||||
[discrete]
|
||||
==== Elastic Cloud
|
||||
|
||||
### Elastic Cloud [_elastic_cloud]
|
||||
|
||||
You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
```python
|
||||
>>> import eland as ed
|
||||
>>> from elasticsearch import Elasticsearch
|
||||
|
||||
# First instantiate an 'Elasticsearch' instance connected to Elastic Cloud
|
||||
>>> es = Elasticsearch(cloud_id="...", api_key=("...", "..."))
|
||||
>>> es = Elasticsearch(cloud_id="...", api_key="...")
|
||||
|
||||
# then wrap the client in an Eland DataFrame:
|
||||
>>> df = ed.DataFrame(es, es_index_pattern="flights")
|
||||
@ -73,16 +75,16 @@ You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
[5 rows x 27 columns]
|
||||
------------------------------------
|
||||
```
|
||||
|
||||
Eland can be used for complex queries and aggregations:
|
||||
|
||||
[source,python]
|
||||
------------------------------------
|
||||
```python
|
||||
>>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False)
|
||||
AvgTicketPrice Cancelled timestamp
|
||||
Carrier
|
||||
Carrier
|
||||
ES-Air 630.235816 0.129814 2018-01-21 20:45:00.200000000
|
||||
JetBeats 627.457373 0.134698 2018-01-21 14:43:18.112400635
|
||||
Logstash Airways 624.581974 0.125188 2018-01-21 16:14:50.711798340
|
||||
------------------------------------
|
||||
```
|
||||
|
19
docs/reference/installation.md
Normal file
19
docs/reference/installation.md
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/installation.html
|
||||
---
|
||||
|
||||
# Installation [installation]
|
||||
|
||||
Eland can be installed with [pip](https://pip.pypa.io) from [PyPI](https://pypi.org/project/eland). We recommend [using a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) when installing with pip:
|
||||
|
||||
```sh
|
||||
$ python -m pip install eland
|
||||
```
|
||||
|
||||
Alternatively, Eland can be installed with [Conda](https://docs.conda.io) from [Conda Forge](https://anaconda.org/conda-forge/eland):
|
||||
|
||||
```sh
|
||||
$ conda install -c conda-forge eland
|
||||
```
|
||||
|
199
docs/reference/machine-learning.md
Normal file
199
docs/reference/machine-learning.md
Normal file
@ -0,0 +1,199 @@
|
||||
---
|
||||
mapped_pages:
|
||||
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html
|
||||
---
|
||||
|
||||
# Machine Learning [machine-learning]
|
||||
|
||||
|
||||
## Trained models [ml-trained-models]
|
||||
|
||||
Eland allows transforming *some*
|
||||
[trained models](https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters) from scikit-learn, XGBoost,
|
||||
and LightGBM libraries to be serialized and used as an inference model in {{es}}.
|
||||
|
||||
```python
|
||||
>>> from xgboost import XGBClassifier
|
||||
>>> from eland.ml import MLModel
|
||||
|
||||
# Train and exercise an XGBoost ML model locally
|
||||
>>> xgb_model = XGBClassifier(booster="gbtree")
|
||||
>>> xgb_model.fit(training_data[0], training_data[1])
|
||||
|
||||
>>> xgb_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
|
||||
# Import the model into Elasticsearch
|
||||
>>> es_model = MLModel.import_model(
|
||||
es_client="http://localhost:9200",
|
||||
model_id="xgb-classifier",
|
||||
model=xgb_model,
|
||||
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
||||
)
|
||||
|
||||
# Exercise the ML model in Elasticsearch with the training data
|
||||
>>> es_model.predict(training_data[0])
|
||||
[0 1 1 0 1 0 0 0 1 0]
|
||||
```
|
||||
|
||||
|
||||
## Natural language processing (NLP) with PyTorch [ml-nlp-pytorch]
|
||||
|
||||
::::{important}
|
||||
You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version.
|
||||
::::
|
||||
|
||||
|
||||
For NLP tasks, Eland enables you to import PyTorch models into {{es}}. Use the `eland_import_hub_model` script to download and install supported [transformer models](https://huggingface.co/transformers) from the [Hugging Face model hub](https://huggingface.co/models). For example:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model <authentication> \ <1>
|
||||
--url http://localhost:9200/ \ <2>
|
||||
--hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
|
||||
--task-type ner \ <4>
|
||||
--start
|
||||
```
|
||||
|
||||
1. Use an authentication method to access your cluster. Refer to [Authentication methods](machine-learning.md#ml-nlp-pytorch-auth).
|
||||
2. The cluster URL. Alternatively, use `--cloud-id`.
|
||||
3. Specify the identifier for the model in the Hugging Face model hub.
|
||||
4. Specify the type of NLP task. Supported values are `fill_mask`, `ner`, `question_answering`, `text_classification`, `text_embedding`, `text_expansion`, `text_similarity` and `zero_shot_classification`.
|
||||
|
||||
|
||||
For more information about the available options, run `eland_import_hub_model` with the `--help` option.
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --help
|
||||
```
|
||||
|
||||
|
||||
### Import model with Docker [ml-nlp-pytorch-docker]
|
||||
|
||||
::::{important}
|
||||
To use the Docker container, you need to clone the Eland repository: [https://github.com/elastic/eland](https://github.com/elastic/eland)
|
||||
::::
|
||||
|
||||
|
||||
If you want to use Eland without installing it, you can use the Docker image:
|
||||
|
||||
You can use the container interactively:
|
||||
|
||||
```bash
|
||||
docker run -it --rm --network host docker.elastic.co/eland/eland
|
||||
```
|
||||
|
||||
Running installed scripts is also possible without an interactive shell, for example:
|
||||
|
||||
```bash
|
||||
docker run -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url $ELASTICSEARCH_URL \
|
||||
--hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
|
||||
--start
|
||||
```
|
||||
|
||||
Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
|
||||
|
||||
|
||||
### Install models in an air-gapped environment [ml-nlp-pytorch-air-gapped]
|
||||
|
||||
You can install models in a restricted or closed network by pointing the `eland_import_hub_model` script to local files.
|
||||
|
||||
For an offline install of a Hugging Face model, the model first needs to be cloned locally, Git and [Git Large File Storage](https://git-lfs.com/) are required to be installed in your system.
|
||||
|
||||
1. Select a model you want to use from Hugging Face. Refer to the [compatible third party model](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md) list for more information on the supported architectures.
|
||||
2. Clone the selected model from Hugging Face by using the model URL. For example:
|
||||
|
||||
```bash
|
||||
git clone https://huggingface.co/dslim/bert-base-NER
|
||||
```
|
||||
|
||||
This command results in a local copy of of the model in the directory `bert-base-NER`.
|
||||
|
||||
3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the directory of the cloned model to install it:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /PATH/TO/MODEL \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
```
|
||||
|
||||
If you use the Docker image to run `eland_import_hub_model` you must bind mount the model directory, so the container can read the files:
|
||||
|
||||
```bash
|
||||
docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \
|
||||
eland_import_hub_model \
|
||||
--url 'XXXX' \
|
||||
--hub-model-id /model \
|
||||
--task-type ner \
|
||||
--es-username elastic --es-password XXX \
|
||||
--es-model-id bert-base-ner
|
||||
```
|
||||
|
||||
Once it’s uploaded to {{es}}, the model will have the ID specified by `--es-model-id`. If it is not set, the model ID is derived from `--hub-model-id`; spaces and path delimiters are converted to double underscores `__`.
|
||||
|
||||
|
||||
|
||||
### Connect to Elasticsearch through a proxy [ml-nlp-pytorch-proxy]
|
||||
|
||||
Behind the scenes, Eland uses the `requests` Python library, which [allows configuring proxies through an environment variable](https://requests.readthedocs.io/en/latest/user/advanced/#proxies). For example, to use an HTTP proxy to connect to an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable when invoking Eland:
|
||||
|
||||
```bash
|
||||
HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ...
|
||||
```
|
||||
|
||||
If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY` instead.
|
||||
|
||||
|
||||
### Authentication methods [ml-nlp-pytorch-auth]
|
||||
|
||||
The following authentication options are available when using the import script:
|
||||
|
||||
* Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
|
||||
```
|
||||
|
||||
These `-u` and `-p` options also work when you use `--url`.
|
||||
|
||||
* Elasticsearch username and password authentication (embedded in the URL):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
|
||||
```
|
||||
|
||||
* Elasticsearch API key authentication:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
|
||||
```
|
||||
|
||||
* HuggingFace Hub access token (for private models):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --hub-access-token <access-token> ...
|
||||
```
|
||||
|
||||
|
||||
|
||||
### TLS/SSL [ml-nlp-pytorch-tls]
|
||||
|
||||
The following TLS/SSL options for Elasticsearch are available when using the import script:
|
||||
|
||||
* Specify alternate CA bundle to verify the cluster certificate:
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --ca-certs CA_CERTS ...
|
||||
```
|
||||
|
||||
* Disable TLS/SSL verification altogether (strongly discouraged):
|
||||
|
||||
```bash
|
||||
eland_import_hub_model --insecure ...
|
||||
```
|
||||
|
||||
|
6
docs/reference/toc.yml
Normal file
6
docs/reference/toc.yml
Normal file
@ -0,0 +1,6 @@
|
||||
project: 'Eland reference'
|
||||
toc:
|
||||
- file: index.md
|
||||
- file: installation.md
|
||||
- file: dataframes.md
|
||||
- file: machine-learning.md
|
@ -1,13 +1,5 @@
|
||||
elasticsearch>=7.7
|
||||
pandas>=1.5
|
||||
matplotlib>=3.6
|
||||
matplotlib
|
||||
nbval
|
||||
scikit-learn>=0.22.1
|
||||
xgboost>=1
|
||||
lightgbm
|
||||
sphinx==5.3.0
|
||||
nbsphinx
|
||||
furo
|
||||
|
||||
# traitlets has been having all sorts of release problems lately.
|
||||
traitlets<5.1
|
||||
|
@ -167,7 +167,7 @@ Configuring PyCharm And Running Tests
|
||||
- Install development requirements. Open terminal in virtual environment and run
|
||||
.. code-block:: bash
|
||||
|
||||
`pip install -r requirements-dev.txt`
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
- Setup Elasticsearch instance with docker
|
||||
.. code-block:: bash
|
||||
@ -200,7 +200,7 @@ Configuring PyCharm And Running Tests
|
||||
- To test specific versions of Python run
|
||||
.. code-block:: bash
|
||||
|
||||
nox -s test-3.8
|
||||
nox -s test-3.12
|
||||
|
||||
|
||||
Documentation
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -49,6 +49,7 @@
|
||||
~DataFrame.tail
|
||||
~DataFrame.to_csv
|
||||
~DataFrame.to_html
|
||||
~DataFrame.to_json
|
||||
~DataFrame.to_numpy
|
||||
~DataFrame.to_pandas
|
||||
~DataFrame.to_string
|
||||
|
6
docs/sphinx/reference/api/eland.DataFrame.to_json.rst
Normal file
6
docs/sphinx/reference/api/eland.DataFrame.to_json.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.DataFrame.to\_json
|
||||
========================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. automethod:: DataFrame.to_json
|
@ -17,6 +17,7 @@
|
||||
~MLModel.delete_model
|
||||
~MLModel.exists_model
|
||||
~MLModel.export_model
|
||||
~MLModel.import_ltr_model
|
||||
~MLModel.import_model
|
||||
~MLModel.predict
|
||||
|
||||
|
@ -140,5 +140,6 @@ Serialization / IO / Conversion
|
||||
DataFrame.to_numpy
|
||||
DataFrame.to_csv
|
||||
DataFrame.to_html
|
||||
DataFrame.to_json
|
||||
DataFrame.to_string
|
||||
DataFrame.to_pandas
|
||||
|
@ -395,7 +395,7 @@ script instead of being modified manually.
|
||||
+---------------------------------------+------------+
|
||||
| ``ed.DataFrame.to_html()`` | **Yes** |
|
||||
+---------------------------------------+------------+
|
||||
| ``ed.DataFrame.to_json()`` | No |
|
||||
| ``ed.DataFrame.to_json()`` | **Yes** |
|
||||
+---------------------------------------+------------+
|
||||
| ``ed.DataFrame.to_latex()`` | No |
|
||||
+---------------------------------------+------------+
|
||||
|
@ -15,6 +15,8 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import warnings
|
||||
|
||||
from ._version import ( # noqa: F401
|
||||
__author__,
|
||||
__author_email__,
|
||||
@ -25,13 +27,16 @@ from ._version import ( # noqa: F401
|
||||
__url__,
|
||||
__version__,
|
||||
)
|
||||
from .common import SortOrder
|
||||
from .common import ElandDeprecationWarning, SortOrder
|
||||
from .dataframe import DataFrame
|
||||
from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland
|
||||
from .index import Index
|
||||
from .ndframe import NDFrame
|
||||
from .series import Series
|
||||
|
||||
# Display Eland deprecation warnings by default
|
||||
warnings.simplefilter("default", category=ElandDeprecationWarning)
|
||||
|
||||
__all__ = [
|
||||
"DataFrame",
|
||||
"Series",
|
||||
|
@ -18,7 +18,7 @@
|
||||
__title__ = "eland"
|
||||
__description__ = "Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch"
|
||||
__url__ = "https://github.com/elastic/eland"
|
||||
__version__ = "8.10.0"
|
||||
__version__ = "9.0.1"
|
||||
__author__ = "Steve Dodson"
|
||||
__author_email__ = "steve.dodson@elastic.co"
|
||||
__maintainer__ = "Elastic Client Library Maintainers"
|
||||
|
@ -32,7 +32,8 @@ import textwrap
|
||||
from elastic_transport.client_utils import DEFAULT
|
||||
from elasticsearch import AuthenticationException, Elasticsearch
|
||||
|
||||
from eland.common import parse_es_version
|
||||
from eland._version import __version__
|
||||
from eland.common import is_serverless_es, parse_es_version
|
||||
|
||||
MODEL_HUB_URL = "https://huggingface.co"
|
||||
|
||||
@ -40,7 +41,9 @@ MODEL_HUB_URL = "https://huggingface.co"
|
||||
def get_arg_parser():
|
||||
from eland.ml.pytorch.transformers import SUPPORTED_TASK_TYPES
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(
|
||||
exit_on_error=False
|
||||
) # throw exception rather than exit
|
||||
location_args = parser.add_mutually_exclusive_group(required=True)
|
||||
location_args.add_argument(
|
||||
"--url",
|
||||
@ -96,7 +99,7 @@ def get_arg_parser():
|
||||
"--task-type",
|
||||
required=False,
|
||||
choices=SUPPORTED_TASK_TYPES,
|
||||
help="The task type for the model usage. Will attempt to auto-detect task type for the model if not provided. "
|
||||
help="The task type for the model usage. Use text_similarity for rerank tasks. Will attempt to auto-detect task type for the model if not provided. "
|
||||
"Default: auto",
|
||||
default="auto",
|
||||
)
|
||||
@ -128,15 +131,60 @@ def get_arg_parser():
|
||||
"--ca-certs", required=False, default=DEFAULT, help="Path to CA bundle"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--ingest-prefix",
|
||||
required=False,
|
||||
default=None,
|
||||
help="String to prepend to model input at ingest",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--search-prefix",
|
||||
required=False,
|
||||
default=None,
|
||||
help="String to prepend to model input at search",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-model-input-length",
|
||||
required=False,
|
||||
default=None,
|
||||
help="""Set the model's max input length.
|
||||
Usually the max input length is derived from the Hugging Face
|
||||
model confifguation. Use this option to explicity set the model's
|
||||
max input length if the value can not be found in the Hugging
|
||||
Face configuration. Max input length should never exceed the
|
||||
model's true max length, setting a smaller max length is valid.
|
||||
""",
|
||||
type=int,
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = get_arg_parser()
|
||||
try:
|
||||
return parser.parse_args()
|
||||
except argparse.ArgumentError as argument_error:
|
||||
if argument_error.argument_name == "--task-type":
|
||||
message = (
|
||||
argument_error.message
|
||||
+ "\n\nUse 'text_similarity' for rerank tasks in Elasticsearch"
|
||||
)
|
||||
parser.error(message=message)
|
||||
else:
|
||||
parser.error(message=argument_error.message)
|
||||
except argparse.ArgumentTypeError as type_error:
|
||||
parser.error(str(type_error))
|
||||
|
||||
|
||||
def get_es_client(cli_args, logger):
|
||||
try:
|
||||
es_args = {
|
||||
"request_timeout": 300,
|
||||
"verify_certs": cli_args.insecure,
|
||||
"ca_certs": cli_args.ca_certs,
|
||||
"node_class": "requests",
|
||||
}
|
||||
|
||||
# Deployment location
|
||||
@ -167,13 +215,20 @@ def get_es_client(cli_args, logger):
|
||||
|
||||
def check_cluster_version(es_client, logger):
|
||||
es_info = es_client.info()
|
||||
|
||||
if is_serverless_es(es_client):
|
||||
logger.info(f"Connected to serverless cluster '{es_info['cluster_name']}'")
|
||||
# Serverless is compatible
|
||||
# Return the latest known semantic version, i.e. this version
|
||||
return parse_es_version(__version__)
|
||||
|
||||
# check the semantic version for none serverless clusters
|
||||
logger.info(
|
||||
f"Connected to cluster named '{es_info['cluster_name']}' (version: {es_info['version']['number']})"
|
||||
)
|
||||
|
||||
sem_ver = parse_es_version(es_info["version"]["number"])
|
||||
major_version = sem_ver[0]
|
||||
minor_version = sem_ver[1]
|
||||
|
||||
# NLP models added in 8
|
||||
if major_version < 8:
|
||||
@ -181,14 +236,9 @@ def check_cluster_version(es_client, logger):
|
||||
f"Elasticsearch version {major_version} does not support NLP models. Please upgrade Elasticsearch to the latest version"
|
||||
)
|
||||
exit(1)
|
||||
|
||||
# PyTorch was upgraded to version 1.13.1 in 8.7.
|
||||
# and is incompatible with earlier versions
|
||||
if major_version == 8 and minor_version < 7:
|
||||
import torch
|
||||
|
||||
elif major_version < 9:
|
||||
logger.error(
|
||||
f"Eland uses PyTorch version {torch.__version__} which is incompatible with Elasticsearch versions prior to 8.7. Please upgrade Elasticsearch to at least version 8.7"
|
||||
"Eland 9.x does not support Elasticsearch 8.x. Please upgrade Elasticsearch first."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
@ -207,6 +257,7 @@ def main():
|
||||
SUPPORTED_TASK_TYPES,
|
||||
TaskTypeError,
|
||||
TransformerModel,
|
||||
UnknownModelInputSizeError,
|
||||
)
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(
|
||||
@ -224,7 +275,7 @@ def main():
|
||||
assert SUPPORTED_TASK_TYPES
|
||||
|
||||
# Parse arguments
|
||||
args = get_arg_parser().parse_args()
|
||||
args = parse_args()
|
||||
|
||||
# Connect to ES
|
||||
logger.info("Establishing connection to Elasticsearch")
|
||||
@ -244,6 +295,9 @@ def main():
|
||||
task_type=args.task_type,
|
||||
es_version=cluster_version,
|
||||
quantize=args.quantize,
|
||||
ingest_prefix=args.ingest_prefix,
|
||||
search_prefix=args.search_prefix,
|
||||
max_model_input_size=args.max_model_input_length,
|
||||
)
|
||||
model_path, config, vocab_path = tm.save(tmp_dir)
|
||||
except TaskTypeError as err:
|
||||
@ -251,6 +305,12 @@ def main():
|
||||
f"Failed to get model for task type, please provide valid task type via '--task-type' parameter. Caused by {err}"
|
||||
)
|
||||
exit(1)
|
||||
except UnknownModelInputSizeError as err:
|
||||
logger.error(
|
||||
f"""Could not automatically determine the model's max input size from the model configuration.
|
||||
Please provde the max input size via the --max-model-input-length parameter. Caused by {err}"""
|
||||
)
|
||||
exit(1)
|
||||
|
||||
ptm = PyTorchModel(
|
||||
es, args.es_model_id if args.es_model_id else tm.elasticsearch_model_id()
|
||||
|
@ -41,7 +41,6 @@ if TYPE_CHECKING:
|
||||
# Default number of rows displayed (different to pandas where ALL could be displayed)
|
||||
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
||||
DEFAULT_CHUNK_SIZE = 10000
|
||||
DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000
|
||||
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
||||
DEFAULT_SEARCH_SIZE = 5000
|
||||
DEFAULT_PIT_KEEP_ALIVE = "3m"
|
||||
@ -53,6 +52,10 @@ PANDAS_VERSION: Tuple[int, ...] = tuple(
|
||||
_ELAND_MAJOR_VERSION = int(_eland_version.split(".")[0])
|
||||
|
||||
|
||||
class ElandDeprecationWarning(DeprecationWarning):
|
||||
"""Warning for deprecation functionality in Eland"""
|
||||
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
||||
@ -306,12 +309,16 @@ def elasticsearch_date_to_pandas_date(
|
||||
|
||||
|
||||
def ensure_es_client(
|
||||
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
|
||||
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
|
||||
) -> Elasticsearch:
|
||||
if isinstance(es_client, tuple):
|
||||
es_client = list(es_client)
|
||||
if not isinstance(es_client, Elasticsearch):
|
||||
es_client = Elasticsearch(es_client) # type: ignore[arg-type]
|
||||
if (
|
||||
isinstance(es_client, str)
|
||||
or isinstance(es_client, list)
|
||||
or isinstance(es_client, tuple)
|
||||
):
|
||||
es_client = Elasticsearch(es_client)
|
||||
return es_client
|
||||
|
||||
|
||||
@ -341,6 +348,17 @@ def es_version(es_client: Elasticsearch) -> Tuple[int, int, int]:
|
||||
return eland_es_version
|
||||
|
||||
|
||||
def is_serverless_es(es_client: Elasticsearch) -> bool:
|
||||
"""
|
||||
Returns true if the client is connected to a serverless instance of Elasticsearch.
|
||||
"""
|
||||
es_info = es_client.info()
|
||||
return (
|
||||
"build_flavor" in es_info["version"]
|
||||
and es_info["version"]["build_flavor"] == "serverless"
|
||||
)
|
||||
|
||||
|
||||
def parse_es_version(version: str) -> Tuple[int, int, int]:
|
||||
"""
|
||||
Parse the semantic version from a string e.g. '8.8.0'
|
||||
|
@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore
|
||||
from pandas.util._validators import validate_bool_kwarg # type: ignore
|
||||
|
||||
import eland.plotting as gfx
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||
from eland.filter import BooleanFilter
|
||||
from eland.groupby import DataFrameGroupBy
|
||||
from eland.ndframe import NDFrame
|
||||
@ -83,7 +83,7 @@ class DataFrame(NDFrame):
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
[5 rows x 28 columns]
|
||||
|
||||
|
||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
||||
@ -173,13 +173,13 @@ class DataFrame(NDFrame):
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> assert isinstance(df.columns, pd.Index)
|
||||
>>> df.columns
|
||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
|
||||
... 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
|
||||
... 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
|
||||
... 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
|
||||
... 'timestamp'],
|
||||
... dtype='object')
|
||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Cities', 'Dest', 'DestAirportID', 'DestCityName',
|
||||
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
|
||||
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
|
||||
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
|
||||
'timestamp'],
|
||||
dtype='object')
|
||||
"""
|
||||
return self._query_compiler.columns
|
||||
|
||||
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
|
||||
axis = pd.DataFrame._get_axis_name(axis)
|
||||
axes = {axis: labels}
|
||||
elif index is not None or columns is not None:
|
||||
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
|
||||
(index, columns), {}
|
||||
)
|
||||
axes = {"columns": columns, "index": index}
|
||||
else:
|
||||
raise ValueError(
|
||||
"Need to specify at least one of 'labels', 'index' or 'columns'"
|
||||
@ -956,8 +954,10 @@ class DataFrame(NDFrame):
|
||||
elif verbose is False: # specifically set to False, not nesc None
|
||||
_non_verbose_repr()
|
||||
else:
|
||||
_non_verbose_repr() if exceeds_info_cols else _verbose_repr(
|
||||
number_of_columns
|
||||
(
|
||||
_non_verbose_repr()
|
||||
if exceeds_info_cols
|
||||
else _verbose_repr(number_of_columns)
|
||||
)
|
||||
|
||||
# pandas 0.25.1 uses get_dtype_counts() here. This
|
||||
@ -1303,6 +1303,7 @@ class DataFrame(NDFrame):
|
||||
quoting=None,
|
||||
quotechar='"',
|
||||
line_terminator=None,
|
||||
lineterminator=None,
|
||||
chunksize=None,
|
||||
tupleize_cols=None,
|
||||
date_format=None,
|
||||
@ -1317,6 +1318,13 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.to_csv`
|
||||
"""
|
||||
if line_terminator:
|
||||
warnings.warn(
|
||||
"The line_terminator argument will be replaced by lineterminator",
|
||||
PendingDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
kwargs = {
|
||||
"path_or_buf": path_or_buf,
|
||||
"sep": sep,
|
||||
@ -1331,7 +1339,7 @@ class DataFrame(NDFrame):
|
||||
"compression": compression,
|
||||
"quoting": quoting,
|
||||
"quotechar": quotechar,
|
||||
"line_terminator": line_terminator,
|
||||
"lineterminator": lineterminator or line_terminator,
|
||||
"chunksize": chunksize,
|
||||
"date_format": date_format,
|
||||
"doublequote": doublequote,
|
||||
@ -1340,6 +1348,50 @@ class DataFrame(NDFrame):
|
||||
}
|
||||
return self._query_compiler.to_csv(**kwargs)
|
||||
|
||||
def to_json(
|
||||
self,
|
||||
path_or_buf=None,
|
||||
orient=None,
|
||||
date_format=None,
|
||||
double_precision=10,
|
||||
force_ascii=True,
|
||||
date_unit="ms",
|
||||
default_handler=None,
|
||||
lines=False,
|
||||
compression="infer",
|
||||
index=None,
|
||||
indent=None,
|
||||
storage_options=None,
|
||||
):
|
||||
"""Write Elasticsearch data to a json file.
|
||||
|
||||
By setting the ``lines`` parameter to ``True``, and ``orient`` to ``'records'``,
|
||||
the entire DataFrame can be written in a streaming manner.
|
||||
Doing so avoids the need to have the entire DataFrame in memory.
|
||||
This format is known as JSON lines and can use the file extension ``.jsonl``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.to_json`
|
||||
"""
|
||||
if index is None and PANDAS_VERSION[0] == 1:
|
||||
index = True # switch to the pandas 1 default
|
||||
kwargs = {
|
||||
"path_or_buf": path_or_buf,
|
||||
"orient": orient,
|
||||
"date_format": date_format,
|
||||
"double_precision": double_precision,
|
||||
"force_ascii": force_ascii,
|
||||
"date_unit": date_unit,
|
||||
"default_handler": default_handler,
|
||||
"lines": lines,
|
||||
"compression": compression,
|
||||
"index": index,
|
||||
"indent": indent,
|
||||
"storage_options": storage_options,
|
||||
}
|
||||
return self._query_compiler.to_json(**kwargs)
|
||||
|
||||
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
|
||||
"""
|
||||
Utility method to convert eland.Dataframe to pandas.Dataframe
|
||||
@ -1962,9 +2014,9 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.shape
|
||||
(13059, 27)
|
||||
(13059, 28)
|
||||
>>> df.query('FlightDelayMin > 60').shape
|
||||
(2730, 27)
|
||||
(2730, 28)
|
||||
"""
|
||||
if isinstance(expr, BooleanFilter):
|
||||
return DataFrame(
|
||||
|
43
eland/etl.py
43
eland/etl.py
@ -16,6 +16,7 @@
|
||||
# under the License.
|
||||
|
||||
import csv
|
||||
import warnings
|
||||
from collections import deque
|
||||
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
|
||||
|
||||
@ -110,15 +111,15 @@ def pandas_to_eland(
|
||||
2 3.141 1 ... 3 Long text - to be indexed as es type text
|
||||
<BLANKLINE>
|
||||
[3 rows x 8 columns]
|
||||
>>> pd_df.dtypes
|
||||
A float64
|
||||
B int64
|
||||
C object
|
||||
D datetime64[ns]
|
||||
E float64
|
||||
F bool
|
||||
G int64
|
||||
H object
|
||||
>>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
|
||||
A float64
|
||||
B int64
|
||||
C object
|
||||
D datetime64[s]
|
||||
E float64
|
||||
F bool
|
||||
G int64
|
||||
H object
|
||||
dtype: object
|
||||
|
||||
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
|
||||
@ -262,7 +263,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
[5 rows x 28 columns]
|
||||
|
||||
Convert `eland.DataFrame` to `pandas.DataFrame` (Note: this loads entire Elasticsearch index into core memory)
|
||||
|
||||
@ -277,7 +278,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
[5 rows x 28 columns]
|
||||
|
||||
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
|
||||
|
||||
@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
squeeze=None,
|
||||
prefix=None,
|
||||
mangle_dupe_cols=True,
|
||||
mangle_dupe_cols=None,
|
||||
# General Parsing Configuration
|
||||
dtype=None,
|
||||
engine=None,
|
||||
@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
|
||||
low_memory: bool = _DEFAULT_LOW_MEMORY,
|
||||
memory_map=False,
|
||||
float_precision=None,
|
||||
**extra_kwargs,
|
||||
) -> "DataFrame":
|
||||
"""
|
||||
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
|
||||
@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
|
||||
"usecols": usecols,
|
||||
"verbose": verbose,
|
||||
"encoding": encoding,
|
||||
"squeeze": squeeze,
|
||||
"memory_map": memory_map,
|
||||
"float_precision": float_precision,
|
||||
"na_filter": na_filter,
|
||||
@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
|
||||
"error_bad_lines": error_bad_lines,
|
||||
"on_bad_lines": on_bad_lines,
|
||||
"low_memory": low_memory,
|
||||
"mangle_dupe_cols": mangle_dupe_cols,
|
||||
"infer_datetime_format": infer_datetime_format,
|
||||
"skip_blank_lines": skip_blank_lines,
|
||||
**extra_kwargs,
|
||||
}
|
||||
|
||||
if chunksize is None:
|
||||
@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore
|
||||
|
||||
kwargs.pop("on_bad_lines")
|
||||
|
||||
if "squeeze" in kwargs:
|
||||
kwargs.pop("squeeze")
|
||||
warnings.warn(
|
||||
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
|
||||
)
|
||||
|
||||
if "mangle_dupe_cols" in kwargs:
|
||||
kwargs.pop("mangle_dupe_cols")
|
||||
warnings.warn(
|
||||
"The mangle_dupe_cols argument no longer works. Furthermore, "
|
||||
"duplicate columns will automatically get a number suffix."
|
||||
)
|
||||
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
|
||||
reader = pd.read_csv(filepath_or_buffer, **kwargs)
|
||||
|
||||
|
@ -36,10 +36,10 @@ import pandas as pd # type: ignore
|
||||
from pandas.core.dtypes.common import ( # type: ignore
|
||||
is_bool_dtype,
|
||||
is_datetime64_any_dtype,
|
||||
is_datetime_or_timedelta_dtype,
|
||||
is_float_dtype,
|
||||
is_integer_dtype,
|
||||
is_string_dtype,
|
||||
is_timedelta64_dtype,
|
||||
)
|
||||
from pandas.core.dtypes.inference import is_list_like
|
||||
|
||||
@ -87,7 +87,9 @@ class Field(NamedTuple):
|
||||
|
||||
@property
|
||||
def is_timestamp(self) -> bool:
|
||||
return is_datetime_or_timedelta_dtype(self.pd_dtype)
|
||||
return is_datetime64_any_dtype(self.pd_dtype) or is_timedelta64_dtype(
|
||||
self.pd_dtype
|
||||
)
|
||||
|
||||
@property
|
||||
def is_bool(self) -> bool:
|
||||
@ -441,9 +443,9 @@ class FieldMappings:
|
||||
try:
|
||||
series = df.loc[df.es_field_name == es_field_name_keyword]
|
||||
if not series.empty and series.is_aggregatable.squeeze():
|
||||
row_as_dict[
|
||||
"aggregatable_es_field_name"
|
||||
] = es_field_name_keyword
|
||||
row_as_dict["aggregatable_es_field_name"] = (
|
||||
es_field_name_keyword
|
||||
)
|
||||
else:
|
||||
row_as_dict["aggregatable_es_field_name"] = None
|
||||
except KeyError:
|
||||
@ -507,7 +509,7 @@ class FieldMappings:
|
||||
es_dtype = "boolean"
|
||||
elif is_string_dtype(pd_dtype):
|
||||
es_dtype = "keyword"
|
||||
elif is_datetime_or_timedelta_dtype(pd_dtype):
|
||||
elif is_timedelta64_dtype(pd_dtype):
|
||||
es_dtype = "date"
|
||||
elif is_datetime64_any_dtype(pd_dtype):
|
||||
es_dtype = "date"
|
||||
@ -710,8 +712,11 @@ class FieldMappings:
|
||||
capabilities, orient="index", columns=FieldMappings.column_labels
|
||||
)
|
||||
|
||||
self._mappings_capabilities = self._mappings_capabilities.append(
|
||||
capability_matrix_row
|
||||
self._mappings_capabilities = pd.concat(
|
||||
[
|
||||
self._mappings_capabilities,
|
||||
capability_matrix_row,
|
||||
]
|
||||
)
|
||||
|
||||
def numeric_source_fields(self) -> List[str]:
|
||||
@ -792,7 +797,9 @@ class FieldMappings:
|
||||
pd_dtypes.append(np.dtype(pd_dtype))
|
||||
es_field_names.append(es_field_name)
|
||||
es_date_formats.append(es_date_format)
|
||||
elif include_timestamp and is_datetime_or_timedelta_dtype(pd_dtype):
|
||||
elif include_timestamp and (
|
||||
is_datetime64_any_dtype(pd_dtype) or is_timedelta64_dtype(pd_dtype)
|
||||
):
|
||||
pd_dtypes.append(np.dtype(pd_dtype))
|
||||
es_field_names.append(es_field_name)
|
||||
es_date_formats.append(es_date_format)
|
||||
|
@ -50,10 +50,7 @@ class Index:
|
||||
# index_field.setter
|
||||
self._is_source_field = False
|
||||
|
||||
# The type:ignore is due to mypy not being smart enough
|
||||
# to recognize the property.setter has a different type
|
||||
# than the property.getter.
|
||||
self.es_index_field = es_index_field # type: ignore
|
||||
self.es_index_field = es_index_field
|
||||
|
||||
@property
|
||||
def sort_field(self) -> str:
|
||||
|
@ -19,7 +19,7 @@ import base64
|
||||
import gzip
|
||||
import json
|
||||
from abc import ABC
|
||||
from typing import Any, Dict, List, Optional, Sequence
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
|
||||
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
|
||||
@ -58,6 +58,9 @@ class ModelSerializer(ABC):
|
||||
"ascii"
|
||||
)
|
||||
|
||||
def bounds(self) -> Tuple[float, float]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class TreeNode:
|
||||
def __init__(
|
||||
@ -96,6 +99,7 @@ class TreeNode:
|
||||
add_if_exists(d, "split_feature", self._split_feature)
|
||||
add_if_exists(d, "threshold", self._threshold)
|
||||
add_if_exists(d, "number_samples", self._number_samples)
|
||||
add_if_exists(d, "default_left", self._default_left)
|
||||
else:
|
||||
if len(self._leaf_value) == 1:
|
||||
# Support Elasticsearch 7.6 which only
|
||||
@ -128,6 +132,14 @@ class Tree(ModelSerializer):
|
||||
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
|
||||
return {"tree": d}
|
||||
|
||||
def bounds(self) -> Tuple[float, float]:
|
||||
leaf_values = [
|
||||
tree_node._leaf_value[0]
|
||||
for tree_node in self._tree_structure
|
||||
if tree_node._leaf_value is not None
|
||||
]
|
||||
return min(leaf_values), max(leaf_values)
|
||||
|
||||
|
||||
class Ensemble(ModelSerializer):
|
||||
def __init__(
|
||||
@ -157,3 +169,9 @@ class Ensemble(ModelSerializer):
|
||||
add_if_exists(d, "classification_weights", self._classification_weights)
|
||||
add_if_exists(d, "aggregate_output", self._output_aggregator)
|
||||
return {"ensemble": d}
|
||||
|
||||
def bounds(self) -> Tuple[float, float]:
|
||||
min_bound, max_bound = tuple(
|
||||
map(sum, zip(*[model.bounds() for model in self._trained_models]))
|
||||
)
|
||||
return min_bound, max_bound
|
||||
|
@ -16,4 +16,5 @@
|
||||
# under the License.
|
||||
|
||||
TYPE_CLASSIFICATION = "classification"
|
||||
TYPE_LEARNING_TO_RANK = "learning_to_rank"
|
||||
TYPE_REGRESSION = "regression"
|
||||
|
@ -1,222 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .._optional import import_optional_dependency
|
||||
|
||||
import_optional_dependency("sklearn", on_version="warn")
|
||||
|
||||
import sklearn
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
|
||||
|
||||
class Tree:
|
||||
"""Wrapper to create sklearn Tree objects from Elastic ML tree
|
||||
description in JSON format.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
json_tree: Dict[str, Any],
|
||||
feature_names_map: Dict[str, int],
|
||||
):
|
||||
tree_leaf = -1
|
||||
|
||||
node_count = len(json_tree["tree_structure"])
|
||||
children_left = np.ones((node_count,), dtype=int) * tree_leaf
|
||||
children_right = np.ones((node_count,), dtype=int) * tree_leaf
|
||||
feature = np.ones((node_count,), dtype=int) * -2
|
||||
threshold = np.ones((node_count,), dtype=float) * -2
|
||||
impurity = np.zeros((node_count,), dtype=float)
|
||||
# value works only for regression and binary classification
|
||||
value = np.zeros((node_count, 1, 1), dtype="<f8")
|
||||
n_node_samples = np.zeros((node_count,), dtype=int)
|
||||
|
||||
# parse values from the JSON tree
|
||||
feature_names = json_tree["feature_names"]
|
||||
for json_node in json_tree["tree_structure"]:
|
||||
node_id = json_node["node_index"]
|
||||
if "number_samples" in json_node:
|
||||
n_node_samples[node_id] = json_node["number_samples"]
|
||||
else:
|
||||
n_node_samples[node_id] = 0
|
||||
|
||||
if "leaf_value" not in json_node:
|
||||
children_left[node_id] = json_node["left_child"]
|
||||
children_right[node_id] = json_node["right_child"]
|
||||
feature[node_id] = feature_names_map[
|
||||
feature_names[json_node["split_feature"]]
|
||||
]
|
||||
threshold[node_id] = json_node["threshold"]
|
||||
if "split_gain" in json_node:
|
||||
impurity[node_id] = json_node["split_gain"]
|
||||
else:
|
||||
impurity[node_id] = -1
|
||||
else:
|
||||
value[node_id, 0, 0] = json_node["leaf_value"]
|
||||
|
||||
# iterate through tree to get max depth and expected values
|
||||
weighted_n_node_samples = n_node_samples.copy()
|
||||
self.max_depth = Tree._compute_expectations(
|
||||
children_left=children_left,
|
||||
children_right=children_right,
|
||||
node_sample_weight=weighted_n_node_samples,
|
||||
values=value,
|
||||
node_index=0,
|
||||
)
|
||||
self.n_outputs = value.shape[-1]
|
||||
|
||||
# initialize the sklearn tree
|
||||
self.tree = sklearn.tree._tree.Tree(
|
||||
len(feature_names), np.array([1], dtype=int), 1
|
||||
)
|
||||
node_state = np.array(
|
||||
[
|
||||
(
|
||||
children_left[i],
|
||||
children_right[i],
|
||||
feature[i],
|
||||
threshold[i],
|
||||
impurity[i],
|
||||
n_node_samples[i],
|
||||
weighted_n_node_samples[i],
|
||||
True,
|
||||
)
|
||||
for i in range(node_count)
|
||||
],
|
||||
dtype={
|
||||
"names": [
|
||||
"left_child",
|
||||
"right_child",
|
||||
"feature",
|
||||
"threshold",
|
||||
"impurity",
|
||||
"n_node_samples",
|
||||
"weighted_n_node_samples",
|
||||
"missing_go_to_left",
|
||||
],
|
||||
"formats": ["<i8", "<i8", "<i8", "<f8", "<f8", "<i8", "<f8", "u1"],
|
||||
},
|
||||
)
|
||||
state = {
|
||||
"max_depth": self.max_depth,
|
||||
"node_count": node_count,
|
||||
"nodes": node_state,
|
||||
"values": value,
|
||||
}
|
||||
self.tree.__setstate__(state)
|
||||
|
||||
@staticmethod
|
||||
def _compute_expectations(
|
||||
children_left, children_right, node_sample_weight, values, node_index
|
||||
) -> int:
|
||||
if children_right[node_index] == -1:
|
||||
return 0
|
||||
|
||||
left_index = children_left[node_index]
|
||||
right_index = children_right[node_index]
|
||||
depth_left = Tree._compute_expectations(
|
||||
children_left, children_right, node_sample_weight, values, left_index
|
||||
)
|
||||
depth_right = Tree._compute_expectations(
|
||||
children_left, children_right, node_sample_weight, values, right_index
|
||||
)
|
||||
left_weight = node_sample_weight[left_index]
|
||||
right_weight = node_sample_weight[right_index]
|
||||
|
||||
v = (
|
||||
(
|
||||
left_weight * values[left_index, :]
|
||||
+ right_weight * values[right_index, :]
|
||||
)
|
||||
/ (left_weight + right_weight)
|
||||
if left_weight + right_weight > 0
|
||||
else 0
|
||||
)
|
||||
values[node_index, :] = v
|
||||
return max(depth_left, depth_right) + 1
|
||||
|
||||
|
||||
class TargetMeanEncoder(FunctionTransformer):
|
||||
"""FunctionTransformer implementation of the target mean encoder, which is
|
||||
deserialized from the Elastic ML preprocessor description in JSON formats.
|
||||
"""
|
||||
|
||||
def __init__(self, preprocessor: Dict[str, Any]):
|
||||
self.preprocessor = preprocessor
|
||||
target_map = self.preprocessor["target_mean_encoding"]["target_map"]
|
||||
feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"]
|
||||
self.field_name_in = self.preprocessor["target_mean_encoding"]["field"]
|
||||
fallback_value = self.preprocessor["target_mean_encoding"]["default_value"]
|
||||
|
||||
def func(column):
|
||||
return np.array(
|
||||
[
|
||||
target_map[str(category)]
|
||||
if category in target_map
|
||||
else fallback_value
|
||||
for category in column
|
||||
]
|
||||
).reshape(-1, 1)
|
||||
|
||||
def feature_names_out(ft, carr):
|
||||
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
||||
|
||||
super().__init__(func=func, feature_names_out=feature_names_out)
|
||||
|
||||
|
||||
class FrequencyEncoder(FunctionTransformer):
|
||||
"""FunctionTransformer implementation of the frequency encoder, which is
|
||||
deserialized from the Elastic ML preprocessor description in JSON format.
|
||||
"""
|
||||
|
||||
def __init__(self, preprocessor: Dict[str, Any]):
|
||||
self.preprocessor = preprocessor
|
||||
frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"]
|
||||
feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"]
|
||||
self.field_name_in = self.preprocessor["frequency_encoding"]["field"]
|
||||
fallback_value = 0.0
|
||||
|
||||
def func(column):
|
||||
return np.array(
|
||||
[
|
||||
frequency_map[str(category)]
|
||||
if category in frequency_map
|
||||
else fallback_value
|
||||
for category in column
|
||||
]
|
||||
).reshape(-1, 1)
|
||||
|
||||
def feature_names_out(ft, carr):
|
||||
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
||||
|
||||
super().__init__(func=func, feature_names_out=feature_names_out)
|
||||
|
||||
|
||||
class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
|
||||
"""Wrapper for sklearn one-hot encoder, which is deserialized from the
|
||||
Elastic ML preprocessor description in JSON format.
|
||||
"""
|
||||
|
||||
def __init__(self, preprocessor: Dict[str, Any]):
|
||||
self.preprocessor = preprocessor
|
||||
self.field_name_in = self.preprocessor["one_hot_encoding"]["field"]
|
||||
self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())]
|
||||
super().__init__(categories=self.cats, handle_unknown="ignore")
|
@ -1,46 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import eland
|
||||
|
||||
|
||||
class ModelDefinitionKeyError(Exception):
|
||||
"""
|
||||
This exception is raised when a key is not found in the model definition.
|
||||
|
||||
Attributes:
|
||||
missed_key (str): The key that was not found in the model definition.
|
||||
available_keys (List[str]): The list of keys that are available in the model definition.
|
||||
|
||||
Examples:
|
||||
model_definition = {"key1": "value1", "key2": "value2"}
|
||||
try:
|
||||
model_definition["key3"]
|
||||
except KeyError as ex:
|
||||
raise ModelDefinitionKeyError(ex) from ex
|
||||
"""
|
||||
|
||||
def __init__(self, ex: KeyError):
|
||||
self.missed_key = ex.args[0]
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f'Key "{self.missed_key}" is not available. '
|
||||
+ "The model definition may have changed. "
|
||||
+ "Make sure you are using an Elasticsearch version compatible "
|
||||
+ f"with Eland {eland.__version__}."
|
||||
)
|
@ -1,472 +0,0 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from abc import ABC
|
||||
from typing import Any, List, Literal, Mapping, Optional, Set, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from elasticsearch import Elasticsearch
|
||||
from numpy.typing import ArrayLike
|
||||
|
||||
from .._optional import import_optional_dependency
|
||||
|
||||
import_optional_dependency("sklearn", on_version="warn")
|
||||
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
||||
from sklearn.ensemble._gb_losses import (
|
||||
BinomialDeviance,
|
||||
HuberLossFunction,
|
||||
LeastSquaresError,
|
||||
)
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils.validation import check_array
|
||||
|
||||
from eland.common import ensure_es_client
|
||||
from eland.ml.common import TYPE_CLASSIFICATION, TYPE_REGRESSION
|
||||
|
||||
from ._sklearn_deserializers import Tree
|
||||
from .common import ModelDefinitionKeyError
|
||||
|
||||
|
||||
class ESGradientBoostingModel(ABC):
|
||||
"""
|
||||
Abstract class for converting Elastic ML model into sklearn Pipeline.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client : Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
model_id : str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
Raises
|
||||
------
|
||||
RuntimeError
|
||||
On failure to retrieve trained model information to the specified model ID.
|
||||
ValueError
|
||||
The model is expected to be trained in Elastic Stack. Models initially imported
|
||||
from xgboost, lgbm, or sklearn are not supported.
|
||||
"""
|
||||
self.es_client: Elasticsearch = ensure_es_client(es_client)
|
||||
self.model_id = model_id
|
||||
|
||||
self._trained_model_result = self.es_client.ml.get_trained_models(
|
||||
model_id=self.model_id,
|
||||
decompress_definition=True,
|
||||
include=["hyperparameters", "definition"],
|
||||
)
|
||||
|
||||
if (
|
||||
"trained_model_configs" not in self._trained_model_result
|
||||
or len(self._trained_model_result["trained_model_configs"]) == 0
|
||||
):
|
||||
raise RuntimeError(
|
||||
f"Failed to retrieve the trained model for model ID {self.model_id!r}"
|
||||
)
|
||||
|
||||
if "metadata" not in self._trained_model_result["trained_model_configs"][0]:
|
||||
raise ValueError(
|
||||
"Error initializing sklearn classifier. Incorrect prior class probability. "
|
||||
+ "Note: only export of models trained in the Elastic Stack is supported."
|
||||
)
|
||||
preprocessors = []
|
||||
if "preprocessors" in self._definition:
|
||||
preprocessors = self._definition["preprocessors"]
|
||||
(
|
||||
self.feature_names_in_,
|
||||
self.input_field_names,
|
||||
) = ESGradientBoostingModel._get_feature_names_in_(
|
||||
preprocessors,
|
||||
self._definition["trained_model"]["ensemble"]["feature_names"],
|
||||
self._trained_model_result["trained_model_configs"][0]["input"][
|
||||
"field_names"
|
||||
],
|
||||
)
|
||||
|
||||
feature_names_map = {name: i for i, name in enumerate(self.feature_names_in_)}
|
||||
|
||||
trained_models = self._definition["trained_model"]["ensemble"]["trained_models"]
|
||||
self._trees = []
|
||||
for trained_model in trained_models:
|
||||
self._trees.append(Tree(trained_model["tree"], feature_names_map))
|
||||
|
||||
# 0's tree is the constant estimator
|
||||
self.n_estimators = len(trained_models) - 1
|
||||
|
||||
def _initialize_estimators(self, decision_tree_type) -> None:
|
||||
self.estimators_ = np.ndarray(
|
||||
(len(self._trees) - 1, 1), dtype=decision_tree_type
|
||||
)
|
||||
self.n_estimators_ = self.estimators_.shape[0]
|
||||
|
||||
for i in range(self.n_estimators_):
|
||||
estimator = decision_tree_type()
|
||||
estimator.tree_ = self._trees[i + 1].tree
|
||||
estimator.n_features_in_ = self.n_features_in_
|
||||
estimator.max_depth = self._max_depth
|
||||
estimator.max_features_ = self.max_features_
|
||||
self.estimators_[i, 0] = estimator
|
||||
|
||||
def _extract_common_parameters(self) -> None:
|
||||
self.n_features_in_ = len(self.feature_names_in_)
|
||||
self.max_features_ = self.n_features_in_
|
||||
|
||||
@property
|
||||
def _max_depth(self) -> int:
|
||||
return max(map(lambda x: x.max_depth, self._trees))
|
||||
|
||||
@property
|
||||
def _n_outputs(self) -> int:
|
||||
return self._trees[0].n_outputs
|
||||
|
||||
@property
|
||||
def _definition(self) -> Mapping[Union[str, int], Any]:
|
||||
return self._trained_model_result["trained_model_configs"][0]["definition"]
|
||||
|
||||
@staticmethod
|
||||
def _get_feature_names_in_(
|
||||
preprocessors, feature_names, field_names
|
||||
) -> Tuple[List[str], Set[str]]:
|
||||
input_field_names = set()
|
||||
|
||||
def add_input_field_name(preprocessor_type: str, feature_name: str) -> None:
|
||||
if feature_name in feature_names:
|
||||
input_field_names.add(preprocessor[preprocessor_type]["field"])
|
||||
|
||||
for preprocessor in preprocessors:
|
||||
if "target_mean_encoding" in preprocessor:
|
||||
add_input_field_name(
|
||||
"target_mean_encoding",
|
||||
preprocessor["target_mean_encoding"]["feature_name"],
|
||||
)
|
||||
elif "frequency_encoding" in preprocessor:
|
||||
add_input_field_name(
|
||||
"frequency_encoding",
|
||||
preprocessor["frequency_encoding"]["feature_name"],
|
||||
)
|
||||
elif "one_hot_encoding" in preprocessor:
|
||||
for feature_name in preprocessor["one_hot_encoding"][
|
||||
"hot_map"
|
||||
].values():
|
||||
add_input_field_name("one_hot_encoding", feature_name)
|
||||
|
||||
for field_name in field_names:
|
||||
if field_name in feature_names and field_name not in input_field_names:
|
||||
input_field_names.add(field_name)
|
||||
|
||||
return feature_names, input_field_names
|
||||
|
||||
@property
|
||||
def preprocessors(self) -> List[Any]:
|
||||
"""
|
||||
Returns the list of preprocessor JSON definitions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Any]
|
||||
List of preprocessors definitions or [].
|
||||
"""
|
||||
if "preprocessors" in self._definition:
|
||||
return self._definition["preprocessors"]
|
||||
return []
|
||||
|
||||
def fit(self, X, y, sample_weight=None, monitor=None) -> None:
|
||||
"""
|
||||
Override of the sklearn fit() method. It does nothing since Elastic ML models are
|
||||
trained in the Elastic Stack or imported.
|
||||
"""
|
||||
# Do nothing, model if fitted using Elasticsearch API
|
||||
pass
|
||||
|
||||
|
||||
class ESGradientBoostingClassifier(ESGradientBoostingModel, GradientBoostingClassifier):
|
||||
"""
|
||||
Elastic ML model wrapper compatible with sklearn GradientBoostingClassifier.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client : Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
model_id : str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
Multi-class classification is not supported at the moment.
|
||||
ValueError
|
||||
The classifier should be defined for at least 2 classes.
|
||||
ModelDefinitionKeyError
|
||||
If required data cannot be extracted from the model definition due to a schema change.
|
||||
"""
|
||||
|
||||
try:
|
||||
ESGradientBoostingModel.__init__(self, es_client, model_id)
|
||||
self._extract_common_parameters()
|
||||
GradientBoostingClassifier.__init__(
|
||||
self,
|
||||
learning_rate=1.0,
|
||||
n_estimators=self.n_estimators,
|
||||
max_depth=self._max_depth,
|
||||
)
|
||||
|
||||
if "classification_labels" in self._definition["trained_model"]["ensemble"]:
|
||||
self.classes_ = np.array(
|
||||
self._definition["trained_model"]["ensemble"][
|
||||
"classification_labels"
|
||||
]
|
||||
)
|
||||
else:
|
||||
self.classes_ = None
|
||||
|
||||
self.n_outputs = self._n_outputs
|
||||
if self.classes_ is not None:
|
||||
self.n_classes_ = len(self.classes_)
|
||||
elif self.n_outputs <= 2:
|
||||
self.n_classes_ = 2
|
||||
else:
|
||||
self.n_classes_ = self.n_outputs
|
||||
|
||||
if self.n_classes_ == 2:
|
||||
self._loss = BinomialDeviance(self.n_classes_)
|
||||
# self.n_outputs = 1
|
||||
elif self.n_classes_ > 2:
|
||||
raise NotImplementedError("Only binary classification is implemented.")
|
||||
else:
|
||||
raise ValueError(f"At least 2 classes required. got {self.n_classes_}.")
|
||||
|
||||
self.init_ = self._initialize_init_()
|
||||
self._initialize_estimators(DecisionTreeClassifier)
|
||||
except KeyError as ex:
|
||||
raise ModelDefinitionKeyError(ex) from ex
|
||||
|
||||
@property
|
||||
def analysis_type(self) -> Literal["classification"]:
|
||||
return TYPE_CLASSIFICATION
|
||||
|
||||
def _initialize_init_(self) -> DummyClassifier:
|
||||
estimator = DummyClassifier(strategy="prior")
|
||||
|
||||
estimator.n_classes_ = self.n_classes_
|
||||
estimator.n_outputs_ = self.n_outputs
|
||||
estimator.classes_ = np.arange(self.n_classes_)
|
||||
estimator._strategy = estimator.strategy
|
||||
|
||||
if self.n_classes_ == 2:
|
||||
log_odds = self._trees[0].tree.value.flatten()[0]
|
||||
if np.isnan(log_odds):
|
||||
raise ValueError(
|
||||
"Error initializing sklearn classifier. Incorrect prior class probability. "
|
||||
+ "Note: only export of models trained in the Elastic Stack is supported."
|
||||
)
|
||||
class_prior = 1 / (1 + np.exp(-log_odds))
|
||||
estimator.class_prior_ = np.array([1 - class_prior, class_prior])
|
||||
else:
|
||||
raise NotImplementedError("Only binary classification is implemented.")
|
||||
|
||||
return estimator
|
||||
|
||||
def predict_proba(
|
||||
self, X, feature_names_in: Optional[Union["ArrayLike", List[str]]] = None
|
||||
) -> "ArrayLike":
|
||||
"""Predict class probabilities for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
feature_names_in : {array of string, list of string} of length n_features.
|
||||
Feature names of the corresponding columns in X. Important, since the column list
|
||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ArrayLike of shape (n_samples, n_classes)
|
||||
The class probabilities of the input samples. The order of the
|
||||
classes corresponds to that in the attribute :term:`classes_`.
|
||||
"""
|
||||
if feature_names_in is not None:
|
||||
if X.shape[1] != len(feature_names_in):
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
||||
)
|
||||
if isinstance(feature_names_in, np.ndarray):
|
||||
feature_names_in = feature_names_in.tolist()
|
||||
# select columns used by the model in the correct order
|
||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
||||
|
||||
X = check_array(X)
|
||||
return GradientBoostingClassifier.predict_proba(self, X)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
X: "ArrayLike",
|
||||
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
|
||||
) -> "ArrayLike":
|
||||
"""Predict class for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
feature_names_in : {array of string, list of string} of length n_features.
|
||||
Feature names of the corresponding columns in X. Important, since the column list
|
||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ArrayLike of shape (n_samples,)
|
||||
The predicted values.
|
||||
"""
|
||||
if feature_names_in is not None:
|
||||
if X.shape[1] != len(feature_names_in):
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
||||
)
|
||||
if isinstance(feature_names_in, np.ndarray):
|
||||
feature_names_in = feature_names_in.tolist()
|
||||
# select columns used by the model in the correct order
|
||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
||||
|
||||
X = check_array(X)
|
||||
return GradientBoostingClassifier.predict(self, X)
|
||||
|
||||
|
||||
class ESGradientBoostingRegressor(ESGradientBoostingModel, GradientBoostingRegressor):
|
||||
"""
|
||||
Elastic ML model wrapper compatible with sklearn GradientBoostingRegressor.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client : Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
model_id : str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
Only MSE, MSLE, and Huber loss functions are supported.
|
||||
ModelDefinitionKeyError
|
||||
If required data cannot be extracted from the model definition due to a schema change.
|
||||
"""
|
||||
try:
|
||||
ESGradientBoostingModel.__init__(self, es_client, model_id)
|
||||
self._extract_common_parameters()
|
||||
GradientBoostingRegressor.__init__(
|
||||
self,
|
||||
learning_rate=1.0,
|
||||
n_estimators=self.n_estimators,
|
||||
max_depth=self._max_depth,
|
||||
)
|
||||
|
||||
self.n_outputs = 1
|
||||
loss_function = self._trained_model_result["trained_model_configs"][0][
|
||||
"metadata"
|
||||
]["analytics_config"]["analysis"][self.analysis_type]["loss_function"]
|
||||
if loss_function == "mse" or loss_function == "msle":
|
||||
self.criterion = "squared_error"
|
||||
self._loss = LeastSquaresError()
|
||||
elif loss_function == "huber":
|
||||
loss_parameter = loss_function = self._trained_model_result[
|
||||
"trained_model_configs"
|
||||
][0]["metadata"]["analytics_config"]["analysis"][self.analysis_type][
|
||||
"loss_function_parameter"
|
||||
]
|
||||
self.criterion = "huber"
|
||||
self._loss = HuberLossFunction(loss_parameter)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only MSE, MSLE and Huber loss functions are supported."
|
||||
)
|
||||
|
||||
self.init_ = self._initialize_init_()
|
||||
self._initialize_estimators(DecisionTreeRegressor)
|
||||
except KeyError as ex:
|
||||
raise ModelDefinitionKeyError(ex) from ex
|
||||
|
||||
@property
|
||||
def analysis_type(self) -> Literal["regression"]:
|
||||
return TYPE_REGRESSION
|
||||
|
||||
def _initialize_init_(self) -> DummyRegressor:
|
||||
constant = self._trees[0].tree.value[0]
|
||||
estimator = DummyRegressor(
|
||||
strategy="constant",
|
||||
constant=constant,
|
||||
)
|
||||
estimator.constant_ = np.array([constant])
|
||||
estimator.n_outputs_ = 1
|
||||
return estimator
|
||||
|
||||
def predict(
|
||||
self,
|
||||
X: "ArrayLike",
|
||||
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
|
||||
) -> "ArrayLike":
|
||||
"""Predict targets for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
feature_names_in : {array of string, list of string} of length n_features.
|
||||
Feature names of the corresponding columns in X. Important, since the column list
|
||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ArrayLike of shape (n_samples,)
|
||||
The predicted values.
|
||||
"""
|
||||
if feature_names_in is not None:
|
||||
if X.shape[1] != len(feature_names_in):
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
||||
)
|
||||
if isinstance(X, np.ndarray):
|
||||
feature_names_in = feature_names_in.tolist()
|
||||
# select columns used by the model in the correct order
|
||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
||||
|
||||
X = check_array(X)
|
||||
return GradientBoostingRegressor.predict(self, X)
|
@ -14,3 +14,13 @@
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from .feature_logger import FeatureLogger
|
||||
from .ltr_model_config import FeatureExtractor, LTRModelConfig, QueryFeatureExtractor
|
||||
|
||||
__all__ = [
|
||||
"LTRModelConfig",
|
||||
"QueryFeatureExtractor",
|
||||
"FeatureExtractor",
|
||||
"FeatureLogger",
|
||||
]
|
162
eland/ml/ltr/feature_logger.py
Normal file
162
eland/ml/ltr/feature_logger.py
Normal file
@ -0,0 +1,162 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import json
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, Any, List, Mapping, Tuple, Union
|
||||
|
||||
from eland.common import ensure_es_client
|
||||
from eland.ml.ltr.ltr_model_config import LTRModelConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
|
||||
class FeatureLogger:
|
||||
"""
|
||||
A class that is used during model training to extract features from the judgment list.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
es_index: str,
|
||||
ltr_model_config: LTRModelConfig,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_client: Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
|
||||
es_index: str
|
||||
Name of the Elastcsearch index used for features extractions.
|
||||
|
||||
ltr_model_config: LTRModelConfig
|
||||
LTR model config used to extract feature.
|
||||
"""
|
||||
self._model_config = ltr_model_config
|
||||
self._client: Elasticsearch = ensure_es_client(es_client)
|
||||
self._index_name = es_index
|
||||
|
||||
def extract_features(
|
||||
self, query_params: Mapping[str, Any], doc_ids: List[str]
|
||||
) -> Mapping[str, List[float]]:
|
||||
"""
|
||||
Extract document features.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query_params: Mapping[str, Any]
|
||||
List of templates params used during features extraction.
|
||||
|
||||
doc_ids: List[str]
|
||||
List of doc ids.
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
|
||||
|
||||
>>> ltr_model_config=LTRModelConfig(
|
||||
... feature_extractors=[
|
||||
... QueryFeatureExtractor(
|
||||
... feature_name='title_bm25',
|
||||
... query={ "match": { "title": "{{query}}" } }
|
||||
... ),
|
||||
... QueryFeatureExtractor(
|
||||
... feature_name='descritption_bm25',
|
||||
... query={ "match": { "description": "{{query}}" } }
|
||||
... )
|
||||
... ]
|
||||
... )
|
||||
|
||||
>>> feature_logger = FeatureLogger(
|
||||
... es_client='http://localhost:9200',
|
||||
... es_index='national_parks',
|
||||
... ltr_model_config=ltr_model_config
|
||||
... )
|
||||
|
||||
>>> doc_features = feature_logger.extract_features(query_params={"query": "yosemite"}, doc_ids=["park-yosemite", "park-everglade"])
|
||||
"""
|
||||
|
||||
doc_features = {
|
||||
doc_id: [float("nan")] * len(self._model_config.feature_extractors)
|
||||
for doc_id in doc_ids
|
||||
}
|
||||
|
||||
for doc_id, query_features in self._extract_query_features(
|
||||
query_params, doc_ids
|
||||
).items():
|
||||
for feature_name, feature_value in query_features.items():
|
||||
doc_features[doc_id][
|
||||
self._model_config.feature_index(feature_name)
|
||||
] = feature_value
|
||||
|
||||
return doc_features
|
||||
|
||||
def _to_named_query(
|
||||
self, query: Mapping[str, Mapping[str, any]], query_name: str
|
||||
) -> Mapping[str, Mapping[str, any]]:
|
||||
return {"bool": {"must": query, "_name": query_name}}
|
||||
|
||||
@cached_property
|
||||
def _script_source(self) -> str:
|
||||
query_extractors = self._model_config.query_feature_extractors
|
||||
queries = [
|
||||
self._to_named_query(extractor.query, extractor.feature_name)
|
||||
for extractor in query_extractors
|
||||
]
|
||||
|
||||
return (
|
||||
json.dumps(
|
||||
{
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": queries,
|
||||
"filter": {"ids": {"values": "##DOC_IDS_JSON##"}},
|
||||
}
|
||||
},
|
||||
"size": "##DOC_IDS_SIZE##",
|
||||
"_source": False,
|
||||
}
|
||||
)
|
||||
.replace('"##DOC_IDS_JSON##"', "{{#toJson}}__doc_ids{{/toJson}}")
|
||||
.replace('"##DOC_IDS_SIZE##"', "{{__size}}")
|
||||
)
|
||||
|
||||
def _extract_query_features(
|
||||
self, query_params: Mapping[str, Any], doc_ids: List[str]
|
||||
):
|
||||
# When support for include_named_queries_score will be added,
|
||||
# this will be replaced by the call to the client search_template method.
|
||||
from elasticsearch._sync.client import _quote
|
||||
|
||||
__path = f"/{_quote(self._index_name)}/_search/template"
|
||||
__query = {"include_named_queries_score": True}
|
||||
__headers = {"accept": "application/json", "content-type": "application/json"}
|
||||
__body = {
|
||||
"source": self._script_source,
|
||||
"params": {**query_params, "__doc_ids": doc_ids, "__size": len(doc_ids)},
|
||||
}
|
||||
|
||||
return {
|
||||
hit["_id"]: hit["matched_queries"] if "matched_queries" in hit else {}
|
||||
for hit in self._client.perform_request(
|
||||
"GET", __path, params=__query, headers=__headers, body=__body
|
||||
)["hits"]["hits"]
|
||||
}
|
156
eland/ml/ltr/ltr_model_config.py
Normal file
156
eland/ml/ltr/ltr_model_config.py
Normal file
@ -0,0 +1,156 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from functools import cached_property
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
from eland.ml.common import TYPE_LEARNING_TO_RANK
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""
|
||||
A base class representing a generic feature extractor.
|
||||
"""
|
||||
|
||||
def __init__(self, type: str, feature_name: str):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
type: str
|
||||
Type of the feature extractor.
|
||||
|
||||
feature_name: str
|
||||
Name of the extracted features.
|
||||
"""
|
||||
self.feature_name = feature_name
|
||||
self.type = type
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert the feature extractor into a dict that can be send to ES as part of the inference config."""
|
||||
return {
|
||||
self.type: {
|
||||
k: v.to_dict() if hasattr(v, "to_dict") else v
|
||||
for k, v in self.__dict__.items()
|
||||
if v is not None and k != "type"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class QueryFeatureExtractor(FeatureExtractor):
|
||||
"""
|
||||
A class that allows to define a query feature extractor.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
feature_name: str,
|
||||
query: Mapping[str, Any],
|
||||
default_score: Optional[float] = None,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
feature_name: str
|
||||
Name of the extracted features.
|
||||
|
||||
query: Mapping[str, Any]
|
||||
Templated query used to extract the feature.
|
||||
|
||||
default_score: str
|
||||
Scored used by default when the doc is not matching the query.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from eland.ml.ltr import QueryFeatureExtractor
|
||||
|
||||
>>> query_feature_extractor = QueryFeatureExtractor(
|
||||
... feature_name='title_bm25',
|
||||
... query={ "match": { "title": "{{query}}" } }
|
||||
... )
|
||||
"""
|
||||
super().__init__(feature_name=feature_name, type="query_extractor")
|
||||
self.query = query
|
||||
self.default_score = default_score
|
||||
|
||||
|
||||
class LTRModelConfig:
|
||||
"""
|
||||
A class representing LTR model configuration.
|
||||
"""
|
||||
|
||||
def __init__(self, feature_extractors: List[FeatureExtractor]):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
feature_extractors: List[FeatureExtractor]
|
||||
List of the feature extractors for the LTR model.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor
|
||||
|
||||
>>> ltr_model_config = LTRModelConfig(
|
||||
... feature_extractors=[
|
||||
... QueryFeatureExtractor(
|
||||
... feature_name='title_bm25',
|
||||
... query={ "match": { "title": "{{query}}" } }
|
||||
... ),
|
||||
... QueryFeatureExtractor(
|
||||
... feature_name='descritption_bm25',
|
||||
... query={ "match": { "description": "{{query}}" } }
|
||||
... )
|
||||
... ]
|
||||
... )
|
||||
"""
|
||||
self.feature_extractors = feature_extractors
|
||||
|
||||
def to_dict(self) -> Mapping[str, Any]:
|
||||
"""
|
||||
Convert the into a dict that can be send to ES as an inference config.
|
||||
"""
|
||||
return {
|
||||
TYPE_LEARNING_TO_RANK: {
|
||||
"feature_extractors": [
|
||||
feature_extractor.to_dict()
|
||||
for feature_extractor in self.feature_extractors
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@cached_property
|
||||
def feature_names(self) -> List[str]:
|
||||
"""
|
||||
List of the feature names for the model.
|
||||
"""
|
||||
|
||||
return [extractor.feature_name for extractor in self.feature_extractors]
|
||||
|
||||
@cached_property
|
||||
def query_feature_extractors(self) -> List[QueryFeatureExtractor]:
|
||||
"""
|
||||
List of query feature extractors for the model.
|
||||
"""
|
||||
return [
|
||||
extractor
|
||||
for extractor in self.feature_extractors
|
||||
if isinstance(extractor, QueryFeatureExtractor)
|
||||
]
|
||||
|
||||
def feature_index(self, feature_name: str) -> int:
|
||||
"Returns the index of the feature in the feature lists."
|
||||
|
||||
return self.feature_names.index(feature_name)
|
@ -20,10 +20,11 @@ from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Uni
|
||||
import elasticsearch
|
||||
import numpy as np
|
||||
|
||||
from eland.common import ensure_es_client, es_version
|
||||
from eland.common import ensure_es_client, es_version, is_serverless_es
|
||||
from eland.utils import deprecated_api
|
||||
|
||||
from .common import TYPE_CLASSIFICATION, TYPE_REGRESSION
|
||||
from .common import TYPE_CLASSIFICATION, TYPE_LEARNING_TO_RANK, TYPE_REGRESSION
|
||||
from .ltr import LTRModelConfig
|
||||
from .transformers import get_model_transformer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -37,7 +38,6 @@ if TYPE_CHECKING:
|
||||
RandomForestClassifier,
|
||||
RandomForestRegressor,
|
||||
)
|
||||
from sklearn.pipeline import Pipeline # type: ignore # noqa: F401
|
||||
from sklearn.tree import ( # type: ignore # noqa: F401
|
||||
DecisionTreeClassifier,
|
||||
DecisionTreeRegressor,
|
||||
@ -45,7 +45,11 @@ if TYPE_CHECKING:
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
from xgboost import XGBClassifier, XGBRegressor # type: ignore # noqa: F401
|
||||
from xgboost import ( # type: ignore # noqa: F401
|
||||
XGBClassifier,
|
||||
XGBRanker,
|
||||
XGBRegressor,
|
||||
)
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
@ -130,6 +134,11 @@ class MLModel:
|
||||
>>> # Delete model from Elasticsearch
|
||||
>>> es_model.delete_model()
|
||||
"""
|
||||
if self.model_type not in (TYPE_CLASSIFICATION, TYPE_REGRESSION):
|
||||
raise NotImplementedError(
|
||||
f"Prediction for type {self.model_type} is not supported."
|
||||
)
|
||||
|
||||
docs: List[Mapping[str, Any]] = []
|
||||
if isinstance(X, np.ndarray):
|
||||
|
||||
@ -215,6 +224,8 @@ class MLModel:
|
||||
inference_config = self._trained_model_config["inference_config"]
|
||||
if "classification" in inference_config:
|
||||
return TYPE_CLASSIFICATION
|
||||
elif "learning_to_rank" in inference_config:
|
||||
return TYPE_LEARNING_TO_RANK
|
||||
elif "regression" in inference_config:
|
||||
return TYPE_REGRESSION
|
||||
raise ValueError("Unable to determine 'model_type' for MLModel")
|
||||
@ -245,6 +256,7 @@ class MLModel:
|
||||
"RandomForestRegressor",
|
||||
"RandomForestClassifier",
|
||||
"XGBClassifier",
|
||||
"XGBRanker",
|
||||
"XGBRegressor",
|
||||
"LGBMRegressor",
|
||||
"LGBMClassifier",
|
||||
@ -296,6 +308,11 @@ class MLModel:
|
||||
- "binary:logistic"
|
||||
- "multi:softmax"
|
||||
- "multi:softprob"
|
||||
- xgboost.XGBRanker
|
||||
- only the following objectives are supported:
|
||||
- "rank:map"
|
||||
- "rank:ndcg"
|
||||
- "rank:pairwise"
|
||||
- xgboost.XGBRegressor
|
||||
- only the following objectives are supported:
|
||||
- "reg:squarederror"
|
||||
@ -358,6 +375,125 @@ class MLModel:
|
||||
>>> # Delete model from Elasticsearch
|
||||
>>> es_model.delete_model()
|
||||
"""
|
||||
|
||||
return cls._import_model(
|
||||
es_client=es_client,
|
||||
model_id=model_id,
|
||||
model=model,
|
||||
feature_names=feature_names,
|
||||
classification_labels=classification_labels,
|
||||
classification_weights=classification_weights,
|
||||
es_if_exists=es_if_exists,
|
||||
es_compress_model_definition=es_compress_model_definition,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def import_ltr_model(
|
||||
cls,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
model: Union[
|
||||
"DecisionTreeRegressor",
|
||||
"RandomForestRegressor",
|
||||
"XGBRanker",
|
||||
"XGBRegressor",
|
||||
"LGBMRegressor",
|
||||
],
|
||||
ltr_model_config: LTRModelConfig,
|
||||
es_if_exists: Optional[str] = None,
|
||||
es_compress_model_definition: bool = True,
|
||||
) -> "MLModel":
|
||||
"""
|
||||
Transform and serialize a trained 3rd party model into Elasticsearch.
|
||||
This model can then be used as a learning_to_rank rescorer in the Elastic Stack.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
es_client: Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance
|
||||
|
||||
model_id: str
|
||||
The unique identifier of the trained inference model in Elasticsearch.
|
||||
|
||||
model: An instance of a supported python model. We support the following model types for LTR prediction:
|
||||
- sklearn.tree.DecisionTreeRegressor
|
||||
- sklearn.ensemble.RandomForestRegressor
|
||||
- xgboost.XGBRanker
|
||||
- only the following objectives are supported:
|
||||
- "rank:map"
|
||||
- "rank:ndcg"
|
||||
- "rank:pairwise"
|
||||
- xgboost.XGBRegressor
|
||||
- only the following objectives are supported:
|
||||
- "reg:squarederror"
|
||||
- "reg:linear"
|
||||
- "reg:squaredlogerror"
|
||||
- "reg:logistic"
|
||||
- "reg:pseudohubererror"
|
||||
- lightgbm.LGBMRegressor
|
||||
- Categorical fields are expected to already be processed
|
||||
- Only the following objectives are supported
|
||||
- "regression"
|
||||
- "regression_l1"
|
||||
- "huber"
|
||||
- "fair"
|
||||
- "quantile"
|
||||
- "mape"
|
||||
|
||||
ltr_model_config: LTRModelConfig
|
||||
The LTR model configuration is used to configure feature extractors for the LTR model.
|
||||
Feature names are automatically inferred from the feature extractors.
|
||||
|
||||
es_if_exists: {'fail', 'replace'} default 'fail'
|
||||
How to behave if model already exists
|
||||
|
||||
- fail: Raise a Value Error
|
||||
- replace: Overwrite existing model
|
||||
|
||||
es_compress_model_definition: bool
|
||||
If True will use 'compressed_definition' which uses gzipped
|
||||
JSON instead of raw JSON to reduce the amount of data sent
|
||||
over the wire in HTTP requests. Defaults to 'True'.
|
||||
"""
|
||||
|
||||
return cls._import_model(
|
||||
es_client=es_client,
|
||||
model_id=model_id,
|
||||
model=model,
|
||||
feature_names=ltr_model_config.feature_names,
|
||||
inference_config=ltr_model_config.to_dict(),
|
||||
es_if_exists=es_if_exists,
|
||||
es_compress_model_definition=es_compress_model_definition,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _import_model(
|
||||
cls,
|
||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
||||
model_id: str,
|
||||
model: Union[
|
||||
"DecisionTreeClassifier",
|
||||
"DecisionTreeRegressor",
|
||||
"RandomForestRegressor",
|
||||
"RandomForestClassifier",
|
||||
"XGBClassifier",
|
||||
"XGBRanker",
|
||||
"XGBRegressor",
|
||||
"LGBMRegressor",
|
||||
"LGBMClassifier",
|
||||
],
|
||||
feature_names: List[str],
|
||||
classification_labels: Optional[List[str]] = None,
|
||||
classification_weights: Optional[List[float]] = None,
|
||||
es_if_exists: Optional[str] = None,
|
||||
es_compress_model_definition: bool = True,
|
||||
inference_config: Optional[Mapping[str, Mapping[str, Any]]] = None,
|
||||
) -> "MLModel":
|
||||
"""
|
||||
Actual implementation of model import used by public API methods.
|
||||
"""
|
||||
|
||||
es_client = ensure_es_client(es_client)
|
||||
transformer = get_model_transformer(
|
||||
model,
|
||||
@ -368,6 +504,9 @@ class MLModel:
|
||||
serializer = transformer.transform()
|
||||
model_type = transformer.model_type
|
||||
|
||||
if inference_config is None:
|
||||
inference_config = {model_type: {}}
|
||||
|
||||
if es_if_exists is None:
|
||||
es_if_exists = "fail"
|
||||
|
||||
@ -385,18 +524,25 @@ class MLModel:
|
||||
elif es_if_exists == "replace":
|
||||
ml_model.delete_model()
|
||||
|
||||
trained_model_input = None
|
||||
is_ltr = next(iter(inference_config)) is TYPE_LEARNING_TO_RANK
|
||||
if not is_ltr or (
|
||||
es_version(es_client) < (8, 15) and not is_serverless_es(es_client)
|
||||
):
|
||||
trained_model_input = {"field_names": feature_names}
|
||||
|
||||
if es_compress_model_definition:
|
||||
ml_model._client.ml.put_trained_model(
|
||||
model_id=model_id,
|
||||
input={"field_names": feature_names},
|
||||
inference_config={model_type: {}},
|
||||
inference_config=inference_config,
|
||||
input=trained_model_input,
|
||||
compressed_definition=serializer.serialize_and_compress_model(),
|
||||
)
|
||||
else:
|
||||
ml_model._client.ml.put_trained_model(
|
||||
model_id=model_id,
|
||||
input={"field_names": feature_names},
|
||||
inference_config={model_type: {}},
|
||||
inference_config=inference_config,
|
||||
input=trained_model_input,
|
||||
definition=serializer.serialize_model(),
|
||||
)
|
||||
|
||||
@ -425,83 +571,6 @@ class MLModel:
|
||||
return False
|
||||
return True
|
||||
|
||||
def export_model(self) -> "Pipeline":
|
||||
"""Export Elastic ML model as sklearn Pipeline.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sklearn.pipeline.Pipeline
|
||||
_description_
|
||||
|
||||
Raises
|
||||
------
|
||||
AssertionError
|
||||
If preprocessors JSON definition has unexpected schema.
|
||||
ValueError
|
||||
The model is expected to be trained in Elastic Stack. Models initially imported
|
||||
from xgboost, lgbm, or sklearn are not supported.
|
||||
ValueError
|
||||
If unexpected categorical encoding is found in the list of preprocessors.
|
||||
NotImplementedError
|
||||
Only regression and binary classification models are supported currently.
|
||||
"""
|
||||
from sklearn.compose import ColumnTransformer # type: ignore # noqa: F401
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from .exporters._sklearn_deserializers import (
|
||||
FrequencyEncoder,
|
||||
OneHotEncoder,
|
||||
TargetMeanEncoder,
|
||||
)
|
||||
from .exporters.es_gb_models import (
|
||||
ESGradientBoostingClassifier,
|
||||
ESGradientBoostingRegressor,
|
||||
)
|
||||
|
||||
if self.model_type == TYPE_CLASSIFICATION:
|
||||
model = ESGradientBoostingClassifier(
|
||||
es_client=self._client, model_id=self._model_id
|
||||
)
|
||||
elif self.model_type == TYPE_REGRESSION:
|
||||
model = ESGradientBoostingRegressor(
|
||||
es_client=self._client, model_id=self._model_id
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only regression and binary classification models are supported currently."
|
||||
)
|
||||
|
||||
transformers = []
|
||||
for p in model.preprocessors:
|
||||
assert (
|
||||
len(p) == 1
|
||||
), f"Unexpected preprocessor data structure: {p}. One-key mapping expected."
|
||||
encoding_type = list(p.keys())[0]
|
||||
field = p[encoding_type]["field"]
|
||||
if encoding_type == "frequency_encoding":
|
||||
transform = FrequencyEncoder(p)
|
||||
transformers.append((f"{field}_{encoding_type}", transform, field))
|
||||
elif encoding_type == "target_mean_encoding":
|
||||
transform = TargetMeanEncoder(p)
|
||||
transformers.append((f"{field}_{encoding_type}", transform, field))
|
||||
elif encoding_type == "one_hot_encoding":
|
||||
transform = OneHotEncoder(p)
|
||||
transformers.append((f"{field}_{encoding_type}", transform, [field]))
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected categorical encoding type {encoding_type} found. "
|
||||
+ "Expected encodings: frequency_encoding, target_mean_encoding, one_hot_encoding."
|
||||
)
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=transformers,
|
||||
remainder="passthrough",
|
||||
verbose_feature_names_out=False,
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("es_model", model)])
|
||||
|
||||
return pipeline
|
||||
|
||||
@property
|
||||
def _trained_model_config(self) -> Dict[str, Any]:
|
||||
"""Lazily loads an ML models 'trained_model_config' information"""
|
||||
|
@ -17,21 +17,40 @@
|
||||
|
||||
from eland.ml.pytorch._pytorch_model import PyTorchModel # noqa: F401
|
||||
from eland.ml.pytorch.nlp_ml_model import (
|
||||
FillMaskInferenceOptions,
|
||||
NerInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
NlpMPNetTokenizationConfig,
|
||||
NlpRobertaTokenizationConfig,
|
||||
NlpTrainedModelConfig,
|
||||
NlpXLMRobertaTokenizationConfig,
|
||||
QuestionAnsweringInferenceOptions,
|
||||
TextClassificationInferenceOptions,
|
||||
TextEmbeddingInferenceOptions,
|
||||
TextSimilarityInferenceOptions,
|
||||
ZeroShotClassificationInferenceOptions,
|
||||
)
|
||||
from eland.ml.pytorch.traceable_model import TraceableModel # noqa: F401
|
||||
from eland.ml.pytorch.transformers import task_type_from_model_config
|
||||
from eland.ml.pytorch.transformers import (
|
||||
UnknownModelInputSizeError,
|
||||
task_type_from_model_config,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"PyTorchModel",
|
||||
"TraceableModel",
|
||||
"FillMaskInferenceOptions",
|
||||
"NerInferenceOptions",
|
||||
"NlpTrainedModelConfig",
|
||||
"NlpBertTokenizationConfig",
|
||||
"NlpRobertaTokenizationConfig",
|
||||
"NlpXLMRobertaTokenizationConfig",
|
||||
"NlpMPNetTokenizationConfig",
|
||||
"QuestionAnsweringInferenceOptions",
|
||||
"TextClassificationInferenceOptions",
|
||||
"TextEmbeddingInferenceOptions",
|
||||
"TextSimilarityInferenceOptions",
|
||||
"ZeroShotClassificationInferenceOptions",
|
||||
"task_type_from_model_config",
|
||||
"UnknownModelInputSizeError",
|
||||
]
|
||||
|
@ -126,6 +126,7 @@ class PyTorchModel:
|
||||
def infer(
|
||||
self,
|
||||
docs: List[Mapping[str, str]],
|
||||
inference_config: Optional[Mapping[str, Any]] = None,
|
||||
timeout: str = DEFAULT_TIMEOUT,
|
||||
) -> Any:
|
||||
if docs is None:
|
||||
@ -133,6 +134,8 @@ class PyTorchModel:
|
||||
|
||||
__body: Dict[str, Any] = {}
|
||||
__body["docs"] = docs
|
||||
if inference_config is not None:
|
||||
__body["inference_config"] = inference_config
|
||||
|
||||
__path = f"/_ml/trained_models/{_quote(self.model_id)}/_infer"
|
||||
__query: Dict[str, Any] = {}
|
||||
|
@ -86,6 +86,27 @@ class NlpXLMRobertaTokenizationConfig(NlpTokenizationConfig):
|
||||
)
|
||||
|
||||
|
||||
class NlpDebertaV2TokenizationConfig(NlpTokenizationConfig):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
do_lower_case: t.Optional[bool] = None,
|
||||
with_special_tokens: t.Optional[bool] = None,
|
||||
max_sequence_length: t.Optional[int] = None,
|
||||
truncate: t.Optional[
|
||||
t.Union["t.Literal['first', 'none', 'second']", str]
|
||||
] = None,
|
||||
span: t.Optional[int] = None,
|
||||
):
|
||||
super().__init__(
|
||||
configuration_type="deberta_v2",
|
||||
with_special_tokens=with_special_tokens,
|
||||
max_sequence_length=max_sequence_length,
|
||||
truncate=truncate,
|
||||
span=span,
|
||||
)
|
||||
|
||||
|
||||
class NlpBertTokenizationConfig(NlpTokenizationConfig):
|
||||
def __init__(
|
||||
self,
|
||||
@ -308,6 +329,23 @@ class TrainedModelInput:
|
||||
return self.__dict__
|
||||
|
||||
|
||||
class PrefixStrings:
|
||||
def __init__(
|
||||
self, *, ingest_prefix: t.Optional[str], search_prefix: t.Optional[str]
|
||||
):
|
||||
self.ingest_prefix = ingest_prefix
|
||||
self.search_prefix = search_prefix
|
||||
|
||||
def to_dict(self) -> t.Dict[str, t.Any]:
|
||||
config = {}
|
||||
if self.ingest_prefix is not None:
|
||||
config["ingest"] = self.ingest_prefix
|
||||
if self.search_prefix is not None:
|
||||
config["search"] = self.search_prefix
|
||||
|
||||
return config
|
||||
|
||||
|
||||
class NlpTrainedModelConfig:
|
||||
def __init__(
|
||||
self,
|
||||
@ -317,16 +355,16 @@ class NlpTrainedModelConfig:
|
||||
input: TrainedModelInput = TrainedModelInput(field_names=["text_field"]),
|
||||
metadata: t.Optional[dict] = None,
|
||||
model_type: t.Union["t.Literal['pytorch']", str] = "pytorch",
|
||||
default_field_map: t.Optional[t.Mapping[str, str]] = None,
|
||||
tags: t.Optional[t.Union[t.List[str], t.Tuple[str, ...]]] = None,
|
||||
prefix_strings: t.Optional[PrefixStrings],
|
||||
):
|
||||
self.tags = tags
|
||||
self.default_field_map = default_field_map
|
||||
self.description = description
|
||||
self.inference_config = inference_config
|
||||
self.input = input
|
||||
self.metadata = metadata
|
||||
self.model_type = model_type
|
||||
self.prefix_strings = prefix_strings
|
||||
|
||||
def to_dict(self) -> t.Dict[str, t.Any]:
|
||||
return {
|
||||
|
@ -50,12 +50,10 @@ class TraceableModel(ABC):
|
||||
return self._trace()
|
||||
|
||||
@abstractmethod
|
||||
def sample_output(self) -> torch.Tensor:
|
||||
...
|
||||
def sample_output(self) -> torch.Tensor: ...
|
||||
|
||||
@abstractmethod
|
||||
def _trace(self) -> TracedModelTypes:
|
||||
...
|
||||
def _trace(self) -> TracedModelTypes: ...
|
||||
|
||||
def classification_labels(self) -> Optional[List[str]]:
|
||||
return None
|
||||
@ -66,3 +64,7 @@ class TraceableModel(ABC):
|
||||
trace_model = torch.jit.freeze(trace_model)
|
||||
torch.jit.save(trace_model, model_path)
|
||||
return model_path
|
||||
|
||||
@property
|
||||
def model(self) -> nn.Module:
|
||||
return self._model
|
||||
|
@ -22,18 +22,17 @@ libraries such as sentence-transformers.
|
||||
|
||||
import json
|
||||
import os.path
|
||||
import random
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import torch # type: ignore
|
||||
import transformers # type: ignore
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
from torch import Tensor, nn
|
||||
from torch import Tensor
|
||||
from torch.profiler import profile # type: ignore
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForQuestionAnswering,
|
||||
BertTokenizer,
|
||||
PretrainedConfig,
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizer,
|
||||
@ -45,12 +44,14 @@ from eland.ml.pytorch.nlp_ml_model import (
|
||||
NerInferenceOptions,
|
||||
NlpBertJapaneseTokenizationConfig,
|
||||
NlpBertTokenizationConfig,
|
||||
NlpDebertaV2TokenizationConfig,
|
||||
NlpMPNetTokenizationConfig,
|
||||
NlpRobertaTokenizationConfig,
|
||||
NlpTokenizationConfig,
|
||||
NlpTrainedModelConfig,
|
||||
NlpXLMRobertaTokenizationConfig,
|
||||
PassThroughInferenceOptions,
|
||||
PrefixStrings,
|
||||
QuestionAnsweringInferenceOptions,
|
||||
TextClassificationInferenceOptions,
|
||||
TextEmbeddingInferenceOptions,
|
||||
@ -60,8 +61,13 @@ from eland.ml.pytorch.nlp_ml_model import (
|
||||
ZeroShotClassificationInferenceOptions,
|
||||
)
|
||||
from eland.ml.pytorch.traceable_model import TraceableModel
|
||||
from eland.ml.pytorch.wrappers import (
|
||||
_DistilBertWrapper,
|
||||
_DPREncoderWrapper,
|
||||
_QuestionAnsweringWrapperModule,
|
||||
_SentenceTransformerWrapperModule,
|
||||
)
|
||||
|
||||
DEFAULT_OUTPUT_KEY = "sentence_embedding"
|
||||
SUPPORTED_TASK_TYPES = {
|
||||
"fill_mask",
|
||||
"ner",
|
||||
@ -112,6 +118,7 @@ SUPPORTED_TOKENIZERS = (
|
||||
transformers.BartTokenizer,
|
||||
transformers.SqueezeBertTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
transformers.DebertaV2Tokenizer,
|
||||
)
|
||||
SUPPORTED_TOKENIZERS_NAMES = ", ".join(sorted([str(x) for x in SUPPORTED_TOKENIZERS]))
|
||||
|
||||
@ -127,6 +134,10 @@ class TaskTypeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownModelInputSizeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]:
|
||||
if model_config.architectures is None:
|
||||
if model_config.name_or_path.startswith("sentence-transformers/"):
|
||||
@ -162,277 +173,6 @@ def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]
|
||||
return potential_task_types.pop()
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around a question answering model.
|
||||
Our inference engine only takes the first tuple if the inference response
|
||||
is a tuple.
|
||||
|
||||
This wrapper transforms the output to be a stacked tensor if its a tuple.
|
||||
|
||||
Otherwise it passes it through
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||
model_id, token=token, torchscript=True
|
||||
)
|
||||
if isinstance(
|
||||
model.config,
|
||||
(
|
||||
transformers.MPNetConfig,
|
||||
transformers.XLMRobertaConfig,
|
||||
transformers.RobertaConfig,
|
||||
transformers.BartConfig,
|
||||
),
|
||||
):
|
||||
return _TwoParameterQuestionAnsweringWrapper(model)
|
||||
else:
|
||||
return _QuestionAnsweringWrapper(model)
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
del inputs["position_ids"]
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _DistilBertWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
A simple wrapper around DistilBERT model which makes the model inputs
|
||||
conform to Elasticsearch's native inference processor interface.
|
||||
"""
|
||||
|
||||
def __init__(self, model: transformers.PreTrainedModel):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
|
||||
if isinstance(model.config, transformers.DistilBertConfig):
|
||||
return _DistilBertWrapper(model)
|
||||
else:
|
||||
return model
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
_token_type_ids: Tensor,
|
||||
_position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
class _SentenceTransformerWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around sentence-transformer models to provide pooling,
|
||||
normalization and other graph layers that are not defined in the base
|
||||
HuggingFace transformer model.
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self._st_model = SentenceTransformer(model.config.name_or_path)
|
||||
self._output_key = output_key
|
||||
self.config = model.config
|
||||
|
||||
self._remove_pooling_layer()
|
||||
self._replace_transformer_layer()
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(
|
||||
model_id: str,
|
||||
*,
|
||||
token: Optional[str] = None,
|
||||
output_key: str = DEFAULT_OUTPUT_KEY,
|
||||
) -> Optional[Any]:
|
||||
model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
|
||||
if isinstance(
|
||||
model.config,
|
||||
(
|
||||
transformers.MPNetConfig,
|
||||
transformers.XLMRobertaConfig,
|
||||
transformers.RobertaConfig,
|
||||
transformers.BartConfig,
|
||||
),
|
||||
):
|
||||
return _TwoParameterSentenceTransformerWrapper(model, output_key)
|
||||
else:
|
||||
return _SentenceTransformerWrapper(model, output_key)
|
||||
|
||||
def _remove_pooling_layer(self) -> None:
|
||||
"""
|
||||
Removes any last pooling layer which is not used to create embeddings.
|
||||
Leaving this layer in will cause it to return a NoneType which in turn
|
||||
will fail to load in libtorch. Alternatively, we can just use the output
|
||||
of the pooling layer as a dummy but this also affects (if only in a
|
||||
minor way) the performance of inference, so we're better off removing
|
||||
the layer if we can.
|
||||
"""
|
||||
|
||||
if hasattr(self._hf_model, "pooler"):
|
||||
self._hf_model.pooler = None
|
||||
|
||||
def _replace_transformer_layer(self) -> None:
|
||||
"""
|
||||
Replaces the HuggingFace Transformer layer in the SentenceTransformer
|
||||
modules so we can set it with one that has pooling layer removed and
|
||||
was loaded ready for TorchScript export.
|
||||
"""
|
||||
|
||||
self._st_model._modules["0"].auto_model = self._hf_model
|
||||
|
||||
|
||||
class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _DPREncoderWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
AutoModel loading does not work for DPRContextEncoders, this only exists as
|
||||
a workaround. This may never be fixed so this is likely permanent.
|
||||
See: https://github.com/huggingface/transformers/issues/13670
|
||||
"""
|
||||
|
||||
_SUPPORTED_MODELS = {
|
||||
transformers.DPRContextEncoder,
|
||||
transformers.DPRQuestionEncoder,
|
||||
}
|
||||
_SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
|
||||
):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
config = AutoConfig.from_pretrained(model_id, token=token)
|
||||
|
||||
def is_compatible() -> bool:
|
||||
is_dpr_model = config.model_type == "dpr"
|
||||
has_architectures = (
|
||||
config.architectures is not None and len(config.architectures) == 1
|
||||
)
|
||||
is_supported_architecture = has_architectures and (
|
||||
config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
|
||||
)
|
||||
return is_dpr_model and is_supported_architecture
|
||||
|
||||
if is_compatible():
|
||||
model = getattr(transformers, config.architectures[0]).from_pretrained(
|
||||
model_id, torchscript=True
|
||||
)
|
||||
return _DPREncoderWrapper(model)
|
||||
else:
|
||||
return None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
_position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
)
|
||||
|
||||
|
||||
class _TransformerTraceableModel(TraceableModel):
|
||||
"""A base class representing a HuggingFace transformer model that can be traced."""
|
||||
|
||||
@ -451,7 +191,7 @@ class _TransformerTraceableModel(TraceableModel):
|
||||
|
||||
def _trace(self) -> TracedModelTypes:
|
||||
inputs = self._compatible_inputs()
|
||||
return torch.jit.trace(self._model, inputs)
|
||||
return torch.jit.trace(self._model, example_inputs=inputs)
|
||||
|
||||
def sample_output(self) -> Tensor:
|
||||
inputs = self._compatible_inputs()
|
||||
@ -466,17 +206,23 @@ class _TransformerTraceableModel(TraceableModel):
|
||||
inputs["input_ids"].size(1), dtype=torch.long
|
||||
)
|
||||
if isinstance(
|
||||
self._model.config,
|
||||
self._tokenizer,
|
||||
(
|
||||
transformers.MPNetConfig,
|
||||
transformers.XLMRobertaConfig,
|
||||
transformers.RobertaConfig,
|
||||
transformers.BartConfig,
|
||||
transformers.BartTokenizer,
|
||||
transformers.MPNetTokenizer,
|
||||
transformers.RobertaTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
),
|
||||
):
|
||||
del inputs["token_type_ids"]
|
||||
return (inputs["input_ids"], inputs["attention_mask"])
|
||||
|
||||
if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||
return (
|
||||
inputs["input_ids"],
|
||||
inputs["attention_mask"],
|
||||
inputs["token_type_ids"],
|
||||
)
|
||||
|
||||
position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
|
||||
inputs["position_ids"] = position_ids
|
||||
return (
|
||||
@ -487,8 +233,7 @@ class _TransformerTraceableModel(TraceableModel):
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
...
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding: ...
|
||||
|
||||
|
||||
class _TraceableClassificationModel(_TransformerTraceableModel, ABC):
|
||||
@ -510,6 +255,15 @@ class _TraceableFillMaskModel(_TransformerTraceableModel):
|
||||
)
|
||||
|
||||
|
||||
class _TraceableTextExpansionModel(_TransformerTraceableModel):
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
return self._tokenizer(
|
||||
"This is an example sentence.",
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
|
||||
class _TraceableNerModel(_TraceableClassificationModel):
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
return self._tokenizer(
|
||||
@ -544,7 +298,7 @@ class _TraceableTextEmbeddingModel(_TransformerTraceableModel):
|
||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||
return self._tokenizer(
|
||||
"This is an example sentence.",
|
||||
padding="max_length",
|
||||
padding="longest",
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
@ -584,10 +338,13 @@ class TransformerModel:
|
||||
self,
|
||||
*,
|
||||
model_id: str,
|
||||
access_token: Optional[str],
|
||||
task_type: str,
|
||||
es_version: Optional[Tuple[int, int, int]] = None,
|
||||
quantize: bool = False,
|
||||
access_token: Optional[str] = None,
|
||||
ingest_prefix: Optional[str] = None,
|
||||
search_prefix: Optional[str] = None,
|
||||
max_model_input_size: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Loads a model from the Hugging Face repository or local file and creates
|
||||
@ -610,11 +367,29 @@ class TransformerModel:
|
||||
|
||||
quantize: bool, default False
|
||||
Quantize the model.
|
||||
|
||||
access_token: Optional[str]
|
||||
For the HuggingFace Hub private model access
|
||||
|
||||
ingest_prefix: Optional[str]
|
||||
Prefix string to prepend to input at ingest
|
||||
|
||||
search_prefix: Optional[str]
|
||||
Prefix string to prepend to input at search
|
||||
|
||||
max_model_input_size: Optional[int]
|
||||
The max model input size counted in tokens.
|
||||
Usually this value should be extracted from the model configuration
|
||||
but if that is not possible or the data is missing it can be
|
||||
explicitly set with this parameter.
|
||||
"""
|
||||
|
||||
self._model_id = model_id
|
||||
self._access_token = access_token
|
||||
self._task_type = task_type.replace("-", "_")
|
||||
self._ingest_prefix = ingest_prefix
|
||||
self._search_prefix = search_prefix
|
||||
self._max_model_input_size = max_model_input_size
|
||||
|
||||
# load Hugging Face model and tokenizer
|
||||
# use padding in the tokenizer to ensure max length sequences are used for tracing (at call time)
|
||||
@ -647,7 +422,12 @@ class TransformerModel:
|
||||
" ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
|
||||
]
|
||||
vocab_obj["merges"] = merges
|
||||
sp_model = getattr(self._tokenizer, "sp_model", None)
|
||||
|
||||
if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||
sp_model = self._tokenizer._tokenizer.spm
|
||||
else:
|
||||
sp_model = getattr(self._tokenizer, "sp_model", None)
|
||||
|
||||
if sp_model:
|
||||
id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
|
||||
scores = []
|
||||
@ -664,27 +444,31 @@ class TransformerModel:
|
||||
return vocab_obj
|
||||
|
||||
def _create_tokenization_config(self) -> NlpTokenizationConfig:
|
||||
if self._max_model_input_size:
|
||||
_max_sequence_length = self._max_model_input_size
|
||||
else:
|
||||
_max_sequence_length = self._find_max_sequence_length()
|
||||
|
||||
if isinstance(self._tokenizer, transformers.MPNetTokenizer):
|
||||
return NlpMPNetTokenizationConfig(
|
||||
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
|
||||
max_sequence_length=getattr(
|
||||
self._tokenizer, "max_model_input_sizes", dict()
|
||||
).get(self._model_id),
|
||||
max_sequence_length=_max_sequence_length,
|
||||
)
|
||||
elif isinstance(
|
||||
self._tokenizer, (transformers.RobertaTokenizer, transformers.BartTokenizer)
|
||||
):
|
||||
return NlpRobertaTokenizationConfig(
|
||||
add_prefix_space=getattr(self._tokenizer, "add_prefix_space", None),
|
||||
max_sequence_length=getattr(
|
||||
self._tokenizer, "max_model_input_sizes", dict()
|
||||
).get(self._model_id),
|
||||
max_sequence_length=_max_sequence_length,
|
||||
)
|
||||
elif isinstance(self._tokenizer, transformers.XLMRobertaTokenizer):
|
||||
return NlpXLMRobertaTokenizationConfig(
|
||||
max_sequence_length=getattr(
|
||||
self._tokenizer, "max_model_input_sizes", dict()
|
||||
).get(self._model_id),
|
||||
max_sequence_length=_max_sequence_length
|
||||
)
|
||||
elif isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||
return NlpDebertaV2TokenizationConfig(
|
||||
max_sequence_length=_max_sequence_length,
|
||||
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
|
||||
)
|
||||
else:
|
||||
japanese_morphological_tokenizers = ["mecab"]
|
||||
@ -695,18 +479,41 @@ class TransformerModel:
|
||||
):
|
||||
return NlpBertJapaneseTokenizationConfig(
|
||||
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
|
||||
max_sequence_length=getattr(
|
||||
self._tokenizer, "max_model_input_sizes", dict()
|
||||
).get(self._model_id),
|
||||
max_sequence_length=_max_sequence_length,
|
||||
)
|
||||
else:
|
||||
return NlpBertTokenizationConfig(
|
||||
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
|
||||
max_sequence_length=getattr(
|
||||
self._tokenizer, "max_model_input_sizes", dict()
|
||||
).get(self._model_id),
|
||||
max_sequence_length=_max_sequence_length,
|
||||
)
|
||||
|
||||
def _find_max_sequence_length(self) -> int:
|
||||
# Sometimes the max_... values are present but contain
|
||||
# a random or very large value.
|
||||
REASONABLE_MAX_LENGTH = 8192
|
||||
max_len = getattr(self._tokenizer, "model_max_length", None)
|
||||
if max_len is not None and max_len <= REASONABLE_MAX_LENGTH:
|
||||
return int(max_len)
|
||||
|
||||
max_sizes = getattr(self._tokenizer, "max_model_input_sizes", dict())
|
||||
max_len = max_sizes.get(self._model_id)
|
||||
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
||||
return int(max_len)
|
||||
|
||||
if max_sizes:
|
||||
# The model id wasn't found in the max sizes dict but
|
||||
# if all the values correspond then take that value
|
||||
sizes = {size for size in max_sizes.values()}
|
||||
if len(sizes) == 1:
|
||||
max_len = sizes.pop()
|
||||
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
||||
return int(max_len)
|
||||
|
||||
if isinstance(self._tokenizer, BertTokenizer):
|
||||
return 512
|
||||
|
||||
raise UnknownModelInputSizeError("Cannot determine model max input length")
|
||||
|
||||
def _create_config(
|
||||
self, es_version: Optional[Tuple[int, int, int]]
|
||||
) -> NlpTrainedModelConfig:
|
||||
@ -718,6 +525,9 @@ class TransformerModel:
|
||||
tokenization_config.span = 128
|
||||
tokenization_config.truncate = "none"
|
||||
|
||||
if self._task_type == "text_similarity":
|
||||
tokenization_config.truncate = "second"
|
||||
|
||||
if self._traceable_model.classification_labels():
|
||||
inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
|
||||
tokenization=tokenization_config,
|
||||
@ -747,6 +557,31 @@ class TransformerModel:
|
||||
tokenization=tokenization_config
|
||||
)
|
||||
|
||||
# add static and dynamic memory state size to metadata
|
||||
per_deployment_memory_bytes = self._get_per_deployment_memory()
|
||||
|
||||
per_allocation_memory_bytes = self._get_per_allocation_memory(
|
||||
tokenization_config.max_sequence_length, 1
|
||||
)
|
||||
|
||||
metadata = {
|
||||
"per_deployment_memory_bytes": per_deployment_memory_bytes,
|
||||
"per_allocation_memory_bytes": per_allocation_memory_bytes,
|
||||
}
|
||||
|
||||
prefix_strings = (
|
||||
PrefixStrings(
|
||||
ingest_prefix=self._ingest_prefix, search_prefix=self._search_prefix
|
||||
)
|
||||
if self._ingest_prefix or self._search_prefix
|
||||
else None
|
||||
)
|
||||
prefix_strings_supported = es_version is None or es_version >= (8, 12, 0)
|
||||
if not prefix_strings_supported and prefix_strings:
|
||||
raise Exception(
|
||||
f"The Elasticsearch cluster version {es_version} does not support prefix strings. Support was added in version 8.12.0"
|
||||
)
|
||||
|
||||
return NlpTrainedModelConfig(
|
||||
description=f"Model {self._model_id} for task type '{self._task_type}'",
|
||||
model_type="pytorch",
|
||||
@ -754,9 +589,131 @@ class TransformerModel:
|
||||
input=TrainedModelInput(
|
||||
field_names=["text_field"],
|
||||
),
|
||||
metadata=metadata,
|
||||
prefix_strings=prefix_strings,
|
||||
)
|
||||
|
||||
def _create_traceable_model(self) -> TraceableModel:
|
||||
def _get_per_deployment_memory(self) -> float:
|
||||
"""
|
||||
Returns the static memory size of the model in bytes.
|
||||
"""
|
||||
psize: float = sum(
|
||||
param.nelement() * param.element_size()
|
||||
for param in self._traceable_model.model.parameters()
|
||||
)
|
||||
bsize: float = sum(
|
||||
buffer.nelement() * buffer.element_size()
|
||||
for buffer in self._traceable_model.model.buffers()
|
||||
)
|
||||
return psize + bsize
|
||||
|
||||
def _get_per_allocation_memory(
|
||||
self, max_seq_length: Optional[int], batch_size: int
|
||||
) -> float:
|
||||
"""
|
||||
Returns the transient memory size of the model in bytes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_seq_length : Optional[int]
|
||||
Maximum sequence length to use for the model.
|
||||
batch_size : int
|
||||
Batch size to use for the model.
|
||||
"""
|
||||
activities = [torch.profiler.ProfilerActivity.CPU]
|
||||
|
||||
# Get the memory usage of the model with a batch size of 1.
|
||||
inputs_1 = self._get_model_inputs(max_seq_length, 1)
|
||||
with profile(activities=activities, profile_memory=True) as prof:
|
||||
self._traceable_model.model(*inputs_1)
|
||||
mem1: float = prof.key_averages().total_average().cpu_memory_usage
|
||||
|
||||
# This is measuring memory usage of the model with a batch size of 2 and
|
||||
# then linearly extrapolating it to get the memory usage of the model for
|
||||
# a batch size of batch_size.
|
||||
if batch_size == 1:
|
||||
return mem1
|
||||
inputs_2 = self._get_model_inputs(max_seq_length, 2)
|
||||
with profile(activities=activities, profile_memory=True) as prof:
|
||||
self._traceable_model.model(*inputs_2)
|
||||
mem2: float = prof.key_averages().total_average().cpu_memory_usage
|
||||
return mem1 + (mem2 - mem1) * (batch_size - 1)
|
||||
|
||||
def _get_model_inputs(
|
||||
self,
|
||||
max_length: Optional[int],
|
||||
batch_size: int,
|
||||
) -> Tuple[Tensor, ...]:
|
||||
"""
|
||||
Returns a random batch of inputs for the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_length : Optional[int]
|
||||
Maximum sequence length to use for the model. Default is 512.
|
||||
batch_size : int
|
||||
Batch size to use for the model.
|
||||
"""
|
||||
vocab: list[str] = list(self._tokenizer.get_vocab().keys())
|
||||
|
||||
# if optional max_length is not set, set it to 512
|
||||
if max_length is None:
|
||||
max_length = 512
|
||||
|
||||
# generate random text
|
||||
texts: list[str] = [
|
||||
" ".join(random.choices(vocab, k=max_length)) for _ in range(batch_size)
|
||||
]
|
||||
|
||||
# tokenize text
|
||||
inputs: transformers.BatchEncoding = self._tokenizer(
|
||||
texts,
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
truncation=True,
|
||||
max_length=max_length,
|
||||
)
|
||||
|
||||
return self._make_inputs_compatible(inputs)
|
||||
|
||||
def _make_inputs_compatible(
|
||||
self, inputs: transformers.BatchEncoding
|
||||
) -> Tuple[Tensor, ...]:
|
||||
""" "
|
||||
Make the input batch format compatible to the model's requirements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inputs : transformers.BatchEncoding
|
||||
The input batch to make compatible.
|
||||
"""
|
||||
# Add params when not provided by the tokenizer (e.g. DistilBERT), to conform to BERT interface
|
||||
if "token_type_ids" not in inputs:
|
||||
inputs["token_type_ids"] = torch.zeros(
|
||||
inputs["input_ids"].size(1), dtype=torch.long
|
||||
)
|
||||
if isinstance(
|
||||
self._tokenizer,
|
||||
(
|
||||
transformers.BartTokenizer,
|
||||
transformers.MPNetTokenizer,
|
||||
transformers.RobertaTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
),
|
||||
):
|
||||
del inputs["token_type_ids"]
|
||||
return (inputs["input_ids"], inputs["attention_mask"])
|
||||
|
||||
position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
|
||||
inputs["position_ids"] = position_ids
|
||||
return (
|
||||
inputs["input_ids"],
|
||||
inputs["attention_mask"],
|
||||
inputs["token_type_ids"],
|
||||
inputs["position_ids"],
|
||||
)
|
||||
|
||||
def _create_traceable_model(self) -> _TransformerTraceableModel:
|
||||
if self._task_type == "auto":
|
||||
model = transformers.AutoModel.from_pretrained(
|
||||
self._model_id, token=self._access_token, torchscript=True
|
||||
@ -769,6 +726,13 @@ class TransformerModel:
|
||||
else:
|
||||
self._task_type = maybe_task_type
|
||||
|
||||
if self._task_type == "text_expansion":
|
||||
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
||||
self._model_id, token=self._access_token, torchscript=True
|
||||
)
|
||||
model = _DistilBertWrapper.try_wrapping(model)
|
||||
return _TraceableTextExpansionModel(self._tokenizer, model)
|
||||
|
||||
if self._task_type == "fill_mask":
|
||||
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
||||
self._model_id, token=self._access_token, torchscript=True
|
||||
@ -796,7 +760,7 @@ class TransformerModel:
|
||||
)
|
||||
if not model:
|
||||
model = _SentenceTransformerWrapperModule.from_pretrained(
|
||||
self._model_id, token=self._access_token
|
||||
self._model_id, self._tokenizer, token=self._access_token
|
||||
)
|
||||
return _TraceableTextEmbeddingModel(self._tokenizer, model)
|
||||
|
||||
@ -828,7 +792,7 @@ class TransformerModel:
|
||||
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Unknown task type {self._task_type}, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
|
||||
f"Task {self._task_type} is not supported, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
|
||||
)
|
||||
|
||||
def elasticsearch_model_id(self) -> str:
|
||||
@ -859,6 +823,5 @@ def elasticsearch_model_id(model_id: str) -> str:
|
||||
"""
|
||||
|
||||
id = re.sub(r"[\s\\/]", "__", model_id).lower()[-64:]
|
||||
if id.startswith("__"):
|
||||
id = id.removeprefix("__")
|
||||
id = id.removeprefix("__")
|
||||
return id
|
||||
|
317
eland/ml/pytorch/wrappers.py
Normal file
317
eland/ml/pytorch/wrappers.py
Normal file
@ -0,0 +1,317 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
This module contains the wrapper classes for the Hugging Face models.
|
||||
Wrapping is necessary to ensure that the forward method of the model
|
||||
is called with the same arguments the ml-cpp pytorch_inference process
|
||||
uses.
|
||||
"""
|
||||
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch # type: ignore
|
||||
import transformers # type: ignore
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
from torch import Tensor, nn
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForQuestionAnswering,
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizer,
|
||||
)
|
||||
|
||||
DEFAULT_OUTPUT_KEY = "sentence_embedding"
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around a question answering model.
|
||||
Our inference engine only takes the first tuple if the inference response
|
||||
is a tuple.
|
||||
|
||||
This wrapper transforms the output to be a stacked tensor if its a tuple.
|
||||
|
||||
Otherwise it passes it through
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||
model_id, token=token, torchscript=True
|
||||
)
|
||||
if isinstance(
|
||||
model.config,
|
||||
(
|
||||
transformers.MPNetConfig,
|
||||
transformers.XLMRobertaConfig,
|
||||
transformers.RobertaConfig,
|
||||
transformers.BartConfig,
|
||||
),
|
||||
):
|
||||
return _TwoParameterQuestionAnsweringWrapper(model)
|
||||
else:
|
||||
return _QuestionAnsweringWrapper(model)
|
||||
|
||||
|
||||
class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
del inputs["position_ids"]
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel):
|
||||
super().__init__(model=model)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
response = self._hf_model(**inputs)
|
||||
if isinstance(response, tuple):
|
||||
return torch.stack(list(response), dim=0)
|
||||
return response
|
||||
|
||||
|
||||
class _DistilBertWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
In Elasticsearch the BERT tokenizer is used for DistilBERT models but
|
||||
the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
|
||||
|
||||
Wrap the model's forward function in a method that accepts the 4
|
||||
arguments passed to a BERT model then discard the token_type_ids
|
||||
and the position_ids to match the wrapped DistilBERT model forward
|
||||
function
|
||||
"""
|
||||
|
||||
def __init__(self, model: transformers.PreTrainedModel):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
|
||||
if isinstance(model.config, transformers.DistilBertConfig):
|
||||
return _DistilBertWrapper(model)
|
||||
else:
|
||||
return model
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
_token_type_ids: Tensor = None,
|
||||
_position_ids: Tensor = None,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
class _SentenceTransformerWrapperModule(nn.Module): # type: ignore
|
||||
"""
|
||||
A wrapper around sentence-transformer models to provide pooling,
|
||||
normalization and other graph layers that are not defined in the base
|
||||
HuggingFace transformer model.
|
||||
"""
|
||||
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__()
|
||||
self._hf_model = model
|
||||
self._st_model = SentenceTransformer(model.config.name_or_path)
|
||||
self._output_key = output_key
|
||||
self.config = model.config
|
||||
|
||||
self._remove_pooling_layer()
|
||||
self._replace_transformer_layer()
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(
|
||||
model_id: str,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
*,
|
||||
token: Optional[str] = None,
|
||||
output_key: str = DEFAULT_OUTPUT_KEY,
|
||||
) -> Optional[Any]:
|
||||
model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
|
||||
if isinstance(
|
||||
tokenizer,
|
||||
(
|
||||
transformers.BartTokenizer,
|
||||
transformers.MPNetTokenizer,
|
||||
transformers.RobertaTokenizer,
|
||||
transformers.XLMRobertaTokenizer,
|
||||
transformers.DebertaV2Tokenizer,
|
||||
),
|
||||
):
|
||||
return _TwoParameterSentenceTransformerWrapper(model, output_key)
|
||||
else:
|
||||
return _SentenceTransformerWrapper(model, output_key)
|
||||
|
||||
def _remove_pooling_layer(self) -> None:
|
||||
"""
|
||||
Removes any last pooling layer which is not used to create embeddings.
|
||||
Leaving this layer in will cause it to return a NoneType which in turn
|
||||
will fail to load in libtorch. Alternatively, we can just use the output
|
||||
of the pooling layer as a dummy but this also affects (if only in a
|
||||
minor way) the performance of inference, so we're better off removing
|
||||
the layer if we can.
|
||||
"""
|
||||
|
||||
if hasattr(self._hf_model, "pooler"):
|
||||
self._hf_model.pooler = None
|
||||
|
||||
def _replace_transformer_layer(self) -> None:
|
||||
"""
|
||||
Replaces the HuggingFace Transformer layer in the SentenceTransformer
|
||||
modules so we can set it with one that has pooling layer removed and
|
||||
was loaded ready for TorchScript export.
|
||||
"""
|
||||
|
||||
self._st_model._modules["0"].auto_model = self._hf_model
|
||||
|
||||
|
||||
class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
"position_ids": position_ids,
|
||||
}
|
||||
|
||||
# remove inputs for specific model types
|
||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||
super().__init__(model=model, output_key=output_key)
|
||||
|
||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
inputs = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
}
|
||||
return self._st_model(inputs)[self._output_key]
|
||||
|
||||
|
||||
class _DPREncoderWrapper(nn.Module): # type: ignore
|
||||
"""
|
||||
AutoModel loading does not work for DPRContextEncoders, this only exists as
|
||||
a workaround. This may never be fixed so this is likely permanent.
|
||||
See: https://github.com/huggingface/transformers/issues/13670
|
||||
"""
|
||||
|
||||
_SUPPORTED_MODELS = {
|
||||
transformers.DPRContextEncoder,
|
||||
transformers.DPRQuestionEncoder,
|
||||
}
|
||||
_SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
|
||||
):
|
||||
super().__init__()
|
||||
self._model = model
|
||||
self.config = model.config
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||
config = AutoConfig.from_pretrained(model_id, token=token)
|
||||
|
||||
def is_compatible() -> bool:
|
||||
is_dpr_model = config.model_type == "dpr"
|
||||
has_architectures = (
|
||||
config.architectures is not None and len(config.architectures) == 1
|
||||
)
|
||||
is_supported_architecture = has_architectures and (
|
||||
config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
|
||||
)
|
||||
return is_dpr_model and is_supported_architecture
|
||||
|
||||
if is_compatible():
|
||||
model = getattr(transformers, config.architectures[0]).from_pretrained(
|
||||
model_id, torchscript=True
|
||||
)
|
||||
return _DPREncoderWrapper(model)
|
||||
else:
|
||||
return None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Tensor,
|
||||
token_type_ids: Tensor,
|
||||
_position_ids: Tensor,
|
||||
) -> Tensor:
|
||||
"""Wrap the input and output to conform to the native process interface."""
|
||||
|
||||
return self._model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
)
|
@ -97,9 +97,11 @@ class LGBMForestTransformer(ModelTransformer):
|
||||
return TreeNode(
|
||||
node_idx=node_id,
|
||||
leaf_value=[float(tree_node_json_obj["leaf_value"])],
|
||||
number_samples=int(tree_node_json_obj["leaf_count"])
|
||||
if "leaf_count" in tree_node_json_obj
|
||||
else None,
|
||||
number_samples=(
|
||||
int(tree_node_json_obj["leaf_count"])
|
||||
if "leaf_count" in tree_node_json_obj
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree:
|
||||
@ -235,9 +237,11 @@ class LGBMClassifierTransformer(LGBMForestTransformer):
|
||||
return TreeNode(
|
||||
node_idx=node_id,
|
||||
leaf_value=leaf_val,
|
||||
number_samples=int(tree_node_json_obj["leaf_count"])
|
||||
if "leaf_count" in tree_node_json_obj
|
||||
else None,
|
||||
number_samples=(
|
||||
int(tree_node_json_obj["leaf_count"])
|
||||
if "leaf_count" in tree_node_json_obj
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
def check_model_booster(self) -> None:
|
||||
|
@ -27,7 +27,13 @@ from .base import ModelTransformer
|
||||
|
||||
import_optional_dependency("xgboost", on_version="warn")
|
||||
|
||||
from xgboost import Booster, XGBClassifier, XGBModel, XGBRegressor # type: ignore
|
||||
from xgboost import ( # type: ignore
|
||||
Booster,
|
||||
XGBClassifier,
|
||||
XGBModel,
|
||||
XGBRanker,
|
||||
XGBRegressor,
|
||||
)
|
||||
|
||||
|
||||
class XGBoostForestTransformer(ModelTransformer):
|
||||
@ -101,6 +107,7 @@ class XGBoostForestTransformer(ModelTransformer):
|
||||
decision_type=self._node_decision_type,
|
||||
left_child=self.extract_node_id(row["Yes"], curr_tree),
|
||||
right_child=self.extract_node_id(row["No"], curr_tree),
|
||||
default_left=row["Yes"] == row["Missing"],
|
||||
threshold=float(row["Split"]),
|
||||
split_feature=self.get_feature_id(row["Feature"]),
|
||||
)
|
||||
@ -140,7 +147,7 @@ class XGBoostForestTransformer(ModelTransformer):
|
||||
if len(tree_nodes) > 0:
|
||||
transformed_trees.append(self.build_tree(tree_nodes))
|
||||
# We add this stump as XGBoost adds the base_score to the regression outputs
|
||||
if self._objective.partition(":")[0] == "reg":
|
||||
if self._objective.partition(":")[0] in ["reg", "rank"]:
|
||||
transformed_trees.append(self.build_base_score_stump())
|
||||
return transformed_trees
|
||||
|
||||
@ -184,6 +191,7 @@ class XGBoostForestTransformer(ModelTransformer):
|
||||
|
||||
class XGBoostRegressorTransformer(XGBoostForestTransformer):
|
||||
def __init__(self, model: XGBRegressor, feature_names: List[str]):
|
||||
self._regressor_model = model
|
||||
# XGBRegressor.base_score defaults to 0.5.
|
||||
base_score = model.base_score
|
||||
if base_score is None:
|
||||
@ -197,6 +205,13 @@ class XGBoostRegressorTransformer(XGBoostForestTransformer):
|
||||
return "regression"
|
||||
|
||||
def is_objective_supported(self) -> bool:
|
||||
if isinstance(self._regressor_model, XGBRanker):
|
||||
return self._objective in {
|
||||
"rank:pairwise",
|
||||
"rank:ndcg",
|
||||
"rank:map",
|
||||
}
|
||||
|
||||
return self._objective in {
|
||||
"reg:squarederror",
|
||||
"reg:squaredlogerror",
|
||||
@ -264,5 +279,6 @@ class XGBoostClassifierTransformer(XGBoostForestTransformer):
|
||||
|
||||
_MODEL_TRANSFORMERS: Dict[type, Type[ModelTransformer]] = {
|
||||
XGBRegressor: XGBoostRegressorTransformer,
|
||||
XGBRanker: XGBoostRegressorTransformer,
|
||||
XGBClassifier: XGBoostClassifierTransformer,
|
||||
}
|
||||
|
@ -16,6 +16,7 @@
|
||||
# under the License.
|
||||
|
||||
import copy
|
||||
import os
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
@ -1156,9 +1157,11 @@ class Operations:
|
||||
# piggy-back on that single aggregation.
|
||||
if extended_stats_calls >= 2:
|
||||
es_aggs = [
|
||||
("extended_stats", es_agg)
|
||||
if es_agg in extended_stats_es_aggs
|
||||
else es_agg
|
||||
(
|
||||
("extended_stats", es_agg)
|
||||
if es_agg in extended_stats_es_aggs
|
||||
else es_agg
|
||||
)
|
||||
for es_agg in es_aggs
|
||||
]
|
||||
|
||||
@ -1218,6 +1221,76 @@ class Operations:
|
||||
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
|
||||
)
|
||||
|
||||
def to_csv( # type: ignore
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
path_or_buf=None,
|
||||
header: bool = True,
|
||||
mode: str = "w",
|
||||
show_progress: bool = False,
|
||||
**kwargs,
|
||||
) -> Optional[str]:
|
||||
result = []
|
||||
processed = 0
|
||||
for i, df in enumerate(
|
||||
self.search_yield_pandas_dataframes(query_compiler=query_compiler)
|
||||
):
|
||||
processed += df.shape[0]
|
||||
if show_progress and processed % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
|
||||
print(f"{datetime.now()}: read {processed} rows")
|
||||
result.append(
|
||||
df.to_csv(
|
||||
path_or_buf=path_or_buf,
|
||||
# start appending after the first batch
|
||||
mode=mode if i == 0 else "a",
|
||||
# only write the header for the first batch, if wanted at all
|
||||
header=header if i == 0 else False,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if path_or_buf is None:
|
||||
return "".join(result)
|
||||
|
||||
def to_json( # type: ignore
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
path_or_buf=None,
|
||||
orient=None,
|
||||
lines=False,
|
||||
**kwargs,
|
||||
):
|
||||
if orient == "records" and lines is True:
|
||||
result: List[str] = []
|
||||
our_filehandle = False
|
||||
if isinstance(path_or_buf, os.PathLike):
|
||||
buf = open(path_or_buf, "w")
|
||||
our_filehandle = True
|
||||
elif isinstance(path_or_buf, str):
|
||||
buf = open(path_or_buf, "w")
|
||||
our_filehandle = True
|
||||
else:
|
||||
buf = path_or_buf
|
||||
for i, df in enumerate(
|
||||
self.search_yield_pandas_dataframes(query_compiler=query_compiler)
|
||||
):
|
||||
output = df.to_json(
|
||||
orient=orient,
|
||||
lines=lines,
|
||||
**kwargs,
|
||||
)
|
||||
if buf is None:
|
||||
result.append(output)
|
||||
else:
|
||||
buf.write(output)
|
||||
# If we opened the file ourselves, we should close it
|
||||
if our_filehandle:
|
||||
buf.close()
|
||||
return "".join(result) or None
|
||||
else:
|
||||
return self.to_pandas(query_compiler=query_compiler).to_json(
|
||||
path_or_buf, orient=orient, lines=lines, **kwargs
|
||||
)
|
||||
|
||||
def to_pandas(
|
||||
self, query_compiler: "QueryCompiler", show_progress: bool = False
|
||||
) -> pd.DataFrame:
|
||||
@ -1239,16 +1312,6 @@ class Operations:
|
||||
return query_compiler._empty_pd_ef()
|
||||
return pd.concat(df_list)
|
||||
|
||||
def to_csv(
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
show_progress: bool = False,
|
||||
**kwargs: Union[bool, str],
|
||||
) -> Optional[str]:
|
||||
return self.to_pandas( # type: ignore[no-any-return]
|
||||
query_compiler=query_compiler, show_progress=show_progress
|
||||
).to_csv(**kwargs)
|
||||
|
||||
def search_yield_pandas_dataframes(
|
||||
self, query_compiler: "QueryCompiler"
|
||||
) -> Generator["pd.DataFrame", None, None]:
|
||||
@ -1480,6 +1543,24 @@ def quantile_to_percentile(quantile: Union[int, float]) -> float:
|
||||
return float(min(100, max(0, quantile * 100)))
|
||||
|
||||
|
||||
def is_field_already_present(
|
||||
key: str, data: Union[Dict[str, Any], List[Dict[str, Any]]]
|
||||
) -> bool:
|
||||
if "." in key:
|
||||
splitted = key.split(".")
|
||||
if isinstance(data, dict):
|
||||
return is_field_already_present(
|
||||
".".join(splitted[1:]), data.get(splitted[0], {})
|
||||
)
|
||||
if isinstance(data, list):
|
||||
return any(
|
||||
is_field_already_present(".".join(splitted[1:]), x.get(splitted[0], {}))
|
||||
for x in data
|
||||
)
|
||||
else:
|
||||
return key in data
|
||||
|
||||
|
||||
def _search_yield_hits(
|
||||
query_compiler: "QueryCompiler",
|
||||
body: Dict[str, Any],
|
||||
@ -1537,10 +1618,24 @@ def _search_yield_hits(
|
||||
|
||||
# Modify the search with the new point in time ID and keep-alive time.
|
||||
body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
|
||||
if isinstance(body["_source"], list):
|
||||
body["fields"] = body["_source"]
|
||||
|
||||
while max_number_of_hits is None or hits_yielded < max_number_of_hits:
|
||||
resp = client.search(**body)
|
||||
hits: List[Dict[str, Any]] = resp["hits"]["hits"]
|
||||
hits: List[Dict[str, Any]] = []
|
||||
for hit in resp["hits"]["hits"]:
|
||||
# Copy some of the fields to _source if they are missing there.
|
||||
if "fields" in hit and "_source" in hit:
|
||||
fields = hit["fields"]
|
||||
del hit["fields"]
|
||||
for k, v in fields.items():
|
||||
if not is_field_already_present(k, hit["_source"]):
|
||||
if isinstance(v, list):
|
||||
hit["_source"][k] = list(sorted(v))
|
||||
else:
|
||||
hit["_source"][k] = v
|
||||
hits.append(hit)
|
||||
|
||||
# The point in time ID can change between searches so we
|
||||
# need to keep the next search up-to-date
|
||||
|
@ -497,7 +497,7 @@ class QueryCompiler:
|
||||
return self._update_query(QueryFilter(query))
|
||||
|
||||
# To/From Pandas
|
||||
def to_pandas(self, show_progress: bool = False):
|
||||
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
|
||||
"""Converts Eland DataFrame to Pandas DataFrame.
|
||||
|
||||
Returns:
|
||||
@ -512,7 +512,15 @@ class QueryCompiler:
|
||||
Returns:
|
||||
If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None.
|
||||
"""
|
||||
return self._operations.to_csv(self, **kwargs)
|
||||
return self._operations.to_csv(query_compiler=self, **kwargs)
|
||||
|
||||
def to_json(self, **kwargs) -> Optional[str]:
|
||||
"""Serialises Eland Dataframe to CSV
|
||||
|
||||
Returns:
|
||||
If path_or_buf is None, returns the resulting json as a string.
|
||||
"""
|
||||
return self._operations.to_json(query_compiler=self, **kwargs)
|
||||
|
||||
def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
|
||||
return self._operations.search_yield_pandas_dataframes(self)
|
||||
|
@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd # type: ignore
|
||||
from pandas.core.indexes.frozen import FrozenList
|
||||
from pandas.io.common import _expand_user, stringify_path # type: ignore
|
||||
|
||||
import eland.plotting
|
||||
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||
from eland.filter import (
|
||||
BooleanFilter,
|
||||
Equal,
|
||||
@ -292,18 +293,26 @@ class Series(NDFrame):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df['Carrier'].value_counts()
|
||||
>>> df['Carrier'].value_counts() # doctest: +SKIP
|
||||
Carrier
|
||||
Logstash Airways 3331
|
||||
JetBeats 3274
|
||||
Kibana Airlines 3234
|
||||
ES-Air 3220
|
||||
Name: Carrier, dtype: int64
|
||||
Name: count, dtype: int64
|
||||
"""
|
||||
if not isinstance(es_size, int):
|
||||
raise TypeError("es_size must be a positive integer.")
|
||||
elif es_size <= 0:
|
||||
raise ValueError("es_size must be a positive integer.")
|
||||
return self._query_compiler.value_counts(es_size)
|
||||
value_counts = self._query_compiler.value_counts(es_size)
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
|
||||
if PANDAS_VERSION[0] == 2:
|
||||
value_counts.name = "count"
|
||||
value_counts.index.names = FrozenList([self.es_field_name])
|
||||
value_counts.index.name = self.es_field_name
|
||||
|
||||
return value_counts
|
||||
|
||||
# dtype not implemented for Series as causes query to fail
|
||||
# in pandas.core.computation.ops.Term.type
|
||||
|
63
noxfile.py
63
noxfile.py
@ -16,7 +16,6 @@
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import nox
|
||||
@ -56,52 +55,48 @@ TYPED_FILES = (
|
||||
)
|
||||
|
||||
|
||||
@nox.session(reuse_venv=True)
|
||||
@nox.session(reuse_venv=True, python="3.11")
|
||||
def format(session):
|
||||
session.install("black", "isort", "flynt")
|
||||
session.install("black ~= 25.0", "isort", "flynt")
|
||||
session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES)
|
||||
session.run("flynt", *SOURCE_FILES)
|
||||
session.run("black", "--target-version=py38", *SOURCE_FILES)
|
||||
session.run("black", "--target-version=py39", *SOURCE_FILES)
|
||||
session.run("isort", "--profile=black", *SOURCE_FILES)
|
||||
lint(session)
|
||||
|
||||
|
||||
@nox.session(reuse_venv=True)
|
||||
@nox.session(reuse_venv=True, python="3.11")
|
||||
def lint(session):
|
||||
# Install numpy to use its mypy plugin
|
||||
# https://numpy.org/devdocs/reference/typing.html#mypy-plugin
|
||||
session.install("black", "flake8", "mypy", "isort", "numpy")
|
||||
session.install("--pre", "elasticsearch>=8.3,<9")
|
||||
session.install("black ~= 25.0", "flake8", "mypy", "isort", "numpy")
|
||||
session.install(".")
|
||||
session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
|
||||
session.run("black", "--check", "--target-version=py38", *SOURCE_FILES)
|
||||
session.run("black", "--check", "--target-version=py39", *SOURCE_FILES)
|
||||
session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
|
||||
session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES)
|
||||
session.run("flake8", "--extend-ignore=E203,E402,E501,E704,E712", *SOURCE_FILES)
|
||||
|
||||
# TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
|
||||
session.log("mypy --show-error-codes --strict eland/")
|
||||
for typed_file in TYPED_FILES:
|
||||
if not os.path.isfile(typed_file):
|
||||
session.error(f"The file {typed_file!r} couldn't be found")
|
||||
process = subprocess.run(
|
||||
["mypy", "--show-error-codes", "--strict", typed_file],
|
||||
env=session.env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
# Ensure that mypy itself ran successfully
|
||||
assert process.returncode in (0, 1)
|
||||
stdout = session.run(
|
||||
"mypy",
|
||||
"--show-error-codes",
|
||||
"--strict",
|
||||
*TYPED_FILES,
|
||||
success_codes=(0, 1),
|
||||
silent=True,
|
||||
)
|
||||
|
||||
errors = []
|
||||
for line in process.stdout.decode().split("\n"):
|
||||
filepath = line.partition(":")[0]
|
||||
if filepath in TYPED_FILES:
|
||||
errors.append(line)
|
||||
if errors:
|
||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||
errors = []
|
||||
for line in stdout.splitlines():
|
||||
filepath = line.partition(":")[0]
|
||||
if filepath in TYPED_FILES:
|
||||
errors.append(line)
|
||||
if errors:
|
||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||
|
||||
|
||||
@nox.session(python=["3.8", "3.9", "3.10"])
|
||||
@nox.parametrize("pandas_version", ["1.5.0"])
|
||||
@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
|
||||
@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
|
||||
def test(session, pandas_version: str):
|
||||
session.install("-r", "requirements-dev.txt")
|
||||
session.install(".")
|
||||
@ -112,6 +107,8 @@ def test(session, pandas_version: str):
|
||||
"python",
|
||||
"-m",
|
||||
"pytest",
|
||||
"-ra",
|
||||
"--tb=native",
|
||||
"--cov-report=term-missing",
|
||||
"--cov=eland/",
|
||||
"--cov-config=setup.cfg",
|
||||
@ -119,9 +116,6 @@ def test(session, pandas_version: str):
|
||||
"--nbval",
|
||||
)
|
||||
|
||||
# PyTorch doesn't support Python 3.11 yet
|
||||
if session.python == "3.11":
|
||||
pytest_args += ("--ignore=eland/ml/pytorch",)
|
||||
session.run(
|
||||
*pytest_args,
|
||||
*(session.posargs or ("eland/", "tests/")),
|
||||
@ -138,7 +132,6 @@ def test(session, pandas_version: str):
|
||||
"scikit-learn",
|
||||
"xgboost",
|
||||
"lightgbm",
|
||||
"shap",
|
||||
)
|
||||
session.run("pytest", "tests/ml/")
|
||||
|
||||
@ -148,8 +141,8 @@ def docs(session):
|
||||
# Run this so users get an error if they don't have Pandoc installed.
|
||||
session.run("pandoc", "--version", external=True)
|
||||
|
||||
session.install("-r", "docs/requirements-docs.txt")
|
||||
session.install(".")
|
||||
session.install("-r", "docs/requirements-docs.txt")
|
||||
|
||||
# See if we have an Elasticsearch cluster active
|
||||
# to rebuild the Jupyter notebooks with.
|
||||
|
@ -1,28 +1,7 @@
|
||||
#
|
||||
# Basic requirements
|
||||
# Basic requirements with extras
|
||||
#
|
||||
elasticsearch>=8.3,<9
|
||||
pandas>=1.5,<2
|
||||
matplotlib>=3.6
|
||||
# Shap is incompatible with NumPy >= 1.24 (elastic/eland#539)
|
||||
# Fix NumPy to a known good range of versions
|
||||
numpy>=1.2.0,<1.24
|
||||
tqdm<5
|
||||
|
||||
#
|
||||
# Extras
|
||||
#
|
||||
scikit-learn>=1.3,<2
|
||||
xgboost>=0.90,<2
|
||||
lightgbm>=2,<4
|
||||
|
||||
# PyTorch doesn't support Python 3.11 yet (pytorch/pytorch#86566)
|
||||
|
||||
# Elasticsearch uses v1.13.1 of PyTorch
|
||||
torch>=1.13.1,<2.0; python_version<'3.11'
|
||||
# Versions known to be compatible with PyTorch 1.13.1
|
||||
sentence-transformers>=2.1.0,<=2.2.2; python_version<'3.11'
|
||||
transformers[torch]>=4.12.0,<=4.27.4; python_version<'3.11'
|
||||
.[all]
|
||||
|
||||
#
|
||||
# Testing
|
||||
@ -31,7 +10,6 @@ pytest>=5.2.1
|
||||
pytest-mock
|
||||
pytest-cov
|
||||
nbval
|
||||
shap==0.41.0
|
||||
|
||||
#
|
||||
# Docs
|
||||
|
@ -1,9 +0,0 @@
|
||||
#
|
||||
# Basic requirements
|
||||
#
|
||||
elasticsearch>=8.3,<9
|
||||
pandas>=1.5,<2
|
||||
matplotlib>=3.6
|
||||
# Shap is incompatible with NumPy >= 1.24 (elastic/eland#539)
|
||||
# Fix NumPy to a known good range of versions
|
||||
numpy>=1.2.0,<1.24
|
25
setup.py
25
setup.py
@ -38,9 +38,10 @@ CLASSIFIERS = [
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering",
|
||||
]
|
||||
|
||||
@ -55,12 +56,16 @@ with open(path.join(here, "README.md"), "r", "utf-8") as f:
|
||||
|
||||
extras = {
|
||||
"xgboost": ["xgboost>=0.90,<2"],
|
||||
"scikit-learn": ["scikit-learn>=1.3,<2"],
|
||||
"lightgbm": ["lightgbm>=2,<4"],
|
||||
"scikit-learn": ["scikit-learn>=1.3,<1.6"],
|
||||
"lightgbm": ["lightgbm>=3,<5"],
|
||||
"pytorch": [
|
||||
"torch>=1.13.1,<2.0",
|
||||
"sentence-transformers>=2.1.0,<=2.2.2",
|
||||
"transformers[torch]>=4.31.0,<=4.33.2",
|
||||
"requests<3",
|
||||
"torch==2.5.1",
|
||||
"tqdm",
|
||||
"sentence-transformers>=2.1.0,<=2.7.0",
|
||||
# sentencepiece is a required dependency for the slow tokenizers
|
||||
# https://huggingface.co/transformers/v4.4.2/migration.html#sentencepiece-is-removed-from-the-required-dependencies
|
||||
"transformers[sentencepiece]>=4.47.0",
|
||||
],
|
||||
}
|
||||
extras["all"] = list({dep for deps in extras.values() for dep in deps})
|
||||
@ -81,16 +86,16 @@ setup(
|
||||
keywords="elastic eland pandas python",
|
||||
packages=find_packages(include=["eland", "eland.*"]),
|
||||
install_requires=[
|
||||
"elasticsearch>=8.3,<9",
|
||||
"pandas>=1.5,<2",
|
||||
"elasticsearch>=9,<10",
|
||||
"pandas>=1.5,<3",
|
||||
"matplotlib>=3.6",
|
||||
"numpy>=1.2.0,<1.24",
|
||||
"numpy>=1.2.0,<2",
|
||||
"packaging",
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": "eland_import_hub_model=eland.cli.eland_import_hub_model:main"
|
||||
},
|
||||
python_requires=">=3.8",
|
||||
python_requires=">=3.9,<3.13",
|
||||
package_data={"eland": ["py.typed"]},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
|
@ -20,7 +20,7 @@ import os
|
||||
import pandas as pd
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
from eland.common import es_version
|
||||
from eland.common import es_version, is_serverless_es
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@ -33,6 +33,7 @@ ELASTICSEARCH_HOST = os.environ.get(
|
||||
ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)
|
||||
|
||||
ES_VERSION = es_version(ES_TEST_CLIENT)
|
||||
ES_IS_SERVERLESS = is_serverless_es(ES_TEST_CLIENT)
|
||||
|
||||
FLIGHTS_INDEX_NAME = "flights"
|
||||
FLIGHTS_MAPPING = {
|
||||
@ -43,7 +44,7 @@ FLIGHTS_MAPPING = {
|
||||
"Carrier": {"type": "keyword"},
|
||||
"Dest": {"type": "keyword"},
|
||||
"DestAirportID": {"type": "keyword"},
|
||||
"DestCityName": {"type": "keyword"},
|
||||
"DestCityName": {"type": "keyword", "copy_to": "Cities"},
|
||||
"DestCountry": {"type": "keyword"},
|
||||
"DestLocation": {"type": "geo_point"},
|
||||
"DestRegion": {"type": "keyword"},
|
||||
@ -58,11 +59,12 @@ FLIGHTS_MAPPING = {
|
||||
"FlightTimeMin": {"type": "float"},
|
||||
"Origin": {"type": "keyword"},
|
||||
"OriginAirportID": {"type": "keyword"},
|
||||
"OriginCityName": {"type": "keyword"},
|
||||
"OriginCityName": {"type": "keyword", "copy_to": "Cities"},
|
||||
"OriginCountry": {"type": "keyword"},
|
||||
"OriginLocation": {"type": "geo_point"},
|
||||
"OriginRegion": {"type": "keyword"},
|
||||
"OriginWeather": {"type": "keyword"},
|
||||
"Cities": {"type": "text"},
|
||||
"dayOfWeek": {"type": "byte"},
|
||||
"timestamp": {"type": "date", "format": "strict_date_hour_minute_second"},
|
||||
}
|
||||
@ -163,6 +165,31 @@ ECOMMERCE_MAPPING = {
|
||||
ECOMMERCE_FILE_NAME = ROOT_DIR + "/ecommerce.json.gz"
|
||||
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + "/ecommerce_df.json.gz"
|
||||
|
||||
NATIONAL_PARKS_INDEX_NAME = "national_parks"
|
||||
NATIONAL_PARKS_FILE_NAME = ROOT_DIR + "/national-parks.json.gz"
|
||||
NATIONAL_PARKS_MAPPING = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"id": {"type": "keyword"},
|
||||
"title": {"type": "text"},
|
||||
"description": {"type": "text"},
|
||||
"nps_link": {"type": "text", "index": False},
|
||||
"date_established": {"type": "date"},
|
||||
"location": {"type": "geo_point"},
|
||||
"states": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {"type": "keyword"},
|
||||
},
|
||||
},
|
||||
"visitors": {"type": "integer"},
|
||||
"world_heritage_site": {"type": "boolean"},
|
||||
"acres": {"type": "float"},
|
||||
"square_km": {"type": "float"},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_MAPPING1 = {
|
||||
"mappings": {
|
||||
"properties": {
|
||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
import eland as ed
|
||||
from eland.common import PANDAS_VERSION
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@ -45,7 +46,14 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
|
||||
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
||||
_ed_flights.columns, axis=1
|
||||
)
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
if PANDAS_VERSION[0] >= 2:
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
|
||||
else:
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
|
||||
_pd_flights["Cities"] = _pd_flights.apply(
|
||||
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
|
||||
)
|
||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||
|
||||
_pd_flights_small = _pd_flights.head(48)
|
||||
@ -58,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
|
||||
)
|
||||
_pd_ecommerce.insert(2, "customer_birth_date", None)
|
||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce["customer_birth_date"].astype("datetime64")
|
||||
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
|
||||
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
|
||||
|
||||
|
||||
|
@ -77,7 +77,16 @@ class SymmetricAPIChecker:
|
||||
pd_exc = e
|
||||
|
||||
self.check_exception(ed_exc, pd_exc)
|
||||
self.check_values(ed_obj, pd_obj)
|
||||
try:
|
||||
self.check_values(ed_obj, pd_obj)
|
||||
except AssertionError as e:
|
||||
# This is an attribute we allow to differ when comparing zero-length objects
|
||||
if (
|
||||
'Attribute "inferred_type" are different' in repr(e)
|
||||
and len(ed_obj) == 0
|
||||
and len(pd_obj) == 0
|
||||
):
|
||||
self.check_values(ed_obj, pd_obj, check_index_type=False)
|
||||
|
||||
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
|
||||
return SymmetricAPIChecker(ed_obj, pd_obj)
|
||||
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
|
||||
|
||||
return f
|
||||
|
||||
def check_values(self, ed_obj, pd_obj):
|
||||
def check_values(self, ed_obj, pd_obj, **kwargs):
|
||||
"""Checks that any two values coming from eland and pandas are equal"""
|
||||
if isinstance(ed_obj, ed.DataFrame):
|
||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj)
|
||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
|
||||
elif isinstance(ed_obj, ed.Series):
|
||||
assert_pandas_eland_series_equal(pd_obj, ed_obj)
|
||||
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.DataFrame):
|
||||
assert_frame_equal(ed_obj, pd_obj)
|
||||
assert_frame_equal(ed_obj, pd_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.Series):
|
||||
assert_series_equal(ed_obj, pd_obj)
|
||||
assert_series_equal(ed_obj, pd_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.Index):
|
||||
assert ed_obj.equals(pd_obj)
|
||||
else:
|
||||
|
@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
|
||||
},
|
||||
index=["0", "1", "2"],
|
||||
)
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||
df["D"] = df["D"].astype("datetime64[ns]")
|
||||
|
||||
expected_mappings = {
|
||||
"mappings": {
|
||||
|
@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
|
||||
["Cancelled", "FlightDelay"], axis="columns"
|
||||
)
|
||||
|
||||
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
|
||||
# This could be implemented in eland, but as of yet this is not the case
|
||||
# We therefore remove it before the comparison
|
||||
if "timestamp" in pd_describe.columns:
|
||||
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
|
||||
|
||||
# Pandas >= 2 orders the aggregations differently than Pandas < 2
|
||||
# A sort_index is applied so tests will succeed in both environments
|
||||
assert_frame_equal(
|
||||
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||
ed_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||
check_exact=False,
|
||||
rtol=True,
|
||||
)
|
||||
|
@ -43,6 +43,7 @@ class TestDataFrameDtypes:
|
||||
"AvgTicketPrice": "float",
|
||||
"Cancelled": "boolean",
|
||||
"Carrier": "keyword",
|
||||
"Cities": "text",
|
||||
"Dest": "keyword",
|
||||
"DestAirportID": "keyword",
|
||||
"DestCityName": "keyword",
|
||||
|
@ -23,6 +23,17 @@ from pandas.testing import assert_frame_equal, assert_index_equal, assert_series
|
||||
|
||||
from tests.common import TestData
|
||||
|
||||
PANDAS_MAJOR_VERSION = int(pd.__version__.split(".")[0])
|
||||
|
||||
|
||||
# The mean absolute difference (mad) aggregation has been removed from
|
||||
# pandas with major version 2:
|
||||
# https://github.com/pandas-dev/pandas/issues/11787
|
||||
# To compare whether eland's version of it works, we need to implement
|
||||
# it here ourselves.
|
||||
def mad(x):
|
||||
return abs(x - x.mean()).mean()
|
||||
|
||||
|
||||
class TestGroupbyDataFrame(TestData):
|
||||
funcs = ["max", "min", "mean", "sum"]
|
||||
@ -71,7 +82,7 @@ class TestGroupbyDataFrame(TestData):
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
@pytest.mark.parametrize("pd_agg", ["max", "min", "mean", "sum", "median"])
|
||||
def test_groupby_aggs_numeric_only_true(self, pd_agg, dropna):
|
||||
# Pandas has numeric_only applicable for the above aggs with groupby only.
|
||||
# Pandas has numeric_only applicable for the above aggs with groupby only.
|
||||
|
||||
pd_flights = self.pd_flights().filter(self.filter_data)
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
@ -95,7 +106,14 @@ class TestGroupbyDataFrame(TestData):
|
||||
pd_flights = self.pd_flights().filter(self.filter_data)
|
||||
ed_flights = self.ed_flights().filter(self.filter_data)
|
||||
|
||||
pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)()
|
||||
# The mad aggregation has been removed in Pandas 2, so we need to use
|
||||
# our own implementation if we run the tests with Pandas 2 or higher
|
||||
if PANDAS_MAJOR_VERSION >= 2 and pd_agg == "mad":
|
||||
pd_groupby = pd_flights.groupby("Cancelled", dropna=dropna).aggregate(mad)
|
||||
else:
|
||||
pd_groupby = getattr(
|
||||
pd_flights.groupby("Cancelled", dropna=dropna), pd_agg
|
||||
)()
|
||||
ed_groupby = getattr(ed_flights.groupby("Cancelled", dropna=dropna), pd_agg)(
|
||||
numeric_only=True
|
||||
)
|
||||
@ -211,14 +229,20 @@ class TestGroupbyDataFrame(TestData):
|
||||
pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
|
||||
ed_flights = self.ed_flights().filter(self.filter_data + ["DestCountry"])
|
||||
|
||||
pd_mad = pd_flights.groupby("DestCountry").mad()
|
||||
if PANDAS_MAJOR_VERSION < 2:
|
||||
pd_mad = pd_flights.groupby("DestCountry").mad()
|
||||
else:
|
||||
pd_mad = pd_flights.groupby("DestCountry").aggregate(mad)
|
||||
ed_mad = ed_flights.groupby("DestCountry").mad()
|
||||
|
||||
assert_index_equal(pd_mad.columns, ed_mad.columns)
|
||||
assert_index_equal(pd_mad.index, ed_mad.index)
|
||||
assert_series_equal(pd_mad.dtypes, ed_mad.dtypes)
|
||||
|
||||
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||
if PANDAS_MAJOR_VERSION < 2:
|
||||
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||
else:
|
||||
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", mad])
|
||||
ed_min_mad = ed_flights.groupby("DestCountry").aggregate(["min", "mad"])
|
||||
|
||||
assert_index_equal(pd_min_mad.columns, ed_min_mad.columns)
|
||||
|
@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
ed_head_0 = ed_flights.head(0)
|
||||
pd_head_0 = pd_flights.head(0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
|
||||
|
||||
def test_doc_test_tail(self):
|
||||
df = self.ed_flights()
|
||||
|
@ -54,9 +54,13 @@ class TestDataFrameIterrowsItertuples(TestData):
|
||||
# Shim which uses pytest.approx() for floating point values inside tuples.
|
||||
assert len(left) == len(right)
|
||||
assert all(
|
||||
(lt == rt) # Not floats? Use ==
|
||||
if not isinstance(lt, float) and not isinstance(rt, float)
|
||||
else (lt == pytest.approx(rt)) # If both are floats use pytest.approx()
|
||||
(
|
||||
# Not floats? Use ==
|
||||
(lt == rt)
|
||||
if not isinstance(lt, float) and not isinstance(rt, float)
|
||||
# If both are floats use pytest.approx()
|
||||
else (lt == pytest.approx(rt))
|
||||
)
|
||||
for lt, rt in zip(left, right)
|
||||
)
|
||||
|
||||
|
@ -22,6 +22,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
from eland.common import PANDAS_VERSION
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights, func)(
|
||||
**({"numeric_only": True} if func != "mad" else {})
|
||||
)
|
||||
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights_1, func)()
|
||||
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
||||
|
||||
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights_0, func)()
|
||||
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
||||
|
||||
@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData):
|
||||
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
|
||||
)
|
||||
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
if PANDAS_VERSION[0] == 1:
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
|
||||
else: # numeric_only is no longer available for pandas > 2
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"])
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"])
|
||||
|
||||
assert_frame_equal(
|
||||
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
|
||||
|
@ -15,7 +15,7 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
# File called _pytest for PyCharm compatibility
|
||||
|
||||
import ast
|
||||
import time
|
||||
@ -41,8 +41,9 @@ class TestDataFrameToCSV(TestData):
|
||||
results_file,
|
||||
index_col=0,
|
||||
converters={
|
||||
"DestLocation": lambda x: ast.literal_eval(x),
|
||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
||||
"DestLocation": ast.literal_eval,
|
||||
"OriginLocation": ast.literal_eval,
|
||||
"Cities": ast.literal_eval,
|
||||
},
|
||||
)
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
@ -63,8 +64,9 @@ class TestDataFrameToCSV(TestData):
|
||||
results_file,
|
||||
index_col=0,
|
||||
converters={
|
||||
"DestLocation": lambda x: ast.literal_eval(x),
|
||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
||||
"DestLocation": ast.literal_eval,
|
||||
"OriginLocation": ast.literal_eval,
|
||||
"Cities": ast.literal_eval,
|
||||
},
|
||||
)
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
@ -112,8 +114,9 @@ class TestDataFrameToCSV(TestData):
|
||||
results,
|
||||
index_col=0,
|
||||
converters={
|
||||
"DestLocation": lambda x: ast.literal_eval(x),
|
||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
||||
"DestLocation": ast.literal_eval,
|
||||
"OriginLocation": ast.literal_eval,
|
||||
"Cities": ast.literal_eval,
|
||||
},
|
||||
)
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
|
139
tests/dataframe/test_to_json_pytest.py
Normal file
139
tests/dataframe/test_to_json_pytest.py
Normal file
@ -0,0 +1,139 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatibility
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
import pandas
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
from tests.common import ROOT_DIR, TestData
|
||||
|
||||
|
||||
class TestDataFrameToJSON(TestData):
|
||||
def test_to_json_default_arguments(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights.to_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl")
|
||||
pd_flights.to_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl")
|
||||
|
||||
assert_frame_equal(
|
||||
pandas.read_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl"),
|
||||
pandas.read_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl"),
|
||||
)
|
||||
|
||||
def test_to_json_streaming_mode(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights.to_json(
|
||||
ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
)
|
||||
pd_flights.to_json(
|
||||
ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
)
|
||||
|
||||
assert_frame_equal(
|
||||
pandas.read_json(
|
||||
ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
pandas.read_json(
|
||||
ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
)
|
||||
|
||||
def test_to_json_streaming_mode_pathlib(self):
|
||||
root_dir = Path(ROOT_DIR)
|
||||
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights.to_json(
|
||||
root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
)
|
||||
pd_flights.to_json(
|
||||
root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
)
|
||||
|
||||
assert_frame_equal(
|
||||
pandas.read_json(
|
||||
root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
pandas.read_json(
|
||||
root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
)
|
||||
|
||||
def test_to_json_with_other_buffer(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
output_buffer = StringIO()
|
||||
ed_flights.to_json(output_buffer, lines=True, orient="records")
|
||||
output_string = pd_flights.to_json(lines=True, orient="records")
|
||||
|
||||
output_buffer.seek(0) # rewind our StringIO object
|
||||
|
||||
assert_frame_equal(
|
||||
pandas.read_json(output_buffer, lines=True, orient="records"),
|
||||
pandas.read_json(
|
||||
StringIO(output_string),
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
)
|
||||
|
||||
def test_to_json_with_file_handle(self):
|
||||
root_dir = Path(ROOT_DIR)
|
||||
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
with open(
|
||||
root_dir / "dataframe" / "results" / "fh_eland_to_json.jsonl", "w"
|
||||
) as w:
|
||||
ed_flights.to_json(w)
|
||||
pd_flights.to_json(
|
||||
root_dir / "dataframe" / "results" / "check_pandas_to_json.jsonl"
|
||||
)
|
||||
|
||||
assert_frame_equal(
|
||||
pandas.read_json(
|
||||
ROOT_DIR + "/dataframe/results/fh_eland_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
pandas.read_json(
|
||||
ROOT_DIR + "/dataframe/results/check_pandas_to_json.jsonl",
|
||||
lines=True,
|
||||
orient="records",
|
||||
),
|
||||
)
|
@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
|
||||
)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||
df["D"] = df["D"].astype("datetime64[ns]")
|
||||
df["H"] = (
|
||||
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
|
||||
)
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
||||
ES_TEST_CLIENT.indices.delete(index=index_name)
|
||||
|
@ -134,7 +134,8 @@ class TestPandasToEland:
|
||||
|
||||
# Assert that the second pandas dataframe is actually appended
|
||||
assert df2.shape == (6, 4)
|
||||
pd_df3 = pd_df.append(pd_df2)
|
||||
# use the "private" append method that's still available in pandas 2.0
|
||||
pd_df3 = pd_df._append(pd_df2)
|
||||
assert_pandas_eland_frame_equal(pd_df3, df2)
|
||||
|
||||
def test_es_if_exists_append_mapping_mismatch_schema_enforcement(self):
|
||||
|
79
tests/ml/ltr/test_feature_logger_pytest.py
Normal file
79
tests/ml/ltr/test_feature_logger_pytest.py
Normal file
@ -0,0 +1,79 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import math
|
||||
|
||||
from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
|
||||
from tests import ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME
|
||||
|
||||
|
||||
class TestFeatureLogger:
|
||||
def test_extract_feature(self):
|
||||
# Create the feature logger and some document extract features for a query.
|
||||
ltr_model_config = self._ltr_model_config()
|
||||
feature_logger = FeatureLogger(
|
||||
ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME, ltr_model_config
|
||||
)
|
||||
|
||||
doc_ids = ["park_yosemite", "park_hawaii-volcanoes", "park_death-valley"]
|
||||
|
||||
doc_features = feature_logger.extract_features(
|
||||
query_params={"query": "yosemite"}, doc_ids=doc_ids
|
||||
)
|
||||
|
||||
# Assert all docs are presents.
|
||||
assert len(doc_features) == len(doc_ids) and all(
|
||||
doc_id in doc_ids for doc_id in doc_features.keys()
|
||||
)
|
||||
|
||||
# Check all features are extracted for all docs
|
||||
assert all(
|
||||
len(features) == len(ltr_model_config.feature_extractors)
|
||||
for features in doc_features.values()
|
||||
)
|
||||
print(doc_features)
|
||||
|
||||
# "park_yosemite" document matches for title and is a world heritage site
|
||||
assert (
|
||||
doc_features["park_yosemite"][0] > 0
|
||||
and doc_features["park_yosemite"][1] > 1
|
||||
)
|
||||
|
||||
# "park_hawaii-volcanoes" document does not matches for title but is a world heritage site
|
||||
assert (
|
||||
math.isnan(doc_features["park_hawaii-volcanoes"][0])
|
||||
and doc_features["park_hawaii-volcanoes"][1] > 1
|
||||
)
|
||||
|
||||
# "park_hawaii-volcanoes" document does not matches for title and is not a world heritage site
|
||||
assert all(math.isnan(feature) for feature in doc_features["park_death-valley"])
|
||||
|
||||
def _ltr_model_config(self):
|
||||
# Returns an LTR config with 2 query feature extractors:
|
||||
# - title_bm25: BM25 score of the match query on the title field
|
||||
# - popularity: Value of the popularity field
|
||||
return LTRModelConfig(
|
||||
[
|
||||
QueryFeatureExtractor(
|
||||
feature_name="title_bm25", query={"match": {"title": "{{query}}"}}
|
||||
),
|
||||
QueryFeatureExtractor(
|
||||
feature_name="world_heritage_site",
|
||||
query={"term": {"world_heritage_site": True}},
|
||||
),
|
||||
]
|
||||
)
|
257
tests/ml/pytorch/test_pytorch_model_config_pytest.py
Normal file
257
tests/ml/pytorch/test_pytorch_model_config_pytest.py
Normal file
@ -0,0 +1,257 @@
|
||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||
# license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright
|
||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||
# the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import sklearn # noqa: F401
|
||||
|
||||
HAS_SKLEARN = True
|
||||
except ImportError:
|
||||
HAS_SKLEARN = False
|
||||
|
||||
try:
|
||||
from eland.ml.pytorch.transformers import TransformerModel
|
||||
|
||||
HAS_TRANSFORMERS = True
|
||||
except ImportError:
|
||||
HAS_TRANSFORMERS = False
|
||||
|
||||
try:
|
||||
import torch # noqa: F401
|
||||
|
||||
from eland.ml.pytorch import (
|
||||
FillMaskInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
NlpDebertaV2TokenizationConfig,
|
||||
NlpMPNetTokenizationConfig,
|
||||
NlpRobertaTokenizationConfig,
|
||||
NlpXLMRobertaTokenizationConfig,
|
||||
QuestionAnsweringInferenceOptions,
|
||||
TextClassificationInferenceOptions,
|
||||
TextEmbeddingInferenceOptions,
|
||||
TextSimilarityInferenceOptions,
|
||||
ZeroShotClassificationInferenceOptions,
|
||||
)
|
||||
|
||||
HAS_PYTORCH = True
|
||||
except ImportError:
|
||||
HAS_PYTORCH = False
|
||||
|
||||
|
||||
from tests import ES_VERSION
|
||||
|
||||
pytestmark = [
|
||||
pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_TRANSFORMERS, reason="This test requires 'transformers' package to run"
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_PYTORCH, reason="This test requires 'torch' package to run"
|
||||
),
|
||||
]
|
||||
|
||||
# If the required imports are missing the test will be skipped.
|
||||
# Only define th test configurations if the referenced classes
|
||||
# have been imported
|
||||
if HAS_PYTORCH and HAS_SKLEARN and HAS_TRANSFORMERS:
|
||||
MODEL_CONFIGURATIONS = [
|
||||
(
|
||||
"sentence-transformers/all-distilroberta-v1",
|
||||
"text_embedding",
|
||||
TextEmbeddingInferenceOptions,
|
||||
NlpRobertaTokenizationConfig,
|
||||
512,
|
||||
768,
|
||||
),
|
||||
(
|
||||
"intfloat/multilingual-e5-small",
|
||||
"text_embedding",
|
||||
TextEmbeddingInferenceOptions,
|
||||
NlpXLMRobertaTokenizationConfig,
|
||||
512,
|
||||
384,
|
||||
),
|
||||
(
|
||||
"sentence-transformers/all-mpnet-base-v2",
|
||||
"text_embedding",
|
||||
TextEmbeddingInferenceOptions,
|
||||
NlpMPNetTokenizationConfig,
|
||||
512,
|
||||
768,
|
||||
),
|
||||
(
|
||||
"facebook/dpr-ctx_encoder-multiset-base",
|
||||
"text_embedding",
|
||||
TextEmbeddingInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
512,
|
||||
768,
|
||||
),
|
||||
(
|
||||
"distilbert-base-uncased",
|
||||
"fill_mask",
|
||||
FillMaskInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
512,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"SamLowe/roberta-base-go_emotions",
|
||||
"text_classification",
|
||||
TextClassificationInferenceOptions,
|
||||
NlpRobertaTokenizationConfig,
|
||||
512,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"distilbert-base-cased-distilled-squad",
|
||||
"question_answering",
|
||||
QuestionAnsweringInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
386,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||
"text_similarity",
|
||||
TextSimilarityInferenceOptions,
|
||||
NlpBertTokenizationConfig,
|
||||
512,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"valhalla/distilbart-mnli-12-6",
|
||||
"zero_shot_classification",
|
||||
ZeroShotClassificationInferenceOptions,
|
||||
NlpRobertaTokenizationConfig,
|
||||
1024,
|
||||
None,
|
||||
),
|
||||
(
|
||||
"microsoft/deberta-v3-xsmall",
|
||||
"fill_mask",
|
||||
FillMaskInferenceOptions,
|
||||
NlpDebertaV2TokenizationConfig,
|
||||
512,
|
||||
None,
|
||||
),
|
||||
]
|
||||
else:
|
||||
MODEL_CONFIGURATIONS = []
|
||||
|
||||
|
||||
class TestModelConfguration:
|
||||
@pytest.mark.parametrize(
|
||||
"model_id,task_type,config_type,tokenizer_type,max_sequence_len,embedding_size",
|
||||
MODEL_CONFIGURATIONS,
|
||||
)
|
||||
def test_model_config(
|
||||
self,
|
||||
model_id,
|
||||
task_type,
|
||||
config_type,
|
||||
tokenizer_type,
|
||||
max_sequence_len,
|
||||
embedding_size,
|
||||
):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tm = TransformerModel(
|
||||
model_id=model_id,
|
||||
task_type=task_type,
|
||||
es_version=ES_VERSION,
|
||||
quantize=False,
|
||||
)
|
||||
_, config, _ = tm.save(tmp_dir)
|
||||
assert "pytorch" == config.model_type
|
||||
assert ["text_field"] == config.input.field_names
|
||||
assert isinstance(config.inference_config, config_type)
|
||||
tokenization = config.inference_config.tokenization
|
||||
assert isinstance(config.metadata, dict)
|
||||
assert config.prefix_strings is None
|
||||
assert (
|
||||
"per_deployment_memory_bytes" in config.metadata
|
||||
and config.metadata["per_deployment_memory_bytes"] > 0
|
||||
)
|
||||
assert (
|
||||
"per_allocation_memory_bytes" in config.metadata
|
||||
and config.metadata["per_allocation_memory_bytes"] > 0
|
||||
)
|
||||
assert isinstance(tokenization, tokenizer_type)
|
||||
assert max_sequence_len == tokenization.max_sequence_length
|
||||
|
||||
if task_type == "text_classification":
|
||||
assert isinstance(config.inference_config.classification_labels, list)
|
||||
assert len(config.inference_config.classification_labels) > 0
|
||||
|
||||
if task_type == "text_embedding":
|
||||
assert embedding_size == config.inference_config.embedding_size
|
||||
|
||||
if task_type == "question_answering":
|
||||
assert tokenization.truncate == "none"
|
||||
assert tokenization.span > 0
|
||||
|
||||
if task_type == "zero_shot_classification":
|
||||
assert isinstance(config.inference_config.classification_labels, list)
|
||||
assert len(config.inference_config.classification_labels) > 0
|
||||
|
||||
if task_type == "text_similarity":
|
||||
assert tokenization.truncate == "second"
|
||||
|
||||
del tm
|
||||
|
||||
def test_model_config_with_prefix_string(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tm = TransformerModel(
|
||||
model_id="sentence-transformers/all-distilroberta-v1",
|
||||
task_type="text_embedding",
|
||||
es_version=(8, 12, 0),
|
||||
quantize=False,
|
||||
ingest_prefix="INGEST:",
|
||||
search_prefix="SEARCH:",
|
||||
)
|
||||
_, config, _ = tm.save(tmp_dir)
|
||||
assert config.prefix_strings.to_dict()["ingest"] == "INGEST:"
|
||||
assert config.prefix_strings.to_dict()["search"] == "SEARCH:"
|
||||
|
||||
def test_model_config_with_prefix_string_not_supported(self):
|
||||
with pytest.raises(Exception):
|
||||
TransformerModel(
|
||||
model_id="sentence-transformers/all-distilroberta-v1",
|
||||
task_type="text_embedding",
|
||||
es_version=(8, 11, 0),
|
||||
quantize=False,
|
||||
ingest_prefix="INGEST:",
|
||||
search_prefix="SEARCH:",
|
||||
)
|
||||
|
||||
def test_model_config_with_user_specified_input_length(self):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tm = TransformerModel(
|
||||
model_id="sentence-transformers/all-distilroberta-v1",
|
||||
task_type="text_embedding",
|
||||
es_version=(8, 13, 0),
|
||||
quantize=False,
|
||||
max_model_input_size=213,
|
||||
)
|
||||
_, config, _ = tm.save(tmp_dir)
|
||||
tokenization = config.inference_config.tokenization
|
||||
assert tokenization.max_sequence_length == 213
|
@ -14,6 +14,7 @@
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import platform
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
@ -37,10 +38,6 @@ except ImportError:
|
||||
from tests import ES_TEST_CLIENT, ES_VERSION
|
||||
|
||||
pytestmark = [
|
||||
pytest.mark.skipif(
|
||||
ES_VERSION < (8, 7, 0),
|
||||
reason="Eland uses Pytorch 1.13.1, versions of Elasticsearch prior to 8.7.0 are incompatible with PyTorch 1.13.1",
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
),
|
||||
@ -66,6 +63,8 @@ TEXT_EMBEDDING_MODELS = [
|
||||
)
|
||||
]
|
||||
|
||||
TEXT_SIMILARITY_MODELS = ["mixedbread-ai/mxbai-rerank-xsmall-v1"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def setup_and_tear_down():
|
||||
@ -82,6 +81,14 @@ def setup_and_tear_down():
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def quantize():
|
||||
# quantization does not work on ARM processors
|
||||
# TODO: It seems that PyTorch 2.0 supports OneDNN for aarch64. We should
|
||||
# revisit this when we upgrade to PyTorch 2.0.
|
||||
return platform.machine() not in ["arm64", "aarch64"]
|
||||
|
||||
|
||||
def download_model_and_start_deployment(tmp_dir, quantize, model_id, task):
|
||||
print("Loading HuggingFace transformer tokenizer and model")
|
||||
tm = TransformerModel(
|
||||
@ -103,31 +110,17 @@ def download_model_and_start_deployment(tmp_dir, quantize, model_id, task):
|
||||
|
||||
|
||||
class TestPytorchModel:
|
||||
def __init__(self):
|
||||
# quantization does not work on ARM processors
|
||||
# TODO: It seems that PyTorch 2.0 supports OneDNN for aarch64. We should
|
||||
# revisit this when we upgrade to PyTorch 2.0.
|
||||
import platform
|
||||
|
||||
self.quantize = (
|
||||
True if platform.machine() not in ["arm64", "aarch64"] else False
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("model_id,task,text_input,value", TEXT_PREDICTION_MODELS)
|
||||
def test_text_prediction(self, model_id, task, text_input, value):
|
||||
def test_text_prediction(self, model_id, task, text_input, value, quantize):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
ptm = download_model_and_start_deployment(
|
||||
tmp_dir, self.quantize, model_id, task
|
||||
)
|
||||
result = ptm.infer(docs=[{"text_field": text_input}])
|
||||
assert result["predicted_value"] == value
|
||||
ptm = download_model_and_start_deployment(tmp_dir, quantize, model_id, task)
|
||||
results = ptm.infer(docs=[{"text_field": text_input}])
|
||||
assert results.body["inference_results"][0]["predicted_value"] == value
|
||||
|
||||
@pytest.mark.parametrize("model_id,task,text_input", TEXT_EMBEDDING_MODELS)
|
||||
def test_text_embedding(self, model_id, task, text_input):
|
||||
def test_text_embedding(self, model_id, task, text_input, quantize):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
ptm = download_model_and_start_deployment(
|
||||
tmp_dir, self.quantize, model_id, task
|
||||
)
|
||||
ptm = download_model_and_start_deployment(tmp_dir, quantize, model_id, task)
|
||||
ptm.infer(docs=[{"text_field": text_input}])
|
||||
|
||||
if ES_VERSION >= (8, 8, 0):
|
||||
@ -140,3 +133,25 @@ class TestPytorchModel:
|
||||
)
|
||||
> 0
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
ES_VERSION < (8, 16, 0), reason="requires 8.16.0 for DeBERTa models"
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", TEXT_SIMILARITY_MODELS)
|
||||
def test_text_similarity(self, model_id):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
ptm = download_model_and_start_deployment(
|
||||
tmp_dir, False, model_id, "text_similarity"
|
||||
)
|
||||
result = ptm.infer(
|
||||
docs=[
|
||||
{
|
||||
"text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
|
||||
},
|
||||
{"text_field": "Paris is the capital of France"},
|
||||
],
|
||||
inference_config={"text_similarity": {"text": "France"}},
|
||||
)
|
||||
|
||||
assert result.body["inference_results"][0]["predicted_value"] < 0
|
||||
assert result.body["inference_results"][1]["predicted_value"] > 0
|
||||
|
@ -24,13 +24,6 @@ import numpy as np
|
||||
import pytest
|
||||
from elasticsearch import NotFoundError
|
||||
|
||||
try:
|
||||
import sklearn # noqa: F401
|
||||
|
||||
HAS_SKLEARN = True
|
||||
except ImportError:
|
||||
HAS_SKLEARN = False
|
||||
|
||||
try:
|
||||
import torch # noqa: F401
|
||||
from torch import Tensor, nn # noqa: F401
|
||||
@ -67,9 +60,6 @@ pytestmark = [
|
||||
ES_VERSION < (8, 0, 0),
|
||||
reason="This test requires at least Elasticsearch version 8.0.0",
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
),
|
||||
pytest.mark.skipif(
|
||||
not HAS_PYTORCH, reason="This test requires 'pytorch' package to run"
|
||||
),
|
||||
|
@ -15,14 +15,20 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import eland as ed
|
||||
from eland.ml import MLModel
|
||||
from tests import ES_TEST_CLIENT, ES_VERSION, FLIGHTS_SMALL_INDEX_NAME
|
||||
from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
|
||||
from eland.ml.transformers import get_model_transformer
|
||||
from tests import (
|
||||
ES_IS_SERVERLESS,
|
||||
ES_TEST_CLIENT,
|
||||
ES_VERSION,
|
||||
NATIONAL_PARKS_INDEX_NAME,
|
||||
)
|
||||
|
||||
try:
|
||||
from sklearn import datasets
|
||||
@ -34,7 +40,7 @@ except ImportError:
|
||||
HAS_SKLEARN = False
|
||||
|
||||
try:
|
||||
from xgboost import XGBClassifier, XGBRegressor
|
||||
from xgboost import XGBClassifier, XGBRanker, XGBRegressor
|
||||
|
||||
HAS_XGBOOST = True
|
||||
except ImportError:
|
||||
@ -47,33 +53,30 @@ try:
|
||||
except ImportError:
|
||||
HAS_LIGHTGBM = False
|
||||
|
||||
try:
|
||||
import shap
|
||||
|
||||
HAS_SHAP = True
|
||||
except ImportError:
|
||||
HAS_SHAP = False
|
||||
|
||||
|
||||
requires_sklearn = pytest.mark.skipif(
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run."
|
||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||
)
|
||||
requires_xgboost = pytest.mark.skipif(
|
||||
not HAS_XGBOOST, reason="This test requires 'xgboost' package to run."
|
||||
)
|
||||
requires_shap = pytest.mark.skipif(
|
||||
not HAS_SHAP, reason="This tests requries 'shap' package to run."
|
||||
not HAS_XGBOOST, reason="This test requires 'xgboost' package to run"
|
||||
)
|
||||
requires_no_ml_extras = pytest.mark.skipif(
|
||||
HAS_SKLEARN or HAS_XGBOOST,
|
||||
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed.",
|
||||
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed",
|
||||
)
|
||||
|
||||
requires_lightgbm = pytest.mark.skipif(
|
||||
not HAS_LIGHTGBM, reason="This test requires 'lightgbm' package to run"
|
||||
not HAS_LIGHTGBM, reason="This test requires 'lightgbm' package to run."
|
||||
)
|
||||
|
||||
|
||||
def requires_elasticsearch_version(minimum_version: Tuple[int, int, int]):
|
||||
return pytest.mark.skipif(
|
||||
ES_VERSION < minimum_version,
|
||||
reason=f"This test requires Elasticsearch version {'.'.join(str(v) for v in minimum_version)} or later.",
|
||||
)
|
||||
|
||||
|
||||
def skip_if_multiclass_classifition():
|
||||
if ES_VERSION < (7, 7):
|
||||
raise pytest.skip(
|
||||
@ -93,100 +96,11 @@ def check_prediction_equality(es_model: MLModel, py_model, test_data):
|
||||
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
|
||||
|
||||
|
||||
def yield_model_id(analysis, analyzed_fields):
|
||||
def randomize_model_id(prefix, suffix_size=10):
|
||||
import random
|
||||
import string
|
||||
import time
|
||||
|
||||
suffix = "".join(random.choices(string.ascii_lowercase, k=4))
|
||||
job_id = "test-flights-regression-" + suffix
|
||||
dest = job_id + "-dest"
|
||||
|
||||
response = ES_TEST_CLIENT.ml.put_data_frame_analytics(
|
||||
id=job_id,
|
||||
analysis=analysis,
|
||||
dest={"index": dest},
|
||||
source={"index": [FLIGHTS_SMALL_INDEX_NAME]},
|
||||
analyzed_fields=analyzed_fields,
|
||||
)
|
||||
assert response.meta.status == 200
|
||||
response = ES_TEST_CLIENT.ml.start_data_frame_analytics(id=job_id)
|
||||
assert response.meta.status == 200
|
||||
|
||||
time.sleep(2)
|
||||
response = ES_TEST_CLIENT.ml.get_trained_models(model_id=job_id + "*")
|
||||
assert response.meta.status == 200
|
||||
assert response.body["count"] == 1
|
||||
model_id = response.body["trained_model_configs"][0]["model_id"]
|
||||
|
||||
yield model_id
|
||||
|
||||
ES_TEST_CLIENT.ml.delete_data_frame_analytics(id=job_id)
|
||||
ES_TEST_CLIENT.indices.delete(index=dest)
|
||||
ES_TEST_CLIENT.ml.delete_trained_model(model_id=model_id)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[0, 4], [0, 1], range(5)])
|
||||
def regression_model_id(request):
|
||||
analysis = {
|
||||
"regression": {
|
||||
"dependent_variable": "FlightDelayMin",
|
||||
"max_trees": 3,
|
||||
"num_top_feature_importance_values": 0,
|
||||
"max_optimization_rounds_per_hyperparameter": 1,
|
||||
"prediction_field_name": "FlightDelayMin_prediction",
|
||||
"training_percent": 30,
|
||||
"randomize_seed": 1000,
|
||||
"loss_function": "mse",
|
||||
"early_stopping_enabled": True,
|
||||
}
|
||||
}
|
||||
all_includes = [
|
||||
"FlightDelayMin",
|
||||
"FlightDelayType",
|
||||
"FlightTimeMin",
|
||||
"DistanceMiles",
|
||||
"OriginAirportID",
|
||||
]
|
||||
includes = [all_includes[i] for i in request.param]
|
||||
analyzed_fields = {
|
||||
"includes": includes,
|
||||
"excludes": [],
|
||||
}
|
||||
yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
|
||||
|
||||
|
||||
@pytest.fixture(params=[[0, 6], [5, 6], range(7)])
|
||||
def classification_model_id(request):
|
||||
analysis = {
|
||||
"classification": {
|
||||
"dependent_variable": "Cancelled",
|
||||
"max_trees": 5,
|
||||
"num_top_feature_importance_values": 0,
|
||||
"max_optimization_rounds_per_hyperparameter": 1,
|
||||
"prediction_field_name": "Cancelled_prediction",
|
||||
"training_percent": 50,
|
||||
"randomize_seed": 1000,
|
||||
"num_top_classes": -1,
|
||||
"class_assignment_objective": "maximize_accuracy",
|
||||
"early_stopping_enabled": True,
|
||||
}
|
||||
}
|
||||
all_includes = [
|
||||
"OriginWeather",
|
||||
"OriginAirportID",
|
||||
"DestCityName",
|
||||
"DestWeather",
|
||||
"DestRegion",
|
||||
"AvgTicketPrice",
|
||||
"Cancelled",
|
||||
]
|
||||
includes = [all_includes[i] for i in request.param]
|
||||
analyzed_fields = {
|
||||
"includes": includes,
|
||||
"excludes": [],
|
||||
}
|
||||
yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
|
||||
return f"{prefix}-{''.join(random.choices(string.ascii_lowercase, k=suffix_size))}"
|
||||
|
||||
|
||||
class TestMLModel:
|
||||
@ -306,6 +220,176 @@ class TestMLModel:
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
def _normalize_ltr_score_from_XGBRanker(self, ranker, ltr_model_config, scores):
|
||||
"""Normalize the scores of an XGBRanker model as ES implementation of LTR would do.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ranker : XGBRanker
|
||||
The XGBRanker model to retrieve the minimum score from.
|
||||
|
||||
ltr_model_config : LTRModelConfig
|
||||
LTR model config.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scores : List[float]
|
||||
Normalized scores for the model.
|
||||
"""
|
||||
|
||||
should_rescore = (
|
||||
(ES_VERSION[0] == 8 and ES_VERSION >= (8, 19))
|
||||
or (
|
||||
ES_VERSION[0] == 9
|
||||
and (ES_VERSION[1] >= 1 or (ES_VERSION[1] == 0 and ES_VERSION[2] >= 1))
|
||||
)
|
||||
or ES_IS_SERVERLESS
|
||||
)
|
||||
|
||||
if should_rescore:
|
||||
# In 8.19+, 9.0.1 and 9.1, the scores are normalized if there are negative scores
|
||||
min_model_score, _ = (
|
||||
get_model_transformer(
|
||||
ranker, feature_names=ltr_model_config.feature_names
|
||||
)
|
||||
.transform()
|
||||
.bounds()
|
||||
)
|
||||
if min_model_score < 0:
|
||||
scores = [score - min_model_score for score in scores]
|
||||
|
||||
return scores
|
||||
|
||||
@requires_elasticsearch_version((8, 12))
|
||||
@requires_xgboost
|
||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"objective",
|
||||
["rank:ndcg", "rank:map", "rank:pairwise"],
|
||||
)
|
||||
def test_learning_to_rank(self, objective, compress_model_definition):
|
||||
X, y = datasets.make_classification(
|
||||
n_features=3, n_informative=2, n_redundant=1
|
||||
)
|
||||
rng = np.random.default_rng()
|
||||
qid = rng.integers(0, 3, size=X.shape[0])
|
||||
|
||||
# Sort the inputs based on query index
|
||||
sorted_idx = np.argsort(qid)
|
||||
X = X[sorted_idx, :]
|
||||
y = y[sorted_idx]
|
||||
qid = qid[sorted_idx]
|
||||
|
||||
ranker = XGBRanker(objective=objective)
|
||||
ranker.fit(X, y, qid=qid)
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
model_id = randomize_model_id("test_learning_to_rank")
|
||||
ltr_model_config = LTRModelConfig(
|
||||
feature_extractors=[
|
||||
QueryFeatureExtractor(
|
||||
feature_name="title_bm25",
|
||||
query={"match": {"title": "{{query_string}}"}},
|
||||
),
|
||||
QueryFeatureExtractor(
|
||||
feature_name="description_bm25",
|
||||
query={"match": {"description_bm25": "{{query_string}}"}},
|
||||
),
|
||||
QueryFeatureExtractor(
|
||||
feature_name="visitors",
|
||||
query={
|
||||
"script_score": {
|
||||
"query": {"exists": {"field": "visitors"}},
|
||||
"script": {"source": 'return doc["visitors"].value;'},
|
||||
}
|
||||
},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
es_model = MLModel.import_ltr_model(
|
||||
ES_TEST_CLIENT,
|
||||
model_id,
|
||||
ranker,
|
||||
ltr_model_config,
|
||||
es_compress_model_definition=compress_model_definition,
|
||||
)
|
||||
|
||||
# Verify the saved inference config contains the passed LTR config
|
||||
response = ES_TEST_CLIENT.ml.get_trained_models(model_id=model_id)
|
||||
assert response.meta.status == 200
|
||||
assert response.body["count"] == 1
|
||||
|
||||
saved_trained_model_config = response.body["trained_model_configs"][0]
|
||||
|
||||
assert "input" in saved_trained_model_config
|
||||
assert "field_names" in saved_trained_model_config["input"]
|
||||
|
||||
if not ES_IS_SERVERLESS and ES_VERSION < (8, 15):
|
||||
assert len(saved_trained_model_config["input"]["field_names"]) == 3
|
||||
else:
|
||||
assert not len(saved_trained_model_config["input"]["field_names"])
|
||||
|
||||
saved_inference_config = saved_trained_model_config["inference_config"]
|
||||
|
||||
assert "learning_to_rank" in saved_inference_config
|
||||
assert "feature_extractors" in saved_inference_config["learning_to_rank"]
|
||||
saved_feature_extractors = saved_inference_config["learning_to_rank"][
|
||||
"feature_extractors"
|
||||
]
|
||||
|
||||
assert all(
|
||||
feature_extractor.to_dict() in saved_feature_extractors
|
||||
for feature_extractor in ltr_model_config.feature_extractors
|
||||
)
|
||||
|
||||
# Execute search with rescoring
|
||||
search_result = ES_TEST_CLIENT.search(
|
||||
index=NATIONAL_PARKS_INDEX_NAME,
|
||||
query={"terms": {"_id": ["park_yosemite", "park_everglades"]}},
|
||||
rescore={
|
||||
"learning_to_rank": {
|
||||
"model_id": model_id,
|
||||
"params": {"query_string": "yosemite"},
|
||||
},
|
||||
"window_size": 2,
|
||||
},
|
||||
)
|
||||
|
||||
# Assert that rescored search result match predition.
|
||||
doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]]
|
||||
|
||||
feature_logger = FeatureLogger(
|
||||
ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME, ltr_model_config
|
||||
)
|
||||
expected_scores = sorted(
|
||||
[
|
||||
ranker.predict(np.asarray([doc_features]))[0]
|
||||
for _, doc_features in feature_logger.extract_features(
|
||||
{"query_string": "yosemite"}, ["park_yosemite", "park_everglades"]
|
||||
).items()
|
||||
],
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
expected_scores = self._normalize_ltr_score_from_XGBRanker(
|
||||
ranker, ltr_model_config, expected_scores
|
||||
)
|
||||
|
||||
np.testing.assert_almost_equal(expected_scores, doc_scores, decimal=2)
|
||||
|
||||
# Verify prediction is not supported for LTR
|
||||
try:
|
||||
es_model.predict([0])
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# Clean up
|
||||
ES_TEST_CLIENT.cluster.health(
|
||||
index=".ml-*", wait_for_active_shards="all"
|
||||
) # Added to prevent flakiness in the test
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_sklearn
|
||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||
def test_random_forest_classifier(self, compress_model_definition):
|
||||
@ -332,6 +416,7 @@ class TestMLModel:
|
||||
)
|
||||
|
||||
# Clean up
|
||||
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_sklearn
|
||||
@ -452,6 +537,45 @@ class TestMLModel:
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_xgboost
|
||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"objective",
|
||||
["rank:ndcg", "rank:map", "rank:pairwise"],
|
||||
)
|
||||
def test_xgb_ranker(self, compress_model_definition, objective):
|
||||
X, y = datasets.make_classification(n_features=5)
|
||||
rng = np.random.default_rng()
|
||||
qid = rng.integers(0, 3, size=X.shape[0])
|
||||
|
||||
# Sort the inputs based on query index
|
||||
sorted_idx = np.argsort(qid)
|
||||
X = X[sorted_idx, :]
|
||||
y = y[sorted_idx]
|
||||
qid = qid[sorted_idx]
|
||||
|
||||
ranker = XGBRanker(objective=objective)
|
||||
ranker.fit(X, y, qid=qid)
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
feature_names = ["f0", "f1", "f2", "f3", "f4"]
|
||||
model_id = "test_xgb_ranker"
|
||||
|
||||
es_model = MLModel.import_model(
|
||||
ES_TEST_CLIENT,
|
||||
model_id,
|
||||
ranker,
|
||||
feature_names,
|
||||
es_if_exists="replace",
|
||||
es_compress_model_definition=compress_model_definition,
|
||||
)
|
||||
|
||||
# Get some test results
|
||||
check_prediction_equality(es_model, ranker, random_rows(X, 20))
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_xgboost
|
||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
@ -603,172 +727,3 @@ class TestMLModel:
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_sklearn
|
||||
@requires_shap
|
||||
def test_export_regressor(self, regression_model_id):
|
||||
ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
|
||||
types = dict(ed_flights.dtypes)
|
||||
X = ed_flights.to_pandas().astype(types)
|
||||
|
||||
model = MLModel(es_client=ES_TEST_CLIENT, model_id=regression_model_id)
|
||||
pipeline = model.export_model()
|
||||
pipeline.fit(X)
|
||||
|
||||
predictions_sklearn = pipeline.predict(
|
||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
response = ES_TEST_CLIENT.ml.infer_trained_model(
|
||||
model_id=regression_model_id,
|
||||
docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
|
||||
)
|
||||
predictions_es = np.array(
|
||||
list(
|
||||
map(
|
||||
itemgetter("FlightDelayMin_prediction"),
|
||||
response.body["inference_results"],
|
||||
)
|
||||
)
|
||||
)
|
||||
np.testing.assert_array_almost_equal(predictions_sklearn, predictions_es)
|
||||
|
||||
import pandas as pd
|
||||
|
||||
X_transformed = pipeline["preprocessor"].transform(X=X)
|
||||
X_transformed = pd.DataFrame(
|
||||
X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
explainer = shap.TreeExplainer(pipeline["es_model"])
|
||||
shap_values = explainer.shap_values(
|
||||
X_transformed[pipeline["es_model"].feature_names_in_]
|
||||
)
|
||||
np.testing.assert_array_almost_equal(
|
||||
predictions_sklearn, shap_values.sum(axis=1) + explainer.expected_value
|
||||
)
|
||||
|
||||
@requires_sklearn
|
||||
def test_export_classification(self, classification_model_id):
|
||||
ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
|
||||
X = ed.eland_to_pandas(ed_flights)
|
||||
|
||||
model = MLModel(es_client=ES_TEST_CLIENT, model_id=classification_model_id)
|
||||
pipeline = model.export_model()
|
||||
pipeline.fit(X)
|
||||
|
||||
predictions_sklearn = pipeline.predict(
|
||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
prediction_proba_sklearn = pipeline.predict_proba(
|
||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
||||
).max(axis=1)
|
||||
|
||||
response = ES_TEST_CLIENT.ml.infer_trained_model(
|
||||
model_id=classification_model_id,
|
||||
docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
|
||||
)
|
||||
predictions_es = np.array(
|
||||
list(
|
||||
map(
|
||||
lambda x: str(int(x["Cancelled_prediction"])),
|
||||
response.body["inference_results"],
|
||||
)
|
||||
)
|
||||
)
|
||||
prediction_proba_es = np.array(
|
||||
list(
|
||||
map(
|
||||
itemgetter("prediction_probability"),
|
||||
response.body["inference_results"],
|
||||
)
|
||||
)
|
||||
)
|
||||
np.testing.assert_array_almost_equal(
|
||||
prediction_proba_sklearn, prediction_proba_es
|
||||
)
|
||||
np.testing.assert_array_equal(predictions_sklearn, predictions_es)
|
||||
|
||||
import pandas as pd
|
||||
|
||||
X_transformed = pipeline["preprocessor"].transform(X=X)
|
||||
X_transformed = pd.DataFrame(
|
||||
X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
|
||||
)
|
||||
explainer = shap.TreeExplainer(pipeline["es_model"])
|
||||
shap_values = explainer.shap_values(
|
||||
X_transformed[pipeline["es_model"].feature_names_in_]
|
||||
)
|
||||
log_odds = shap_values.sum(axis=1) + explainer.expected_value
|
||||
prediction_proba_shap = 1 / (1 + np.exp(-log_odds))
|
||||
# use probability of the predicted class
|
||||
prediction_proba_shap[prediction_proba_shap < 0.5] = (
|
||||
1 - prediction_proba_shap[prediction_proba_shap < 0.5]
|
||||
)
|
||||
np.testing.assert_array_almost_equal(
|
||||
prediction_proba_sklearn, prediction_proba_shap
|
||||
)
|
||||
|
||||
@requires_xgboost
|
||||
@requires_sklearn
|
||||
@pytest.mark.parametrize("objective", ["binary:logistic", "reg:squarederror"])
|
||||
def test_xgb_import_export(self, objective):
|
||||
booster = "gbtree"
|
||||
|
||||
if objective.startswith("binary:"):
|
||||
training_data = datasets.make_classification(n_features=5)
|
||||
xgb_model = XGBClassifier(
|
||||
booster=booster, objective=objective, use_label_encoder=False
|
||||
)
|
||||
else:
|
||||
training_data = datasets.make_regression(n_features=5)
|
||||
xgb_model = XGBRegressor(
|
||||
booster=booster, objective=objective, use_label_encoder=False
|
||||
)
|
||||
|
||||
# Train model
|
||||
xgb_model.fit(training_data[0], training_data[1])
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
|
||||
model_id = "test_xgb_model"
|
||||
|
||||
es_model = MLModel.import_model(
|
||||
ES_TEST_CLIENT, model_id, xgb_model, feature_names, es_if_exists="replace"
|
||||
)
|
||||
|
||||
# Export suppose to fail
|
||||
with pytest.raises(ValueError) as ex:
|
||||
es_model.export_model()
|
||||
assert ex.match("Error initializing sklearn classifier.")
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
||||
@requires_lightgbm
|
||||
@pytest.mark.parametrize("objective", ["regression", "binary"])
|
||||
def test_lgbm_import_export(self, objective):
|
||||
booster = "gbdt"
|
||||
if objective == "binary":
|
||||
training_data = datasets.make_classification(n_features=5)
|
||||
lgbm_model = LGBMClassifier(boosting_type=booster, objective=objective)
|
||||
else:
|
||||
training_data = datasets.make_regression(n_features=5)
|
||||
lgbm_model = LGBMRegressor(boosting_type=booster, objective=objective)
|
||||
|
||||
# Train model
|
||||
lgbm_model.fit(training_data[0], training_data[1])
|
||||
|
||||
# Serialise the models to Elasticsearch
|
||||
feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
|
||||
model_id = "test_lgbm_model"
|
||||
|
||||
es_model = MLModel.import_model(
|
||||
ES_TEST_CLIENT, model_id, lgbm_model, feature_names, es_if_exists="replace"
|
||||
)
|
||||
|
||||
# Export suppose to fail
|
||||
with pytest.raises(ValueError) as ex:
|
||||
es_model.export_model()
|
||||
assert ex.match("Error initializing sklearn classifier.")
|
||||
|
||||
# Clean up
|
||||
es_model.delete_model()
|
||||
|
BIN
tests/national-parks.json.gz
Normal file
BIN
tests/national-parks.json.gz
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -19,7 +19,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
"HeadApiResponse(False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
@ -43,8 +43,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2021-03-30 11:57:39.116425: read 10000 rows\n",
|
||||
"2021-03-30 11:57:39.522722: read 13059 rows\n"
|
||||
"2024-05-21 09:07:17.882569: read 10000 rows\n",
|
||||
"2024-05-21 09:07:18.375305: read 13059 rows\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -78,6 +78,18 @@
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'\n",
|
||||
" reader = pd.read_csv(filepath_or_buffer, **kwargs)\n",
|
||||
"/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze(\"columns\") to the call to squeeze.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" reader = pd.read_csv(filepath_or_buffer, **kwargs)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -218,35 +230,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'took': 0,\n",
|
||||
" 'timed_out': False,\n",
|
||||
" '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n",
|
||||
" 'hits': {'total': {'value': 2, 'relation': 'eq'},\n",
|
||||
" 'max_score': 1.0,\n",
|
||||
" 'hits': [{'_index': 'churn',\n",
|
||||
" '_id': '0',\n",
|
||||
" '_score': 1.0,\n",
|
||||
" '_source': {'state': 'KS',\n",
|
||||
" 'account length': 128,\n",
|
||||
" 'area code': 415,\n",
|
||||
" 'phone number': '382-4657',\n",
|
||||
" 'international plan': 'no',\n",
|
||||
" 'voice mail plan': 'yes',\n",
|
||||
" 'number vmail messages': 25,\n",
|
||||
" 'total day minutes': 265.1,\n",
|
||||
" 'total day calls': 110,\n",
|
||||
" 'total day charge': 45.07,\n",
|
||||
" 'total eve minutes': 197.4,\n",
|
||||
" 'total eve calls': 99,\n",
|
||||
" 'total eve charge': 16.78,\n",
|
||||
" 'total night minutes': 244.7,\n",
|
||||
" 'total night calls': 91,\n",
|
||||
" 'total night charge': 11.01,\n",
|
||||
" 'total intl minutes': 10.0,\n",
|
||||
" 'total intl calls': 3,\n",
|
||||
" 'total intl charge': 2.7,\n",
|
||||
" 'customer service calls': 1,\n",
|
||||
" 'churn': 0}}]}}"
|
||||
"ObjectApiResponse({'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'churn', '_id': '0', '_score': 1.0, '_source': {'state': 'KS', 'account length': 128, 'area code': 415, 'phone number': '382-4657', 'international plan': 'no', 'voice mail plan': 'yes', 'number vmail messages': 25, 'total day minutes': 265.1, 'total day calls': 110, 'total day charge': 45.07, 'total eve minutes': 197.4, 'total eve calls': 99, 'total eve charge': 16.78, 'total night minutes': 244.7, 'total night calls': 91, 'total night charge': 11.01, 'total intl minutes': 10.0, 'total intl calls': 3, 'total intl charge': 2.7, 'customer service calls': 1, 'churn': 0}}]}})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
@ -267,7 +251,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'acknowledged': True}"
|
||||
"ObjectApiResponse({'acknowledged': True})"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
@ -297,7 +281,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -33,10 +33,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"AvgTicketPrice 639.433214\n",
|
||||
"Cancelled False\n",
|
||||
"dayOfWeek 3\n",
|
||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
||||
"dayOfWeek 2\n",
|
||||
"timestamp 2018-01-21 20:23:15.159835648\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
@ -58,9 +58,9 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"AvgTicketPrice 639.433214\n",
|
||||
"Cancelled 0.000000\n",
|
||||
"dayOfWeek 3.000000\n",
|
||||
"dayOfWeek 2.935777\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
@ -82,10 +82,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"AvgTicketPrice 639.433214\n",
|
||||
"Cancelled False\n",
|
||||
"dayOfWeek 3\n",
|
||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
||||
"dayOfWeek 2\n",
|
||||
"timestamp 2018-01-21 20:23:15.159835648\n",
|
||||
"DestCountry NaN\n",
|
||||
"dtype: object"
|
||||
]
|
||||
@ -108,7 +108,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"AvgTicketPrice 213.453156\n",
|
||||
"dayOfWeek 2.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
@ -131,7 +131,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"AvgTicketPrice 213.453156\n",
|
||||
"dayOfWeek 2.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
@ -154,7 +154,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"AvgTicketPrice 213.453156\n",
|
||||
"Cancelled NaN\n",
|
||||
"dayOfWeek 2.0\n",
|
||||
"timestamp NaT\n",
|
||||
@ -189,7 +189,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
File diff suppressed because one or more lines are too long
@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
|
||||
ed_ser = ed_flights_small.filter(items=items, axis=0)
|
||||
pd_ser = pd_flights_small.filter(items=items, axis=0)
|
||||
|
||||
assert_pandas_eland_series_equal(pd_ser, ed_ser)
|
||||
# For an empty Series, eland will say the datatype it knows from the Elastic index
|
||||
# Pandas however will state empty as the datatype
|
||||
assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
|
||||
|
||||
def test_flights_filter_index_like_and_regex(self):
|
||||
ed_flights_small = self.ed_flights_small()["FlightDelayType"]
|
||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_series_equal
|
||||
|
||||
from eland.common import PANDAS_VERSION
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
|
||||
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
||||
|
||||
for func in self.all_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights, func)()
|
||||
ed_metric = getattr(ed_flights, func)()
|
||||
|
||||
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
|
||||
ed_ecommerce = self.ed_ecommerce()[column]
|
||||
|
||||
for func in self.all_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_ecommerce, func)()
|
||||
ed_metric = getattr(ed_ecommerce, func)(
|
||||
**({"numeric_only": True} if (func != "nunique") else {})
|
||||
|
@ -30,6 +30,9 @@ from tests import (
|
||||
FLIGHTS_MAPPING,
|
||||
FLIGHTS_SMALL_FILE_NAME,
|
||||
FLIGHTS_SMALL_INDEX_NAME,
|
||||
NATIONAL_PARKS_FILE_NAME,
|
||||
NATIONAL_PARKS_INDEX_NAME,
|
||||
NATIONAL_PARKS_MAPPING,
|
||||
TEST_MAPPING1,
|
||||
TEST_MAPPING1_INDEX_NAME,
|
||||
TEST_NESTED_USER_GROUP_DOCS,
|
||||
@ -41,6 +44,7 @@ DATA_LIST = [
|
||||
(FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
|
||||
(FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING),
|
||||
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING),
|
||||
(NATIONAL_PARKS_FILE_NAME, NATIONAL_PARKS_INDEX_NAME, NATIONAL_PARKS_MAPPING),
|
||||
]
|
||||
|
||||
|
||||
@ -58,18 +62,20 @@ def _setup_data(es):
|
||||
es.indices.create(index=index_name, **mapping)
|
||||
|
||||
df = pd.read_json(json_file_name, lines=True)
|
||||
|
||||
actions = []
|
||||
n = 0
|
||||
|
||||
print("Adding", df.shape[0], "items to index:", index_name)
|
||||
for index, row in df.iterrows():
|
||||
values = row.to_dict()
|
||||
values = row.dropna().to_dict()
|
||||
# make timestamp datetime 2018-01-01T12:09:35
|
||||
# values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
|
||||
|
||||
# Use integer as id field for repeatable results
|
||||
action = {"_index": index_name, "_source": values, "_id": str(n)}
|
||||
# Use id field as document id from the row if the fiel exists.
|
||||
# Else, use integer as id field for repeatable results
|
||||
# document_id = values['id'] if 'id' in values else str(n)
|
||||
document_id = values["id"] if "id" in values else str(n)
|
||||
action = {"_index": index_name, "_source": values, "_id": document_id}
|
||||
|
||||
actions.append(action)
|
||||
|
||||
|
@ -65,7 +65,7 @@ def find_files_to_fix(sources: List[str]) -> Iterator[str]:
|
||||
def does_file_need_fix(filepath: str) -> bool:
|
||||
if not filepath.endswith(".py"):
|
||||
return False
|
||||
with open(filepath, mode="r") as f:
|
||||
with open(filepath) as f:
|
||||
first_license_line = None
|
||||
for line in f:
|
||||
if line == license_header_lines[0]:
|
||||
@ -82,7 +82,7 @@ def does_file_need_fix(filepath: str) -> bool:
|
||||
|
||||
|
||||
def add_header_to_file(filepath: str) -> None:
|
||||
with open(filepath, mode="r") as f:
|
||||
with open(filepath) as f:
|
||||
lines = list(f)
|
||||
i = 0
|
||||
for i, line in enumerate(lines):
|
||||
|
Loading…
x
Reference in New Issue
Block a user