mirror of
https://github.com/elastic/eland.git
synced 2025-07-24 00:00:39 +08:00
Compare commits
86 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
bebb9d52e5 | ||
|
117f61b010 | ||
|
cef4710695 | ||
|
44ead02b05 | ||
|
cb7c4fb122 | ||
|
9e8f164677 | ||
|
3c3ffd7403 | ||
|
f5c2dcfc9d | ||
|
878cde6126 | ||
|
ec45c395fd | ||
|
00dc55b3bd | ||
|
8147eb517a | ||
|
4728d9b648 | ||
|
51a2b9cc19 | ||
|
a9c36927f6 | ||
|
87380ef716 | ||
|
9ca76d7888 | ||
|
ced3cdfe32 | ||
|
87379c53de | ||
|
1ddae81769 | ||
|
9302bef7db | ||
|
ca64672fd7 | ||
|
6692251d9e | ||
|
ee4d701aa4 | ||
|
acdeeeded2 | ||
|
8350f06ea8 | ||
|
e846fb7697 | ||
|
c4ac64e3a0 | ||
|
214c4645e9 | ||
|
871e52b37a | ||
|
aa5196edee | ||
|
75c57b0775 | ||
|
77589b26b8 | ||
|
9b5badb941 | ||
|
f99adce23f | ||
|
7774a506ae | ||
|
82492fe771 | ||
|
04102f2a4e | ||
|
9aec8fc751 | ||
|
79d9a6ae29 | ||
|
939f4d672c | ||
|
1312e96220 | ||
|
2916b51fa7 | ||
|
5dabe9c099 | ||
|
06b65e211e | ||
|
a45c7bc357 | ||
|
d1e533ffb9 | ||
|
a83ce20fcc | ||
|
03af8a6319 | ||
|
5253501704 | ||
|
ec66b5f320 | ||
|
64d05e4c68 | ||
|
f79180be42 | ||
|
0ce3db26e8 | ||
|
5a76f826df | ||
|
fd8886da6a | ||
|
bee6d0e1f7 | ||
|
f18aa35e8e | ||
|
56a46d0f85 | ||
|
c497683064 | ||
|
0ddc21b895 | ||
|
5a3e7d78b3 | ||
|
1014ecdb39 | ||
|
632074c0f0 | ||
|
35a96ab3f0 | ||
|
116416b3e8 | ||
|
5b728c29c1 | ||
|
e76b32eee2 | ||
|
fd38e26df1 | ||
|
f7f6e0aba9 | ||
|
9cea2385e6 | ||
|
1921792df8 | ||
|
c16e36c051 | ||
|
ae0bba34c6 | ||
|
aaec995b1b | ||
|
de83f3f905 | ||
|
8e8c49ddbf | ||
|
5d34dc3cc4 | ||
|
9b335315bb | ||
|
28eda95ba9 | ||
|
f4b30753ad | ||
|
33cf029efe | ||
|
9d492b03aa | ||
|
fd2ceab846 | ||
|
02190e74e7 | ||
|
2a6a4b1f06 |
@ -1,6 +1,8 @@
|
|||||||
ARG PYTHON_VERSION=3.9
|
ARG PYTHON_VERSION=3.9
|
||||||
FROM python:${PYTHON_VERSION}
|
FROM python:${PYTHON_VERSION}
|
||||||
|
|
||||||
|
ENV FORCE_COLOR=1
|
||||||
|
|
||||||
WORKDIR /code/eland
|
WORKDIR /code/eland
|
||||||
RUN python -m pip install nox
|
RUN python -m pip install nox
|
||||||
|
|
||||||
|
11
.buildkite/build-docker-images.sh
Normal file
11
.buildkite/build-docker-images.sh
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -eo pipefail
|
||||||
|
export LC_ALL=en_US.UTF-8
|
||||||
|
|
||||||
|
echo "--- Building the Wolfi image"
|
||||||
|
# Building the linux/arm64 image takes about one hour on Buildkite, which is too slow
|
||||||
|
docker build --file Dockerfile.wolfi .
|
||||||
|
|
||||||
|
echo "--- Building the public image"
|
||||||
|
docker build .
|
@ -1,15 +1,8 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y pandoc python3 python3-pip
|
|
||||||
python3 -m pip install nox
|
|
||||||
/opt/buildkite-agent/.local/bin/nox -s docs
|
|
||||||
|
|
||||||
# I couldn't make this work, for some reason pandoc is not found in the docker container repository:
|
docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
|
||||||
# docker build --file .buildkite/Dockerfile --tag elastic/eland --build-arg PYTHON_VERSION=${PYTHON_VERSION} .
|
docker run \
|
||||||
# docker run \
|
--name doc_build \
|
||||||
# --name doc_build \
|
--rm \
|
||||||
# --rm \
|
elastic/eland \
|
||||||
# elastic/eland \
|
bash -c "apt-get update && apt-get install --yes pandoc && nox -s docs"
|
||||||
# apt-get update && \
|
|
||||||
# sudo apt-get install --yes pandoc && \
|
|
||||||
# nox -s docs
|
|
||||||
|
@ -15,22 +15,36 @@ steps:
|
|||||||
machineType: "n2-standard-2"
|
machineType: "n2-standard-2"
|
||||||
commands:
|
commands:
|
||||||
- ./.buildkite/build-documentation.sh
|
- ./.buildkite/build-documentation.sh
|
||||||
- label: "Eland :python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }}"
|
- label: ":docker: Build Wolfi image"
|
||||||
|
env:
|
||||||
|
PYTHON_VERSION: 3.11-bookworm
|
||||||
|
agents:
|
||||||
|
provider: "gcp"
|
||||||
|
machineType: "n2-standard-2"
|
||||||
|
commands:
|
||||||
|
- ./.buildkite/build-docker-images.sh
|
||||||
|
- label: ":python: {{ matrix.python }} :elasticsearch: {{ matrix.stack }} :pandas: {{ matrix.pandas }}"
|
||||||
agents:
|
agents:
|
||||||
provider: "gcp"
|
provider: "gcp"
|
||||||
machineType: "n2-standard-4"
|
machineType: "n2-standard-4"
|
||||||
env:
|
env:
|
||||||
PYTHON_VERSION: "{{ matrix.python }}"
|
PYTHON_VERSION: "{{ matrix.python }}"
|
||||||
PANDAS_VERSION: '1.5.0'
|
PANDAS_VERSION: "{{ matrix.pandas }}"
|
||||||
TEST_SUITE: "xpack"
|
TEST_SUITE: "xpack"
|
||||||
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
|
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
|
||||||
matrix:
|
matrix:
|
||||||
setup:
|
setup:
|
||||||
|
# Python and pandas versions need to be added to the nox configuration too
|
||||||
|
# (in the decorators of the test method in noxfile.py)
|
||||||
|
pandas:
|
||||||
|
- '1.5.0'
|
||||||
|
- '2.2.3'
|
||||||
python:
|
python:
|
||||||
|
- '3.12'
|
||||||
|
- '3.11'
|
||||||
- '3.10'
|
- '3.10'
|
||||||
- '3.9'
|
- '3.9'
|
||||||
- '3.8'
|
|
||||||
stack:
|
stack:
|
||||||
- '8.11-SNAPSHOT'
|
- '9.0.0'
|
||||||
- '8.12-SNAPSHOT'
|
- '9.1.0-SNAPSHOT'
|
||||||
command: ./.buildkite/run-tests
|
command: ./.buildkite/run-tests
|
||||||
|
@ -11,6 +11,18 @@
|
|||||||
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
||||||
"skip_ci_labels": ["skip-ci"],
|
"skip_ci_labels": ["skip-ci"],
|
||||||
"skip_ci_on_only_changed": ["\\.md$"]
|
"skip_ci_on_only_changed": ["\\.md$"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"enabled": true,
|
||||||
|
"pipeline_slug": "docs-build-pr",
|
||||||
|
"allow_org_users": true,
|
||||||
|
"allowed_repo_permissions": ["admin", "write"],
|
||||||
|
"build_on_commit": true,
|
||||||
|
"build_on_comment": true,
|
||||||
|
"trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
||||||
|
"always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
|
||||||
|
"skip_ci_labels": ["skip-ci"],
|
||||||
|
"skip_ci_on_only_changed": ["\\.md$"]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,7 @@ git --no-pager show
|
|||||||
docker buildx rm --force eland-multiarch-builder || true
|
docker buildx rm --force eland-multiarch-builder || true
|
||||||
docker buildx create --name eland-multiarch-builder --bootstrap --use
|
docker buildx create --name eland-multiarch-builder --bootstrap --use
|
||||||
docker buildx build --push \
|
docker buildx build --push \
|
||||||
|
--file Dockerfile.wolfi \
|
||||||
--tag "$docker_registry/eland/eland:$RELEASE_VERSION" \
|
--tag "$docker_registry/eland/eland:$RELEASE_VERSION" \
|
||||||
--tag "$docker_registry/eland/eland:latest" \
|
--tag "$docker_registry/eland/eland:latest" \
|
||||||
--platform linux/amd64,linux/arm64 \
|
--platform linux/amd64,linux/arm64 \
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
# docs and example
|
# docs and example
|
||||||
docs/*
|
|
||||||
example/*
|
example/*
|
||||||
|
|
||||||
# Git
|
# Git
|
||||||
@ -18,9 +17,6 @@ dist/
|
|||||||
# Build folder
|
# Build folder
|
||||||
build/
|
build/
|
||||||
|
|
||||||
# docs
|
|
||||||
docs/*
|
|
||||||
|
|
||||||
# pytest results
|
# pytest results
|
||||||
tests/dataframe/results/*csv
|
tests/dataframe/results/*csv
|
||||||
result_images/
|
result_images/
|
||||||
|
26
.github/workflows/backport.yml
vendored
Normal file
26
.github/workflows/backport.yml
vendored
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
name: Backport
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types:
|
||||||
|
- closed
|
||||||
|
- labeled
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
backport:
|
||||||
|
name: Backport
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Only react to merged PRs for security reasons.
|
||||||
|
# See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target.
|
||||||
|
if: >
|
||||||
|
github.event.pull_request.merged
|
||||||
|
&& (
|
||||||
|
github.event.action == 'closed'
|
||||||
|
|| (
|
||||||
|
github.event.action == 'labeled'
|
||||||
|
&& contains(github.event.label.name, 'backport')
|
||||||
|
)
|
||||||
|
)
|
||||||
|
steps:
|
||||||
|
- uses: tibdex/backport@9565281eda0731b1d20c4025c43339fb0a23812e # v2.0.4
|
||||||
|
with:
|
||||||
|
github_token: ${{ secrets.GITHUB_TOKEN }}
|
19
.github/workflows/docs-build.yml
vendored
Normal file
19
.github/workflows/docs-build.yml
vendored
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
name: docs-build
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request_target: ~
|
||||||
|
merge_group: ~
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
docs-preview:
|
||||||
|
uses: elastic/docs-builder/.github/workflows/preview-build.yml@main
|
||||||
|
with:
|
||||||
|
path-pattern: docs/**
|
||||||
|
permissions:
|
||||||
|
deployments: write
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
pull-requests: write
|
14
.github/workflows/docs-cleanup.yml
vendored
Normal file
14
.github/workflows/docs-cleanup.yml
vendored
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
name: docs-cleanup
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types:
|
||||||
|
- closed
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
docs-preview:
|
||||||
|
uses: elastic/docs-builder/.github/workflows/preview-cleanup.yml@main
|
||||||
|
permissions:
|
||||||
|
contents: none
|
||||||
|
id-token: write
|
||||||
|
deployments: write
|
@ -3,9 +3,12 @@ version: 2
|
|||||||
build:
|
build:
|
||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.10"
|
python: "3.11"
|
||||||
|
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: docs/requirements-docs.txt
|
|
||||||
- path: .
|
- path: .
|
||||||
|
- requirements: docs/requirements-docs.txt
|
||||||
|
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/sphinx/conf.py
|
||||||
|
123
CHANGELOG.rst
123
CHANGELOG.rst
@ -2,6 +2,129 @@
|
|||||||
Changelog
|
Changelog
|
||||||
=========
|
=========
|
||||||
|
|
||||||
|
9.0.1 (2025-04-30)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
* Forbid Elasticsearch 8 client or server (`#780 <https://github.com/elastic/eland/pull/780>`_)
|
||||||
|
* Fix DeBERTa tokenization (`#769 <https://github.com/elastic/eland/pull/769>`_)
|
||||||
|
* Upgrade PyTorch to 2.5.1 (`#785 <https://github.com/elastic/eland/pull/785>`_)
|
||||||
|
* Upgrade LightGBM to 4.6.0 (`#782 <https://github.com/elastic/eland/pull/782>`_)
|
||||||
|
|
||||||
|
9.0.0 (2025-04-15)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
* Drop Python 3.8, Support Python 3.12 (`#743 <https://github.com/elastic/eland/pull/743>`_)
|
||||||
|
* Support Pandas 2 (`#742 <https://github.com/elastic/eland/pull/742>`_)
|
||||||
|
* Upgrade transformers to 4.47 (`#752 <https://github.com/elastic/eland/pull/752>`_)
|
||||||
|
* Remove ML model export as sklearn Pipeline (`#744 <https://github.com/elastic/eland/pull/744>`_)
|
||||||
|
* Allow scikit-learn 1.5 (`#729 <https://github.com/elastic/eland/pull/729>`_)
|
||||||
|
* Migrate docs from AsciiDoc to Markdown (`#762 <https://github.com/elastic/eland/pull/762>`_)
|
||||||
|
|
||||||
|
8.17.0 (2025-01-07)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Support sparse embedding models such as SPLADE-v3-DistilBERT (`#740 <https://github.com/elastic/eland/pull/740>`_)
|
||||||
|
|
||||||
|
8.16.0 (2024-11-13)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Add deprecation warning for ESGradientBoostingModel subclasses (`#738 <https://github.com/elastic/eland/pull/738>`_)
|
||||||
|
|
||||||
|
8.15.4 (2024-10-17)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Revert "Allow reading Elasticsearch certs in Wolfi image" (`#734 <https://github.com/elastic/eland/pull/734>`_)
|
||||||
|
|
||||||
|
8.15.3 (2024-10-09)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Added support for DeBERTa-V2 tokenizer (`#717 <https://github.com/elastic/eland/pull/717>`_)
|
||||||
|
* Fixed ``--ca-cert`` with a shared Elasticsearch Docker volume (`#732 <https://github.com/elastic/eland/pull/732>`_)
|
||||||
|
|
||||||
|
8.15.2 (2024-10-02)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Fixed Docker image build (`#728 <https://github.com/elastic/eland/pull/728>`_)
|
||||||
|
|
||||||
|
8.15.1 (2024-10-01)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Upgraded PyTorch to version 2.3.1, which is compatible with Elasticsearch 8.15.2 or above (`#718 <https://github.com/elastic/eland/pull/718>`_)
|
||||||
|
* Migrated to distroless Wolfi base Docker image (`#720 <https://github.com/elastic/eland/pull/720>`_)
|
||||||
|
|
||||||
|
|
||||||
|
8.15.0 (2024-08-12)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* Added a default truncation of ``second`` for text similarity (`#713 <https://github.com/elastic/eland/pull/713>`_)
|
||||||
|
* Added note about using text_similarity for rerank in the CLI (`#716 <https://github.com/elastic/eland/pull/716>`_)
|
||||||
|
* Added support for lists in result hits (`#707 <https://github.com/elastic/eland/pull/707>`_)
|
||||||
|
* Removed input fields from exported LTR models (`#708 <https://github.com/elastic/eland/pull/708>`_)
|
||||||
|
|
||||||
|
8.14.0 (2024-06-10)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Added
|
||||||
|
^^^^^
|
||||||
|
|
||||||
|
* Added Elasticsearch Serverless support in DataFrames (`#690`_, contributed by `@AshokChoudhary11`_) and eland_import_hub_model (`#698`_)
|
||||||
|
|
||||||
|
Fixed
|
||||||
|
^^^^^
|
||||||
|
|
||||||
|
* Fixed Python 3.8 support (`#695`_, contributed by `@bartbroere`_)
|
||||||
|
* Fixed non _source fields missing from the results hits (`#693`_, contributed by `@bartbroere`_)
|
||||||
|
|
||||||
|
.. _@AshokChoudhary11: https://github.com/AshokChoudhary11
|
||||||
|
.. _#690: https://github.com/elastic/eland/pull/690
|
||||||
|
.. _#693: https://github.com/elastic/eland/pull/693
|
||||||
|
.. _#695: https://github.com/elastic/eland/pull/695
|
||||||
|
.. _#698: https://github.com/elastic/eland/pull/698
|
||||||
|
|
||||||
|
8.13.1 (2024-05-03)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Added
|
||||||
|
^^^^^
|
||||||
|
|
||||||
|
* Added support for HTTP proxies in eland_import_hub_model (`#688`_)
|
||||||
|
|
||||||
|
.. _#688: https://github.com/elastic/eland/pull/688
|
||||||
|
|
||||||
|
8.13.0 (2024-03-27)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Added
|
||||||
|
^^^^^
|
||||||
|
|
||||||
|
* Added support for Python 3.11 (`#681`_)
|
||||||
|
* Added ``eland.DataFrame.to_json`` function (`#661`_, contributed by `@bartbroere`_)
|
||||||
|
* Added override option to specify the model's max input size (`#674`_)
|
||||||
|
|
||||||
|
Changed
|
||||||
|
^^^^^^^
|
||||||
|
|
||||||
|
* Upgraded torch to 2.1.2 (`#671`_)
|
||||||
|
* Mirrored pandas' ``lineterminator`` instead of ``line_terminator`` in ``to_csv`` (`#595`_, contributed by `@bartbroere`_)
|
||||||
|
|
||||||
|
.. _#595: https://github.com/elastic/eland/pull/595
|
||||||
|
.. _#661: https://github.com/elastic/eland/pull/661
|
||||||
|
.. _#671: https://github.com/elastic/eland/pull/671
|
||||||
|
.. _#674: https://github.com/elastic/eland/pull/674
|
||||||
|
.. _#681: https://github.com/elastic/eland/pull/681
|
||||||
|
|
||||||
|
|
||||||
|
8.12.1 (2024-01-30)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Fixed
|
||||||
|
^^^^^
|
||||||
|
|
||||||
|
* Fix missing value support for XGBRanker (`#654`_)
|
||||||
|
|
||||||
|
.. _#654: https://github.com/elastic/eland/pull/654
|
||||||
|
|
||||||
|
|
||||||
8.12.0 (2024-01-18)
|
8.12.0 (2024-01-18)
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
@ -78,9 +78,15 @@ Once your changes and tests are ready to submit for review:
|
|||||||
# Run Auto-format, lint, mypy type checker for your changes
|
# Run Auto-format, lint, mypy type checker for your changes
|
||||||
$ nox -s format
|
$ nox -s format
|
||||||
|
|
||||||
# Run the test suite
|
# Launch Elasticsearch with a trial licence and ML enabled
|
||||||
$ pytest --doctest-modules eland/ tests/
|
$ docker run --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.license.self_generated.type=trial" docker.elastic.co/elasticsearch/elasticsearch:9.0.0
|
||||||
$ pytest --nbval tests/notebook/
|
|
||||||
|
# See all test suites
|
||||||
|
$ nox -l
|
||||||
|
# Run a specific test suite
|
||||||
|
$ nox -rs "test-3.12(pandas_version='2.2.3')"
|
||||||
|
# Run a specific test
|
||||||
|
$ nox -rs "test-3.12(pandas_version='2.2.3')" -- -k test_learning_to_rank
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -169,7 +175,7 @@ currently using a minimum version of PyCharm 2019.2.4.
|
|||||||
* Setup Elasticsearch instance with docker
|
* Setup Elasticsearch instance with docker
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
> ELASTICSEARCH_VERSION=elasticsearch:8.x-SNAPSHOT BUILDKITE=false .buildkite/run-elasticsearch.sh
|
> ELASTICSEARCH_VERSION=elasticsearch:8.17.0 BUILDKITE=false .buildkite/run-elasticsearch.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
* Now check `http://localhost:9200`
|
* Now check `http://localhost:9200`
|
||||||
@ -203,7 +209,7 @@ currently using a minimum version of PyCharm 2019.2.4.
|
|||||||
* To test specific versions of Python run
|
* To test specific versions of Python run
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
> nox -s test-3.8
|
> nox -s test-3.12
|
||||||
```
|
```
|
||||||
|
|
||||||
### Documentation
|
### Documentation
|
||||||
|
@ -18,7 +18,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||||
python3 -m pip install \
|
python3 -m pip install \
|
||||||
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||||
torch==1.13.1+cpu .[all]; \
|
torch==2.5.1+cpu .[all]; \
|
||||||
else \
|
else \
|
||||||
python3 -m pip install \
|
python3 -m pip install \
|
||||||
--no-cache-dir --disable-pip-version-check \
|
--no-cache-dir --disable-pip-version-check \
|
||||||
|
42
Dockerfile.wolfi
Normal file
42
Dockerfile.wolfi
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
FROM docker.elastic.co/wolfi/python:3.10-dev AS builder
|
||||||
|
|
||||||
|
WORKDIR /eland
|
||||||
|
ENV VIRTUAL_ENV=/eland/venv
|
||||||
|
RUN python3 -m venv $VIRTUAL_ENV
|
||||||
|
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||||
|
|
||||||
|
ADD . /eland
|
||||||
|
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
|
||||||
|
python3 -m pip install \
|
||||||
|
--no-cache-dir --disable-pip-version-check --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||||
|
torch==2.5.1+cpu .[all]; \
|
||||||
|
else \
|
||||||
|
python3 -m pip install \
|
||||||
|
--no-cache-dir --disable-pip-version-check \
|
||||||
|
.[all]; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
FROM docker.elastic.co/wolfi/python:3.10
|
||||||
|
|
||||||
|
WORKDIR /eland
|
||||||
|
ENV VIRTUAL_ENV=/eland/venv
|
||||||
|
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||||
|
|
||||||
|
COPY --from=builder /eland /eland
|
||||||
|
|
||||||
|
# The eland_import_hub_model script is intended to be executed by a shell,
|
||||||
|
# which will see its shebang line and then execute it with the Python
|
||||||
|
# interpreter of the virtual environment. We want to keep this behavior even
|
||||||
|
# with Wolfi so that users can use the image as before. To do that, we use two
|
||||||
|
# tricks:
|
||||||
|
#
|
||||||
|
# * copy /bin/sh (that is, busybox's ash) from the builder image
|
||||||
|
# * revert to Docker's the default entrypoint, which is the only way to pass
|
||||||
|
# parameters to `eland_import_hub_model` without needing quotes.
|
||||||
|
#
|
||||||
|
COPY --from=builder /bin/sh /bin/sh
|
||||||
|
ENTRYPOINT []
|
14
README.md
14
README.md
@ -12,8 +12,7 @@
|
|||||||
<a href="https://pepy.tech/project/eland"><img src="https://static.pepy.tech/badge/eland" alt="Downloads"></a>
|
<a href="https://pepy.tech/project/eland"><img src="https://static.pepy.tech/badge/eland" alt="Downloads"></a>
|
||||||
<a href="https://pypi.org/project/eland"><img src="https://img.shields.io/pypi/status/eland.svg"
|
<a href="https://pypi.org/project/eland"><img src="https://img.shields.io/pypi/status/eland.svg"
|
||||||
alt="Package Status"></a>
|
alt="Package Status"></a>
|
||||||
<a href="https://clients-ci.elastic.co/job/elastic+eland+main"><img
|
<a href="https://buildkite.com/elastic/eland"><img src="https://badge.buildkite.com/d92340e800bc06a7c7c02a71b8d42fcb958bd18c25f99fe2d9.svg" alt="Build Status"></a>
|
||||||
src="https://clients-ci.elastic.co/buildStatus/icon?job=elastic%2Beland%2Bmain" alt="Build Status"></a>
|
|
||||||
<a href="https://github.com/elastic/eland/blob/main/LICENSE.txt"><img src="https://img.shields.io/pypi/l/eland.svg"
|
<a href="https://github.com/elastic/eland/blob/main/LICENSE.txt"><img src="https://img.shields.io/pypi/l/eland.svg"
|
||||||
alt="License"></a>
|
alt="License"></a>
|
||||||
<a href="https://eland.readthedocs.io"><img
|
<a href="https://eland.readthedocs.io"><img
|
||||||
@ -43,7 +42,7 @@ $ python -m pip install eland
|
|||||||
|
|
||||||
If using Eland to upload NLP models to Elasticsearch install the PyTorch extras:
|
If using Eland to upload NLP models to Elasticsearch install the PyTorch extras:
|
||||||
```bash
|
```bash
|
||||||
$ python -m pip install eland[pytorch]
|
$ python -m pip install 'eland[pytorch]'
|
||||||
```
|
```
|
||||||
|
|
||||||
Eland can also be installed from [Conda Forge](https://anaconda.org/conda-forge/eland) with Conda:
|
Eland can also be installed from [Conda Forge](https://anaconda.org/conda-forge/eland) with Conda:
|
||||||
@ -54,13 +53,14 @@ $ conda install -c conda-forge eland
|
|||||||
|
|
||||||
### Compatibility
|
### Compatibility
|
||||||
|
|
||||||
- Supports Python 3.8, 3.9, 3.10 and Pandas 1.5
|
- Supports Python 3.9, 3.10, 3.11 and 3.12.
|
||||||
- Supports Elasticsearch clusters that are 7.11+, recommended 8.3 or later for all features to work.
|
- Supports Pandas 1.5 and 2.
|
||||||
|
- Supports Elasticsearch 8+ clusters, recommended 8.16 or later for all features to work.
|
||||||
If you are using the NLP with PyTorch feature make sure your Eland minor version matches the minor
|
If you are using the NLP with PyTorch feature make sure your Eland minor version matches the minor
|
||||||
version of your Elasticsearch cluster. For all other features it is sufficient for the major versions
|
version of your Elasticsearch cluster. For all other features it is sufficient for the major versions
|
||||||
to match.
|
to match.
|
||||||
- You need to use PyTorch `1.13.1` or earlier to import an NLP model.
|
- You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip
|
||||||
Run `pip install torch==1.13.1` to install the aproppriate version of PyTorch.
|
install 'eland[pytorch]'` to install that version.
|
||||||
|
|
||||||
|
|
||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
@ -55,7 +55,7 @@ spec:
|
|||||||
repository: elastic/eland
|
repository: elastic/eland
|
||||||
teams:
|
teams:
|
||||||
ml-core: {}
|
ml-core: {}
|
||||||
clients-team: {}
|
devtools-team: {}
|
||||||
es-docs: {}
|
es-docs: {}
|
||||||
everyone:
|
everyone:
|
||||||
access_level: READ_ONLY
|
access_level: READ_ONLY
|
||||||
@ -89,6 +89,6 @@ spec:
|
|||||||
repository: elastic/eland
|
repository: elastic/eland
|
||||||
teams:
|
teams:
|
||||||
ml-core: {}
|
ml-core: {}
|
||||||
clients-team: {}
|
devtools-team: {}
|
||||||
everyone:
|
everyone:
|
||||||
access_level: READ_ONLY
|
access_level: READ_ONLY
|
||||||
|
10
docs/docset.yml
Normal file
10
docs/docset.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
project: 'Eland Python client'
|
||||||
|
products:
|
||||||
|
- id: elasticsearch-client
|
||||||
|
cross_links:
|
||||||
|
- docs-content
|
||||||
|
toc:
|
||||||
|
- toc: reference
|
||||||
|
subs:
|
||||||
|
es: "Elasticsearch"
|
||||||
|
ml: "machine learning"
|
@ -1,14 +0,0 @@
|
|||||||
= Eland Python Client
|
|
||||||
|
|
||||||
:doctype: book
|
|
||||||
|
|
||||||
include::{asciidoc-dir}/../../shared/versions/stack/{source_branch}.asciidoc[]
|
|
||||||
include::{asciidoc-dir}/../../shared/attributes.asciidoc[]
|
|
||||||
|
|
||||||
include::overview.asciidoc[]
|
|
||||||
|
|
||||||
include::installation.asciidoc[]
|
|
||||||
|
|
||||||
include::dataframes.asciidoc[]
|
|
||||||
|
|
||||||
include::machine-learning.asciidoc[]
|
|
@ -1,16 +0,0 @@
|
|||||||
[[installation]]
|
|
||||||
== Installation
|
|
||||||
|
|
||||||
Eland can be installed with https://pip.pypa.io[pip] from https://pypi.org/project/eland[PyPI]. We recommend https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/[using a virtual environment] when installing with pip:
|
|
||||||
|
|
||||||
[source,sh]
|
|
||||||
-----------------------------
|
|
||||||
$ python -m pip install eland
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
Alternatively, Eland can be installed with https://docs.conda.io[Conda] from https://anaconda.org/conda-forge/eland[Conda Forge]:
|
|
||||||
|
|
||||||
[source,sh]
|
|
||||||
------------------------------------
|
|
||||||
$ conda install -c conda-forge eland
|
|
||||||
------------------------------------
|
|
@ -1,194 +0,0 @@
|
|||||||
[[machine-learning]]
|
|
||||||
== Machine Learning
|
|
||||||
|
|
||||||
[discrete]
|
|
||||||
[[ml-trained-models]]
|
|
||||||
=== Trained models
|
|
||||||
|
|
||||||
Eland allows transforming trained models from scikit-learn, XGBoost,
|
|
||||||
and LightGBM libraries to be serialized and used as an inference
|
|
||||||
model in {es}.
|
|
||||||
|
|
||||||
[source,python]
|
|
||||||
------------------------
|
|
||||||
>>> from xgboost import XGBClassifier
|
|
||||||
>>> from eland.ml import MLModel
|
|
||||||
|
|
||||||
# Train and exercise an XGBoost ML model locally
|
|
||||||
>>> xgb_model = XGBClassifier(booster="gbtree")
|
|
||||||
>>> xgb_model.fit(training_data[0], training_data[1])
|
|
||||||
|
|
||||||
>>> xgb_model.predict(training_data[0])
|
|
||||||
[0 1 1 0 1 0 0 0 1 0]
|
|
||||||
|
|
||||||
# Import the model into Elasticsearch
|
|
||||||
>>> es_model = MLModel.import_model(
|
|
||||||
es_client="http://localhost:9200",
|
|
||||||
model_id="xgb-classifier",
|
|
||||||
model=xgb_model,
|
|
||||||
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Exercise the ML model in Elasticsearch with the training data
|
|
||||||
>>> es_model.predict(training_data[0])
|
|
||||||
[0 1 1 0 1 0 0 0 1 0]
|
|
||||||
------------------------
|
|
||||||
|
|
||||||
[discrete]
|
|
||||||
[[ml-nlp-pytorch]]
|
|
||||||
=== Natural language processing (NLP) with PyTorch
|
|
||||||
|
|
||||||
|
|
||||||
IMPORTANT: You need to use PyTorch `1.13` or earlier to import an NLP model.
|
|
||||||
Run `pip install torch==1.13` to install the aproppriate version of PyTorch.
|
|
||||||
|
|
||||||
For NLP tasks, Eland enables you to import PyTorch models into {es}. Use the
|
|
||||||
`eland_import_hub_model` script to download and install supported
|
|
||||||
https://huggingface.co/transformers[transformer models] from the
|
|
||||||
https://huggingface.co/models[Hugging Face model hub]. For example:
|
|
||||||
|
|
||||||
[source,bash]
|
|
||||||
------------------------
|
|
||||||
$ eland_import_hub_model <authentication> \ <1>
|
|
||||||
--url http://localhost:9200/ \ <2>
|
|
||||||
--hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
|
|
||||||
--task-type ner \ <4>
|
|
||||||
--start
|
|
||||||
------------------------
|
|
||||||
<1> Use an authentication method to access your cluster. Refer to <<ml-nlp-pytorch-auth>>.
|
|
||||||
<2> The cluster URL. Alternatively, use `--cloud-id`.
|
|
||||||
<3> Specify the identifier for the model in the Hugging Face model hub.
|
|
||||||
<4> Specify the type of NLP task. Supported values are `fill_mask`, `ner`,
|
|
||||||
`question_answering`, `text_classification`, `text_embedding`, and `zero_shot_classification`.
|
|
||||||
|
|
||||||
|
|
||||||
[discrete]
|
|
||||||
[[ml-nlp-pytorch-docker]]
|
|
||||||
==== Import model with Docker
|
|
||||||
|
|
||||||
IMPORTANT: To use the Docker container, you need to clone the Eland repository: https://github.com/elastic/eland
|
|
||||||
|
|
||||||
If you want to use Eland without installing it, you can use the Docker image:
|
|
||||||
|
|
||||||
You can use the container interactively:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ docker run -it --rm --network host docker.elastic.co/eland/eland
|
|
||||||
```
|
|
||||||
|
|
||||||
Running installed scripts is also possible without an interactive shell, for example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker run -it --rm docker.elastic.co/eland/eland \
|
|
||||||
eland_import_hub_model \
|
|
||||||
--url $ELASTICSEARCH_URL \
|
|
||||||
--hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
|
|
||||||
--start
|
|
||||||
```
|
|
||||||
|
|
||||||
Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
|
|
||||||
|
|
||||||
[discrete]
|
|
||||||
[[ml-nlp-pytorch-air-gapped]]
|
|
||||||
==== Install models in an air-gapped environment
|
|
||||||
|
|
||||||
You can install models in a restricted or closed network by pointing the
|
|
||||||
`eland_import_hub_model` script to local files.
|
|
||||||
|
|
||||||
For an offline install of a Hugging Face model, the model first needs to be
|
|
||||||
cloned locally, Git and https://git-lfs.com/[Git Large File Storage] are
|
|
||||||
required to be installed in your system.
|
|
||||||
|
|
||||||
1. Select a model you want to use from Hugging Face. Refer to the
|
|
||||||
{ml-docs}/ml-nlp-model-ref.html[compatible third party model] list for more
|
|
||||||
information on the supported architectures.
|
|
||||||
|
|
||||||
2. Clone the selected model from Hugging Face by using the model URL. For
|
|
||||||
example:
|
|
||||||
+
|
|
||||||
--
|
|
||||||
[source,bash]
|
|
||||||
----
|
|
||||||
git clone https://huggingface.co/dslim/bert-base-NER
|
|
||||||
----
|
|
||||||
This command results in a local copy of
|
|
||||||
of the model in the directory `bert-base-NER`.
|
|
||||||
--
|
|
||||||
|
|
||||||
3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the
|
|
||||||
directory of the cloned model to install it:
|
|
||||||
+
|
|
||||||
--
|
|
||||||
[source,bash]
|
|
||||||
----
|
|
||||||
eland_import_hub_model \
|
|
||||||
--url 'XXXX' \
|
|
||||||
--hub-model-id /PATH/TO/MODEL \
|
|
||||||
--task-type ner \
|
|
||||||
--es-username elastic --es-password XXX \
|
|
||||||
--es-model-id bert-base-ner
|
|
||||||
----
|
|
||||||
|
|
||||||
If you use the Docker image to run `eland_import_hub_model` you must bind mount
|
|
||||||
the model directory, so the container can read the files:
|
|
||||||
|
|
||||||
[source,bash]
|
|
||||||
----
|
|
||||||
docker run --mount type=bind,source=/PATH/TO/MODELS,destination=/models,readonly -it --rm docker.elastic.co/eland/eland \
|
|
||||||
eland_import_hub_model \
|
|
||||||
--url 'XXXX' \
|
|
||||||
--hub-model-id /models/bert-base-NER \
|
|
||||||
--task-type ner \
|
|
||||||
--es-username elastic --es-password XXX \
|
|
||||||
--es-model-id bert-base-ner
|
|
||||||
----
|
|
||||||
Once it's uploaded to {es}, the model will have the ID specified by
|
|
||||||
`--es-model-id`. If it is not set, the model ID is derived from
|
|
||||||
`--hub-model-id`; spaces and path delimiters are converted to double
|
|
||||||
underscores `__`.
|
|
||||||
|
|
||||||
--
|
|
||||||
|
|
||||||
|
|
||||||
[discrete]
|
|
||||||
[[ml-nlp-pytorch-auth]]
|
|
||||||
==== Authentication methods
|
|
||||||
|
|
||||||
The following authentication options are available when using the import script:
|
|
||||||
|
|
||||||
* Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
|
|
||||||
+
|
|
||||||
--
|
|
||||||
[source,bash]
|
|
||||||
--------------------------------------------------
|
|
||||||
eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
|
|
||||||
--------------------------------------------------
|
|
||||||
These `-u` and `-p` options also work when you use `--url`.
|
|
||||||
--
|
|
||||||
|
|
||||||
* Elasticsearch username and password authentication (embedded in the URL):
|
|
||||||
+
|
|
||||||
--
|
|
||||||
[source,bash]
|
|
||||||
--------------------------------------------------
|
|
||||||
eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
|
|
||||||
--------------------------------------------------
|
|
||||||
--
|
|
||||||
|
|
||||||
* Elasticsearch API key authentication:
|
|
||||||
+
|
|
||||||
--
|
|
||||||
[source,bash]
|
|
||||||
--------------------------------------------------
|
|
||||||
eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
|
|
||||||
--------------------------------------------------
|
|
||||||
--
|
|
||||||
|
|
||||||
* HuggingFace Hub access token (for private models):
|
|
||||||
+
|
|
||||||
--
|
|
||||||
[source,bash]
|
|
||||||
--------------------------------------------------
|
|
||||||
eland_import_hub_model --hub-access-token <access-token> ...
|
|
||||||
--------------------------------------------------
|
|
||||||
--
|
|
@ -1,16 +1,16 @@
|
|||||||
[[dataframes]]
|
---
|
||||||
== Data Frames
|
mapped_pages:
|
||||||
|
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/dataframes.html
|
||||||
|
---
|
||||||
|
|
||||||
`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API
|
# Data Frames [dataframes]
|
||||||
and defers all processing and filtering of data to Elasticsearch
|
|
||||||
instead of your local machine. This means you can process large
|
|
||||||
amounts of data within Elasticsearch from a Jupyter Notebook
|
|
||||||
without overloading your machine.
|
|
||||||
|
|
||||||
[source,python]
|
`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API and defers all processing and filtering of data to Elasticsearch instead of your local machine. This means you can process large amounts of data within Elasticsearch from a Jupyter Notebook without overloading your machine.
|
||||||
-------------------------------------
|
|
||||||
|
```python
|
||||||
>>> import eland as ed
|
>>> import eland as ed
|
||||||
>>> # Connect to 'flights' index via localhost Elasticsearch node
|
>>>
|
||||||
|
# Connect to 'flights' index via localhost Elasticsearch node
|
||||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||||
|
|
||||||
# eland.DataFrame instance has the same API as pandas.DataFrame
|
# eland.DataFrame instance has the same API as pandas.DataFrame
|
||||||
@ -59,4 +59,5 @@ Elasticsearch storage usage: 5.043 MB
|
|||||||
sum 9.261629e+07 8.204365e+06
|
sum 9.261629e+07 8.204365e+06
|
||||||
min 0.000000e+00 1.000205e+02
|
min 0.000000e+00 1.000205e+02
|
||||||
std 4.578263e+03 2.663867e+02
|
std 4.578263e+03 2.663867e+02
|
||||||
-------------------------------------
|
```
|
||||||
|
|
@ -1,33 +1,36 @@
|
|||||||
[[overview]]
|
---
|
||||||
== Overview
|
mapped_pages:
|
||||||
|
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/index.html
|
||||||
|
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/overview.html
|
||||||
|
navigation_title: Eland
|
||||||
|
---
|
||||||
|
|
||||||
Eland is a Python client and toolkit for DataFrames and {ml} in {es}.
|
# Eland Python client [overview]
|
||||||
Full documentation is available on https://eland.readthedocs.io[Read the Docs].
|
|
||||||
Source code is available on https://github.com/elastic/eland[GitHub].
|
|
||||||
|
|
||||||
[discrete]
|
Eland is a Python client and toolkit for DataFrames and {{ml}} in {{es}}. Full documentation is available on [Read the Docs](https://eland.readthedocs.io). Source code is available on [GitHub](https://github.com/elastic/eland).
|
||||||
=== Compatibility
|
|
||||||
|
|
||||||
- Supports Python 3.8+ and Pandas 1.5
|
|
||||||
- Supports {es} clusters that are 7.11+, recommended 7.14 or later for all features to work.
|
|
||||||
Make sure your Eland major version matches the major version of your Elasticsearch cluster.
|
|
||||||
|
|
||||||
The recommended way to set your requirements in your `setup.py` or
|
## Compatibility [_compatibility]
|
||||||
`requirements.txt` is::
|
|
||||||
|
|
||||||
# Elasticsearch 8.x
|
* Supports Python 3.9+ and Pandas 1.5
|
||||||
eland>=8,<9
|
* Supports {{es}} 8+ clusters, recommended 8.16 or later for all features to work. Make sure your Eland major version matches the major version of your Elasticsearch cluster.
|
||||||
|
|
||||||
# Elasticsearch 7.x
|
The recommended way to set your requirements in your `setup.py` or `requirements.txt` is::
|
||||||
eland>=7,<8
|
|
||||||
|
|
||||||
[discrete]
|
```
|
||||||
=== Getting Started
|
# Elasticsearch 8.x
|
||||||
|
eland>=8,<9
|
||||||
|
```
|
||||||
|
```
|
||||||
|
# Elasticsearch 7.x
|
||||||
|
eland>=7,<8
|
||||||
|
```
|
||||||
|
|
||||||
Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`:
|
## Getting Started [_getting_started]
|
||||||
|
|
||||||
[source,python]
|
Create a `DataFrame` object connected to an {{es}} cluster running on `http://localhost:9200`:
|
||||||
------------------------------------
|
|
||||||
|
```python
|
||||||
>>> import eland as ed
|
>>> import eland as ed
|
||||||
>>> df = ed.DataFrame(
|
>>> df = ed.DataFrame(
|
||||||
... es_client="http://localhost:9200",
|
... es_client="http://localhost:9200",
|
||||||
@ -48,20 +51,19 @@ Create a `DataFrame` object connected to an {es} cluster running on `http://loca
|
|||||||
13058 858.144337 False ... 6 2018-02-11 14:54:34
|
13058 858.144337 False ... 6 2018-02-11 14:54:34
|
||||||
|
|
||||||
[13059 rows x 27 columns]
|
[13059 rows x 27 columns]
|
||||||
------------------------------------
|
```
|
||||||
|
|
||||||
[discrete]
|
|
||||||
==== Elastic Cloud
|
### Elastic Cloud [_elastic_cloud]
|
||||||
|
|
||||||
You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
||||||
|
|
||||||
[source,python]
|
```python
|
||||||
------------------------------------
|
|
||||||
>>> import eland as ed
|
>>> import eland as ed
|
||||||
>>> from elasticsearch import Elasticsearch
|
>>> from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
# First instantiate an 'Elasticsearch' instance connected to Elastic Cloud
|
# First instantiate an 'Elasticsearch' instance connected to Elastic Cloud
|
||||||
>>> es = Elasticsearch(cloud_id="...", api_key=("...", "..."))
|
>>> es = Elasticsearch(cloud_id="...", api_key="...")
|
||||||
|
|
||||||
# then wrap the client in an Eland DataFrame:
|
# then wrap the client in an Eland DataFrame:
|
||||||
>>> df = ed.DataFrame(es, es_index_pattern="flights")
|
>>> df = ed.DataFrame(es, es_index_pattern="flights")
|
||||||
@ -73,16 +75,16 @@ You can also connect Eland to an Elasticsearch instance in Elastic Cloud:
|
|||||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||||
[5 rows x 27 columns]
|
[5 rows x 27 columns]
|
||||||
------------------------------------
|
```
|
||||||
|
|
||||||
Eland can be used for complex queries and aggregations:
|
Eland can be used for complex queries and aggregations:
|
||||||
|
|
||||||
[source,python]
|
```python
|
||||||
------------------------------------
|
|
||||||
>>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False)
|
>>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False)
|
||||||
AvgTicketPrice Cancelled timestamp
|
AvgTicketPrice Cancelled timestamp
|
||||||
Carrier
|
Carrier
|
||||||
ES-Air 630.235816 0.129814 2018-01-21 20:45:00.200000000
|
ES-Air 630.235816 0.129814 2018-01-21 20:45:00.200000000
|
||||||
JetBeats 627.457373 0.134698 2018-01-21 14:43:18.112400635
|
JetBeats 627.457373 0.134698 2018-01-21 14:43:18.112400635
|
||||||
Logstash Airways 624.581974 0.125188 2018-01-21 16:14:50.711798340
|
Logstash Airways 624.581974 0.125188 2018-01-21 16:14:50.711798340
|
||||||
------------------------------------
|
```
|
||||||
|
|
19
docs/reference/installation.md
Normal file
19
docs/reference/installation.md
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
mapped_pages:
|
||||||
|
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/installation.html
|
||||||
|
---
|
||||||
|
|
||||||
|
# Installation [installation]
|
||||||
|
|
||||||
|
Eland can be installed with [pip](https://pip.pypa.io) from [PyPI](https://pypi.org/project/eland). We recommend [using a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) when installing with pip:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ python -m pip install eland
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, Eland can be installed with [Conda](https://docs.conda.io) from [Conda Forge](https://anaconda.org/conda-forge/eland):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ conda install -c conda-forge eland
|
||||||
|
```
|
||||||
|
|
199
docs/reference/machine-learning.md
Normal file
199
docs/reference/machine-learning.md
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
---
|
||||||
|
mapped_pages:
|
||||||
|
- https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html
|
||||||
|
---
|
||||||
|
|
||||||
|
# Machine Learning [machine-learning]
|
||||||
|
|
||||||
|
|
||||||
|
## Trained models [ml-trained-models]
|
||||||
|
|
||||||
|
Eland allows transforming *some*
|
||||||
|
[trained models](https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters) from scikit-learn, XGBoost,
|
||||||
|
and LightGBM libraries to be serialized and used as an inference model in {{es}}.
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from xgboost import XGBClassifier
|
||||||
|
>>> from eland.ml import MLModel
|
||||||
|
|
||||||
|
# Train and exercise an XGBoost ML model locally
|
||||||
|
>>> xgb_model = XGBClassifier(booster="gbtree")
|
||||||
|
>>> xgb_model.fit(training_data[0], training_data[1])
|
||||||
|
|
||||||
|
>>> xgb_model.predict(training_data[0])
|
||||||
|
[0 1 1 0 1 0 0 0 1 0]
|
||||||
|
|
||||||
|
# Import the model into Elasticsearch
|
||||||
|
>>> es_model = MLModel.import_model(
|
||||||
|
es_client="http://localhost:9200",
|
||||||
|
model_id="xgb-classifier",
|
||||||
|
model=xgb_model,
|
||||||
|
feature_names=["f0", "f1", "f2", "f3", "f4"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Exercise the ML model in Elasticsearch with the training data
|
||||||
|
>>> es_model.predict(training_data[0])
|
||||||
|
[0 1 1 0 1 0 0 0 1 0]
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Natural language processing (NLP) with PyTorch [ml-nlp-pytorch]
|
||||||
|
|
||||||
|
::::{important}
|
||||||
|
You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version.
|
||||||
|
::::
|
||||||
|
|
||||||
|
|
||||||
|
For NLP tasks, Eland enables you to import PyTorch models into {{es}}. Use the `eland_import_hub_model` script to download and install supported [transformer models](https://huggingface.co/transformers) from the [Hugging Face model hub](https://huggingface.co/models). For example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model <authentication> \ <1>
|
||||||
|
--url http://localhost:9200/ \ <2>
|
||||||
|
--hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3>
|
||||||
|
--task-type ner \ <4>
|
||||||
|
--start
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Use an authentication method to access your cluster. Refer to [Authentication methods](machine-learning.md#ml-nlp-pytorch-auth).
|
||||||
|
2. The cluster URL. Alternatively, use `--cloud-id`.
|
||||||
|
3. Specify the identifier for the model in the Hugging Face model hub.
|
||||||
|
4. Specify the type of NLP task. Supported values are `fill_mask`, `ner`, `question_answering`, `text_classification`, `text_embedding`, `text_expansion`, `text_similarity` and `zero_shot_classification`.
|
||||||
|
|
||||||
|
|
||||||
|
For more information about the available options, run `eland_import_hub_model` with the `--help` option.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model --help
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Import model with Docker [ml-nlp-pytorch-docker]
|
||||||
|
|
||||||
|
::::{important}
|
||||||
|
To use the Docker container, you need to clone the Eland repository: [https://github.com/elastic/eland](https://github.com/elastic/eland)
|
||||||
|
::::
|
||||||
|
|
||||||
|
|
||||||
|
If you want to use Eland without installing it, you can use the Docker image:
|
||||||
|
|
||||||
|
You can use the container interactively:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -it --rm --network host docker.elastic.co/eland/eland
|
||||||
|
```
|
||||||
|
|
||||||
|
Running installed scripts is also possible without an interactive shell, for example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -it --rm docker.elastic.co/eland/eland \
|
||||||
|
eland_import_hub_model \
|
||||||
|
--url $ELASTICSEARCH_URL \
|
||||||
|
--hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \
|
||||||
|
--start
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`.
|
||||||
|
|
||||||
|
|
||||||
|
### Install models in an air-gapped environment [ml-nlp-pytorch-air-gapped]
|
||||||
|
|
||||||
|
You can install models in a restricted or closed network by pointing the `eland_import_hub_model` script to local files.
|
||||||
|
|
||||||
|
For an offline install of a Hugging Face model, the model first needs to be cloned locally, Git and [Git Large File Storage](https://git-lfs.com/) are required to be installed in your system.
|
||||||
|
|
||||||
|
1. Select a model you want to use from Hugging Face. Refer to the [compatible third party model](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md) list for more information on the supported architectures.
|
||||||
|
2. Clone the selected model from Hugging Face by using the model URL. For example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://huggingface.co/dslim/bert-base-NER
|
||||||
|
```
|
||||||
|
|
||||||
|
This command results in a local copy of of the model in the directory `bert-base-NER`.
|
||||||
|
|
||||||
|
3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the directory of the cloned model to install it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model \
|
||||||
|
--url 'XXXX' \
|
||||||
|
--hub-model-id /PATH/TO/MODEL \
|
||||||
|
--task-type ner \
|
||||||
|
--es-username elastic --es-password XXX \
|
||||||
|
--es-model-id bert-base-ner
|
||||||
|
```
|
||||||
|
|
||||||
|
If you use the Docker image to run `eland_import_hub_model` you must bind mount the model directory, so the container can read the files:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \
|
||||||
|
eland_import_hub_model \
|
||||||
|
--url 'XXXX' \
|
||||||
|
--hub-model-id /model \
|
||||||
|
--task-type ner \
|
||||||
|
--es-username elastic --es-password XXX \
|
||||||
|
--es-model-id bert-base-ner
|
||||||
|
```
|
||||||
|
|
||||||
|
Once it’s uploaded to {{es}}, the model will have the ID specified by `--es-model-id`. If it is not set, the model ID is derived from `--hub-model-id`; spaces and path delimiters are converted to double underscores `__`.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Connect to Elasticsearch through a proxy [ml-nlp-pytorch-proxy]
|
||||||
|
|
||||||
|
Behind the scenes, Eland uses the `requests` Python library, which [allows configuring proxies through an environment variable](https://requests.readthedocs.io/en/latest/user/advanced/#proxies). For example, to use an HTTP proxy to connect to an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable when invoking Eland:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ...
|
||||||
|
```
|
||||||
|
|
||||||
|
If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY` instead.
|
||||||
|
|
||||||
|
|
||||||
|
### Authentication methods [ml-nlp-pytorch-auth]
|
||||||
|
|
||||||
|
The following authentication options are available when using the import script:
|
||||||
|
|
||||||
|
* Elasticsearch username and password authentication (specified with the `-u` and `-p` options):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model -u <username> -p <password> --cloud-id <cloud-id> ...
|
||||||
|
```
|
||||||
|
|
||||||
|
These `-u` and `-p` options also work when you use `--url`.
|
||||||
|
|
||||||
|
* Elasticsearch username and password authentication (embedded in the URL):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model --url https://<user>:<password>@<hostname>:<port> ...
|
||||||
|
```
|
||||||
|
|
||||||
|
* Elasticsearch API key authentication:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model --es-api-key <api-key> --url https://<hostname>:<port> ...
|
||||||
|
```
|
||||||
|
|
||||||
|
* HuggingFace Hub access token (for private models):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model --hub-access-token <access-token> ...
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### TLS/SSL [ml-nlp-pytorch-tls]
|
||||||
|
|
||||||
|
The following TLS/SSL options for Elasticsearch are available when using the import script:
|
||||||
|
|
||||||
|
* Specify alternate CA bundle to verify the cluster certificate:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model --ca-certs CA_CERTS ...
|
||||||
|
```
|
||||||
|
|
||||||
|
* Disable TLS/SSL verification altogether (strongly discouraged):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eland_import_hub_model --insecure ...
|
||||||
|
```
|
||||||
|
|
||||||
|
|
6
docs/reference/toc.yml
Normal file
6
docs/reference/toc.yml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
project: 'Eland reference'
|
||||||
|
toc:
|
||||||
|
- file: index.md
|
||||||
|
- file: installation.md
|
||||||
|
- file: dataframes.md
|
||||||
|
- file: machine-learning.md
|
@ -1,13 +1,5 @@
|
|||||||
elasticsearch>=7.7
|
matplotlib
|
||||||
pandas>=1.5
|
|
||||||
matplotlib>=3.6
|
|
||||||
nbval
|
nbval
|
||||||
scikit-learn>=0.22.1
|
|
||||||
xgboost>=1
|
|
||||||
lightgbm
|
|
||||||
sphinx==5.3.0
|
sphinx==5.3.0
|
||||||
nbsphinx
|
nbsphinx
|
||||||
furo
|
furo
|
||||||
|
|
||||||
# traitlets has been having all sorts of release problems lately.
|
|
||||||
traitlets<5.1
|
|
||||||
|
@ -200,7 +200,7 @@ Configuring PyCharm And Running Tests
|
|||||||
- To test specific versions of Python run
|
- To test specific versions of Python run
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
nox -s test-3.8
|
nox -s test-3.12
|
||||||
|
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
@ -24,7 +24,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"For this example, you will need:\n",
|
"For this example, you will need:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"- Python 3.8 or later\n",
|
"- Python 3.9 or later\n",
|
||||||
"- An Elastic deployment\n",
|
"- An Elastic deployment\n",
|
||||||
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
|
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -49,6 +49,7 @@
|
|||||||
~DataFrame.tail
|
~DataFrame.tail
|
||||||
~DataFrame.to_csv
|
~DataFrame.to_csv
|
||||||
~DataFrame.to_html
|
~DataFrame.to_html
|
||||||
|
~DataFrame.to_json
|
||||||
~DataFrame.to_numpy
|
~DataFrame.to_numpy
|
||||||
~DataFrame.to_pandas
|
~DataFrame.to_pandas
|
||||||
~DataFrame.to_string
|
~DataFrame.to_string
|
||||||
|
6
docs/sphinx/reference/api/eland.DataFrame.to_json.rst
Normal file
6
docs/sphinx/reference/api/eland.DataFrame.to_json.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.DataFrame.to\_json
|
||||||
|
========================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. automethod:: DataFrame.to_json
|
@ -17,6 +17,7 @@
|
|||||||
~MLModel.delete_model
|
~MLModel.delete_model
|
||||||
~MLModel.exists_model
|
~MLModel.exists_model
|
||||||
~MLModel.export_model
|
~MLModel.export_model
|
||||||
|
~MLModel.import_ltr_model
|
||||||
~MLModel.import_model
|
~MLModel.import_model
|
||||||
~MLModel.predict
|
~MLModel.predict
|
||||||
|
|
||||||
|
@ -140,5 +140,6 @@ Serialization / IO / Conversion
|
|||||||
DataFrame.to_numpy
|
DataFrame.to_numpy
|
||||||
DataFrame.to_csv
|
DataFrame.to_csv
|
||||||
DataFrame.to_html
|
DataFrame.to_html
|
||||||
|
DataFrame.to_json
|
||||||
DataFrame.to_string
|
DataFrame.to_string
|
||||||
DataFrame.to_pandas
|
DataFrame.to_pandas
|
||||||
|
@ -395,7 +395,7 @@ script instead of being modified manually.
|
|||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.DataFrame.to_html()`` | **Yes** |
|
| ``ed.DataFrame.to_html()`` | **Yes** |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.DataFrame.to_json()`` | No |
|
| ``ed.DataFrame.to_json()`` | **Yes** |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
| ``ed.DataFrame.to_latex()`` | No |
|
| ``ed.DataFrame.to_latex()`` | No |
|
||||||
+---------------------------------------+------------+
|
+---------------------------------------+------------+
|
||||||
|
@ -15,6 +15,8 @@
|
|||||||
# specific language governing permissions and limitations
|
# specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ._version import ( # noqa: F401
|
from ._version import ( # noqa: F401
|
||||||
__author__,
|
__author__,
|
||||||
__author_email__,
|
__author_email__,
|
||||||
@ -25,13 +27,16 @@ from ._version import ( # noqa: F401
|
|||||||
__url__,
|
__url__,
|
||||||
__version__,
|
__version__,
|
||||||
)
|
)
|
||||||
from .common import SortOrder
|
from .common import ElandDeprecationWarning, SortOrder
|
||||||
from .dataframe import DataFrame
|
from .dataframe import DataFrame
|
||||||
from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland
|
from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland
|
||||||
from .index import Index
|
from .index import Index
|
||||||
from .ndframe import NDFrame
|
from .ndframe import NDFrame
|
||||||
from .series import Series
|
from .series import Series
|
||||||
|
|
||||||
|
# Display Eland deprecation warnings by default
|
||||||
|
warnings.simplefilter("default", category=ElandDeprecationWarning)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"DataFrame",
|
"DataFrame",
|
||||||
"Series",
|
"Series",
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
__title__ = "eland"
|
__title__ = "eland"
|
||||||
__description__ = "Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch"
|
__description__ = "Python Client and Toolkit for DataFrames, Big Data, Machine Learning and ETL in Elasticsearch"
|
||||||
__url__ = "https://github.com/elastic/eland"
|
__url__ = "https://github.com/elastic/eland"
|
||||||
__version__ = "8.12.0"
|
__version__ = "9.0.1"
|
||||||
__author__ = "Steve Dodson"
|
__author__ = "Steve Dodson"
|
||||||
__author_email__ = "steve.dodson@elastic.co"
|
__author_email__ = "steve.dodson@elastic.co"
|
||||||
__maintainer__ = "Elastic Client Library Maintainers"
|
__maintainer__ = "Elastic Client Library Maintainers"
|
||||||
|
@ -32,7 +32,8 @@ import textwrap
|
|||||||
from elastic_transport.client_utils import DEFAULT
|
from elastic_transport.client_utils import DEFAULT
|
||||||
from elasticsearch import AuthenticationException, Elasticsearch
|
from elasticsearch import AuthenticationException, Elasticsearch
|
||||||
|
|
||||||
from eland.common import parse_es_version
|
from eland._version import __version__
|
||||||
|
from eland.common import is_serverless_es, parse_es_version
|
||||||
|
|
||||||
MODEL_HUB_URL = "https://huggingface.co"
|
MODEL_HUB_URL = "https://huggingface.co"
|
||||||
|
|
||||||
@ -40,7 +41,9 @@ MODEL_HUB_URL = "https://huggingface.co"
|
|||||||
def get_arg_parser():
|
def get_arg_parser():
|
||||||
from eland.ml.pytorch.transformers import SUPPORTED_TASK_TYPES
|
from eland.ml.pytorch.transformers import SUPPORTED_TASK_TYPES
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser(
|
||||||
|
exit_on_error=False
|
||||||
|
) # throw exception rather than exit
|
||||||
location_args = parser.add_mutually_exclusive_group(required=True)
|
location_args = parser.add_mutually_exclusive_group(required=True)
|
||||||
location_args.add_argument(
|
location_args.add_argument(
|
||||||
"--url",
|
"--url",
|
||||||
@ -96,7 +99,7 @@ def get_arg_parser():
|
|||||||
"--task-type",
|
"--task-type",
|
||||||
required=False,
|
required=False,
|
||||||
choices=SUPPORTED_TASK_TYPES,
|
choices=SUPPORTED_TASK_TYPES,
|
||||||
help="The task type for the model usage. Will attempt to auto-detect task type for the model if not provided. "
|
help="The task type for the model usage. Use text_similarity for rerank tasks. Will attempt to auto-detect task type for the model if not provided. "
|
||||||
"Default: auto",
|
"Default: auto",
|
||||||
default="auto",
|
default="auto",
|
||||||
)
|
)
|
||||||
@ -141,15 +144,47 @@ def get_arg_parser():
|
|||||||
help="String to prepend to model input at search",
|
help="String to prepend to model input at search",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-model-input-length",
|
||||||
|
required=False,
|
||||||
|
default=None,
|
||||||
|
help="""Set the model's max input length.
|
||||||
|
Usually the max input length is derived from the Hugging Face
|
||||||
|
model confifguation. Use this option to explicity set the model's
|
||||||
|
max input length if the value can not be found in the Hugging
|
||||||
|
Face configuration. Max input length should never exceed the
|
||||||
|
model's true max length, setting a smaller max length is valid.
|
||||||
|
""",
|
||||||
|
type=int,
|
||||||
|
)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = get_arg_parser()
|
||||||
|
try:
|
||||||
|
return parser.parse_args()
|
||||||
|
except argparse.ArgumentError as argument_error:
|
||||||
|
if argument_error.argument_name == "--task-type":
|
||||||
|
message = (
|
||||||
|
argument_error.message
|
||||||
|
+ "\n\nUse 'text_similarity' for rerank tasks in Elasticsearch"
|
||||||
|
)
|
||||||
|
parser.error(message=message)
|
||||||
|
else:
|
||||||
|
parser.error(message=argument_error.message)
|
||||||
|
except argparse.ArgumentTypeError as type_error:
|
||||||
|
parser.error(str(type_error))
|
||||||
|
|
||||||
|
|
||||||
def get_es_client(cli_args, logger):
|
def get_es_client(cli_args, logger):
|
||||||
try:
|
try:
|
||||||
es_args = {
|
es_args = {
|
||||||
"request_timeout": 300,
|
"request_timeout": 300,
|
||||||
"verify_certs": cli_args.insecure,
|
"verify_certs": cli_args.insecure,
|
||||||
"ca_certs": cli_args.ca_certs,
|
"ca_certs": cli_args.ca_certs,
|
||||||
|
"node_class": "requests",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Deployment location
|
# Deployment location
|
||||||
@ -180,13 +215,20 @@ def get_es_client(cli_args, logger):
|
|||||||
|
|
||||||
def check_cluster_version(es_client, logger):
|
def check_cluster_version(es_client, logger):
|
||||||
es_info = es_client.info()
|
es_info = es_client.info()
|
||||||
|
|
||||||
|
if is_serverless_es(es_client):
|
||||||
|
logger.info(f"Connected to serverless cluster '{es_info['cluster_name']}'")
|
||||||
|
# Serverless is compatible
|
||||||
|
# Return the latest known semantic version, i.e. this version
|
||||||
|
return parse_es_version(__version__)
|
||||||
|
|
||||||
|
# check the semantic version for none serverless clusters
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Connected to cluster named '{es_info['cluster_name']}' (version: {es_info['version']['number']})"
|
f"Connected to cluster named '{es_info['cluster_name']}' (version: {es_info['version']['number']})"
|
||||||
)
|
)
|
||||||
|
|
||||||
sem_ver = parse_es_version(es_info["version"]["number"])
|
sem_ver = parse_es_version(es_info["version"]["number"])
|
||||||
major_version = sem_ver[0]
|
major_version = sem_ver[0]
|
||||||
minor_version = sem_ver[1]
|
|
||||||
|
|
||||||
# NLP models added in 8
|
# NLP models added in 8
|
||||||
if major_version < 8:
|
if major_version < 8:
|
||||||
@ -194,14 +236,9 @@ def check_cluster_version(es_client, logger):
|
|||||||
f"Elasticsearch version {major_version} does not support NLP models. Please upgrade Elasticsearch to the latest version"
|
f"Elasticsearch version {major_version} does not support NLP models. Please upgrade Elasticsearch to the latest version"
|
||||||
)
|
)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
elif major_version < 9:
|
||||||
# PyTorch was upgraded to version 1.13.1 in 8.7.
|
|
||||||
# and is incompatible with earlier versions
|
|
||||||
if major_version == 8 and minor_version < 7:
|
|
||||||
import torch
|
|
||||||
|
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Eland uses PyTorch version {torch.__version__} which is incompatible with Elasticsearch versions prior to 8.7. Please upgrade Elasticsearch to at least version 8.7"
|
"Eland 9.x does not support Elasticsearch 8.x. Please upgrade Elasticsearch first."
|
||||||
)
|
)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
@ -220,6 +257,7 @@ def main():
|
|||||||
SUPPORTED_TASK_TYPES,
|
SUPPORTED_TASK_TYPES,
|
||||||
TaskTypeError,
|
TaskTypeError,
|
||||||
TransformerModel,
|
TransformerModel,
|
||||||
|
UnknownModelInputSizeError,
|
||||||
)
|
)
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -237,7 +275,7 @@ def main():
|
|||||||
assert SUPPORTED_TASK_TYPES
|
assert SUPPORTED_TASK_TYPES
|
||||||
|
|
||||||
# Parse arguments
|
# Parse arguments
|
||||||
args = get_arg_parser().parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Connect to ES
|
# Connect to ES
|
||||||
logger.info("Establishing connection to Elasticsearch")
|
logger.info("Establishing connection to Elasticsearch")
|
||||||
@ -259,6 +297,7 @@ def main():
|
|||||||
quantize=args.quantize,
|
quantize=args.quantize,
|
||||||
ingest_prefix=args.ingest_prefix,
|
ingest_prefix=args.ingest_prefix,
|
||||||
search_prefix=args.search_prefix,
|
search_prefix=args.search_prefix,
|
||||||
|
max_model_input_size=args.max_model_input_length,
|
||||||
)
|
)
|
||||||
model_path, config, vocab_path = tm.save(tmp_dir)
|
model_path, config, vocab_path = tm.save(tmp_dir)
|
||||||
except TaskTypeError as err:
|
except TaskTypeError as err:
|
||||||
@ -266,6 +305,12 @@ def main():
|
|||||||
f"Failed to get model for task type, please provide valid task type via '--task-type' parameter. Caused by {err}"
|
f"Failed to get model for task type, please provide valid task type via '--task-type' parameter. Caused by {err}"
|
||||||
)
|
)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
except UnknownModelInputSizeError as err:
|
||||||
|
logger.error(
|
||||||
|
f"""Could not automatically determine the model's max input size from the model configuration.
|
||||||
|
Please provde the max input size via the --max-model-input-length parameter. Caused by {err}"""
|
||||||
|
)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
ptm = PyTorchModel(
|
ptm = PyTorchModel(
|
||||||
es, args.es_model_id if args.es_model_id else tm.elasticsearch_model_id()
|
es, args.es_model_id if args.es_model_id else tm.elasticsearch_model_id()
|
||||||
|
@ -52,6 +52,10 @@ PANDAS_VERSION: Tuple[int, ...] = tuple(
|
|||||||
_ELAND_MAJOR_VERSION = int(_eland_version.split(".")[0])
|
_ELAND_MAJOR_VERSION = int(_eland_version.split(".")[0])
|
||||||
|
|
||||||
|
|
||||||
|
class ElandDeprecationWarning(DeprecationWarning):
|
||||||
|
"""Warning for deprecation functionality in Eland"""
|
||||||
|
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
||||||
@ -305,11 +309,15 @@ def elasticsearch_date_to_pandas_date(
|
|||||||
|
|
||||||
|
|
||||||
def ensure_es_client(
|
def ensure_es_client(
|
||||||
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
|
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
|
||||||
) -> Elasticsearch:
|
) -> Elasticsearch:
|
||||||
if isinstance(es_client, tuple):
|
if isinstance(es_client, tuple):
|
||||||
es_client = list(es_client)
|
es_client = list(es_client)
|
||||||
if not isinstance(es_client, Elasticsearch):
|
if (
|
||||||
|
isinstance(es_client, str)
|
||||||
|
or isinstance(es_client, list)
|
||||||
|
or isinstance(es_client, tuple)
|
||||||
|
):
|
||||||
es_client = Elasticsearch(es_client)
|
es_client = Elasticsearch(es_client)
|
||||||
return es_client
|
return es_client
|
||||||
|
|
||||||
@ -340,6 +348,17 @@ def es_version(es_client: Elasticsearch) -> Tuple[int, int, int]:
|
|||||||
return eland_es_version
|
return eland_es_version
|
||||||
|
|
||||||
|
|
||||||
|
def is_serverless_es(es_client: Elasticsearch) -> bool:
|
||||||
|
"""
|
||||||
|
Returns true if the client is connected to a serverless instance of Elasticsearch.
|
||||||
|
"""
|
||||||
|
es_info = es_client.info()
|
||||||
|
return (
|
||||||
|
"build_flavor" in es_info["version"]
|
||||||
|
and es_info["version"]["build_flavor"] == "serverless"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_es_version(version: str) -> Tuple[int, int, int]:
|
def parse_es_version(version: str) -> Tuple[int, int, int]:
|
||||||
"""
|
"""
|
||||||
Parse the semantic version from a string e.g. '8.8.0'
|
Parse the semantic version from a string e.g. '8.8.0'
|
||||||
|
@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore
|
|||||||
from pandas.util._validators import validate_bool_kwarg # type: ignore
|
from pandas.util._validators import validate_bool_kwarg # type: ignore
|
||||||
|
|
||||||
import eland.plotting as gfx
|
import eland.plotting as gfx
|
||||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||||
from eland.filter import BooleanFilter
|
from eland.filter import BooleanFilter
|
||||||
from eland.groupby import DataFrameGroupBy
|
from eland.groupby import DataFrameGroupBy
|
||||||
from eland.ndframe import NDFrame
|
from eland.ndframe import NDFrame
|
||||||
@ -83,7 +83,7 @@ class DataFrame(NDFrame):
|
|||||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
[5 rows x 27 columns]
|
[5 rows x 28 columns]
|
||||||
|
|
||||||
|
|
||||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
||||||
@ -173,13 +173,13 @@ class DataFrame(NDFrame):
|
|||||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||||
>>> assert isinstance(df.columns, pd.Index)
|
>>> assert isinstance(df.columns, pd.Index)
|
||||||
>>> df.columns
|
>>> df.columns
|
||||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
|
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Cities', 'Dest', 'DestAirportID', 'DestCityName',
|
||||||
... 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
|
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
|
||||||
... 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
|
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
|
||||||
... 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||||
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
|
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
|
||||||
... 'timestamp'],
|
'timestamp'],
|
||||||
... dtype='object')
|
dtype='object')
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.columns
|
return self._query_compiler.columns
|
||||||
|
|
||||||
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
|
|||||||
axis = pd.DataFrame._get_axis_name(axis)
|
axis = pd.DataFrame._get_axis_name(axis)
|
||||||
axes = {axis: labels}
|
axes = {axis: labels}
|
||||||
elif index is not None or columns is not None:
|
elif index is not None or columns is not None:
|
||||||
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
|
axes = {"columns": columns, "index": index}
|
||||||
(index, columns), {}
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Need to specify at least one of 'labels', 'index' or 'columns'"
|
"Need to specify at least one of 'labels', 'index' or 'columns'"
|
||||||
@ -956,8 +954,10 @@ class DataFrame(NDFrame):
|
|||||||
elif verbose is False: # specifically set to False, not nesc None
|
elif verbose is False: # specifically set to False, not nesc None
|
||||||
_non_verbose_repr()
|
_non_verbose_repr()
|
||||||
else:
|
else:
|
||||||
_non_verbose_repr() if exceeds_info_cols else _verbose_repr(
|
(
|
||||||
number_of_columns
|
_non_verbose_repr()
|
||||||
|
if exceeds_info_cols
|
||||||
|
else _verbose_repr(number_of_columns)
|
||||||
)
|
)
|
||||||
|
|
||||||
# pandas 0.25.1 uses get_dtype_counts() here. This
|
# pandas 0.25.1 uses get_dtype_counts() here. This
|
||||||
@ -1303,6 +1303,7 @@ class DataFrame(NDFrame):
|
|||||||
quoting=None,
|
quoting=None,
|
||||||
quotechar='"',
|
quotechar='"',
|
||||||
line_terminator=None,
|
line_terminator=None,
|
||||||
|
lineterminator=None,
|
||||||
chunksize=None,
|
chunksize=None,
|
||||||
tupleize_cols=None,
|
tupleize_cols=None,
|
||||||
date_format=None,
|
date_format=None,
|
||||||
@ -1317,6 +1318,13 @@ class DataFrame(NDFrame):
|
|||||||
--------
|
--------
|
||||||
:pandas_api_docs:`pandas.DataFrame.to_csv`
|
:pandas_api_docs:`pandas.DataFrame.to_csv`
|
||||||
"""
|
"""
|
||||||
|
if line_terminator:
|
||||||
|
warnings.warn(
|
||||||
|
"The line_terminator argument will be replaced by lineterminator",
|
||||||
|
PendingDeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"path_or_buf": path_or_buf,
|
"path_or_buf": path_or_buf,
|
||||||
"sep": sep,
|
"sep": sep,
|
||||||
@ -1331,7 +1339,7 @@ class DataFrame(NDFrame):
|
|||||||
"compression": compression,
|
"compression": compression,
|
||||||
"quoting": quoting,
|
"quoting": quoting,
|
||||||
"quotechar": quotechar,
|
"quotechar": quotechar,
|
||||||
"line_terminator": line_terminator,
|
"lineterminator": lineterminator or line_terminator,
|
||||||
"chunksize": chunksize,
|
"chunksize": chunksize,
|
||||||
"date_format": date_format,
|
"date_format": date_format,
|
||||||
"doublequote": doublequote,
|
"doublequote": doublequote,
|
||||||
@ -1340,6 +1348,50 @@ class DataFrame(NDFrame):
|
|||||||
}
|
}
|
||||||
return self._query_compiler.to_csv(**kwargs)
|
return self._query_compiler.to_csv(**kwargs)
|
||||||
|
|
||||||
|
def to_json(
|
||||||
|
self,
|
||||||
|
path_or_buf=None,
|
||||||
|
orient=None,
|
||||||
|
date_format=None,
|
||||||
|
double_precision=10,
|
||||||
|
force_ascii=True,
|
||||||
|
date_unit="ms",
|
||||||
|
default_handler=None,
|
||||||
|
lines=False,
|
||||||
|
compression="infer",
|
||||||
|
index=None,
|
||||||
|
indent=None,
|
||||||
|
storage_options=None,
|
||||||
|
):
|
||||||
|
"""Write Elasticsearch data to a json file.
|
||||||
|
|
||||||
|
By setting the ``lines`` parameter to ``True``, and ``orient`` to ``'records'``,
|
||||||
|
the entire DataFrame can be written in a streaming manner.
|
||||||
|
Doing so avoids the need to have the entire DataFrame in memory.
|
||||||
|
This format is known as JSON lines and can use the file extension ``.jsonl``.
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.DataFrame.to_json`
|
||||||
|
"""
|
||||||
|
if index is None and PANDAS_VERSION[0] == 1:
|
||||||
|
index = True # switch to the pandas 1 default
|
||||||
|
kwargs = {
|
||||||
|
"path_or_buf": path_or_buf,
|
||||||
|
"orient": orient,
|
||||||
|
"date_format": date_format,
|
||||||
|
"double_precision": double_precision,
|
||||||
|
"force_ascii": force_ascii,
|
||||||
|
"date_unit": date_unit,
|
||||||
|
"default_handler": default_handler,
|
||||||
|
"lines": lines,
|
||||||
|
"compression": compression,
|
||||||
|
"index": index,
|
||||||
|
"indent": indent,
|
||||||
|
"storage_options": storage_options,
|
||||||
|
}
|
||||||
|
return self._query_compiler.to_json(**kwargs)
|
||||||
|
|
||||||
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
|
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Utility method to convert eland.Dataframe to pandas.Dataframe
|
Utility method to convert eland.Dataframe to pandas.Dataframe
|
||||||
@ -1962,9 +2014,9 @@ class DataFrame(NDFrame):
|
|||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||||
>>> df.shape
|
>>> df.shape
|
||||||
(13059, 27)
|
(13059, 28)
|
||||||
>>> df.query('FlightDelayMin > 60').shape
|
>>> df.query('FlightDelayMin > 60').shape
|
||||||
(2730, 27)
|
(2730, 28)
|
||||||
"""
|
"""
|
||||||
if isinstance(expr, BooleanFilter):
|
if isinstance(expr, BooleanFilter):
|
||||||
return DataFrame(
|
return DataFrame(
|
||||||
|
43
eland/etl.py
43
eland/etl.py
@ -16,6 +16,7 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
import warnings
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
|
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
|
||||||
|
|
||||||
@ -110,15 +111,15 @@ def pandas_to_eland(
|
|||||||
2 3.141 1 ... 3 Long text - to be indexed as es type text
|
2 3.141 1 ... 3 Long text - to be indexed as es type text
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
[3 rows x 8 columns]
|
[3 rows x 8 columns]
|
||||||
>>> pd_df.dtypes
|
>>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
|
||||||
A float64
|
A float64
|
||||||
B int64
|
B int64
|
||||||
C object
|
C object
|
||||||
D datetime64[ns]
|
D datetime64[s]
|
||||||
E float64
|
E float64
|
||||||
F bool
|
F bool
|
||||||
G int64
|
G int64
|
||||||
H object
|
H object
|
||||||
dtype: object
|
dtype: object
|
||||||
|
|
||||||
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
|
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
|
||||||
@ -262,7 +263,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
|||||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
[5 rows x 27 columns]
|
[5 rows x 28 columns]
|
||||||
|
|
||||||
Convert `eland.DataFrame` to `pandas.DataFrame` (Note: this loads entire Elasticsearch index into core memory)
|
Convert `eland.DataFrame` to `pandas.DataFrame` (Note: this loads entire Elasticsearch index into core memory)
|
||||||
|
|
||||||
@ -277,7 +278,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
|||||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
[5 rows x 27 columns]
|
[5 rows x 28 columns]
|
||||||
|
|
||||||
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
|
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
|
||||||
|
|
||||||
@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
|
|||||||
names=None,
|
names=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
usecols=None,
|
usecols=None,
|
||||||
squeeze=False,
|
squeeze=None,
|
||||||
prefix=None,
|
prefix=None,
|
||||||
mangle_dupe_cols=True,
|
mangle_dupe_cols=None,
|
||||||
# General Parsing Configuration
|
# General Parsing Configuration
|
||||||
dtype=None,
|
dtype=None,
|
||||||
engine=None,
|
engine=None,
|
||||||
@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
|
|||||||
low_memory: bool = _DEFAULT_LOW_MEMORY,
|
low_memory: bool = _DEFAULT_LOW_MEMORY,
|
||||||
memory_map=False,
|
memory_map=False,
|
||||||
float_precision=None,
|
float_precision=None,
|
||||||
|
**extra_kwargs,
|
||||||
) -> "DataFrame":
|
) -> "DataFrame":
|
||||||
"""
|
"""
|
||||||
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
|
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
|
||||||
@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
|
|||||||
"usecols": usecols,
|
"usecols": usecols,
|
||||||
"verbose": verbose,
|
"verbose": verbose,
|
||||||
"encoding": encoding,
|
"encoding": encoding,
|
||||||
"squeeze": squeeze,
|
|
||||||
"memory_map": memory_map,
|
"memory_map": memory_map,
|
||||||
"float_precision": float_precision,
|
"float_precision": float_precision,
|
||||||
"na_filter": na_filter,
|
"na_filter": na_filter,
|
||||||
@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
|
|||||||
"error_bad_lines": error_bad_lines,
|
"error_bad_lines": error_bad_lines,
|
||||||
"on_bad_lines": on_bad_lines,
|
"on_bad_lines": on_bad_lines,
|
||||||
"low_memory": low_memory,
|
"low_memory": low_memory,
|
||||||
"mangle_dupe_cols": mangle_dupe_cols,
|
|
||||||
"infer_datetime_format": infer_datetime_format,
|
"infer_datetime_format": infer_datetime_format,
|
||||||
"skip_blank_lines": skip_blank_lines,
|
"skip_blank_lines": skip_blank_lines,
|
||||||
|
**extra_kwargs,
|
||||||
}
|
}
|
||||||
|
|
||||||
if chunksize is None:
|
if chunksize is None:
|
||||||
@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore
|
|||||||
|
|
||||||
kwargs.pop("on_bad_lines")
|
kwargs.pop("on_bad_lines")
|
||||||
|
|
||||||
|
if "squeeze" in kwargs:
|
||||||
|
kwargs.pop("squeeze")
|
||||||
|
warnings.warn(
|
||||||
|
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
|
||||||
|
)
|
||||||
|
|
||||||
|
if "mangle_dupe_cols" in kwargs:
|
||||||
|
kwargs.pop("mangle_dupe_cols")
|
||||||
|
warnings.warn(
|
||||||
|
"The mangle_dupe_cols argument no longer works. Furthermore, "
|
||||||
|
"duplicate columns will automatically get a number suffix."
|
||||||
|
)
|
||||||
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
|
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
|
||||||
reader = pd.read_csv(filepath_or_buffer, **kwargs)
|
reader = pd.read_csv(filepath_or_buffer, **kwargs)
|
||||||
|
|
||||||
|
@ -443,9 +443,9 @@ class FieldMappings:
|
|||||||
try:
|
try:
|
||||||
series = df.loc[df.es_field_name == es_field_name_keyword]
|
series = df.loc[df.es_field_name == es_field_name_keyword]
|
||||||
if not series.empty and series.is_aggregatable.squeeze():
|
if not series.empty and series.is_aggregatable.squeeze():
|
||||||
row_as_dict[
|
row_as_dict["aggregatable_es_field_name"] = (
|
||||||
"aggregatable_es_field_name"
|
es_field_name_keyword
|
||||||
] = es_field_name_keyword
|
)
|
||||||
else:
|
else:
|
||||||
row_as_dict["aggregatable_es_field_name"] = None
|
row_as_dict["aggregatable_es_field_name"] = None
|
||||||
except KeyError:
|
except KeyError:
|
||||||
@ -712,8 +712,11 @@ class FieldMappings:
|
|||||||
capabilities, orient="index", columns=FieldMappings.column_labels
|
capabilities, orient="index", columns=FieldMappings.column_labels
|
||||||
)
|
)
|
||||||
|
|
||||||
self._mappings_capabilities = self._mappings_capabilities.append(
|
self._mappings_capabilities = pd.concat(
|
||||||
capability_matrix_row
|
[
|
||||||
|
self._mappings_capabilities,
|
||||||
|
capability_matrix_row,
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
def numeric_source_fields(self) -> List[str]:
|
def numeric_source_fields(self) -> List[str]:
|
||||||
|
@ -50,10 +50,7 @@ class Index:
|
|||||||
# index_field.setter
|
# index_field.setter
|
||||||
self._is_source_field = False
|
self._is_source_field = False
|
||||||
|
|
||||||
# The type:ignore is due to mypy not being smart enough
|
self.es_index_field = es_index_field
|
||||||
# to recognize the property.setter has a different type
|
|
||||||
# than the property.getter.
|
|
||||||
self.es_index_field = es_index_field # type: ignore
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sort_field(self) -> str:
|
def sort_field(self) -> str:
|
||||||
|
@ -19,7 +19,7 @@ import base64
|
|||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
from typing import Any, Dict, List, Optional, Sequence
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
|
|
||||||
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
|
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
|
||||||
@ -58,6 +58,9 @@ class ModelSerializer(ABC):
|
|||||||
"ascii"
|
"ascii"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def bounds(self) -> Tuple[float, float]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class TreeNode:
|
class TreeNode:
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -96,6 +99,7 @@ class TreeNode:
|
|||||||
add_if_exists(d, "split_feature", self._split_feature)
|
add_if_exists(d, "split_feature", self._split_feature)
|
||||||
add_if_exists(d, "threshold", self._threshold)
|
add_if_exists(d, "threshold", self._threshold)
|
||||||
add_if_exists(d, "number_samples", self._number_samples)
|
add_if_exists(d, "number_samples", self._number_samples)
|
||||||
|
add_if_exists(d, "default_left", self._default_left)
|
||||||
else:
|
else:
|
||||||
if len(self._leaf_value) == 1:
|
if len(self._leaf_value) == 1:
|
||||||
# Support Elasticsearch 7.6 which only
|
# Support Elasticsearch 7.6 which only
|
||||||
@ -128,6 +132,14 @@ class Tree(ModelSerializer):
|
|||||||
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
|
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
|
||||||
return {"tree": d}
|
return {"tree": d}
|
||||||
|
|
||||||
|
def bounds(self) -> Tuple[float, float]:
|
||||||
|
leaf_values = [
|
||||||
|
tree_node._leaf_value[0]
|
||||||
|
for tree_node in self._tree_structure
|
||||||
|
if tree_node._leaf_value is not None
|
||||||
|
]
|
||||||
|
return min(leaf_values), max(leaf_values)
|
||||||
|
|
||||||
|
|
||||||
class Ensemble(ModelSerializer):
|
class Ensemble(ModelSerializer):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -157,3 +169,9 @@ class Ensemble(ModelSerializer):
|
|||||||
add_if_exists(d, "classification_weights", self._classification_weights)
|
add_if_exists(d, "classification_weights", self._classification_weights)
|
||||||
add_if_exists(d, "aggregate_output", self._output_aggregator)
|
add_if_exists(d, "aggregate_output", self._output_aggregator)
|
||||||
return {"ensemble": d}
|
return {"ensemble": d}
|
||||||
|
|
||||||
|
def bounds(self) -> Tuple[float, float]:
|
||||||
|
min_bound, max_bound = tuple(
|
||||||
|
map(sum, zip(*[model.bounds() for model in self._trained_models]))
|
||||||
|
)
|
||||||
|
return min_bound, max_bound
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
||||||
# license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright
|
|
||||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
||||||
# the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
# not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
@ -1,222 +0,0 @@
|
|||||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
||||||
# license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright
|
|
||||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
||||||
# the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
# not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from .._optional import import_optional_dependency
|
|
||||||
|
|
||||||
import_optional_dependency("sklearn", on_version="warn")
|
|
||||||
|
|
||||||
import sklearn
|
|
||||||
from sklearn.preprocessing import FunctionTransformer
|
|
||||||
|
|
||||||
|
|
||||||
class Tree:
|
|
||||||
"""Wrapper to create sklearn Tree objects from Elastic ML tree
|
|
||||||
description in JSON format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
json_tree: Dict[str, Any],
|
|
||||||
feature_names_map: Dict[str, int],
|
|
||||||
):
|
|
||||||
tree_leaf = -1
|
|
||||||
|
|
||||||
node_count = len(json_tree["tree_structure"])
|
|
||||||
children_left = np.ones((node_count,), dtype=int) * tree_leaf
|
|
||||||
children_right = np.ones((node_count,), dtype=int) * tree_leaf
|
|
||||||
feature = np.ones((node_count,), dtype=int) * -2
|
|
||||||
threshold = np.ones((node_count,), dtype=float) * -2
|
|
||||||
impurity = np.zeros((node_count,), dtype=float)
|
|
||||||
# value works only for regression and binary classification
|
|
||||||
value = np.zeros((node_count, 1, 1), dtype="<f8")
|
|
||||||
n_node_samples = np.zeros((node_count,), dtype=int)
|
|
||||||
|
|
||||||
# parse values from the JSON tree
|
|
||||||
feature_names = json_tree["feature_names"]
|
|
||||||
for json_node in json_tree["tree_structure"]:
|
|
||||||
node_id = json_node["node_index"]
|
|
||||||
if "number_samples" in json_node:
|
|
||||||
n_node_samples[node_id] = json_node["number_samples"]
|
|
||||||
else:
|
|
||||||
n_node_samples[node_id] = 0
|
|
||||||
|
|
||||||
if "leaf_value" not in json_node:
|
|
||||||
children_left[node_id] = json_node["left_child"]
|
|
||||||
children_right[node_id] = json_node["right_child"]
|
|
||||||
feature[node_id] = feature_names_map[
|
|
||||||
feature_names[json_node["split_feature"]]
|
|
||||||
]
|
|
||||||
threshold[node_id] = json_node["threshold"]
|
|
||||||
if "split_gain" in json_node:
|
|
||||||
impurity[node_id] = json_node["split_gain"]
|
|
||||||
else:
|
|
||||||
impurity[node_id] = -1
|
|
||||||
else:
|
|
||||||
value[node_id, 0, 0] = json_node["leaf_value"]
|
|
||||||
|
|
||||||
# iterate through tree to get max depth and expected values
|
|
||||||
weighted_n_node_samples = n_node_samples.copy()
|
|
||||||
self.max_depth = Tree._compute_expectations(
|
|
||||||
children_left=children_left,
|
|
||||||
children_right=children_right,
|
|
||||||
node_sample_weight=weighted_n_node_samples,
|
|
||||||
values=value,
|
|
||||||
node_index=0,
|
|
||||||
)
|
|
||||||
self.n_outputs = value.shape[-1]
|
|
||||||
|
|
||||||
# initialize the sklearn tree
|
|
||||||
self.tree = sklearn.tree._tree.Tree(
|
|
||||||
len(feature_names), np.array([1], dtype=int), 1
|
|
||||||
)
|
|
||||||
node_state = np.array(
|
|
||||||
[
|
|
||||||
(
|
|
||||||
children_left[i],
|
|
||||||
children_right[i],
|
|
||||||
feature[i],
|
|
||||||
threshold[i],
|
|
||||||
impurity[i],
|
|
||||||
n_node_samples[i],
|
|
||||||
weighted_n_node_samples[i],
|
|
||||||
True,
|
|
||||||
)
|
|
||||||
for i in range(node_count)
|
|
||||||
],
|
|
||||||
dtype={
|
|
||||||
"names": [
|
|
||||||
"left_child",
|
|
||||||
"right_child",
|
|
||||||
"feature",
|
|
||||||
"threshold",
|
|
||||||
"impurity",
|
|
||||||
"n_node_samples",
|
|
||||||
"weighted_n_node_samples",
|
|
||||||
"missing_go_to_left",
|
|
||||||
],
|
|
||||||
"formats": ["<i8", "<i8", "<i8", "<f8", "<f8", "<i8", "<f8", "u1"],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
state = {
|
|
||||||
"max_depth": self.max_depth,
|
|
||||||
"node_count": node_count,
|
|
||||||
"nodes": node_state,
|
|
||||||
"values": value,
|
|
||||||
}
|
|
||||||
self.tree.__setstate__(state)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _compute_expectations(
|
|
||||||
children_left, children_right, node_sample_weight, values, node_index
|
|
||||||
) -> int:
|
|
||||||
if children_right[node_index] == -1:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
left_index = children_left[node_index]
|
|
||||||
right_index = children_right[node_index]
|
|
||||||
depth_left = Tree._compute_expectations(
|
|
||||||
children_left, children_right, node_sample_weight, values, left_index
|
|
||||||
)
|
|
||||||
depth_right = Tree._compute_expectations(
|
|
||||||
children_left, children_right, node_sample_weight, values, right_index
|
|
||||||
)
|
|
||||||
left_weight = node_sample_weight[left_index]
|
|
||||||
right_weight = node_sample_weight[right_index]
|
|
||||||
|
|
||||||
v = (
|
|
||||||
(
|
|
||||||
left_weight * values[left_index, :]
|
|
||||||
+ right_weight * values[right_index, :]
|
|
||||||
)
|
|
||||||
/ (left_weight + right_weight)
|
|
||||||
if left_weight + right_weight > 0
|
|
||||||
else 0
|
|
||||||
)
|
|
||||||
values[node_index, :] = v
|
|
||||||
return max(depth_left, depth_right) + 1
|
|
||||||
|
|
||||||
|
|
||||||
class TargetMeanEncoder(FunctionTransformer):
|
|
||||||
"""FunctionTransformer implementation of the target mean encoder, which is
|
|
||||||
deserialized from the Elastic ML preprocessor description in JSON formats.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, preprocessor: Dict[str, Any]):
|
|
||||||
self.preprocessor = preprocessor
|
|
||||||
target_map = self.preprocessor["target_mean_encoding"]["target_map"]
|
|
||||||
feature_name_out = self.preprocessor["target_mean_encoding"]["feature_name"]
|
|
||||||
self.field_name_in = self.preprocessor["target_mean_encoding"]["field"]
|
|
||||||
fallback_value = self.preprocessor["target_mean_encoding"]["default_value"]
|
|
||||||
|
|
||||||
def func(column):
|
|
||||||
return np.array(
|
|
||||||
[
|
|
||||||
target_map[str(category)]
|
|
||||||
if category in target_map
|
|
||||||
else fallback_value
|
|
||||||
for category in column
|
|
||||||
]
|
|
||||||
).reshape(-1, 1)
|
|
||||||
|
|
||||||
def feature_names_out(ft, carr):
|
|
||||||
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
|
||||||
|
|
||||||
super().__init__(func=func, feature_names_out=feature_names_out)
|
|
||||||
|
|
||||||
|
|
||||||
class FrequencyEncoder(FunctionTransformer):
|
|
||||||
"""FunctionTransformer implementation of the frequency encoder, which is
|
|
||||||
deserialized from the Elastic ML preprocessor description in JSON format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, preprocessor: Dict[str, Any]):
|
|
||||||
self.preprocessor = preprocessor
|
|
||||||
frequency_map = self.preprocessor["frequency_encoding"]["frequency_map"]
|
|
||||||
feature_name_out = self.preprocessor["frequency_encoding"]["feature_name"]
|
|
||||||
self.field_name_in = self.preprocessor["frequency_encoding"]["field"]
|
|
||||||
fallback_value = 0.0
|
|
||||||
|
|
||||||
def func(column):
|
|
||||||
return np.array(
|
|
||||||
[
|
|
||||||
frequency_map[str(category)]
|
|
||||||
if category in frequency_map
|
|
||||||
else fallback_value
|
|
||||||
for category in column
|
|
||||||
]
|
|
||||||
).reshape(-1, 1)
|
|
||||||
|
|
||||||
def feature_names_out(ft, carr):
|
|
||||||
return [feature_name_out if c == self.field_name_in else c for c in carr]
|
|
||||||
|
|
||||||
super().__init__(func=func, feature_names_out=feature_names_out)
|
|
||||||
|
|
||||||
|
|
||||||
class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
|
|
||||||
"""Wrapper for sklearn one-hot encoder, which is deserialized from the
|
|
||||||
Elastic ML preprocessor description in JSON format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, preprocessor: Dict[str, Any]):
|
|
||||||
self.preprocessor = preprocessor
|
|
||||||
self.field_name_in = self.preprocessor["one_hot_encoding"]["field"]
|
|
||||||
self.cats = [list(self.preprocessor["one_hot_encoding"]["hot_map"].keys())]
|
|
||||||
super().__init__(categories=self.cats, handle_unknown="ignore")
|
|
@ -1,46 +0,0 @@
|
|||||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
||||||
# license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright
|
|
||||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
||||||
# the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
# not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
|
|
||||||
import eland
|
|
||||||
|
|
||||||
|
|
||||||
class ModelDefinitionKeyError(Exception):
|
|
||||||
"""
|
|
||||||
This exception is raised when a key is not found in the model definition.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
missed_key (str): The key that was not found in the model definition.
|
|
||||||
available_keys (List[str]): The list of keys that are available in the model definition.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
model_definition = {"key1": "value1", "key2": "value2"}
|
|
||||||
try:
|
|
||||||
model_definition["key3"]
|
|
||||||
except KeyError as ex:
|
|
||||||
raise ModelDefinitionKeyError(ex) from ex
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, ex: KeyError):
|
|
||||||
self.missed_key = ex.args[0]
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return (
|
|
||||||
f'Key "{self.missed_key}" is not available. '
|
|
||||||
+ "The model definition may have changed. "
|
|
||||||
+ "Make sure you are using an Elasticsearch version compatible "
|
|
||||||
+ f"with Eland {eland.__version__}."
|
|
||||||
)
|
|
@ -1,472 +0,0 @@
|
|||||||
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
||||||
# license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright
|
|
||||||
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
||||||
# the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
# not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
|
|
||||||
from abc import ABC
|
|
||||||
from typing import Any, List, Literal, Mapping, Optional, Set, Tuple, Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from elasticsearch import Elasticsearch
|
|
||||||
from numpy.typing import ArrayLike
|
|
||||||
|
|
||||||
from .._optional import import_optional_dependency
|
|
||||||
|
|
||||||
import_optional_dependency("sklearn", on_version="warn")
|
|
||||||
|
|
||||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
|
||||||
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
|
||||||
from sklearn.ensemble._gb_losses import (
|
|
||||||
BinomialDeviance,
|
|
||||||
HuberLossFunction,
|
|
||||||
LeastSquaresError,
|
|
||||||
)
|
|
||||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
||||||
from sklearn.utils.validation import check_array
|
|
||||||
|
|
||||||
from eland.common import ensure_es_client
|
|
||||||
from eland.ml.common import TYPE_CLASSIFICATION, TYPE_REGRESSION
|
|
||||||
|
|
||||||
from ._sklearn_deserializers import Tree
|
|
||||||
from .common import ModelDefinitionKeyError
|
|
||||||
|
|
||||||
|
|
||||||
class ESGradientBoostingModel(ABC):
|
|
||||||
"""
|
|
||||||
Abstract class for converting Elastic ML model into sklearn Pipeline.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
|
||||||
model_id: str,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
es_client : Elasticsearch client argument(s)
|
|
||||||
- elasticsearch-py parameters or
|
|
||||||
- elasticsearch-py instance
|
|
||||||
model_id : str
|
|
||||||
The unique identifier of the trained inference model in Elasticsearch.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
RuntimeError
|
|
||||||
On failure to retrieve trained model information to the specified model ID.
|
|
||||||
ValueError
|
|
||||||
The model is expected to be trained in Elastic Stack. Models initially imported
|
|
||||||
from xgboost, lgbm, or sklearn are not supported.
|
|
||||||
"""
|
|
||||||
self.es_client: Elasticsearch = ensure_es_client(es_client)
|
|
||||||
self.model_id = model_id
|
|
||||||
|
|
||||||
self._trained_model_result = self.es_client.ml.get_trained_models(
|
|
||||||
model_id=self.model_id,
|
|
||||||
decompress_definition=True,
|
|
||||||
include=["hyperparameters", "definition"],
|
|
||||||
)
|
|
||||||
|
|
||||||
if (
|
|
||||||
"trained_model_configs" not in self._trained_model_result
|
|
||||||
or len(self._trained_model_result["trained_model_configs"]) == 0
|
|
||||||
):
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Failed to retrieve the trained model for model ID {self.model_id!r}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if "metadata" not in self._trained_model_result["trained_model_configs"][0]:
|
|
||||||
raise ValueError(
|
|
||||||
"Error initializing sklearn classifier. Incorrect prior class probability. "
|
|
||||||
+ "Note: only export of models trained in the Elastic Stack is supported."
|
|
||||||
)
|
|
||||||
preprocessors = []
|
|
||||||
if "preprocessors" in self._definition:
|
|
||||||
preprocessors = self._definition["preprocessors"]
|
|
||||||
(
|
|
||||||
self.feature_names_in_,
|
|
||||||
self.input_field_names,
|
|
||||||
) = ESGradientBoostingModel._get_feature_names_in_(
|
|
||||||
preprocessors,
|
|
||||||
self._definition["trained_model"]["ensemble"]["feature_names"],
|
|
||||||
self._trained_model_result["trained_model_configs"][0]["input"][
|
|
||||||
"field_names"
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
feature_names_map = {name: i for i, name in enumerate(self.feature_names_in_)}
|
|
||||||
|
|
||||||
trained_models = self._definition["trained_model"]["ensemble"]["trained_models"]
|
|
||||||
self._trees = []
|
|
||||||
for trained_model in trained_models:
|
|
||||||
self._trees.append(Tree(trained_model["tree"], feature_names_map))
|
|
||||||
|
|
||||||
# 0's tree is the constant estimator
|
|
||||||
self.n_estimators = len(trained_models) - 1
|
|
||||||
|
|
||||||
def _initialize_estimators(self, decision_tree_type) -> None:
|
|
||||||
self.estimators_ = np.ndarray(
|
|
||||||
(len(self._trees) - 1, 1), dtype=decision_tree_type
|
|
||||||
)
|
|
||||||
self.n_estimators_ = self.estimators_.shape[0]
|
|
||||||
|
|
||||||
for i in range(self.n_estimators_):
|
|
||||||
estimator = decision_tree_type()
|
|
||||||
estimator.tree_ = self._trees[i + 1].tree
|
|
||||||
estimator.n_features_in_ = self.n_features_in_
|
|
||||||
estimator.max_depth = self._max_depth
|
|
||||||
estimator.max_features_ = self.max_features_
|
|
||||||
self.estimators_[i, 0] = estimator
|
|
||||||
|
|
||||||
def _extract_common_parameters(self) -> None:
|
|
||||||
self.n_features_in_ = len(self.feature_names_in_)
|
|
||||||
self.max_features_ = self.n_features_in_
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _max_depth(self) -> int:
|
|
||||||
return max(map(lambda x: x.max_depth, self._trees))
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _n_outputs(self) -> int:
|
|
||||||
return self._trees[0].n_outputs
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _definition(self) -> Mapping[Union[str, int], Any]:
|
|
||||||
return self._trained_model_result["trained_model_configs"][0]["definition"]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_feature_names_in_(
|
|
||||||
preprocessors, feature_names, field_names
|
|
||||||
) -> Tuple[List[str], Set[str]]:
|
|
||||||
input_field_names = set()
|
|
||||||
|
|
||||||
def add_input_field_name(preprocessor_type: str, feature_name: str) -> None:
|
|
||||||
if feature_name in feature_names:
|
|
||||||
input_field_names.add(preprocessor[preprocessor_type]["field"])
|
|
||||||
|
|
||||||
for preprocessor in preprocessors:
|
|
||||||
if "target_mean_encoding" in preprocessor:
|
|
||||||
add_input_field_name(
|
|
||||||
"target_mean_encoding",
|
|
||||||
preprocessor["target_mean_encoding"]["feature_name"],
|
|
||||||
)
|
|
||||||
elif "frequency_encoding" in preprocessor:
|
|
||||||
add_input_field_name(
|
|
||||||
"frequency_encoding",
|
|
||||||
preprocessor["frequency_encoding"]["feature_name"],
|
|
||||||
)
|
|
||||||
elif "one_hot_encoding" in preprocessor:
|
|
||||||
for feature_name in preprocessor["one_hot_encoding"][
|
|
||||||
"hot_map"
|
|
||||||
].values():
|
|
||||||
add_input_field_name("one_hot_encoding", feature_name)
|
|
||||||
|
|
||||||
for field_name in field_names:
|
|
||||||
if field_name in feature_names and field_name not in input_field_names:
|
|
||||||
input_field_names.add(field_name)
|
|
||||||
|
|
||||||
return feature_names, input_field_names
|
|
||||||
|
|
||||||
@property
|
|
||||||
def preprocessors(self) -> List[Any]:
|
|
||||||
"""
|
|
||||||
Returns the list of preprocessor JSON definitions.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
List[Any]
|
|
||||||
List of preprocessors definitions or [].
|
|
||||||
"""
|
|
||||||
if "preprocessors" in self._definition:
|
|
||||||
return self._definition["preprocessors"]
|
|
||||||
return []
|
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None, monitor=None) -> None:
|
|
||||||
"""
|
|
||||||
Override of the sklearn fit() method. It does nothing since Elastic ML models are
|
|
||||||
trained in the Elastic Stack or imported.
|
|
||||||
"""
|
|
||||||
# Do nothing, model if fitted using Elasticsearch API
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class ESGradientBoostingClassifier(ESGradientBoostingModel, GradientBoostingClassifier):
|
|
||||||
"""
|
|
||||||
Elastic ML model wrapper compatible with sklearn GradientBoostingClassifier.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
|
||||||
model_id: str,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
es_client : Elasticsearch client argument(s)
|
|
||||||
- elasticsearch-py parameters or
|
|
||||||
- elasticsearch-py instance
|
|
||||||
model_id : str
|
|
||||||
The unique identifier of the trained inference model in Elasticsearch.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
NotImplementedError
|
|
||||||
Multi-class classification is not supported at the moment.
|
|
||||||
ValueError
|
|
||||||
The classifier should be defined for at least 2 classes.
|
|
||||||
ModelDefinitionKeyError
|
|
||||||
If required data cannot be extracted from the model definition due to a schema change.
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
ESGradientBoostingModel.__init__(self, es_client, model_id)
|
|
||||||
self._extract_common_parameters()
|
|
||||||
GradientBoostingClassifier.__init__(
|
|
||||||
self,
|
|
||||||
learning_rate=1.0,
|
|
||||||
n_estimators=self.n_estimators,
|
|
||||||
max_depth=self._max_depth,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "classification_labels" in self._definition["trained_model"]["ensemble"]:
|
|
||||||
self.classes_ = np.array(
|
|
||||||
self._definition["trained_model"]["ensemble"][
|
|
||||||
"classification_labels"
|
|
||||||
]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.classes_ = None
|
|
||||||
|
|
||||||
self.n_outputs = self._n_outputs
|
|
||||||
if self.classes_ is not None:
|
|
||||||
self.n_classes_ = len(self.classes_)
|
|
||||||
elif self.n_outputs <= 2:
|
|
||||||
self.n_classes_ = 2
|
|
||||||
else:
|
|
||||||
self.n_classes_ = self.n_outputs
|
|
||||||
|
|
||||||
if self.n_classes_ == 2:
|
|
||||||
self._loss = BinomialDeviance(self.n_classes_)
|
|
||||||
# self.n_outputs = 1
|
|
||||||
elif self.n_classes_ > 2:
|
|
||||||
raise NotImplementedError("Only binary classification is implemented.")
|
|
||||||
else:
|
|
||||||
raise ValueError(f"At least 2 classes required. got {self.n_classes_}.")
|
|
||||||
|
|
||||||
self.init_ = self._initialize_init_()
|
|
||||||
self._initialize_estimators(DecisionTreeClassifier)
|
|
||||||
except KeyError as ex:
|
|
||||||
raise ModelDefinitionKeyError(ex) from ex
|
|
||||||
|
|
||||||
@property
|
|
||||||
def analysis_type(self) -> Literal["classification"]:
|
|
||||||
return TYPE_CLASSIFICATION
|
|
||||||
|
|
||||||
def _initialize_init_(self) -> DummyClassifier:
|
|
||||||
estimator = DummyClassifier(strategy="prior")
|
|
||||||
|
|
||||||
estimator.n_classes_ = self.n_classes_
|
|
||||||
estimator.n_outputs_ = self.n_outputs
|
|
||||||
estimator.classes_ = np.arange(self.n_classes_)
|
|
||||||
estimator._strategy = estimator.strategy
|
|
||||||
|
|
||||||
if self.n_classes_ == 2:
|
|
||||||
log_odds = self._trees[0].tree.value.flatten()[0]
|
|
||||||
if np.isnan(log_odds):
|
|
||||||
raise ValueError(
|
|
||||||
"Error initializing sklearn classifier. Incorrect prior class probability. "
|
|
||||||
+ "Note: only export of models trained in the Elastic Stack is supported."
|
|
||||||
)
|
|
||||||
class_prior = 1 / (1 + np.exp(-log_odds))
|
|
||||||
estimator.class_prior_ = np.array([1 - class_prior, class_prior])
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Only binary classification is implemented.")
|
|
||||||
|
|
||||||
return estimator
|
|
||||||
|
|
||||||
def predict_proba(
|
|
||||||
self, X, feature_names_in: Optional[Union["ArrayLike", List[str]]] = None
|
|
||||||
) -> "ArrayLike":
|
|
||||||
"""Predict class probabilities for X.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array-like of shape (n_samples, n_features)
|
|
||||||
The input samples.
|
|
||||||
feature_names_in : {array of string, list of string} of length n_features.
|
|
||||||
Feature names of the corresponding columns in X. Important, since the column list
|
|
||||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ArrayLike of shape (n_samples, n_classes)
|
|
||||||
The class probabilities of the input samples. The order of the
|
|
||||||
classes corresponds to that in the attribute :term:`classes_`.
|
|
||||||
"""
|
|
||||||
if feature_names_in is not None:
|
|
||||||
if X.shape[1] != len(feature_names_in):
|
|
||||||
raise ValueError(
|
|
||||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
|
||||||
)
|
|
||||||
if isinstance(feature_names_in, np.ndarray):
|
|
||||||
feature_names_in = feature_names_in.tolist()
|
|
||||||
# select columns used by the model in the correct order
|
|
||||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
|
||||||
|
|
||||||
X = check_array(X)
|
|
||||||
return GradientBoostingClassifier.predict_proba(self, X)
|
|
||||||
|
|
||||||
def predict(
|
|
||||||
self,
|
|
||||||
X: "ArrayLike",
|
|
||||||
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
|
|
||||||
) -> "ArrayLike":
|
|
||||||
"""Predict class for X.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array-like of shape (n_samples, n_features)
|
|
||||||
The input samples.
|
|
||||||
feature_names_in : {array of string, list of string} of length n_features.
|
|
||||||
Feature names of the corresponding columns in X. Important, since the column list
|
|
||||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ArrayLike of shape (n_samples,)
|
|
||||||
The predicted values.
|
|
||||||
"""
|
|
||||||
if feature_names_in is not None:
|
|
||||||
if X.shape[1] != len(feature_names_in):
|
|
||||||
raise ValueError(
|
|
||||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
|
||||||
)
|
|
||||||
if isinstance(feature_names_in, np.ndarray):
|
|
||||||
feature_names_in = feature_names_in.tolist()
|
|
||||||
# select columns used by the model in the correct order
|
|
||||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
|
||||||
|
|
||||||
X = check_array(X)
|
|
||||||
return GradientBoostingClassifier.predict(self, X)
|
|
||||||
|
|
||||||
|
|
||||||
class ESGradientBoostingRegressor(ESGradientBoostingModel, GradientBoostingRegressor):
|
|
||||||
"""
|
|
||||||
Elastic ML model wrapper compatible with sklearn GradientBoostingRegressor.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
|
|
||||||
model_id: str,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
es_client : Elasticsearch client argument(s)
|
|
||||||
- elasticsearch-py parameters or
|
|
||||||
- elasticsearch-py instance
|
|
||||||
model_id : str
|
|
||||||
The unique identifier of the trained inference model in Elasticsearch.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
NotImplementedError
|
|
||||||
Only MSE, MSLE, and Huber loss functions are supported.
|
|
||||||
ModelDefinitionKeyError
|
|
||||||
If required data cannot be extracted from the model definition due to a schema change.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
ESGradientBoostingModel.__init__(self, es_client, model_id)
|
|
||||||
self._extract_common_parameters()
|
|
||||||
GradientBoostingRegressor.__init__(
|
|
||||||
self,
|
|
||||||
learning_rate=1.0,
|
|
||||||
n_estimators=self.n_estimators,
|
|
||||||
max_depth=self._max_depth,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.n_outputs = 1
|
|
||||||
loss_function = self._trained_model_result["trained_model_configs"][0][
|
|
||||||
"metadata"
|
|
||||||
]["analytics_config"]["analysis"][self.analysis_type]["loss_function"]
|
|
||||||
if loss_function == "mse" or loss_function == "msle":
|
|
||||||
self.criterion = "squared_error"
|
|
||||||
self._loss = LeastSquaresError()
|
|
||||||
elif loss_function == "huber":
|
|
||||||
loss_parameter = loss_function = self._trained_model_result[
|
|
||||||
"trained_model_configs"
|
|
||||||
][0]["metadata"]["analytics_config"]["analysis"][self.analysis_type][
|
|
||||||
"loss_function_parameter"
|
|
||||||
]
|
|
||||||
self.criterion = "huber"
|
|
||||||
self._loss = HuberLossFunction(loss_parameter)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Only MSE, MSLE and Huber loss functions are supported."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.init_ = self._initialize_init_()
|
|
||||||
self._initialize_estimators(DecisionTreeRegressor)
|
|
||||||
except KeyError as ex:
|
|
||||||
raise ModelDefinitionKeyError(ex) from ex
|
|
||||||
|
|
||||||
@property
|
|
||||||
def analysis_type(self) -> Literal["regression"]:
|
|
||||||
return TYPE_REGRESSION
|
|
||||||
|
|
||||||
def _initialize_init_(self) -> DummyRegressor:
|
|
||||||
constant = self._trees[0].tree.value[0]
|
|
||||||
estimator = DummyRegressor(
|
|
||||||
strategy="constant",
|
|
||||||
constant=constant,
|
|
||||||
)
|
|
||||||
estimator.constant_ = np.array([constant])
|
|
||||||
estimator.n_outputs_ = 1
|
|
||||||
return estimator
|
|
||||||
|
|
||||||
def predict(
|
|
||||||
self,
|
|
||||||
X: "ArrayLike",
|
|
||||||
feature_names_in: Optional[Union["ArrayLike", List[str]]] = None,
|
|
||||||
) -> "ArrayLike":
|
|
||||||
"""Predict targets for X.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array-like of shape (n_samples, n_features)
|
|
||||||
The input samples.
|
|
||||||
feature_names_in : {array of string, list of string} of length n_features.
|
|
||||||
Feature names of the corresponding columns in X. Important, since the column list
|
|
||||||
can be extended by ColumnTransformer through the pipeline. By default None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
ArrayLike of shape (n_samples,)
|
|
||||||
The predicted values.
|
|
||||||
"""
|
|
||||||
if feature_names_in is not None:
|
|
||||||
if X.shape[1] != len(feature_names_in):
|
|
||||||
raise ValueError(
|
|
||||||
f"Dimension mismatch: X with {X.shape[1]} columns has to be the same size as feature_names_in with {len(feature_names_in)}."
|
|
||||||
)
|
|
||||||
if isinstance(X, np.ndarray):
|
|
||||||
feature_names_in = feature_names_in.tolist()
|
|
||||||
# select columns used by the model in the correct order
|
|
||||||
X = X[:, [feature_names_in.index(fn) for fn in self.feature_names_in_]]
|
|
||||||
|
|
||||||
X = check_array(X)
|
|
||||||
return GradientBoostingRegressor.predict(self, X)
|
|
@ -20,7 +20,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Uni
|
|||||||
import elasticsearch
|
import elasticsearch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from eland.common import ensure_es_client, es_version
|
from eland.common import ensure_es_client, es_version, is_serverless_es
|
||||||
from eland.utils import deprecated_api
|
from eland.utils import deprecated_api
|
||||||
|
|
||||||
from .common import TYPE_CLASSIFICATION, TYPE_LEARNING_TO_RANK, TYPE_REGRESSION
|
from .common import TYPE_CLASSIFICATION, TYPE_LEARNING_TO_RANK, TYPE_REGRESSION
|
||||||
@ -38,7 +38,6 @@ if TYPE_CHECKING:
|
|||||||
RandomForestClassifier,
|
RandomForestClassifier,
|
||||||
RandomForestRegressor,
|
RandomForestRegressor,
|
||||||
)
|
)
|
||||||
from sklearn.pipeline import Pipeline # type: ignore # noqa: F401
|
|
||||||
from sklearn.tree import ( # type: ignore # noqa: F401
|
from sklearn.tree import ( # type: ignore # noqa: F401
|
||||||
DecisionTreeClassifier,
|
DecisionTreeClassifier,
|
||||||
DecisionTreeRegressor,
|
DecisionTreeRegressor,
|
||||||
@ -504,7 +503,9 @@ class MLModel:
|
|||||||
)
|
)
|
||||||
serializer = transformer.transform()
|
serializer = transformer.transform()
|
||||||
model_type = transformer.model_type
|
model_type = transformer.model_type
|
||||||
default_inference_config: Mapping[str, Mapping[str, Any]] = {model_type: {}}
|
|
||||||
|
if inference_config is None:
|
||||||
|
inference_config = {model_type: {}}
|
||||||
|
|
||||||
if es_if_exists is None:
|
if es_if_exists is None:
|
||||||
es_if_exists = "fail"
|
es_if_exists = "fail"
|
||||||
@ -523,18 +524,25 @@ class MLModel:
|
|||||||
elif es_if_exists == "replace":
|
elif es_if_exists == "replace":
|
||||||
ml_model.delete_model()
|
ml_model.delete_model()
|
||||||
|
|
||||||
|
trained_model_input = None
|
||||||
|
is_ltr = next(iter(inference_config)) is TYPE_LEARNING_TO_RANK
|
||||||
|
if not is_ltr or (
|
||||||
|
es_version(es_client) < (8, 15) and not is_serverless_es(es_client)
|
||||||
|
):
|
||||||
|
trained_model_input = {"field_names": feature_names}
|
||||||
|
|
||||||
if es_compress_model_definition:
|
if es_compress_model_definition:
|
||||||
ml_model._client.ml.put_trained_model(
|
ml_model._client.ml.put_trained_model(
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
input={"field_names": feature_names},
|
inference_config=inference_config,
|
||||||
inference_config=inference_config or default_inference_config,
|
input=trained_model_input,
|
||||||
compressed_definition=serializer.serialize_and_compress_model(),
|
compressed_definition=serializer.serialize_and_compress_model(),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
ml_model._client.ml.put_trained_model(
|
ml_model._client.ml.put_trained_model(
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
input={"field_names": feature_names},
|
inference_config=inference_config,
|
||||||
inference_config=inference_config or default_inference_config,
|
input=trained_model_input,
|
||||||
definition=serializer.serialize_model(),
|
definition=serializer.serialize_model(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -563,83 +571,6 @@ class MLModel:
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def export_model(self) -> "Pipeline":
|
|
||||||
"""Export Elastic ML model as sklearn Pipeline.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
sklearn.pipeline.Pipeline
|
|
||||||
_description_
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
AssertionError
|
|
||||||
If preprocessors JSON definition has unexpected schema.
|
|
||||||
ValueError
|
|
||||||
The model is expected to be trained in Elastic Stack. Models initially imported
|
|
||||||
from xgboost, lgbm, or sklearn are not supported.
|
|
||||||
ValueError
|
|
||||||
If unexpected categorical encoding is found in the list of preprocessors.
|
|
||||||
NotImplementedError
|
|
||||||
Only regression and binary classification models are supported currently.
|
|
||||||
"""
|
|
||||||
from sklearn.compose import ColumnTransformer # type: ignore # noqa: F401
|
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
|
|
||||||
from .exporters._sklearn_deserializers import (
|
|
||||||
FrequencyEncoder,
|
|
||||||
OneHotEncoder,
|
|
||||||
TargetMeanEncoder,
|
|
||||||
)
|
|
||||||
from .exporters.es_gb_models import (
|
|
||||||
ESGradientBoostingClassifier,
|
|
||||||
ESGradientBoostingRegressor,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.model_type == TYPE_CLASSIFICATION:
|
|
||||||
model = ESGradientBoostingClassifier(
|
|
||||||
es_client=self._client, model_id=self._model_id
|
|
||||||
)
|
|
||||||
elif self.model_type == TYPE_REGRESSION:
|
|
||||||
model = ESGradientBoostingRegressor(
|
|
||||||
es_client=self._client, model_id=self._model_id
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Only regression and binary classification models are supported currently."
|
|
||||||
)
|
|
||||||
|
|
||||||
transformers = []
|
|
||||||
for p in model.preprocessors:
|
|
||||||
assert (
|
|
||||||
len(p) == 1
|
|
||||||
), f"Unexpected preprocessor data structure: {p}. One-key mapping expected."
|
|
||||||
encoding_type = list(p.keys())[0]
|
|
||||||
field = p[encoding_type]["field"]
|
|
||||||
if encoding_type == "frequency_encoding":
|
|
||||||
transform = FrequencyEncoder(p)
|
|
||||||
transformers.append((f"{field}_{encoding_type}", transform, field))
|
|
||||||
elif encoding_type == "target_mean_encoding":
|
|
||||||
transform = TargetMeanEncoder(p)
|
|
||||||
transformers.append((f"{field}_{encoding_type}", transform, field))
|
|
||||||
elif encoding_type == "one_hot_encoding":
|
|
||||||
transform = OneHotEncoder(p)
|
|
||||||
transformers.append((f"{field}_{encoding_type}", transform, [field]))
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unexpected categorical encoding type {encoding_type} found. "
|
|
||||||
+ "Expected encodings: frequency_encoding, target_mean_encoding, one_hot_encoding."
|
|
||||||
)
|
|
||||||
preprocessor = ColumnTransformer(
|
|
||||||
transformers=transformers,
|
|
||||||
remainder="passthrough",
|
|
||||||
verbose_feature_names_out=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("es_model", model)])
|
|
||||||
|
|
||||||
return pipeline
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _trained_model_config(self) -> Dict[str, Any]:
|
def _trained_model_config(self) -> Dict[str, Any]:
|
||||||
"""Lazily loads an ML models 'trained_model_config' information"""
|
"""Lazily loads an ML models 'trained_model_config' information"""
|
||||||
|
@ -31,7 +31,10 @@ from eland.ml.pytorch.nlp_ml_model import (
|
|||||||
ZeroShotClassificationInferenceOptions,
|
ZeroShotClassificationInferenceOptions,
|
||||||
)
|
)
|
||||||
from eland.ml.pytorch.traceable_model import TraceableModel # noqa: F401
|
from eland.ml.pytorch.traceable_model import TraceableModel # noqa: F401
|
||||||
from eland.ml.pytorch.transformers import task_type_from_model_config
|
from eland.ml.pytorch.transformers import (
|
||||||
|
UnknownModelInputSizeError,
|
||||||
|
task_type_from_model_config,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"PyTorchModel",
|
"PyTorchModel",
|
||||||
@ -49,4 +52,5 @@ __all__ = [
|
|||||||
"TextSimilarityInferenceOptions",
|
"TextSimilarityInferenceOptions",
|
||||||
"ZeroShotClassificationInferenceOptions",
|
"ZeroShotClassificationInferenceOptions",
|
||||||
"task_type_from_model_config",
|
"task_type_from_model_config",
|
||||||
|
"UnknownModelInputSizeError",
|
||||||
]
|
]
|
||||||
|
@ -126,6 +126,7 @@ class PyTorchModel:
|
|||||||
def infer(
|
def infer(
|
||||||
self,
|
self,
|
||||||
docs: List[Mapping[str, str]],
|
docs: List[Mapping[str, str]],
|
||||||
|
inference_config: Optional[Mapping[str, Any]] = None,
|
||||||
timeout: str = DEFAULT_TIMEOUT,
|
timeout: str = DEFAULT_TIMEOUT,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
if docs is None:
|
if docs is None:
|
||||||
@ -133,6 +134,8 @@ class PyTorchModel:
|
|||||||
|
|
||||||
__body: Dict[str, Any] = {}
|
__body: Dict[str, Any] = {}
|
||||||
__body["docs"] = docs
|
__body["docs"] = docs
|
||||||
|
if inference_config is not None:
|
||||||
|
__body["inference_config"] = inference_config
|
||||||
|
|
||||||
__path = f"/_ml/trained_models/{_quote(self.model_id)}/_infer"
|
__path = f"/_ml/trained_models/{_quote(self.model_id)}/_infer"
|
||||||
__query: Dict[str, Any] = {}
|
__query: Dict[str, Any] = {}
|
||||||
|
@ -86,6 +86,27 @@ class NlpXLMRobertaTokenizationConfig(NlpTokenizationConfig):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NlpDebertaV2TokenizationConfig(NlpTokenizationConfig):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
do_lower_case: t.Optional[bool] = None,
|
||||||
|
with_special_tokens: t.Optional[bool] = None,
|
||||||
|
max_sequence_length: t.Optional[int] = None,
|
||||||
|
truncate: t.Optional[
|
||||||
|
t.Union["t.Literal['first', 'none', 'second']", str]
|
||||||
|
] = None,
|
||||||
|
span: t.Optional[int] = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
configuration_type="deberta_v2",
|
||||||
|
with_special_tokens=with_special_tokens,
|
||||||
|
max_sequence_length=max_sequence_length,
|
||||||
|
truncate=truncate,
|
||||||
|
span=span,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class NlpBertTokenizationConfig(NlpTokenizationConfig):
|
class NlpBertTokenizationConfig(NlpTokenizationConfig):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -50,12 +50,10 @@ class TraceableModel(ABC):
|
|||||||
return self._trace()
|
return self._trace()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def sample_output(self) -> torch.Tensor:
|
def sample_output(self) -> torch.Tensor: ...
|
||||||
...
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _trace(self) -> TracedModelTypes:
|
def _trace(self) -> TracedModelTypes: ...
|
||||||
...
|
|
||||||
|
|
||||||
def classification_labels(self) -> Optional[List[str]]:
|
def classification_labels(self) -> Optional[List[str]]:
|
||||||
return None
|
return None
|
||||||
|
@ -25,17 +25,14 @@ import os.path
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
import torch # type: ignore
|
import torch # type: ignore
|
||||||
import transformers # type: ignore
|
import transformers # type: ignore
|
||||||
from sentence_transformers import SentenceTransformer # type: ignore
|
from torch import Tensor
|
||||||
from torch import Tensor, nn
|
|
||||||
from torch.profiler import profile # type: ignore
|
from torch.profiler import profile # type: ignore
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
BertTokenizer,
|
||||||
AutoModel,
|
|
||||||
AutoModelForQuestionAnswering,
|
|
||||||
PretrainedConfig,
|
PretrainedConfig,
|
||||||
PreTrainedModel,
|
PreTrainedModel,
|
||||||
PreTrainedTokenizer,
|
PreTrainedTokenizer,
|
||||||
@ -47,6 +44,7 @@ from eland.ml.pytorch.nlp_ml_model import (
|
|||||||
NerInferenceOptions,
|
NerInferenceOptions,
|
||||||
NlpBertJapaneseTokenizationConfig,
|
NlpBertJapaneseTokenizationConfig,
|
||||||
NlpBertTokenizationConfig,
|
NlpBertTokenizationConfig,
|
||||||
|
NlpDebertaV2TokenizationConfig,
|
||||||
NlpMPNetTokenizationConfig,
|
NlpMPNetTokenizationConfig,
|
||||||
NlpRobertaTokenizationConfig,
|
NlpRobertaTokenizationConfig,
|
||||||
NlpTokenizationConfig,
|
NlpTokenizationConfig,
|
||||||
@ -63,8 +61,13 @@ from eland.ml.pytorch.nlp_ml_model import (
|
|||||||
ZeroShotClassificationInferenceOptions,
|
ZeroShotClassificationInferenceOptions,
|
||||||
)
|
)
|
||||||
from eland.ml.pytorch.traceable_model import TraceableModel
|
from eland.ml.pytorch.traceable_model import TraceableModel
|
||||||
|
from eland.ml.pytorch.wrappers import (
|
||||||
|
_DistilBertWrapper,
|
||||||
|
_DPREncoderWrapper,
|
||||||
|
_QuestionAnsweringWrapperModule,
|
||||||
|
_SentenceTransformerWrapperModule,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_OUTPUT_KEY = "sentence_embedding"
|
|
||||||
SUPPORTED_TASK_TYPES = {
|
SUPPORTED_TASK_TYPES = {
|
||||||
"fill_mask",
|
"fill_mask",
|
||||||
"ner",
|
"ner",
|
||||||
@ -115,6 +118,7 @@ SUPPORTED_TOKENIZERS = (
|
|||||||
transformers.BartTokenizer,
|
transformers.BartTokenizer,
|
||||||
transformers.SqueezeBertTokenizer,
|
transformers.SqueezeBertTokenizer,
|
||||||
transformers.XLMRobertaTokenizer,
|
transformers.XLMRobertaTokenizer,
|
||||||
|
transformers.DebertaV2Tokenizer,
|
||||||
)
|
)
|
||||||
SUPPORTED_TOKENIZERS_NAMES = ", ".join(sorted([str(x) for x in SUPPORTED_TOKENIZERS]))
|
SUPPORTED_TOKENIZERS_NAMES = ", ".join(sorted([str(x) for x in SUPPORTED_TOKENIZERS]))
|
||||||
|
|
||||||
@ -130,6 +134,10 @@ class TaskTypeError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownModelInputSizeError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]:
|
def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]:
|
||||||
if model_config.architectures is None:
|
if model_config.architectures is None:
|
||||||
if model_config.name_or_path.startswith("sentence-transformers/"):
|
if model_config.name_or_path.startswith("sentence-transformers/"):
|
||||||
@ -165,283 +173,6 @@ def task_type_from_model_config(model_config: PretrainedConfig) -> Optional[str]
|
|||||||
return potential_task_types.pop()
|
return potential_task_types.pop()
|
||||||
|
|
||||||
|
|
||||||
class _QuestionAnsweringWrapperModule(nn.Module): # type: ignore
|
|
||||||
"""
|
|
||||||
A wrapper around a question answering model.
|
|
||||||
Our inference engine only takes the first tuple if the inference response
|
|
||||||
is a tuple.
|
|
||||||
|
|
||||||
This wrapper transforms the output to be a stacked tensor if its a tuple.
|
|
||||||
|
|
||||||
Otherwise it passes it through
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model: PreTrainedModel):
|
|
||||||
super().__init__()
|
|
||||||
self._hf_model = model
|
|
||||||
self.config = model.config
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
|
||||||
model = AutoModelForQuestionAnswering.from_pretrained(
|
|
||||||
model_id, token=token, torchscript=True
|
|
||||||
)
|
|
||||||
if isinstance(
|
|
||||||
model.config,
|
|
||||||
(
|
|
||||||
transformers.MPNetConfig,
|
|
||||||
transformers.XLMRobertaConfig,
|
|
||||||
transformers.RobertaConfig,
|
|
||||||
transformers.BartConfig,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
return _TwoParameterQuestionAnsweringWrapper(model)
|
|
||||||
else:
|
|
||||||
return _QuestionAnsweringWrapper(model)
|
|
||||||
|
|
||||||
|
|
||||||
class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
|
||||||
def __init__(self, model: PreTrainedModel):
|
|
||||||
super().__init__(model=model)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
input_ids: Tensor,
|
|
||||||
attention_mask: Tensor,
|
|
||||||
token_type_ids: Tensor,
|
|
||||||
position_ids: Tensor,
|
|
||||||
) -> Tensor:
|
|
||||||
"""Wrap the input and output to conform to the native process interface."""
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
"position_ids": position_ids,
|
|
||||||
}
|
|
||||||
|
|
||||||
# remove inputs for specific model types
|
|
||||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
|
||||||
del inputs["token_type_ids"]
|
|
||||||
del inputs["position_ids"]
|
|
||||||
response = self._hf_model(**inputs)
|
|
||||||
if isinstance(response, tuple):
|
|
||||||
return torch.stack(list(response), dim=0)
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
|
||||||
def __init__(self, model: PreTrainedModel):
|
|
||||||
super().__init__(model=model)
|
|
||||||
|
|
||||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
|
||||||
"""Wrap the input and output to conform to the native process interface."""
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
}
|
|
||||||
response = self._hf_model(**inputs)
|
|
||||||
if isinstance(response, tuple):
|
|
||||||
return torch.stack(list(response), dim=0)
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
class _DistilBertWrapper(nn.Module): # type: ignore
|
|
||||||
"""
|
|
||||||
In Elasticsearch the BERT tokenizer is used for DistilBERT models but
|
|
||||||
the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
|
|
||||||
|
|
||||||
Wrap the model's forward function in a method that accepts the 4
|
|
||||||
arguments passed to a BERT model then discard the token_type_ids
|
|
||||||
and the position_ids to match the wrapped DistilBERT model forward
|
|
||||||
function
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model: transformers.PreTrainedModel):
|
|
||||||
super().__init__()
|
|
||||||
self._model = model
|
|
||||||
self.config = model.config
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
|
|
||||||
if isinstance(model.config, transformers.DistilBertConfig):
|
|
||||||
return _DistilBertWrapper(model)
|
|
||||||
else:
|
|
||||||
return model
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
input_ids: Tensor,
|
|
||||||
attention_mask: Tensor,
|
|
||||||
_token_type_ids: Tensor = None,
|
|
||||||
_position_ids: Tensor = None,
|
|
||||||
) -> Tensor:
|
|
||||||
"""Wrap the input and output to conform to the native process interface."""
|
|
||||||
|
|
||||||
return self._model(input_ids=input_ids, attention_mask=attention_mask)
|
|
||||||
|
|
||||||
|
|
||||||
class _SentenceTransformerWrapperModule(nn.Module): # type: ignore
|
|
||||||
"""
|
|
||||||
A wrapper around sentence-transformer models to provide pooling,
|
|
||||||
normalization and other graph layers that are not defined in the base
|
|
||||||
HuggingFace transformer model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
|
||||||
super().__init__()
|
|
||||||
self._hf_model = model
|
|
||||||
self._st_model = SentenceTransformer(model.config.name_or_path)
|
|
||||||
self._output_key = output_key
|
|
||||||
self.config = model.config
|
|
||||||
|
|
||||||
self._remove_pooling_layer()
|
|
||||||
self._replace_transformer_layer()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_pretrained(
|
|
||||||
model_id: str,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
*,
|
|
||||||
token: Optional[str] = None,
|
|
||||||
output_key: str = DEFAULT_OUTPUT_KEY,
|
|
||||||
) -> Optional[Any]:
|
|
||||||
model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
|
|
||||||
if isinstance(
|
|
||||||
tokenizer,
|
|
||||||
(
|
|
||||||
transformers.BartTokenizer,
|
|
||||||
transformers.MPNetTokenizer,
|
|
||||||
transformers.RobertaTokenizer,
|
|
||||||
transformers.XLMRobertaTokenizer,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
return _TwoParameterSentenceTransformerWrapper(model, output_key)
|
|
||||||
else:
|
|
||||||
return _SentenceTransformerWrapper(model, output_key)
|
|
||||||
|
|
||||||
def _remove_pooling_layer(self) -> None:
|
|
||||||
"""
|
|
||||||
Removes any last pooling layer which is not used to create embeddings.
|
|
||||||
Leaving this layer in will cause it to return a NoneType which in turn
|
|
||||||
will fail to load in libtorch. Alternatively, we can just use the output
|
|
||||||
of the pooling layer as a dummy but this also affects (if only in a
|
|
||||||
minor way) the performance of inference, so we're better off removing
|
|
||||||
the layer if we can.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if hasattr(self._hf_model, "pooler"):
|
|
||||||
self._hf_model.pooler = None
|
|
||||||
|
|
||||||
def _replace_transformer_layer(self) -> None:
|
|
||||||
"""
|
|
||||||
Replaces the HuggingFace Transformer layer in the SentenceTransformer
|
|
||||||
modules so we can set it with one that has pooling layer removed and
|
|
||||||
was loaded ready for TorchScript export.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self._st_model._modules["0"].auto_model = self._hf_model
|
|
||||||
|
|
||||||
|
|
||||||
class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
|
||||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
|
||||||
super().__init__(model=model, output_key=output_key)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
input_ids: Tensor,
|
|
||||||
attention_mask: Tensor,
|
|
||||||
token_type_ids: Tensor,
|
|
||||||
position_ids: Tensor,
|
|
||||||
) -> Tensor:
|
|
||||||
"""Wrap the input and output to conform to the native process interface."""
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
"token_type_ids": token_type_ids,
|
|
||||||
"position_ids": position_ids,
|
|
||||||
}
|
|
||||||
|
|
||||||
# remove inputs for specific model types
|
|
||||||
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
|
||||||
del inputs["token_type_ids"]
|
|
||||||
|
|
||||||
return self._st_model(inputs)[self._output_key]
|
|
||||||
|
|
||||||
|
|
||||||
class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
|
||||||
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
|
||||||
super().__init__(model=model, output_key=output_key)
|
|
||||||
|
|
||||||
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
|
||||||
"""Wrap the input and output to conform to the native process interface."""
|
|
||||||
inputs = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
}
|
|
||||||
return self._st_model(inputs)[self._output_key]
|
|
||||||
|
|
||||||
|
|
||||||
class _DPREncoderWrapper(nn.Module): # type: ignore
|
|
||||||
"""
|
|
||||||
AutoModel loading does not work for DPRContextEncoders, this only exists as
|
|
||||||
a workaround. This may never be fixed so this is likely permanent.
|
|
||||||
See: https://github.com/huggingface/transformers/issues/13670
|
|
||||||
"""
|
|
||||||
|
|
||||||
_SUPPORTED_MODELS = {
|
|
||||||
transformers.DPRContextEncoder,
|
|
||||||
transformers.DPRQuestionEncoder,
|
|
||||||
}
|
|
||||||
_SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
self._model = model
|
|
||||||
self.config = model.config
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
|
||||||
config = AutoConfig.from_pretrained(model_id, token=token)
|
|
||||||
|
|
||||||
def is_compatible() -> bool:
|
|
||||||
is_dpr_model = config.model_type == "dpr"
|
|
||||||
has_architectures = (
|
|
||||||
config.architectures is not None and len(config.architectures) == 1
|
|
||||||
)
|
|
||||||
is_supported_architecture = has_architectures and (
|
|
||||||
config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
|
|
||||||
)
|
|
||||||
return is_dpr_model and is_supported_architecture
|
|
||||||
|
|
||||||
if is_compatible():
|
|
||||||
model = getattr(transformers, config.architectures[0]).from_pretrained(
|
|
||||||
model_id, torchscript=True
|
|
||||||
)
|
|
||||||
return _DPREncoderWrapper(model)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
input_ids: Tensor,
|
|
||||||
attention_mask: Tensor,
|
|
||||||
token_type_ids: Tensor,
|
|
||||||
_position_ids: Tensor,
|
|
||||||
) -> Tensor:
|
|
||||||
"""Wrap the input and output to conform to the native process interface."""
|
|
||||||
|
|
||||||
return self._model(
|
|
||||||
input_ids=input_ids,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
token_type_ids=token_type_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class _TransformerTraceableModel(TraceableModel):
|
class _TransformerTraceableModel(TraceableModel):
|
||||||
"""A base class representing a HuggingFace transformer model that can be traced."""
|
"""A base class representing a HuggingFace transformer model that can be traced."""
|
||||||
|
|
||||||
@ -460,7 +191,7 @@ class _TransformerTraceableModel(TraceableModel):
|
|||||||
|
|
||||||
def _trace(self) -> TracedModelTypes:
|
def _trace(self) -> TracedModelTypes:
|
||||||
inputs = self._compatible_inputs()
|
inputs = self._compatible_inputs()
|
||||||
return torch.jit.trace(self._model, inputs)
|
return torch.jit.trace(self._model, example_inputs=inputs)
|
||||||
|
|
||||||
def sample_output(self) -> Tensor:
|
def sample_output(self) -> Tensor:
|
||||||
inputs = self._compatible_inputs()
|
inputs = self._compatible_inputs()
|
||||||
@ -483,9 +214,15 @@ class _TransformerTraceableModel(TraceableModel):
|
|||||||
transformers.XLMRobertaTokenizer,
|
transformers.XLMRobertaTokenizer,
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
del inputs["token_type_ids"]
|
|
||||||
return (inputs["input_ids"], inputs["attention_mask"])
|
return (inputs["input_ids"], inputs["attention_mask"])
|
||||||
|
|
||||||
|
if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||||
|
return (
|
||||||
|
inputs["input_ids"],
|
||||||
|
inputs["attention_mask"],
|
||||||
|
inputs["token_type_ids"],
|
||||||
|
)
|
||||||
|
|
||||||
position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
|
position_ids = torch.arange(inputs["input_ids"].size(1), dtype=torch.long)
|
||||||
inputs["position_ids"] = position_ids
|
inputs["position_ids"] = position_ids
|
||||||
return (
|
return (
|
||||||
@ -496,8 +233,7 @@ class _TransformerTraceableModel(TraceableModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
def _prepare_inputs(self) -> transformers.BatchEncoding: ...
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
class _TraceableClassificationModel(_TransformerTraceableModel, ABC):
|
class _TraceableClassificationModel(_TransformerTraceableModel, ABC):
|
||||||
@ -519,6 +255,15 @@ class _TraceableFillMaskModel(_TransformerTraceableModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _TraceableTextExpansionModel(_TransformerTraceableModel):
|
||||||
|
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||||
|
return self._tokenizer(
|
||||||
|
"This is an example sentence.",
|
||||||
|
padding="max_length",
|
||||||
|
return_tensors="pt",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class _TraceableNerModel(_TraceableClassificationModel):
|
class _TraceableNerModel(_TraceableClassificationModel):
|
||||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||||
return self._tokenizer(
|
return self._tokenizer(
|
||||||
@ -553,7 +298,7 @@ class _TraceableTextEmbeddingModel(_TransformerTraceableModel):
|
|||||||
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
def _prepare_inputs(self) -> transformers.BatchEncoding:
|
||||||
return self._tokenizer(
|
return self._tokenizer(
|
||||||
"This is an example sentence.",
|
"This is an example sentence.",
|
||||||
padding="max_length",
|
padding="longest",
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -599,6 +344,7 @@ class TransformerModel:
|
|||||||
access_token: Optional[str] = None,
|
access_token: Optional[str] = None,
|
||||||
ingest_prefix: Optional[str] = None,
|
ingest_prefix: Optional[str] = None,
|
||||||
search_prefix: Optional[str] = None,
|
search_prefix: Optional[str] = None,
|
||||||
|
max_model_input_size: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Loads a model from the Hugging Face repository or local file and creates
|
Loads a model from the Hugging Face repository or local file and creates
|
||||||
@ -630,6 +376,12 @@ class TransformerModel:
|
|||||||
|
|
||||||
search_prefix: Optional[str]
|
search_prefix: Optional[str]
|
||||||
Prefix string to prepend to input at search
|
Prefix string to prepend to input at search
|
||||||
|
|
||||||
|
max_model_input_size: Optional[int]
|
||||||
|
The max model input size counted in tokens.
|
||||||
|
Usually this value should be extracted from the model configuration
|
||||||
|
but if that is not possible or the data is missing it can be
|
||||||
|
explicitly set with this parameter.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self._model_id = model_id
|
self._model_id = model_id
|
||||||
@ -637,6 +389,7 @@ class TransformerModel:
|
|||||||
self._task_type = task_type.replace("-", "_")
|
self._task_type = task_type.replace("-", "_")
|
||||||
self._ingest_prefix = ingest_prefix
|
self._ingest_prefix = ingest_prefix
|
||||||
self._search_prefix = search_prefix
|
self._search_prefix = search_prefix
|
||||||
|
self._max_model_input_size = max_model_input_size
|
||||||
|
|
||||||
# load Hugging Face model and tokenizer
|
# load Hugging Face model and tokenizer
|
||||||
# use padding in the tokenizer to ensure max length sequences are used for tracing (at call time)
|
# use padding in the tokenizer to ensure max length sequences are used for tracing (at call time)
|
||||||
@ -669,7 +422,12 @@ class TransformerModel:
|
|||||||
" ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
|
" ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
|
||||||
]
|
]
|
||||||
vocab_obj["merges"] = merges
|
vocab_obj["merges"] = merges
|
||||||
sp_model = getattr(self._tokenizer, "sp_model", None)
|
|
||||||
|
if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||||
|
sp_model = self._tokenizer._tokenizer.spm
|
||||||
|
else:
|
||||||
|
sp_model = getattr(self._tokenizer, "sp_model", None)
|
||||||
|
|
||||||
if sp_model:
|
if sp_model:
|
||||||
id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
|
id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
|
||||||
scores = []
|
scores = []
|
||||||
@ -686,7 +444,10 @@ class TransformerModel:
|
|||||||
return vocab_obj
|
return vocab_obj
|
||||||
|
|
||||||
def _create_tokenization_config(self) -> NlpTokenizationConfig:
|
def _create_tokenization_config(self) -> NlpTokenizationConfig:
|
||||||
_max_sequence_length = self._find_max_sequence_length()
|
if self._max_model_input_size:
|
||||||
|
_max_sequence_length = self._max_model_input_size
|
||||||
|
else:
|
||||||
|
_max_sequence_length = self._find_max_sequence_length()
|
||||||
|
|
||||||
if isinstance(self._tokenizer, transformers.MPNetTokenizer):
|
if isinstance(self._tokenizer, transformers.MPNetTokenizer):
|
||||||
return NlpMPNetTokenizationConfig(
|
return NlpMPNetTokenizationConfig(
|
||||||
@ -704,6 +465,11 @@ class TransformerModel:
|
|||||||
return NlpXLMRobertaTokenizationConfig(
|
return NlpXLMRobertaTokenizationConfig(
|
||||||
max_sequence_length=_max_sequence_length
|
max_sequence_length=_max_sequence_length
|
||||||
)
|
)
|
||||||
|
elif isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
|
||||||
|
return NlpDebertaV2TokenizationConfig(
|
||||||
|
max_sequence_length=_max_sequence_length,
|
||||||
|
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
japanese_morphological_tokenizers = ["mecab"]
|
japanese_morphological_tokenizers = ["mecab"]
|
||||||
if (
|
if (
|
||||||
@ -725,25 +491,28 @@ class TransformerModel:
|
|||||||
# Sometimes the max_... values are present but contain
|
# Sometimes the max_... values are present but contain
|
||||||
# a random or very large value.
|
# a random or very large value.
|
||||||
REASONABLE_MAX_LENGTH = 8192
|
REASONABLE_MAX_LENGTH = 8192
|
||||||
max_len = getattr(self._tokenizer, "max_model_input_sizes", dict()).get(
|
|
||||||
self._model_id
|
|
||||||
)
|
|
||||||
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
|
||||||
return int(max_len)
|
|
||||||
|
|
||||||
max_len = getattr(self._tokenizer, "model_max_length", None)
|
max_len = getattr(self._tokenizer, "model_max_length", None)
|
||||||
|
if max_len is not None and max_len <= REASONABLE_MAX_LENGTH:
|
||||||
|
return int(max_len)
|
||||||
|
|
||||||
|
max_sizes = getattr(self._tokenizer, "max_model_input_sizes", dict())
|
||||||
|
max_len = max_sizes.get(self._model_id)
|
||||||
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
||||||
return int(max_len)
|
return int(max_len)
|
||||||
|
|
||||||
model_config = getattr(self._traceable_model._model, "config", None)
|
if max_sizes:
|
||||||
if model_config is None:
|
# The model id wasn't found in the max sizes dict but
|
||||||
raise ValueError("Cannot determine model max input length")
|
# if all the values correspond then take that value
|
||||||
|
sizes = {size for size in max_sizes.values()}
|
||||||
|
if len(sizes) == 1:
|
||||||
|
max_len = sizes.pop()
|
||||||
|
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
||||||
|
return int(max_len)
|
||||||
|
|
||||||
max_len = getattr(model_config, "max_position_embeddings", None)
|
if isinstance(self._tokenizer, BertTokenizer):
|
||||||
if max_len is not None and max_len < REASONABLE_MAX_LENGTH:
|
return 512
|
||||||
return int(max_len)
|
|
||||||
|
|
||||||
raise ValueError("Cannot determine model max input length")
|
raise UnknownModelInputSizeError("Cannot determine model max input length")
|
||||||
|
|
||||||
def _create_config(
|
def _create_config(
|
||||||
self, es_version: Optional[Tuple[int, int, int]]
|
self, es_version: Optional[Tuple[int, int, int]]
|
||||||
@ -756,6 +525,9 @@ class TransformerModel:
|
|||||||
tokenization_config.span = 128
|
tokenization_config.span = 128
|
||||||
tokenization_config.truncate = "none"
|
tokenization_config.truncate = "none"
|
||||||
|
|
||||||
|
if self._task_type == "text_similarity":
|
||||||
|
tokenization_config.truncate = "second"
|
||||||
|
|
||||||
if self._traceable_model.classification_labels():
|
if self._traceable_model.classification_labels():
|
||||||
inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
|
inference_config = TASK_TYPE_TO_INFERENCE_CONFIG[self._task_type](
|
||||||
tokenization=tokenization_config,
|
tokenization=tokenization_config,
|
||||||
@ -954,6 +726,13 @@ class TransformerModel:
|
|||||||
else:
|
else:
|
||||||
self._task_type = maybe_task_type
|
self._task_type = maybe_task_type
|
||||||
|
|
||||||
|
if self._task_type == "text_expansion":
|
||||||
|
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
||||||
|
self._model_id, token=self._access_token, torchscript=True
|
||||||
|
)
|
||||||
|
model = _DistilBertWrapper.try_wrapping(model)
|
||||||
|
return _TraceableTextExpansionModel(self._tokenizer, model)
|
||||||
|
|
||||||
if self._task_type == "fill_mask":
|
if self._task_type == "fill_mask":
|
||||||
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
model = transformers.AutoModelForMaskedLM.from_pretrained(
|
||||||
self._model_id, token=self._access_token, torchscript=True
|
self._model_id, token=self._access_token, torchscript=True
|
||||||
@ -1013,7 +792,7 @@ class TransformerModel:
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"Unknown task type {self._task_type}, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
|
f"Task {self._task_type} is not supported, must be one of: {SUPPORTED_TASK_TYPES_NAMES}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def elasticsearch_model_id(self) -> str:
|
def elasticsearch_model_id(self) -> str:
|
||||||
@ -1044,6 +823,5 @@ def elasticsearch_model_id(model_id: str) -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
id = re.sub(r"[\s\\/]", "__", model_id).lower()[-64:]
|
id = re.sub(r"[\s\\/]", "__", model_id).lower()[-64:]
|
||||||
if id.startswith("__"):
|
id = id.removeprefix("__")
|
||||||
id = id.removeprefix("__")
|
|
||||||
return id
|
return id
|
||||||
|
317
eland/ml/pytorch/wrappers.py
Normal file
317
eland/ml/pytorch/wrappers.py
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||||
|
# license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright
|
||||||
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||||
|
# the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This module contains the wrapper classes for the Hugging Face models.
|
||||||
|
Wrapping is necessary to ensure that the forward method of the model
|
||||||
|
is called with the same arguments the ml-cpp pytorch_inference process
|
||||||
|
uses.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
import torch # type: ignore
|
||||||
|
import transformers # type: ignore
|
||||||
|
from sentence_transformers import SentenceTransformer # type: ignore
|
||||||
|
from torch import Tensor, nn
|
||||||
|
from transformers import (
|
||||||
|
AutoConfig,
|
||||||
|
AutoModel,
|
||||||
|
AutoModelForQuestionAnswering,
|
||||||
|
PreTrainedModel,
|
||||||
|
PreTrainedTokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
|
DEFAULT_OUTPUT_KEY = "sentence_embedding"
|
||||||
|
|
||||||
|
|
||||||
|
class _QuestionAnsweringWrapperModule(nn.Module): # type: ignore
|
||||||
|
"""
|
||||||
|
A wrapper around a question answering model.
|
||||||
|
Our inference engine only takes the first tuple if the inference response
|
||||||
|
is a tuple.
|
||||||
|
|
||||||
|
This wrapper transforms the output to be a stacked tensor if its a tuple.
|
||||||
|
|
||||||
|
Otherwise it passes it through
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: PreTrainedModel):
|
||||||
|
super().__init__()
|
||||||
|
self._hf_model = model
|
||||||
|
self.config = model.config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||||
|
model = AutoModelForQuestionAnswering.from_pretrained(
|
||||||
|
model_id, token=token, torchscript=True
|
||||||
|
)
|
||||||
|
if isinstance(
|
||||||
|
model.config,
|
||||||
|
(
|
||||||
|
transformers.MPNetConfig,
|
||||||
|
transformers.XLMRobertaConfig,
|
||||||
|
transformers.RobertaConfig,
|
||||||
|
transformers.BartConfig,
|
||||||
|
),
|
||||||
|
):
|
||||||
|
return _TwoParameterQuestionAnsweringWrapper(model)
|
||||||
|
else:
|
||||||
|
return _QuestionAnsweringWrapper(model)
|
||||||
|
|
||||||
|
|
||||||
|
class _QuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||||
|
def __init__(self, model: PreTrainedModel):
|
||||||
|
super().__init__(model=model)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: Tensor,
|
||||||
|
attention_mask: Tensor,
|
||||||
|
token_type_ids: Tensor,
|
||||||
|
position_ids: Tensor,
|
||||||
|
) -> Tensor:
|
||||||
|
"""Wrap the input and output to conform to the native process interface."""
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
"position_ids": position_ids,
|
||||||
|
}
|
||||||
|
|
||||||
|
# remove inputs for specific model types
|
||||||
|
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||||
|
del inputs["token_type_ids"]
|
||||||
|
del inputs["position_ids"]
|
||||||
|
response = self._hf_model(**inputs)
|
||||||
|
if isinstance(response, tuple):
|
||||||
|
return torch.stack(list(response), dim=0)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class _TwoParameterQuestionAnsweringWrapper(_QuestionAnsweringWrapperModule):
|
||||||
|
def __init__(self, model: PreTrainedModel):
|
||||||
|
super().__init__(model=model)
|
||||||
|
|
||||||
|
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||||
|
"""Wrap the input and output to conform to the native process interface."""
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
}
|
||||||
|
response = self._hf_model(**inputs)
|
||||||
|
if isinstance(response, tuple):
|
||||||
|
return torch.stack(list(response), dim=0)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class _DistilBertWrapper(nn.Module): # type: ignore
|
||||||
|
"""
|
||||||
|
In Elasticsearch the BERT tokenizer is used for DistilBERT models but
|
||||||
|
the BERT tokenizer produces 4 inputs where DistilBERT models expect 2.
|
||||||
|
|
||||||
|
Wrap the model's forward function in a method that accepts the 4
|
||||||
|
arguments passed to a BERT model then discard the token_type_ids
|
||||||
|
and the position_ids to match the wrapped DistilBERT model forward
|
||||||
|
function
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: transformers.PreTrainedModel):
|
||||||
|
super().__init__()
|
||||||
|
self._model = model
|
||||||
|
self.config = model.config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def try_wrapping(model: PreTrainedModel) -> Optional[Any]:
|
||||||
|
if isinstance(model.config, transformers.DistilBertConfig):
|
||||||
|
return _DistilBertWrapper(model)
|
||||||
|
else:
|
||||||
|
return model
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: Tensor,
|
||||||
|
attention_mask: Tensor,
|
||||||
|
_token_type_ids: Tensor = None,
|
||||||
|
_position_ids: Tensor = None,
|
||||||
|
) -> Tensor:
|
||||||
|
"""Wrap the input and output to conform to the native process interface."""
|
||||||
|
|
||||||
|
return self._model(input_ids=input_ids, attention_mask=attention_mask)
|
||||||
|
|
||||||
|
|
||||||
|
class _SentenceTransformerWrapperModule(nn.Module): # type: ignore
|
||||||
|
"""
|
||||||
|
A wrapper around sentence-transformer models to provide pooling,
|
||||||
|
normalization and other graph layers that are not defined in the base
|
||||||
|
HuggingFace transformer model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||||
|
super().__init__()
|
||||||
|
self._hf_model = model
|
||||||
|
self._st_model = SentenceTransformer(model.config.name_or_path)
|
||||||
|
self._output_key = output_key
|
||||||
|
self.config = model.config
|
||||||
|
|
||||||
|
self._remove_pooling_layer()
|
||||||
|
self._replace_transformer_layer()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_pretrained(
|
||||||
|
model_id: str,
|
||||||
|
tokenizer: PreTrainedTokenizer,
|
||||||
|
*,
|
||||||
|
token: Optional[str] = None,
|
||||||
|
output_key: str = DEFAULT_OUTPUT_KEY,
|
||||||
|
) -> Optional[Any]:
|
||||||
|
model = AutoModel.from_pretrained(model_id, token=token, torchscript=True)
|
||||||
|
if isinstance(
|
||||||
|
tokenizer,
|
||||||
|
(
|
||||||
|
transformers.BartTokenizer,
|
||||||
|
transformers.MPNetTokenizer,
|
||||||
|
transformers.RobertaTokenizer,
|
||||||
|
transformers.XLMRobertaTokenizer,
|
||||||
|
transformers.DebertaV2Tokenizer,
|
||||||
|
),
|
||||||
|
):
|
||||||
|
return _TwoParameterSentenceTransformerWrapper(model, output_key)
|
||||||
|
else:
|
||||||
|
return _SentenceTransformerWrapper(model, output_key)
|
||||||
|
|
||||||
|
def _remove_pooling_layer(self) -> None:
|
||||||
|
"""
|
||||||
|
Removes any last pooling layer which is not used to create embeddings.
|
||||||
|
Leaving this layer in will cause it to return a NoneType which in turn
|
||||||
|
will fail to load in libtorch. Alternatively, we can just use the output
|
||||||
|
of the pooling layer as a dummy but this also affects (if only in a
|
||||||
|
minor way) the performance of inference, so we're better off removing
|
||||||
|
the layer if we can.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if hasattr(self._hf_model, "pooler"):
|
||||||
|
self._hf_model.pooler = None
|
||||||
|
|
||||||
|
def _replace_transformer_layer(self) -> None:
|
||||||
|
"""
|
||||||
|
Replaces the HuggingFace Transformer layer in the SentenceTransformer
|
||||||
|
modules so we can set it with one that has pooling layer removed and
|
||||||
|
was loaded ready for TorchScript export.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self._st_model._modules["0"].auto_model = self._hf_model
|
||||||
|
|
||||||
|
|
||||||
|
class _SentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||||
|
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||||
|
super().__init__(model=model, output_key=output_key)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: Tensor,
|
||||||
|
attention_mask: Tensor,
|
||||||
|
token_type_ids: Tensor,
|
||||||
|
position_ids: Tensor,
|
||||||
|
) -> Tensor:
|
||||||
|
"""Wrap the input and output to conform to the native process interface."""
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
"token_type_ids": token_type_ids,
|
||||||
|
"position_ids": position_ids,
|
||||||
|
}
|
||||||
|
|
||||||
|
# remove inputs for specific model types
|
||||||
|
if isinstance(self._hf_model.config, transformers.DistilBertConfig):
|
||||||
|
del inputs["token_type_ids"]
|
||||||
|
|
||||||
|
return self._st_model(inputs)[self._output_key]
|
||||||
|
|
||||||
|
|
||||||
|
class _TwoParameterSentenceTransformerWrapper(_SentenceTransformerWrapperModule):
|
||||||
|
def __init__(self, model: PreTrainedModel, output_key: str = DEFAULT_OUTPUT_KEY):
|
||||||
|
super().__init__(model=model, output_key=output_key)
|
||||||
|
|
||||||
|
def forward(self, input_ids: Tensor, attention_mask: Tensor) -> Tensor:
|
||||||
|
"""Wrap the input and output to conform to the native process interface."""
|
||||||
|
inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"attention_mask": attention_mask,
|
||||||
|
}
|
||||||
|
return self._st_model(inputs)[self._output_key]
|
||||||
|
|
||||||
|
|
||||||
|
class _DPREncoderWrapper(nn.Module): # type: ignore
|
||||||
|
"""
|
||||||
|
AutoModel loading does not work for DPRContextEncoders, this only exists as
|
||||||
|
a workaround. This may never be fixed so this is likely permanent.
|
||||||
|
See: https://github.com/huggingface/transformers/issues/13670
|
||||||
|
"""
|
||||||
|
|
||||||
|
_SUPPORTED_MODELS = {
|
||||||
|
transformers.DPRContextEncoder,
|
||||||
|
transformers.DPRQuestionEncoder,
|
||||||
|
}
|
||||||
|
_SUPPORTED_MODELS_NAMES = set([x.__name__ for x in _SUPPORTED_MODELS])
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: Union[transformers.DPRContextEncoder, transformers.DPRQuestionEncoder],
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self._model = model
|
||||||
|
self.config = model.config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_pretrained(model_id: str, *, token: Optional[str] = None) -> Optional[Any]:
|
||||||
|
config = AutoConfig.from_pretrained(model_id, token=token)
|
||||||
|
|
||||||
|
def is_compatible() -> bool:
|
||||||
|
is_dpr_model = config.model_type == "dpr"
|
||||||
|
has_architectures = (
|
||||||
|
config.architectures is not None and len(config.architectures) == 1
|
||||||
|
)
|
||||||
|
is_supported_architecture = has_architectures and (
|
||||||
|
config.architectures[0] in _DPREncoderWrapper._SUPPORTED_MODELS_NAMES
|
||||||
|
)
|
||||||
|
return is_dpr_model and is_supported_architecture
|
||||||
|
|
||||||
|
if is_compatible():
|
||||||
|
model = getattr(transformers, config.architectures[0]).from_pretrained(
|
||||||
|
model_id, torchscript=True
|
||||||
|
)
|
||||||
|
return _DPREncoderWrapper(model)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: Tensor,
|
||||||
|
attention_mask: Tensor,
|
||||||
|
token_type_ids: Tensor,
|
||||||
|
_position_ids: Tensor,
|
||||||
|
) -> Tensor:
|
||||||
|
"""Wrap the input and output to conform to the native process interface."""
|
||||||
|
|
||||||
|
return self._model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
)
|
@ -97,9 +97,11 @@ class LGBMForestTransformer(ModelTransformer):
|
|||||||
return TreeNode(
|
return TreeNode(
|
||||||
node_idx=node_id,
|
node_idx=node_id,
|
||||||
leaf_value=[float(tree_node_json_obj["leaf_value"])],
|
leaf_value=[float(tree_node_json_obj["leaf_value"])],
|
||||||
number_samples=int(tree_node_json_obj["leaf_count"])
|
number_samples=(
|
||||||
if "leaf_count" in tree_node_json_obj
|
int(tree_node_json_obj["leaf_count"])
|
||||||
else None,
|
if "leaf_count" in tree_node_json_obj
|
||||||
|
else None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree:
|
def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree:
|
||||||
@ -235,9 +237,11 @@ class LGBMClassifierTransformer(LGBMForestTransformer):
|
|||||||
return TreeNode(
|
return TreeNode(
|
||||||
node_idx=node_id,
|
node_idx=node_id,
|
||||||
leaf_value=leaf_val,
|
leaf_value=leaf_val,
|
||||||
number_samples=int(tree_node_json_obj["leaf_count"])
|
number_samples=(
|
||||||
if "leaf_count" in tree_node_json_obj
|
int(tree_node_json_obj["leaf_count"])
|
||||||
else None,
|
if "leaf_count" in tree_node_json_obj
|
||||||
|
else None
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def check_model_booster(self) -> None:
|
def check_model_booster(self) -> None:
|
||||||
|
@ -107,6 +107,7 @@ class XGBoostForestTransformer(ModelTransformer):
|
|||||||
decision_type=self._node_decision_type,
|
decision_type=self._node_decision_type,
|
||||||
left_child=self.extract_node_id(row["Yes"], curr_tree),
|
left_child=self.extract_node_id(row["Yes"], curr_tree),
|
||||||
right_child=self.extract_node_id(row["No"], curr_tree),
|
right_child=self.extract_node_id(row["No"], curr_tree),
|
||||||
|
default_left=row["Yes"] == row["Missing"],
|
||||||
threshold=float(row["Split"]),
|
threshold=float(row["Split"]),
|
||||||
split_feature=self.get_feature_id(row["Feature"]),
|
split_feature=self.get_feature_id(row["Feature"]),
|
||||||
)
|
)
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -1156,9 +1157,11 @@ class Operations:
|
|||||||
# piggy-back on that single aggregation.
|
# piggy-back on that single aggregation.
|
||||||
if extended_stats_calls >= 2:
|
if extended_stats_calls >= 2:
|
||||||
es_aggs = [
|
es_aggs = [
|
||||||
("extended_stats", es_agg)
|
(
|
||||||
if es_agg in extended_stats_es_aggs
|
("extended_stats", es_agg)
|
||||||
else es_agg
|
if es_agg in extended_stats_es_aggs
|
||||||
|
else es_agg
|
||||||
|
)
|
||||||
for es_agg in es_aggs
|
for es_agg in es_aggs
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1248,6 +1251,46 @@ class Operations:
|
|||||||
if path_or_buf is None:
|
if path_or_buf is None:
|
||||||
return "".join(result)
|
return "".join(result)
|
||||||
|
|
||||||
|
def to_json( # type: ignore
|
||||||
|
self,
|
||||||
|
query_compiler: "QueryCompiler",
|
||||||
|
path_or_buf=None,
|
||||||
|
orient=None,
|
||||||
|
lines=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if orient == "records" and lines is True:
|
||||||
|
result: List[str] = []
|
||||||
|
our_filehandle = False
|
||||||
|
if isinstance(path_or_buf, os.PathLike):
|
||||||
|
buf = open(path_or_buf, "w")
|
||||||
|
our_filehandle = True
|
||||||
|
elif isinstance(path_or_buf, str):
|
||||||
|
buf = open(path_or_buf, "w")
|
||||||
|
our_filehandle = True
|
||||||
|
else:
|
||||||
|
buf = path_or_buf
|
||||||
|
for i, df in enumerate(
|
||||||
|
self.search_yield_pandas_dataframes(query_compiler=query_compiler)
|
||||||
|
):
|
||||||
|
output = df.to_json(
|
||||||
|
orient=orient,
|
||||||
|
lines=lines,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
if buf is None:
|
||||||
|
result.append(output)
|
||||||
|
else:
|
||||||
|
buf.write(output)
|
||||||
|
# If we opened the file ourselves, we should close it
|
||||||
|
if our_filehandle:
|
||||||
|
buf.close()
|
||||||
|
return "".join(result) or None
|
||||||
|
else:
|
||||||
|
return self.to_pandas(query_compiler=query_compiler).to_json(
|
||||||
|
path_or_buf, orient=orient, lines=lines, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
def to_pandas(
|
def to_pandas(
|
||||||
self, query_compiler: "QueryCompiler", show_progress: bool = False
|
self, query_compiler: "QueryCompiler", show_progress: bool = False
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
@ -1500,6 +1543,24 @@ def quantile_to_percentile(quantile: Union[int, float]) -> float:
|
|||||||
return float(min(100, max(0, quantile * 100)))
|
return float(min(100, max(0, quantile * 100)))
|
||||||
|
|
||||||
|
|
||||||
|
def is_field_already_present(
|
||||||
|
key: str, data: Union[Dict[str, Any], List[Dict[str, Any]]]
|
||||||
|
) -> bool:
|
||||||
|
if "." in key:
|
||||||
|
splitted = key.split(".")
|
||||||
|
if isinstance(data, dict):
|
||||||
|
return is_field_already_present(
|
||||||
|
".".join(splitted[1:]), data.get(splitted[0], {})
|
||||||
|
)
|
||||||
|
if isinstance(data, list):
|
||||||
|
return any(
|
||||||
|
is_field_already_present(".".join(splitted[1:]), x.get(splitted[0], {}))
|
||||||
|
for x in data
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return key in data
|
||||||
|
|
||||||
|
|
||||||
def _search_yield_hits(
|
def _search_yield_hits(
|
||||||
query_compiler: "QueryCompiler",
|
query_compiler: "QueryCompiler",
|
||||||
body: Dict[str, Any],
|
body: Dict[str, Any],
|
||||||
@ -1557,10 +1618,24 @@ def _search_yield_hits(
|
|||||||
|
|
||||||
# Modify the search with the new point in time ID and keep-alive time.
|
# Modify the search with the new point in time ID and keep-alive time.
|
||||||
body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
|
body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
|
||||||
|
if isinstance(body["_source"], list):
|
||||||
|
body["fields"] = body["_source"]
|
||||||
|
|
||||||
while max_number_of_hits is None or hits_yielded < max_number_of_hits:
|
while max_number_of_hits is None or hits_yielded < max_number_of_hits:
|
||||||
resp = client.search(**body)
|
resp = client.search(**body)
|
||||||
hits: List[Dict[str, Any]] = resp["hits"]["hits"]
|
hits: List[Dict[str, Any]] = []
|
||||||
|
for hit in resp["hits"]["hits"]:
|
||||||
|
# Copy some of the fields to _source if they are missing there.
|
||||||
|
if "fields" in hit and "_source" in hit:
|
||||||
|
fields = hit["fields"]
|
||||||
|
del hit["fields"]
|
||||||
|
for k, v in fields.items():
|
||||||
|
if not is_field_already_present(k, hit["_source"]):
|
||||||
|
if isinstance(v, list):
|
||||||
|
hit["_source"][k] = list(sorted(v))
|
||||||
|
else:
|
||||||
|
hit["_source"][k] = v
|
||||||
|
hits.append(hit)
|
||||||
|
|
||||||
# The point in time ID can change between searches so we
|
# The point in time ID can change between searches so we
|
||||||
# need to keep the next search up-to-date
|
# need to keep the next search up-to-date
|
||||||
|
@ -514,6 +514,14 @@ class QueryCompiler:
|
|||||||
"""
|
"""
|
||||||
return self._operations.to_csv(query_compiler=self, **kwargs)
|
return self._operations.to_csv(query_compiler=self, **kwargs)
|
||||||
|
|
||||||
|
def to_json(self, **kwargs) -> Optional[str]:
|
||||||
|
"""Serialises Eland Dataframe to CSV
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
If path_or_buf is None, returns the resulting json as a string.
|
||||||
|
"""
|
||||||
|
return self._operations.to_json(query_compiler=self, **kwargs)
|
||||||
|
|
||||||
def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
|
def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
|
||||||
return self._operations.search_yield_pandas_dataframes(self)
|
return self._operations.search_yield_pandas_dataframes(self)
|
||||||
|
|
||||||
|
@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
|
from pandas.core.indexes.frozen import FrozenList
|
||||||
from pandas.io.common import _expand_user, stringify_path # type: ignore
|
from pandas.io.common import _expand_user, stringify_path # type: ignore
|
||||||
|
|
||||||
import eland.plotting
|
import eland.plotting
|
||||||
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
|
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
|
||||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||||
from eland.filter import (
|
from eland.filter import (
|
||||||
BooleanFilter,
|
BooleanFilter,
|
||||||
Equal,
|
Equal,
|
||||||
@ -292,18 +293,26 @@ class Series(NDFrame):
|
|||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||||
>>> df['Carrier'].value_counts()
|
>>> df['Carrier'].value_counts() # doctest: +SKIP
|
||||||
|
Carrier
|
||||||
Logstash Airways 3331
|
Logstash Airways 3331
|
||||||
JetBeats 3274
|
JetBeats 3274
|
||||||
Kibana Airlines 3234
|
Kibana Airlines 3234
|
||||||
ES-Air 3220
|
ES-Air 3220
|
||||||
Name: Carrier, dtype: int64
|
Name: count, dtype: int64
|
||||||
"""
|
"""
|
||||||
if not isinstance(es_size, int):
|
if not isinstance(es_size, int):
|
||||||
raise TypeError("es_size must be a positive integer.")
|
raise TypeError("es_size must be a positive integer.")
|
||||||
elif es_size <= 0:
|
elif es_size <= 0:
|
||||||
raise ValueError("es_size must be a positive integer.")
|
raise ValueError("es_size must be a positive integer.")
|
||||||
return self._query_compiler.value_counts(es_size)
|
value_counts = self._query_compiler.value_counts(es_size)
|
||||||
|
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
|
||||||
|
if PANDAS_VERSION[0] == 2:
|
||||||
|
value_counts.name = "count"
|
||||||
|
value_counts.index.names = FrozenList([self.es_field_name])
|
||||||
|
value_counts.index.name = self.es_field_name
|
||||||
|
|
||||||
|
return value_counts
|
||||||
|
|
||||||
# dtype not implemented for Series as causes query to fail
|
# dtype not implemented for Series as causes query to fail
|
||||||
# in pandas.core.computation.ops.Term.type
|
# in pandas.core.computation.ops.Term.type
|
||||||
|
61
noxfile.py
61
noxfile.py
@ -16,7 +16,6 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import nox
|
import nox
|
||||||
@ -56,52 +55,48 @@ TYPED_FILES = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@nox.session(reuse_venv=True)
|
@nox.session(reuse_venv=True, python="3.11")
|
||||||
def format(session):
|
def format(session):
|
||||||
session.install("black", "isort", "flynt")
|
session.install("black ~= 25.0", "isort", "flynt")
|
||||||
session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES)
|
session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES)
|
||||||
session.run("flynt", *SOURCE_FILES)
|
session.run("flynt", *SOURCE_FILES)
|
||||||
session.run("black", "--target-version=py38", *SOURCE_FILES)
|
session.run("black", "--target-version=py39", *SOURCE_FILES)
|
||||||
session.run("isort", "--profile=black", *SOURCE_FILES)
|
session.run("isort", "--profile=black", *SOURCE_FILES)
|
||||||
lint(session)
|
lint(session)
|
||||||
|
|
||||||
|
|
||||||
@nox.session(reuse_venv=True)
|
@nox.session(reuse_venv=True, python="3.11")
|
||||||
def lint(session):
|
def lint(session):
|
||||||
# Install numpy to use its mypy plugin
|
# Install numpy to use its mypy plugin
|
||||||
# https://numpy.org/devdocs/reference/typing.html#mypy-plugin
|
# https://numpy.org/devdocs/reference/typing.html#mypy-plugin
|
||||||
session.install("black", "flake8", "mypy", "isort", "numpy")
|
session.install("black ~= 25.0", "flake8", "mypy", "isort", "numpy")
|
||||||
session.install("--pre", "elasticsearch>=8.3,<9")
|
session.install(".")
|
||||||
session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
|
session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES)
|
||||||
session.run("black", "--check", "--target-version=py38", *SOURCE_FILES)
|
session.run("black", "--check", "--target-version=py39", *SOURCE_FILES)
|
||||||
session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
|
session.run("isort", "--check", "--profile=black", *SOURCE_FILES)
|
||||||
session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES)
|
session.run("flake8", "--extend-ignore=E203,E402,E501,E704,E712", *SOURCE_FILES)
|
||||||
|
|
||||||
# TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
|
# TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
|
||||||
session.log("mypy --show-error-codes --strict eland/")
|
stdout = session.run(
|
||||||
for typed_file in TYPED_FILES:
|
"mypy",
|
||||||
if not os.path.isfile(typed_file):
|
"--show-error-codes",
|
||||||
session.error(f"The file {typed_file!r} couldn't be found")
|
"--strict",
|
||||||
process = subprocess.run(
|
*TYPED_FILES,
|
||||||
["mypy", "--show-error-codes", "--strict", typed_file],
|
success_codes=(0, 1),
|
||||||
env=session.env,
|
silent=True,
|
||||||
stdout=subprocess.PIPE,
|
)
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
)
|
|
||||||
# Ensure that mypy itself ran successfully
|
|
||||||
assert process.returncode in (0, 1)
|
|
||||||
|
|
||||||
errors = []
|
errors = []
|
||||||
for line in process.stdout.decode().split("\n"):
|
for line in stdout.splitlines():
|
||||||
filepath = line.partition(":")[0]
|
filepath = line.partition(":")[0]
|
||||||
if filepath in TYPED_FILES:
|
if filepath in TYPED_FILES:
|
||||||
errors.append(line)
|
errors.append(line)
|
||||||
if errors:
|
if errors:
|
||||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||||
|
|
||||||
|
|
||||||
@nox.session(python=["3.8", "3.9", "3.10"])
|
@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
|
||||||
@nox.parametrize("pandas_version", ["1.5.0"])
|
@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
|
||||||
def test(session, pandas_version: str):
|
def test(session, pandas_version: str):
|
||||||
session.install("-r", "requirements-dev.txt")
|
session.install("-r", "requirements-dev.txt")
|
||||||
session.install(".")
|
session.install(".")
|
||||||
@ -121,9 +116,6 @@ def test(session, pandas_version: str):
|
|||||||
"--nbval",
|
"--nbval",
|
||||||
)
|
)
|
||||||
|
|
||||||
# PyTorch doesn't support Python 3.11 yet
|
|
||||||
if session.python == "3.11":
|
|
||||||
pytest_args += ("--ignore=eland/ml/pytorch",)
|
|
||||||
session.run(
|
session.run(
|
||||||
*pytest_args,
|
*pytest_args,
|
||||||
*(session.posargs or ("eland/", "tests/")),
|
*(session.posargs or ("eland/", "tests/")),
|
||||||
@ -140,7 +132,6 @@ def test(session, pandas_version: str):
|
|||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
"xgboost",
|
"xgboost",
|
||||||
"lightgbm",
|
"lightgbm",
|
||||||
"shap",
|
|
||||||
)
|
)
|
||||||
session.run("pytest", "tests/ml/")
|
session.run("pytest", "tests/ml/")
|
||||||
|
|
||||||
@ -150,8 +141,8 @@ def docs(session):
|
|||||||
# Run this so users get an error if they don't have Pandoc installed.
|
# Run this so users get an error if they don't have Pandoc installed.
|
||||||
session.run("pandoc", "--version", external=True)
|
session.run("pandoc", "--version", external=True)
|
||||||
|
|
||||||
session.install("-r", "docs/requirements-docs.txt")
|
|
||||||
session.install(".")
|
session.install(".")
|
||||||
|
session.install("-r", "docs/requirements-docs.txt")
|
||||||
|
|
||||||
# See if we have an Elasticsearch cluster active
|
# See if we have an Elasticsearch cluster active
|
||||||
# to rebuild the Jupyter notebooks with.
|
# to rebuild the Jupyter notebooks with.
|
||||||
|
@ -1,26 +1,7 @@
|
|||||||
#
|
#
|
||||||
# Basic requirements
|
# Basic requirements with extras
|
||||||
#
|
#
|
||||||
elasticsearch>=8.3,<9
|
.[all]
|
||||||
pandas>=1.5,<2
|
|
||||||
matplotlib>=3.6
|
|
||||||
numpy>=1.2.0,<2
|
|
||||||
tqdm<5
|
|
||||||
|
|
||||||
#
|
|
||||||
# Extras
|
|
||||||
#
|
|
||||||
scikit-learn>=1.3,<1.4
|
|
||||||
xgboost>=0.90,<2
|
|
||||||
lightgbm>=2,<4
|
|
||||||
|
|
||||||
# PyTorch doesn't support Python 3.11 yet (pytorch/pytorch#86566)
|
|
||||||
|
|
||||||
# Elasticsearch uses v1.13.1 of PyTorch
|
|
||||||
torch>=1.13.1,<2.0; python_version<'3.11'
|
|
||||||
# Versions known to be compatible with PyTorch 1.13.1
|
|
||||||
sentence-transformers>=2.1.0,<=2.2.2; python_version<'3.11'
|
|
||||||
transformers[torch]>=4.31.0,<=4.33.2; python_version<'3.11'
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Testing
|
# Testing
|
||||||
@ -29,7 +10,6 @@ pytest>=5.2.1
|
|||||||
pytest-mock
|
pytest-mock
|
||||||
pytest-cov
|
pytest-cov
|
||||||
nbval
|
nbval
|
||||||
shap==0.43.0
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Docs
|
# Docs
|
||||||
|
@ -1,7 +0,0 @@
|
|||||||
#
|
|
||||||
# Basic requirements
|
|
||||||
#
|
|
||||||
elasticsearch>=8.3,<9
|
|
||||||
pandas>=1.5,<2
|
|
||||||
matplotlib>=3.6
|
|
||||||
numpy>=1.2.0,<2
|
|
23
setup.py
23
setup.py
@ -38,9 +38,10 @@ CLASSIFIERS = [
|
|||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3 :: Only",
|
"Programming Language :: Python :: 3 :: Only",
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
"Topic :: Scientific/Engineering",
|
"Topic :: Scientific/Engineering",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -55,12 +56,16 @@ with open(path.join(here, "README.md"), "r", "utf-8") as f:
|
|||||||
|
|
||||||
extras = {
|
extras = {
|
||||||
"xgboost": ["xgboost>=0.90,<2"],
|
"xgboost": ["xgboost>=0.90,<2"],
|
||||||
"scikit-learn": ["scikit-learn>=1.3,<1.4"],
|
"scikit-learn": ["scikit-learn>=1.3,<1.6"],
|
||||||
"lightgbm": ["lightgbm>=2,<4"],
|
"lightgbm": ["lightgbm>=3,<5"],
|
||||||
"pytorch": [
|
"pytorch": [
|
||||||
"torch>=1.13.1,<2.0",
|
"requests<3",
|
||||||
"sentence-transformers>=2.1.0,<=2.2.2",
|
"torch==2.5.1",
|
||||||
"transformers[torch]>=4.31.0,<=4.33.2",
|
"tqdm",
|
||||||
|
"sentence-transformers>=5.0.0,<6.0.0",
|
||||||
|
# sentencepiece is a required dependency for the slow tokenizers
|
||||||
|
# https://huggingface.co/transformers/v4.4.2/migration.html#sentencepiece-is-removed-from-the-required-dependencies
|
||||||
|
"transformers[sentencepiece]>=4.47.0,<4.50.3",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
extras["all"] = list({dep for deps in extras.values() for dep in deps})
|
extras["all"] = list({dep for deps in extras.values() for dep in deps})
|
||||||
@ -81,8 +86,8 @@ setup(
|
|||||||
keywords="elastic eland pandas python",
|
keywords="elastic eland pandas python",
|
||||||
packages=find_packages(include=["eland", "eland.*"]),
|
packages=find_packages(include=["eland", "eland.*"]),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"elasticsearch>=8.3,<9",
|
"elasticsearch>=9,<10",
|
||||||
"pandas>=1.5,<2",
|
"pandas>=1.5,<3",
|
||||||
"matplotlib>=3.6",
|
"matplotlib>=3.6",
|
||||||
"numpy>=1.2.0,<2",
|
"numpy>=1.2.0,<2",
|
||||||
"packaging",
|
"packaging",
|
||||||
@ -90,7 +95,7 @@ setup(
|
|||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": "eland_import_hub_model=eland.cli.eland_import_hub_model:main"
|
"console_scripts": "eland_import_hub_model=eland.cli.eland_import_hub_model:main"
|
||||||
},
|
},
|
||||||
python_requires=">=3.8",
|
python_requires=">=3.9,<3.13",
|
||||||
package_data={"eland": ["py.typed"]},
|
package_data={"eland": ["py.typed"]},
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
|
@ -20,7 +20,7 @@ import os
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
from eland.common import es_version
|
from eland.common import es_version, is_serverless_es
|
||||||
|
|
||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
@ -33,6 +33,7 @@ ELASTICSEARCH_HOST = os.environ.get(
|
|||||||
ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)
|
ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)
|
||||||
|
|
||||||
ES_VERSION = es_version(ES_TEST_CLIENT)
|
ES_VERSION = es_version(ES_TEST_CLIENT)
|
||||||
|
ES_IS_SERVERLESS = is_serverless_es(ES_TEST_CLIENT)
|
||||||
|
|
||||||
FLIGHTS_INDEX_NAME = "flights"
|
FLIGHTS_INDEX_NAME = "flights"
|
||||||
FLIGHTS_MAPPING = {
|
FLIGHTS_MAPPING = {
|
||||||
@ -43,7 +44,7 @@ FLIGHTS_MAPPING = {
|
|||||||
"Carrier": {"type": "keyword"},
|
"Carrier": {"type": "keyword"},
|
||||||
"Dest": {"type": "keyword"},
|
"Dest": {"type": "keyword"},
|
||||||
"DestAirportID": {"type": "keyword"},
|
"DestAirportID": {"type": "keyword"},
|
||||||
"DestCityName": {"type": "keyword"},
|
"DestCityName": {"type": "keyword", "copy_to": "Cities"},
|
||||||
"DestCountry": {"type": "keyword"},
|
"DestCountry": {"type": "keyword"},
|
||||||
"DestLocation": {"type": "geo_point"},
|
"DestLocation": {"type": "geo_point"},
|
||||||
"DestRegion": {"type": "keyword"},
|
"DestRegion": {"type": "keyword"},
|
||||||
@ -58,11 +59,12 @@ FLIGHTS_MAPPING = {
|
|||||||
"FlightTimeMin": {"type": "float"},
|
"FlightTimeMin": {"type": "float"},
|
||||||
"Origin": {"type": "keyword"},
|
"Origin": {"type": "keyword"},
|
||||||
"OriginAirportID": {"type": "keyword"},
|
"OriginAirportID": {"type": "keyword"},
|
||||||
"OriginCityName": {"type": "keyword"},
|
"OriginCityName": {"type": "keyword", "copy_to": "Cities"},
|
||||||
"OriginCountry": {"type": "keyword"},
|
"OriginCountry": {"type": "keyword"},
|
||||||
"OriginLocation": {"type": "geo_point"},
|
"OriginLocation": {"type": "geo_point"},
|
||||||
"OriginRegion": {"type": "keyword"},
|
"OriginRegion": {"type": "keyword"},
|
||||||
"OriginWeather": {"type": "keyword"},
|
"OriginWeather": {"type": "keyword"},
|
||||||
|
"Cities": {"type": "text"},
|
||||||
"dayOfWeek": {"type": "byte"},
|
"dayOfWeek": {"type": "byte"},
|
||||||
"timestamp": {"type": "date", "format": "strict_date_hour_minute_second"},
|
"timestamp": {"type": "date", "format": "strict_date_hour_minute_second"},
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
|||||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||||
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
|
from eland.common import PANDAS_VERSION
|
||||||
|
|
||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
@ -45,7 +46,14 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
|
|||||||
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
||||||
_ed_flights.columns, axis=1
|
_ed_flights.columns, axis=1
|
||||||
)
|
)
|
||||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
if PANDAS_VERSION[0] >= 2:
|
||||||
|
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
|
||||||
|
else:
|
||||||
|
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||||
|
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
|
||||||
|
_pd_flights["Cities"] = _pd_flights.apply(
|
||||||
|
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
|
||||||
|
)
|
||||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||||
|
|
||||||
_pd_flights_small = _pd_flights.head(48)
|
_pd_flights_small = _pd_flights.head(48)
|
||||||
@ -58,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
|
|||||||
)
|
)
|
||||||
_pd_ecommerce.insert(2, "customer_birth_date", None)
|
_pd_ecommerce.insert(2, "customer_birth_date", None)
|
||||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||||
_pd_ecommerce["customer_birth_date"].astype("datetime64")
|
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
|
||||||
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
|
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,7 +77,16 @@ class SymmetricAPIChecker:
|
|||||||
pd_exc = e
|
pd_exc = e
|
||||||
|
|
||||||
self.check_exception(ed_exc, pd_exc)
|
self.check_exception(ed_exc, pd_exc)
|
||||||
self.check_values(ed_obj, pd_obj)
|
try:
|
||||||
|
self.check_values(ed_obj, pd_obj)
|
||||||
|
except AssertionError as e:
|
||||||
|
# This is an attribute we allow to differ when comparing zero-length objects
|
||||||
|
if (
|
||||||
|
'Attribute "inferred_type" are different' in repr(e)
|
||||||
|
and len(ed_obj) == 0
|
||||||
|
and len(pd_obj) == 0
|
||||||
|
):
|
||||||
|
self.check_values(ed_obj, pd_obj, check_index_type=False)
|
||||||
|
|
||||||
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
|
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
|
||||||
return SymmetricAPIChecker(ed_obj, pd_obj)
|
return SymmetricAPIChecker(ed_obj, pd_obj)
|
||||||
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
|
|||||||
|
|
||||||
return f
|
return f
|
||||||
|
|
||||||
def check_values(self, ed_obj, pd_obj):
|
def check_values(self, ed_obj, pd_obj, **kwargs):
|
||||||
"""Checks that any two values coming from eland and pandas are equal"""
|
"""Checks that any two values coming from eland and pandas are equal"""
|
||||||
if isinstance(ed_obj, ed.DataFrame):
|
if isinstance(ed_obj, ed.DataFrame):
|
||||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj)
|
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
|
||||||
elif isinstance(ed_obj, ed.Series):
|
elif isinstance(ed_obj, ed.Series):
|
||||||
assert_pandas_eland_series_equal(pd_obj, ed_obj)
|
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
|
||||||
elif isinstance(ed_obj, pd.DataFrame):
|
elif isinstance(ed_obj, pd.DataFrame):
|
||||||
assert_frame_equal(ed_obj, pd_obj)
|
assert_frame_equal(ed_obj, pd_obj, **kwargs)
|
||||||
elif isinstance(ed_obj, pd.Series):
|
elif isinstance(ed_obj, pd.Series):
|
||||||
assert_series_equal(ed_obj, pd_obj)
|
assert_series_equal(ed_obj, pd_obj, **kwargs)
|
||||||
elif isinstance(ed_obj, pd.Index):
|
elif isinstance(ed_obj, pd.Index):
|
||||||
assert ed_obj.equals(pd_obj)
|
assert ed_obj.equals(pd_obj)
|
||||||
else:
|
else:
|
||||||
|
@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
|
|||||||
},
|
},
|
||||||
index=["0", "1", "2"],
|
index=["0", "1", "2"],
|
||||||
)
|
)
|
||||||
|
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||||
|
df["D"] = df["D"].astype("datetime64[ns]")
|
||||||
|
|
||||||
expected_mappings = {
|
expected_mappings = {
|
||||||
"mappings": {
|
"mappings": {
|
||||||
|
@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
|
|||||||
["Cancelled", "FlightDelay"], axis="columns"
|
["Cancelled", "FlightDelay"], axis="columns"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
|
||||||
|
# This could be implemented in eland, but as of yet this is not the case
|
||||||
|
# We therefore remove it before the comparison
|
||||||
|
if "timestamp" in pd_describe.columns:
|
||||||
|
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
|
||||||
|
|
||||||
|
# Pandas >= 2 orders the aggregations differently than Pandas < 2
|
||||||
|
# A sort_index is applied so tests will succeed in both environments
|
||||||
assert_frame_equal(
|
assert_frame_equal(
|
||||||
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
|
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||||
ed_describe.drop(["25%", "50%", "75%"], axis="index"),
|
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||||
check_exact=False,
|
check_exact=False,
|
||||||
rtol=True,
|
rtol=True,
|
||||||
)
|
)
|
||||||
|
@ -43,6 +43,7 @@ class TestDataFrameDtypes:
|
|||||||
"AvgTicketPrice": "float",
|
"AvgTicketPrice": "float",
|
||||||
"Cancelled": "boolean",
|
"Cancelled": "boolean",
|
||||||
"Carrier": "keyword",
|
"Carrier": "keyword",
|
||||||
|
"Cities": "text",
|
||||||
"Dest": "keyword",
|
"Dest": "keyword",
|
||||||
"DestAirportID": "keyword",
|
"DestAirportID": "keyword",
|
||||||
"DestCityName": "keyword",
|
"DestCityName": "keyword",
|
||||||
|
@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
|
|||||||
|
|
||||||
ed_head_0 = ed_flights.head(0)
|
ed_head_0 = ed_flights.head(0)
|
||||||
pd_head_0 = pd_flights.head(0)
|
pd_head_0 = pd_flights.head(0)
|
||||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
|
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
|
||||||
|
|
||||||
def test_doc_test_tail(self):
|
def test_doc_test_tail(self):
|
||||||
df = self.ed_flights()
|
df = self.ed_flights()
|
||||||
|
@ -54,9 +54,13 @@ class TestDataFrameIterrowsItertuples(TestData):
|
|||||||
# Shim which uses pytest.approx() for floating point values inside tuples.
|
# Shim which uses pytest.approx() for floating point values inside tuples.
|
||||||
assert len(left) == len(right)
|
assert len(left) == len(right)
|
||||||
assert all(
|
assert all(
|
||||||
(lt == rt) # Not floats? Use ==
|
(
|
||||||
if not isinstance(lt, float) and not isinstance(rt, float)
|
# Not floats? Use ==
|
||||||
else (lt == pytest.approx(rt)) # If both are floats use pytest.approx()
|
(lt == rt)
|
||||||
|
if not isinstance(lt, float) and not isinstance(rt, float)
|
||||||
|
# If both are floats use pytest.approx()
|
||||||
|
else (lt == pytest.approx(rt))
|
||||||
|
)
|
||||||
for lt, rt in zip(left, right)
|
for lt, rt in zip(left, right)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ import pandas as pd
|
|||||||
import pytest
|
import pytest
|
||||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||||
|
|
||||||
|
from eland.common import PANDAS_VERSION
|
||||||
from tests.common import TestData, assert_almost_equal
|
from tests.common import TestData, assert_almost_equal
|
||||||
|
|
||||||
|
|
||||||
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
|
|||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
|
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||||
|
continue
|
||||||
pd_metric = getattr(pd_flights, func)(
|
pd_metric = getattr(pd_flights, func)(
|
||||||
**({"numeric_only": True} if func != "mad" else {})
|
**({"numeric_only": True} if func != "mad" else {})
|
||||||
)
|
)
|
||||||
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
|
|||||||
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
|
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||||
|
continue
|
||||||
pd_metric = getattr(pd_flights_1, func)()
|
pd_metric = getattr(pd_flights_1, func)()
|
||||||
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
||||||
|
|
||||||
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
|
|||||||
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
|
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||||
|
continue
|
||||||
pd_metric = getattr(pd_flights_0, func)()
|
pd_metric = getattr(pd_flights_0, func)()
|
||||||
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
||||||
|
|
||||||
@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData):
|
|||||||
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
|
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
|
||||||
)
|
)
|
||||||
|
|
||||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
if PANDAS_VERSION[0] == 1:
|
||||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||||
|
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||||
|
|
||||||
|
else: # numeric_only is no longer available for pandas > 2
|
||||||
|
pd_quantile = pd_flights.agg(["quantile", "min"])
|
||||||
|
ed_quantile = ed_flights.agg(["quantile", "min"])
|
||||||
|
|
||||||
assert_frame_equal(
|
assert_frame_equal(
|
||||||
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
|
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
# specific language governing permissions and limitations
|
# specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatibility
|
||||||
|
|
||||||
import ast
|
import ast
|
||||||
import time
|
import time
|
||||||
@ -41,8 +41,9 @@ class TestDataFrameToCSV(TestData):
|
|||||||
results_file,
|
results_file,
|
||||||
index_col=0,
|
index_col=0,
|
||||||
converters={
|
converters={
|
||||||
"DestLocation": lambda x: ast.literal_eval(x),
|
"DestLocation": ast.literal_eval,
|
||||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
"OriginLocation": ast.literal_eval,
|
||||||
|
"Cities": ast.literal_eval,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||||
@ -63,8 +64,9 @@ class TestDataFrameToCSV(TestData):
|
|||||||
results_file,
|
results_file,
|
||||||
index_col=0,
|
index_col=0,
|
||||||
converters={
|
converters={
|
||||||
"DestLocation": lambda x: ast.literal_eval(x),
|
"DestLocation": ast.literal_eval,
|
||||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
"OriginLocation": ast.literal_eval,
|
||||||
|
"Cities": ast.literal_eval,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||||
@ -112,8 +114,9 @@ class TestDataFrameToCSV(TestData):
|
|||||||
results,
|
results,
|
||||||
index_col=0,
|
index_col=0,
|
||||||
converters={
|
converters={
|
||||||
"DestLocation": lambda x: ast.literal_eval(x),
|
"DestLocation": ast.literal_eval,
|
||||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
"OriginLocation": ast.literal_eval,
|
||||||
|
"Cities": ast.literal_eval,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||||
|
139
tests/dataframe/test_to_json_pytest.py
Normal file
139
tests/dataframe/test_to_json_pytest.py
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
# Licensed to Elasticsearch B.V. under one or more contributor
|
||||||
|
# license agreements. See the NOTICE file distributed with
|
||||||
|
# this work for additional information regarding copyright
|
||||||
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
||||||
|
# the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
# File called _pytest for PyCharm compatibility
|
||||||
|
|
||||||
|
from io import StringIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
|
from tests.common import ROOT_DIR, TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameToJSON(TestData):
|
||||||
|
def test_to_json_default_arguments(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights.to_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl")
|
||||||
|
pd_flights.to_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl")
|
||||||
|
|
||||||
|
assert_frame_equal(
|
||||||
|
pandas.read_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl"),
|
||||||
|
pandas.read_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl"),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_to_json_streaming_mode(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights.to_json(
|
||||||
|
ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
)
|
||||||
|
pd_flights.to_json(
|
||||||
|
ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert_frame_equal(
|
||||||
|
pandas.read_json(
|
||||||
|
ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
pandas.read_json(
|
||||||
|
ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_to_json_streaming_mode_pathlib(self):
|
||||||
|
root_dir = Path(ROOT_DIR)
|
||||||
|
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights.to_json(
|
||||||
|
root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
)
|
||||||
|
pd_flights.to_json(
|
||||||
|
root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert_frame_equal(
|
||||||
|
pandas.read_json(
|
||||||
|
root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
pandas.read_json(
|
||||||
|
root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_to_json_with_other_buffer(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
output_buffer = StringIO()
|
||||||
|
ed_flights.to_json(output_buffer, lines=True, orient="records")
|
||||||
|
output_string = pd_flights.to_json(lines=True, orient="records")
|
||||||
|
|
||||||
|
output_buffer.seek(0) # rewind our StringIO object
|
||||||
|
|
||||||
|
assert_frame_equal(
|
||||||
|
pandas.read_json(output_buffer, lines=True, orient="records"),
|
||||||
|
pandas.read_json(
|
||||||
|
StringIO(output_string),
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_to_json_with_file_handle(self):
|
||||||
|
root_dir = Path(ROOT_DIR)
|
||||||
|
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
with open(
|
||||||
|
root_dir / "dataframe" / "results" / "fh_eland_to_json.jsonl", "w"
|
||||||
|
) as w:
|
||||||
|
ed_flights.to_json(w)
|
||||||
|
pd_flights.to_json(
|
||||||
|
root_dir / "dataframe" / "results" / "check_pandas_to_json.jsonl"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert_frame_equal(
|
||||||
|
pandas.read_json(
|
||||||
|
ROOT_DIR + "/dataframe/results/fh_eland_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
pandas.read_json(
|
||||||
|
ROOT_DIR + "/dataframe/results/check_pandas_to_json.jsonl",
|
||||||
|
lines=True,
|
||||||
|
orient="records",
|
||||||
|
),
|
||||||
|
)
|
@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
|
|||||||
)
|
)
|
||||||
ed_df_head = ed_df.head()
|
ed_df_head = ed_df.head()
|
||||||
|
|
||||||
|
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||||
|
df["D"] = df["D"].astype("datetime64[ns]")
|
||||||
|
df["H"] = (
|
||||||
|
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
|
||||||
|
)
|
||||||
|
|
||||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||||
|
|
||||||
ES_TEST_CLIENT.indices.delete(index=index_name)
|
ES_TEST_CLIENT.indices.delete(index=index_name)
|
||||||
|
@ -39,6 +39,7 @@ try:
|
|||||||
from eland.ml.pytorch import (
|
from eland.ml.pytorch import (
|
||||||
FillMaskInferenceOptions,
|
FillMaskInferenceOptions,
|
||||||
NlpBertTokenizationConfig,
|
NlpBertTokenizationConfig,
|
||||||
|
NlpDebertaV2TokenizationConfig,
|
||||||
NlpMPNetTokenizationConfig,
|
NlpMPNetTokenizationConfig,
|
||||||
NlpRobertaTokenizationConfig,
|
NlpRobertaTokenizationConfig,
|
||||||
NlpXLMRobertaTokenizationConfig,
|
NlpXLMRobertaTokenizationConfig,
|
||||||
@ -57,10 +58,6 @@ except ImportError:
|
|||||||
from tests import ES_VERSION
|
from tests import ES_VERSION
|
||||||
|
|
||||||
pytestmark = [
|
pytestmark = [
|
||||||
pytest.mark.skipif(
|
|
||||||
ES_VERSION < (8, 7, 0),
|
|
||||||
reason="Eland uses Pytorch 1.13.1, versions of Elasticsearch prior to 8.7.0 are incompatible with PyTorch 1.13.1",
|
|
||||||
),
|
|
||||||
pytest.mark.skipif(
|
pytest.mark.skipif(
|
||||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||||
),
|
),
|
||||||
@ -149,13 +146,20 @@ if HAS_PYTORCH and HAS_SKLEARN and HAS_TRANSFORMERS:
|
|||||||
1024,
|
1024,
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
"microsoft/deberta-v3-xsmall",
|
||||||
|
"fill_mask",
|
||||||
|
FillMaskInferenceOptions,
|
||||||
|
NlpDebertaV2TokenizationConfig,
|
||||||
|
512,
|
||||||
|
None,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
MODEL_CONFIGURATIONS = []
|
MODEL_CONFIGURATIONS = []
|
||||||
|
|
||||||
|
|
||||||
class TestModelConfguration:
|
class TestModelConfguration:
|
||||||
@pytest.mark.skip(reason="https://github.com/elastic/eland/issues/633")
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model_id,task_type,config_type,tokenizer_type,max_sequence_len,embedding_size",
|
"model_id,task_type,config_type,tokenizer_type,max_sequence_len,embedding_size",
|
||||||
MODEL_CONFIGURATIONS,
|
MODEL_CONFIGURATIONS,
|
||||||
@ -209,6 +213,9 @@ class TestModelConfguration:
|
|||||||
assert isinstance(config.inference_config.classification_labels, list)
|
assert isinstance(config.inference_config.classification_labels, list)
|
||||||
assert len(config.inference_config.classification_labels) > 0
|
assert len(config.inference_config.classification_labels) > 0
|
||||||
|
|
||||||
|
if task_type == "text_similarity":
|
||||||
|
assert tokenization.truncate == "second"
|
||||||
|
|
||||||
del tm
|
del tm
|
||||||
|
|
||||||
def test_model_config_with_prefix_string(self):
|
def test_model_config_with_prefix_string(self):
|
||||||
@ -235,3 +242,16 @@ class TestModelConfguration:
|
|||||||
ingest_prefix="INGEST:",
|
ingest_prefix="INGEST:",
|
||||||
search_prefix="SEARCH:",
|
search_prefix="SEARCH:",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_model_config_with_user_specified_input_length(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tm = TransformerModel(
|
||||||
|
model_id="sentence-transformers/all-distilroberta-v1",
|
||||||
|
task_type="text_embedding",
|
||||||
|
es_version=(8, 13, 0),
|
||||||
|
quantize=False,
|
||||||
|
max_model_input_size=213,
|
||||||
|
)
|
||||||
|
_, config, _ = tm.save(tmp_dir)
|
||||||
|
tokenization = config.inference_config.tokenization
|
||||||
|
assert tokenization.max_sequence_length == 213
|
||||||
|
@ -38,10 +38,6 @@ except ImportError:
|
|||||||
from tests import ES_TEST_CLIENT, ES_VERSION
|
from tests import ES_TEST_CLIENT, ES_VERSION
|
||||||
|
|
||||||
pytestmark = [
|
pytestmark = [
|
||||||
pytest.mark.skipif(
|
|
||||||
ES_VERSION < (8, 7, 0),
|
|
||||||
reason="Eland uses Pytorch 1.13.1, versions of Elasticsearch prior to 8.7.0 are incompatible with PyTorch 1.13.1",
|
|
||||||
),
|
|
||||||
pytest.mark.skipif(
|
pytest.mark.skipif(
|
||||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||||
),
|
),
|
||||||
@ -67,6 +63,10 @@ TEXT_EMBEDDING_MODELS = [
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
TEXT_SIMILARITY_MODELS = ["mixedbread-ai/mxbai-rerank-xsmall-v1"]
|
||||||
|
|
||||||
|
TEXT_EXPANSION_MODELS = ["naver/splade-v3-distilbert"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def setup_and_tear_down():
|
def setup_and_tear_down():
|
||||||
@ -135,3 +135,44 @@ class TestPytorchModel:
|
|||||||
)
|
)
|
||||||
> 0
|
> 0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
ES_VERSION < (8, 16, 0), reason="requires 8.16.0 for DeBERTa models"
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("model_id", TEXT_SIMILARITY_MODELS)
|
||||||
|
def test_text_similarity(self, model_id):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
ptm = download_model_and_start_deployment(
|
||||||
|
tmp_dir, False, model_id, "text_similarity"
|
||||||
|
)
|
||||||
|
result = ptm.infer(
|
||||||
|
docs=[
|
||||||
|
{
|
||||||
|
"text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
|
||||||
|
},
|
||||||
|
{"text_field": "Paris is the capital of France"},
|
||||||
|
],
|
||||||
|
inference_config={"text_similarity": {"text": "France"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.body["inference_results"][0]["predicted_value"] < 0
|
||||||
|
assert result.body["inference_results"][1]["predicted_value"] > 0
|
||||||
|
|
||||||
|
@pytest.mark.skipif(ES_VERSION < (9, 0, 0), reason="requires current major version")
|
||||||
|
@pytest.mark.parametrize("model_id", TEXT_EXPANSION_MODELS)
|
||||||
|
def test_text_expansion(self, model_id):
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
ptm = download_model_and_start_deployment(
|
||||||
|
tmp_dir, False, model_id, "text_expansion"
|
||||||
|
)
|
||||||
|
result = ptm.infer(
|
||||||
|
docs=[
|
||||||
|
{
|
||||||
|
"text_field": "The Amazon rainforest covers most of the Amazon basin in South America"
|
||||||
|
},
|
||||||
|
{"text_field": "Paris is the capital of France"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(result.body["inference_results"][0]["predicted_value"]) > 0
|
||||||
|
assert len(result.body["inference_results"][1]["predicted_value"]) > 0
|
||||||
|
@ -15,19 +15,18 @@
|
|||||||
# specific language governing permissions and limitations
|
# specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
from operator import itemgetter
|
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import eland as ed
|
|
||||||
from eland.ml import MLModel
|
from eland.ml import MLModel
|
||||||
from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor
|
from eland.ml.ltr import FeatureLogger, LTRModelConfig, QueryFeatureExtractor
|
||||||
|
from eland.ml.transformers import get_model_transformer
|
||||||
from tests import (
|
from tests import (
|
||||||
|
ES_IS_SERVERLESS,
|
||||||
ES_TEST_CLIENT,
|
ES_TEST_CLIENT,
|
||||||
ES_VERSION,
|
ES_VERSION,
|
||||||
FLIGHTS_SMALL_INDEX_NAME,
|
|
||||||
NATIONAL_PARKS_INDEX_NAME,
|
NATIONAL_PARKS_INDEX_NAME,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -54,26 +53,16 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
HAS_LIGHTGBM = False
|
HAS_LIGHTGBM = False
|
||||||
|
|
||||||
try:
|
|
||||||
import shap
|
|
||||||
|
|
||||||
HAS_SHAP = True
|
|
||||||
except ImportError:
|
|
||||||
HAS_SHAP = False
|
|
||||||
|
|
||||||
|
|
||||||
requires_sklearn = pytest.mark.skipif(
|
requires_sklearn = pytest.mark.skipif(
|
||||||
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run."
|
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
|
||||||
)
|
)
|
||||||
requires_xgboost = pytest.mark.skipif(
|
requires_xgboost = pytest.mark.skipif(
|
||||||
not HAS_XGBOOST, reason="This test requires 'xgboost' package to run."
|
not HAS_XGBOOST, reason="This test requires 'xgboost' package to run"
|
||||||
)
|
|
||||||
requires_shap = pytest.mark.skipif(
|
|
||||||
not HAS_SHAP, reason="This tests requries 'shap' package to run."
|
|
||||||
)
|
)
|
||||||
requires_no_ml_extras = pytest.mark.skipif(
|
requires_no_ml_extras = pytest.mark.skipif(
|
||||||
HAS_SKLEARN or HAS_XGBOOST,
|
HAS_SKLEARN or HAS_XGBOOST,
|
||||||
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed.",
|
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed",
|
||||||
)
|
)
|
||||||
|
|
||||||
requires_lightgbm = pytest.mark.skipif(
|
requires_lightgbm = pytest.mark.skipif(
|
||||||
@ -107,100 +96,11 @@ def check_prediction_equality(es_model: MLModel, py_model, test_data):
|
|||||||
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
|
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
|
||||||
|
|
||||||
|
|
||||||
def yield_model_id(analysis, analyzed_fields):
|
def randomize_model_id(prefix, suffix_size=10):
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
import time
|
|
||||||
|
|
||||||
suffix = "".join(random.choices(string.ascii_lowercase, k=4))
|
return f"{prefix}-{''.join(random.choices(string.ascii_lowercase, k=suffix_size))}"
|
||||||
job_id = "test-flights-regression-" + suffix
|
|
||||||
dest = job_id + "-dest"
|
|
||||||
|
|
||||||
response = ES_TEST_CLIENT.ml.put_data_frame_analytics(
|
|
||||||
id=job_id,
|
|
||||||
analysis=analysis,
|
|
||||||
dest={"index": dest},
|
|
||||||
source={"index": [FLIGHTS_SMALL_INDEX_NAME]},
|
|
||||||
analyzed_fields=analyzed_fields,
|
|
||||||
)
|
|
||||||
assert response.meta.status == 200
|
|
||||||
response = ES_TEST_CLIENT.ml.start_data_frame_analytics(id=job_id)
|
|
||||||
assert response.meta.status == 200
|
|
||||||
|
|
||||||
time.sleep(2)
|
|
||||||
response = ES_TEST_CLIENT.ml.get_trained_models(model_id=job_id + "*")
|
|
||||||
assert response.meta.status == 200
|
|
||||||
assert response.body["count"] == 1
|
|
||||||
model_id = response.body["trained_model_configs"][0]["model_id"]
|
|
||||||
|
|
||||||
yield model_id
|
|
||||||
|
|
||||||
ES_TEST_CLIENT.ml.delete_data_frame_analytics(id=job_id)
|
|
||||||
ES_TEST_CLIENT.indices.delete(index=dest)
|
|
||||||
ES_TEST_CLIENT.ml.delete_trained_model(model_id=model_id)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=[[0, 4], [0, 1], range(5)])
|
|
||||||
def regression_model_id(request):
|
|
||||||
analysis = {
|
|
||||||
"regression": {
|
|
||||||
"dependent_variable": "FlightDelayMin",
|
|
||||||
"max_trees": 3,
|
|
||||||
"num_top_feature_importance_values": 0,
|
|
||||||
"max_optimization_rounds_per_hyperparameter": 1,
|
|
||||||
"prediction_field_name": "FlightDelayMin_prediction",
|
|
||||||
"training_percent": 30,
|
|
||||||
"randomize_seed": 1000,
|
|
||||||
"loss_function": "mse",
|
|
||||||
"early_stopping_enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
all_includes = [
|
|
||||||
"FlightDelayMin",
|
|
||||||
"FlightDelayType",
|
|
||||||
"FlightTimeMin",
|
|
||||||
"DistanceMiles",
|
|
||||||
"OriginAirportID",
|
|
||||||
]
|
|
||||||
includes = [all_includes[i] for i in request.param]
|
|
||||||
analyzed_fields = {
|
|
||||||
"includes": includes,
|
|
||||||
"excludes": [],
|
|
||||||
}
|
|
||||||
yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=[[0, 6], [5, 6], range(7)])
|
|
||||||
def classification_model_id(request):
|
|
||||||
analysis = {
|
|
||||||
"classification": {
|
|
||||||
"dependent_variable": "Cancelled",
|
|
||||||
"max_trees": 5,
|
|
||||||
"num_top_feature_importance_values": 0,
|
|
||||||
"max_optimization_rounds_per_hyperparameter": 1,
|
|
||||||
"prediction_field_name": "Cancelled_prediction",
|
|
||||||
"training_percent": 50,
|
|
||||||
"randomize_seed": 1000,
|
|
||||||
"num_top_classes": -1,
|
|
||||||
"class_assignment_objective": "maximize_accuracy",
|
|
||||||
"early_stopping_enabled": True,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
all_includes = [
|
|
||||||
"OriginWeather",
|
|
||||||
"OriginAirportID",
|
|
||||||
"DestCityName",
|
|
||||||
"DestWeather",
|
|
||||||
"DestRegion",
|
|
||||||
"AvgTicketPrice",
|
|
||||||
"Cancelled",
|
|
||||||
]
|
|
||||||
includes = [all_includes[i] for i in request.param]
|
|
||||||
analyzed_fields = {
|
|
||||||
"includes": includes,
|
|
||||||
"excludes": [],
|
|
||||||
}
|
|
||||||
yield from yield_model_id(analysis=analysis, analyzed_fields=analyzed_fields)
|
|
||||||
|
|
||||||
|
|
||||||
class TestMLModel:
|
class TestMLModel:
|
||||||
@ -320,17 +220,71 @@ class TestMLModel:
|
|||||||
# Clean up
|
# Clean up
|
||||||
es_model.delete_model()
|
es_model.delete_model()
|
||||||
|
|
||||||
|
def _normalize_ltr_score_from_XGBRanker(self, ranker, ltr_model_config, scores):
|
||||||
|
"""Normalize the scores of an XGBRanker model as ES implementation of LTR would do.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ranker : XGBRanker
|
||||||
|
The XGBRanker model to retrieve the minimum score from.
|
||||||
|
|
||||||
|
ltr_model_config : LTRModelConfig
|
||||||
|
LTR model config.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
scores : List[float]
|
||||||
|
Normalized scores for the model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
should_rescore = (
|
||||||
|
(ES_VERSION[0] == 8 and ES_VERSION >= (8, 19))
|
||||||
|
or (
|
||||||
|
ES_VERSION[0] == 9
|
||||||
|
and (ES_VERSION[1] >= 1 or (ES_VERSION[1] == 0 and ES_VERSION[2] >= 1))
|
||||||
|
)
|
||||||
|
or ES_IS_SERVERLESS
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_rescore:
|
||||||
|
# In 8.19+, 9.0.1 and 9.1, the scores are normalized if there are negative scores
|
||||||
|
min_model_score, _ = (
|
||||||
|
get_model_transformer(
|
||||||
|
ranker, feature_names=ltr_model_config.feature_names
|
||||||
|
)
|
||||||
|
.transform()
|
||||||
|
.bounds()
|
||||||
|
)
|
||||||
|
if min_model_score < 0:
|
||||||
|
scores = [score - min_model_score for score in scores]
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
||||||
@requires_elasticsearch_version((8, 12))
|
@requires_elasticsearch_version((8, 12))
|
||||||
@requires_sklearn
|
@requires_xgboost
|
||||||
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
@pytest.mark.parametrize("compress_model_definition", [True, False])
|
||||||
def test_learning_to_rank(self, compress_model_definition):
|
@pytest.mark.parametrize(
|
||||||
# Train model
|
"objective",
|
||||||
training_data = datasets.make_regression(n_features=2)
|
["rank:ndcg", "rank:map", "rank:pairwise"],
|
||||||
regressor = DecisionTreeRegressor()
|
)
|
||||||
regressor.fit(training_data[0], training_data[1])
|
def test_learning_to_rank(self, objective, compress_model_definition):
|
||||||
|
X, y = datasets.make_classification(
|
||||||
|
n_features=3, n_informative=2, n_redundant=1
|
||||||
|
)
|
||||||
|
rng = np.random.default_rng()
|
||||||
|
qid = rng.integers(0, 3, size=X.shape[0])
|
||||||
|
|
||||||
|
# Sort the inputs based on query index
|
||||||
|
sorted_idx = np.argsort(qid)
|
||||||
|
X = X[sorted_idx, :]
|
||||||
|
y = y[sorted_idx]
|
||||||
|
qid = qid[sorted_idx]
|
||||||
|
|
||||||
|
ranker = XGBRanker(objective=objective)
|
||||||
|
ranker.fit(X, y, qid=qid)
|
||||||
|
|
||||||
# Serialise the models to Elasticsearch
|
# Serialise the models to Elasticsearch
|
||||||
model_id = "test_learning_to_rank"
|
model_id = randomize_model_id("test_learning_to_rank")
|
||||||
ltr_model_config = LTRModelConfig(
|
ltr_model_config = LTRModelConfig(
|
||||||
feature_extractors=[
|
feature_extractors=[
|
||||||
QueryFeatureExtractor(
|
QueryFeatureExtractor(
|
||||||
@ -356,9 +310,8 @@ class TestMLModel:
|
|||||||
es_model = MLModel.import_ltr_model(
|
es_model = MLModel.import_ltr_model(
|
||||||
ES_TEST_CLIENT,
|
ES_TEST_CLIENT,
|
||||||
model_id,
|
model_id,
|
||||||
regressor,
|
ranker,
|
||||||
ltr_model_config,
|
ltr_model_config,
|
||||||
es_if_exists="replace",
|
|
||||||
es_compress_model_definition=compress_model_definition,
|
es_compress_model_definition=compress_model_definition,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -366,9 +319,19 @@ class TestMLModel:
|
|||||||
response = ES_TEST_CLIENT.ml.get_trained_models(model_id=model_id)
|
response = ES_TEST_CLIENT.ml.get_trained_models(model_id=model_id)
|
||||||
assert response.meta.status == 200
|
assert response.meta.status == 200
|
||||||
assert response.body["count"] == 1
|
assert response.body["count"] == 1
|
||||||
saved_inference_config = response.body["trained_model_configs"][0][
|
|
||||||
"inference_config"
|
saved_trained_model_config = response.body["trained_model_configs"][0]
|
||||||
]
|
|
||||||
|
assert "input" in saved_trained_model_config
|
||||||
|
assert "field_names" in saved_trained_model_config["input"]
|
||||||
|
|
||||||
|
if not ES_IS_SERVERLESS and ES_VERSION < (8, 15):
|
||||||
|
assert len(saved_trained_model_config["input"]["field_names"]) == 3
|
||||||
|
else:
|
||||||
|
assert not len(saved_trained_model_config["input"]["field_names"])
|
||||||
|
|
||||||
|
saved_inference_config = saved_trained_model_config["inference_config"]
|
||||||
|
|
||||||
assert "learning_to_rank" in saved_inference_config
|
assert "learning_to_rank" in saved_inference_config
|
||||||
assert "feature_extractors" in saved_inference_config["learning_to_rank"]
|
assert "feature_extractors" in saved_inference_config["learning_to_rank"]
|
||||||
saved_feature_extractors = saved_inference_config["learning_to_rank"][
|
saved_feature_extractors = saved_inference_config["learning_to_rank"][
|
||||||
@ -388,16 +351,32 @@ class TestMLModel:
|
|||||||
"learning_to_rank": {
|
"learning_to_rank": {
|
||||||
"model_id": model_id,
|
"model_id": model_id,
|
||||||
"params": {"query_string": "yosemite"},
|
"params": {"query_string": "yosemite"},
|
||||||
}
|
},
|
||||||
|
"window_size": 2,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assert that:
|
# Assert that rescored search result match predition.
|
||||||
# - all documents from the query are present
|
|
||||||
# - all documents have been rescored (score != 1.0)
|
|
||||||
doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]]
|
doc_scores = [hit["_score"] for hit in search_result["hits"]["hits"]]
|
||||||
assert len(search_result["hits"]["hits"]) == 2
|
|
||||||
assert all(score != float(1) for score in doc_scores)
|
feature_logger = FeatureLogger(
|
||||||
|
ES_TEST_CLIENT, NATIONAL_PARKS_INDEX_NAME, ltr_model_config
|
||||||
|
)
|
||||||
|
expected_scores = sorted(
|
||||||
|
[
|
||||||
|
ranker.predict(np.asarray([doc_features]))[0]
|
||||||
|
for _, doc_features in feature_logger.extract_features(
|
||||||
|
{"query_string": "yosemite"}, ["park_yosemite", "park_everglades"]
|
||||||
|
).items()
|
||||||
|
],
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_scores = self._normalize_ltr_score_from_XGBRanker(
|
||||||
|
ranker, ltr_model_config, expected_scores
|
||||||
|
)
|
||||||
|
|
||||||
|
np.testing.assert_almost_equal(expected_scores, doc_scores, decimal=2)
|
||||||
|
|
||||||
# Verify prediction is not supported for LTR
|
# Verify prediction is not supported for LTR
|
||||||
try:
|
try:
|
||||||
@ -406,6 +385,9 @@ class TestMLModel:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
|
ES_TEST_CLIENT.cluster.health(
|
||||||
|
index=".ml-*", wait_for_active_shards="all"
|
||||||
|
) # Added to prevent flakiness in the test
|
||||||
es_model.delete_model()
|
es_model.delete_model()
|
||||||
|
|
||||||
@requires_sklearn
|
@requires_sklearn
|
||||||
@ -434,6 +416,7 @@ class TestMLModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
|
|
||||||
es_model.delete_model()
|
es_model.delete_model()
|
||||||
|
|
||||||
@requires_sklearn
|
@requires_sklearn
|
||||||
@ -744,172 +727,3 @@ class TestMLModel:
|
|||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
es_model.delete_model()
|
es_model.delete_model()
|
||||||
|
|
||||||
@requires_sklearn
|
|
||||||
@requires_shap
|
|
||||||
def test_export_regressor(self, regression_model_id):
|
|
||||||
ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
|
|
||||||
types = dict(ed_flights.dtypes)
|
|
||||||
X = ed_flights.to_pandas().astype(types)
|
|
||||||
|
|
||||||
model = MLModel(es_client=ES_TEST_CLIENT, model_id=regression_model_id)
|
|
||||||
pipeline = model.export_model()
|
|
||||||
pipeline.fit(X)
|
|
||||||
|
|
||||||
predictions_sklearn = pipeline.predict(
|
|
||||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
|
||||||
)
|
|
||||||
response = ES_TEST_CLIENT.ml.infer_trained_model(
|
|
||||||
model_id=regression_model_id,
|
|
||||||
docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
|
|
||||||
)
|
|
||||||
predictions_es = np.array(
|
|
||||||
list(
|
|
||||||
map(
|
|
||||||
itemgetter("FlightDelayMin_prediction"),
|
|
||||||
response.body["inference_results"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
np.testing.assert_array_almost_equal(predictions_sklearn, predictions_es)
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
X_transformed = pipeline["preprocessor"].transform(X=X)
|
|
||||||
X_transformed = pd.DataFrame(
|
|
||||||
X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
|
|
||||||
)
|
|
||||||
explainer = shap.TreeExplainer(pipeline["es_model"])
|
|
||||||
shap_values = explainer.shap_values(
|
|
||||||
X_transformed[pipeline["es_model"].feature_names_in_]
|
|
||||||
)
|
|
||||||
np.testing.assert_array_almost_equal(
|
|
||||||
predictions_sklearn, shap_values.sum(axis=1) + explainer.expected_value
|
|
||||||
)
|
|
||||||
|
|
||||||
@requires_sklearn
|
|
||||||
def test_export_classification(self, classification_model_id):
|
|
||||||
ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME).head(10)
|
|
||||||
X = ed.eland_to_pandas(ed_flights)
|
|
||||||
|
|
||||||
model = MLModel(es_client=ES_TEST_CLIENT, model_id=classification_model_id)
|
|
||||||
pipeline = model.export_model()
|
|
||||||
pipeline.fit(X)
|
|
||||||
|
|
||||||
predictions_sklearn = pipeline.predict(
|
|
||||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
|
||||||
)
|
|
||||||
prediction_proba_sklearn = pipeline.predict_proba(
|
|
||||||
X, feature_names_in=pipeline["preprocessor"].get_feature_names_out()
|
|
||||||
).max(axis=1)
|
|
||||||
|
|
||||||
response = ES_TEST_CLIENT.ml.infer_trained_model(
|
|
||||||
model_id=classification_model_id,
|
|
||||||
docs=X[pipeline["es_model"].input_field_names].to_dict("records"),
|
|
||||||
)
|
|
||||||
predictions_es = np.array(
|
|
||||||
list(
|
|
||||||
map(
|
|
||||||
lambda x: str(int(x["Cancelled_prediction"])),
|
|
||||||
response.body["inference_results"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
prediction_proba_es = np.array(
|
|
||||||
list(
|
|
||||||
map(
|
|
||||||
itemgetter("prediction_probability"),
|
|
||||||
response.body["inference_results"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
np.testing.assert_array_almost_equal(
|
|
||||||
prediction_proba_sklearn, prediction_proba_es
|
|
||||||
)
|
|
||||||
np.testing.assert_array_equal(predictions_sklearn, predictions_es)
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
X_transformed = pipeline["preprocessor"].transform(X=X)
|
|
||||||
X_transformed = pd.DataFrame(
|
|
||||||
X_transformed, columns=pipeline["preprocessor"].get_feature_names_out()
|
|
||||||
)
|
|
||||||
explainer = shap.TreeExplainer(pipeline["es_model"])
|
|
||||||
shap_values = explainer.shap_values(
|
|
||||||
X_transformed[pipeline["es_model"].feature_names_in_]
|
|
||||||
)
|
|
||||||
log_odds = shap_values.sum(axis=1) + explainer.expected_value
|
|
||||||
prediction_proba_shap = 1 / (1 + np.exp(-log_odds))
|
|
||||||
# use probability of the predicted class
|
|
||||||
prediction_proba_shap[prediction_proba_shap < 0.5] = (
|
|
||||||
1 - prediction_proba_shap[prediction_proba_shap < 0.5]
|
|
||||||
)
|
|
||||||
np.testing.assert_array_almost_equal(
|
|
||||||
prediction_proba_sklearn, prediction_proba_shap
|
|
||||||
)
|
|
||||||
|
|
||||||
@requires_xgboost
|
|
||||||
@requires_sklearn
|
|
||||||
@pytest.mark.parametrize("objective", ["binary:logistic", "reg:squarederror"])
|
|
||||||
def test_xgb_import_export(self, objective):
|
|
||||||
booster = "gbtree"
|
|
||||||
|
|
||||||
if objective.startswith("binary:"):
|
|
||||||
training_data = datasets.make_classification(n_features=5)
|
|
||||||
xgb_model = XGBClassifier(
|
|
||||||
booster=booster, objective=objective, use_label_encoder=False
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
training_data = datasets.make_regression(n_features=5)
|
|
||||||
xgb_model = XGBRegressor(
|
|
||||||
booster=booster, objective=objective, use_label_encoder=False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Train model
|
|
||||||
xgb_model.fit(training_data[0], training_data[1])
|
|
||||||
|
|
||||||
# Serialise the models to Elasticsearch
|
|
||||||
feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
|
|
||||||
model_id = "test_xgb_model"
|
|
||||||
|
|
||||||
es_model = MLModel.import_model(
|
|
||||||
ES_TEST_CLIENT, model_id, xgb_model, feature_names, es_if_exists="replace"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export suppose to fail
|
|
||||||
with pytest.raises(ValueError) as ex:
|
|
||||||
es_model.export_model()
|
|
||||||
assert ex.match("Error initializing sklearn classifier.")
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
es_model.delete_model()
|
|
||||||
|
|
||||||
@requires_lightgbm
|
|
||||||
@pytest.mark.parametrize("objective", ["regression", "binary"])
|
|
||||||
def test_lgbm_import_export(self, objective):
|
|
||||||
booster = "gbdt"
|
|
||||||
if objective == "binary":
|
|
||||||
training_data = datasets.make_classification(n_features=5)
|
|
||||||
lgbm_model = LGBMClassifier(boosting_type=booster, objective=objective)
|
|
||||||
else:
|
|
||||||
training_data = datasets.make_regression(n_features=5)
|
|
||||||
lgbm_model = LGBMRegressor(boosting_type=booster, objective=objective)
|
|
||||||
|
|
||||||
# Train model
|
|
||||||
lgbm_model.fit(training_data[0], training_data[1])
|
|
||||||
|
|
||||||
# Serialise the models to Elasticsearch
|
|
||||||
feature_names = ["feature0", "feature1", "feature2", "feature3", "feature4"]
|
|
||||||
model_id = "test_lgbm_model"
|
|
||||||
|
|
||||||
es_model = MLModel.import_model(
|
|
||||||
ES_TEST_CLIENT, model_id, lgbm_model, feature_names, es_if_exists="replace"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export suppose to fail
|
|
||||||
with pytest.raises(ValueError) as ex:
|
|
||||||
es_model.export_model()
|
|
||||||
assert ex.match("Error initializing sklearn classifier.")
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
es_model.delete_model()
|
|
||||||
|
File diff suppressed because one or more lines are too long
@ -19,7 +19,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"False"
|
"HeadApiResponse(False)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
@ -43,8 +43,8 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"2021-03-30 11:57:39.116425: read 10000 rows\n",
|
"2024-05-21 09:07:17.882569: read 10000 rows\n",
|
||||||
"2021-03-30 11:57:39.522722: read 13059 rows\n"
|
"2024-05-21 09:07:18.375305: read 13059 rows\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -78,6 +78,18 @@
|
|||||||
"execution_count": 5,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'\n",
|
||||||
|
" reader = pd.read_csv(filepath_or_buffer, **kwargs)\n",
|
||||||
|
"/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze(\"columns\") to the call to squeeze.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" reader = pd.read_csv(filepath_or_buffer, **kwargs)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/html": [
|
"text/html": [
|
||||||
@ -218,35 +230,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"{'took': 0,\n",
|
"ObjectApiResponse({'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'churn', '_id': '0', '_score': 1.0, '_source': {'state': 'KS', 'account length': 128, 'area code': 415, 'phone number': '382-4657', 'international plan': 'no', 'voice mail plan': 'yes', 'number vmail messages': 25, 'total day minutes': 265.1, 'total day calls': 110, 'total day charge': 45.07, 'total eve minutes': 197.4, 'total eve calls': 99, 'total eve charge': 16.78, 'total night minutes': 244.7, 'total night calls': 91, 'total night charge': 11.01, 'total intl minutes': 10.0, 'total intl calls': 3, 'total intl charge': 2.7, 'customer service calls': 1, 'churn': 0}}]}})"
|
||||||
" 'timed_out': False,\n",
|
|
||||||
" '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n",
|
|
||||||
" 'hits': {'total': {'value': 2, 'relation': 'eq'},\n",
|
|
||||||
" 'max_score': 1.0,\n",
|
|
||||||
" 'hits': [{'_index': 'churn',\n",
|
|
||||||
" '_id': '0',\n",
|
|
||||||
" '_score': 1.0,\n",
|
|
||||||
" '_source': {'state': 'KS',\n",
|
|
||||||
" 'account length': 128,\n",
|
|
||||||
" 'area code': 415,\n",
|
|
||||||
" 'phone number': '382-4657',\n",
|
|
||||||
" 'international plan': 'no',\n",
|
|
||||||
" 'voice mail plan': 'yes',\n",
|
|
||||||
" 'number vmail messages': 25,\n",
|
|
||||||
" 'total day minutes': 265.1,\n",
|
|
||||||
" 'total day calls': 110,\n",
|
|
||||||
" 'total day charge': 45.07,\n",
|
|
||||||
" 'total eve minutes': 197.4,\n",
|
|
||||||
" 'total eve calls': 99,\n",
|
|
||||||
" 'total eve charge': 16.78,\n",
|
|
||||||
" 'total night minutes': 244.7,\n",
|
|
||||||
" 'total night calls': 91,\n",
|
|
||||||
" 'total night charge': 11.01,\n",
|
|
||||||
" 'total intl minutes': 10.0,\n",
|
|
||||||
" 'total intl calls': 3,\n",
|
|
||||||
" 'total intl charge': 2.7,\n",
|
|
||||||
" 'customer service calls': 1,\n",
|
|
||||||
" 'churn': 0}}]}}"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
@ -267,7 +251,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"{'acknowledged': True}"
|
"ObjectApiResponse({'acknowledged': True})"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 7,
|
"execution_count": 7,
|
||||||
@ -297,7 +281,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.5"
|
"version": "3.10.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -33,10 +33,10 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"AvgTicketPrice 640.387285\n",
|
"AvgTicketPrice 639.433214\n",
|
||||||
"Cancelled False\n",
|
"Cancelled False\n",
|
||||||
"dayOfWeek 3\n",
|
"dayOfWeek 2\n",
|
||||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
"timestamp 2018-01-21 20:23:15.159835648\n",
|
||||||
"dtype: object"
|
"dtype: object"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -58,9 +58,9 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"AvgTicketPrice 640.387285\n",
|
"AvgTicketPrice 639.433214\n",
|
||||||
"Cancelled 0.000000\n",
|
"Cancelled 0.000000\n",
|
||||||
"dayOfWeek 3.000000\n",
|
"dayOfWeek 2.935777\n",
|
||||||
"dtype: float64"
|
"dtype: float64"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -82,10 +82,10 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"AvgTicketPrice 640.387285\n",
|
"AvgTicketPrice 639.433214\n",
|
||||||
"Cancelled False\n",
|
"Cancelled False\n",
|
||||||
"dayOfWeek 3\n",
|
"dayOfWeek 2\n",
|
||||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
"timestamp 2018-01-21 20:23:15.159835648\n",
|
||||||
"DestCountry NaN\n",
|
"DestCountry NaN\n",
|
||||||
"dtype: object"
|
"dtype: object"
|
||||||
]
|
]
|
||||||
@ -108,7 +108,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"AvgTicketPrice 213.430365\n",
|
"AvgTicketPrice 213.453156\n",
|
||||||
"dayOfWeek 2.000000\n",
|
"dayOfWeek 2.000000\n",
|
||||||
"dtype: float64"
|
"dtype: float64"
|
||||||
]
|
]
|
||||||
@ -131,7 +131,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"AvgTicketPrice 213.430365\n",
|
"AvgTicketPrice 213.453156\n",
|
||||||
"dayOfWeek 2.000000\n",
|
"dayOfWeek 2.000000\n",
|
||||||
"dtype: float64"
|
"dtype: float64"
|
||||||
]
|
]
|
||||||
@ -154,7 +154,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"AvgTicketPrice 213.430365\n",
|
"AvgTicketPrice 213.453156\n",
|
||||||
"Cancelled NaN\n",
|
"Cancelled NaN\n",
|
||||||
"dayOfWeek 2.0\n",
|
"dayOfWeek 2.0\n",
|
||||||
"timestamp NaT\n",
|
"timestamp NaT\n",
|
||||||
@ -189,7 +189,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.5"
|
"version": "3.10.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
File diff suppressed because one or more lines are too long
@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
|
|||||||
ed_ser = ed_flights_small.filter(items=items, axis=0)
|
ed_ser = ed_flights_small.filter(items=items, axis=0)
|
||||||
pd_ser = pd_flights_small.filter(items=items, axis=0)
|
pd_ser = pd_flights_small.filter(items=items, axis=0)
|
||||||
|
|
||||||
assert_pandas_eland_series_equal(pd_ser, ed_ser)
|
# For an empty Series, eland will say the datatype it knows from the Elastic index
|
||||||
|
# Pandas however will state empty as the datatype
|
||||||
|
assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
|
||||||
|
|
||||||
def test_flights_filter_index_like_and_regex(self):
|
def test_flights_filter_index_like_and_regex(self):
|
||||||
ed_flights_small = self.ed_flights_small()["FlightDelayType"]
|
ed_flights_small = self.ed_flights_small()["FlightDelayType"]
|
||||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
|||||||
import pytest
|
import pytest
|
||||||
from pandas.testing import assert_series_equal
|
from pandas.testing import assert_series_equal
|
||||||
|
|
||||||
|
from eland.common import PANDAS_VERSION
|
||||||
from tests.common import TestData, assert_almost_equal
|
from tests.common import TestData, assert_almost_equal
|
||||||
|
|
||||||
|
|
||||||
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
|
|||||||
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
||||||
|
|
||||||
for func in self.all_funcs:
|
for func in self.all_funcs:
|
||||||
|
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||||
|
continue
|
||||||
pd_metric = getattr(pd_flights, func)()
|
pd_metric = getattr(pd_flights, func)()
|
||||||
ed_metric = getattr(ed_flights, func)()
|
ed_metric = getattr(ed_flights, func)()
|
||||||
|
|
||||||
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
|
|||||||
ed_ecommerce = self.ed_ecommerce()[column]
|
ed_ecommerce = self.ed_ecommerce()[column]
|
||||||
|
|
||||||
for func in self.all_funcs:
|
for func in self.all_funcs:
|
||||||
|
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||||
|
continue
|
||||||
pd_metric = getattr(pd_ecommerce, func)()
|
pd_metric = getattr(pd_ecommerce, func)()
|
||||||
ed_metric = getattr(ed_ecommerce, func)(
|
ed_metric = getattr(ed_ecommerce, func)(
|
||||||
**({"numeric_only": True} if (func != "nunique") else {})
|
**({"numeric_only": True} if (func != "nunique") else {})
|
||||||
|
@ -65,7 +65,7 @@ def find_files_to_fix(sources: List[str]) -> Iterator[str]:
|
|||||||
def does_file_need_fix(filepath: str) -> bool:
|
def does_file_need_fix(filepath: str) -> bool:
|
||||||
if not filepath.endswith(".py"):
|
if not filepath.endswith(".py"):
|
||||||
return False
|
return False
|
||||||
with open(filepath, mode="r") as f:
|
with open(filepath) as f:
|
||||||
first_license_line = None
|
first_license_line = None
|
||||||
for line in f:
|
for line in f:
|
||||||
if line == license_header_lines[0]:
|
if line == license_header_lines[0]:
|
||||||
@ -82,7 +82,7 @@ def does_file_need_fix(filepath: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def add_header_to_file(filepath: str) -> None:
|
def add_header_to_file(filepath: str) -> None:
|
||||||
with open(filepath, mode="r") as f:
|
with open(filepath) as f:
|
||||||
lines = list(f)
|
lines = list(f)
|
||||||
i = 0
|
i = 0
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user