diff --git a/.github/workflows/comment-on-asciidoc-changes.yml b/.github/workflows/comment-on-asciidoc-changes.yml deleted file mode 100644 index 8e5f836..0000000 --- a/.github/workflows/comment-on-asciidoc-changes.yml +++ /dev/null @@ -1,21 +0,0 @@ ---- -name: Comment on PR for .asciidoc changes - -on: - # We need to use pull_request_target to be able to comment on PRs from forks - pull_request_target: - types: - - synchronize - - opened - - reopened - branches: - - main - - master - - "9.0" - -jobs: - comment-on-asciidoc-change: - permissions: - contents: read - pull-requests: write - uses: elastic/docs-builder/.github/workflows/comment-on-asciidoc-changes.yml@main diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml new file mode 100644 index 0000000..bb46616 --- /dev/null +++ b/.github/workflows/docs-build.yml @@ -0,0 +1,19 @@ +name: docs-build + +on: + push: + branches: + - main + pull_request_target: ~ + merge_group: ~ + +jobs: + docs-preview: + uses: elastic/docs-builder/.github/workflows/preview-build.yml@main + with: + path-pattern: docs/** + permissions: + deployments: write + id-token: write + contents: read + pull-requests: read diff --git a/.github/workflows/docs-cleanup.yml b/.github/workflows/docs-cleanup.yml new file mode 100644 index 0000000..f83e017 --- /dev/null +++ b/.github/workflows/docs-cleanup.yml @@ -0,0 +1,14 @@ +name: docs-cleanup + +on: + pull_request_target: + types: + - closed + +jobs: + docs-preview: + uses: elastic/docs-builder/.github/workflows/preview-cleanup.yml@main + permissions: + contents: none + id-token: write + deployments: write diff --git a/docs/docset.yml b/docs/docset.yml new file mode 100644 index 0000000..782b57b --- /dev/null +++ b/docs/docset.yml @@ -0,0 +1,8 @@ +project: 'Eland Python client' +cross_links: + - docs-content +toc: + - toc: reference +subs: + es: "Elasticsearch" + ml: "machine learning" diff --git a/docs/guide/index.asciidoc b/docs/guide/index.asciidoc deleted file mode 100644 index f2a8029..0000000 --- a/docs/guide/index.asciidoc +++ /dev/null @@ -1,14 +0,0 @@ -= Eland Python Client - -:doctype: book - -include::{asciidoc-dir}/../../shared/versions/stack/{source_branch}.asciidoc[] -include::{asciidoc-dir}/../../shared/attributes.asciidoc[] - -include::overview.asciidoc[] - -include::installation.asciidoc[] - -include::dataframes.asciidoc[] - -include::machine-learning.asciidoc[] diff --git a/docs/guide/installation.asciidoc b/docs/guide/installation.asciidoc deleted file mode 100644 index 69763cf..0000000 --- a/docs/guide/installation.asciidoc +++ /dev/null @@ -1,16 +0,0 @@ -[[installation]] -== Installation - -Eland can be installed with https://pip.pypa.io[pip] from https://pypi.org/project/eland[PyPI]. We recommend https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/[using a virtual environment] when installing with pip: - -[source,sh] ------------------------------ -$ python -m pip install eland ------------------------------ - -Alternatively, Eland can be installed with https://docs.conda.io[Conda] from https://anaconda.org/conda-forge/eland[Conda Forge]: - -[source,sh] ------------------------------------- -$ conda install -c conda-forge eland ------------------------------------- diff --git a/docs/guide/dataframes.asciidoc b/docs/reference/dataframes.md similarity index 69% rename from docs/guide/dataframes.asciidoc rename to docs/reference/dataframes.md index 616a31d..d8560bd 100644 --- a/docs/guide/dataframes.asciidoc +++ b/docs/reference/dataframes.md @@ -1,16 +1,16 @@ -[[dataframes]] -== Data Frames +--- +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/dataframes.html +--- -`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API -and defers all processing and filtering of data to Elasticsearch -instead of your local machine. This means you can process large -amounts of data within Elasticsearch from a Jupyter Notebook -without overloading your machine. +# Data Frames [dataframes] -[source,python] -------------------------------------- +`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API and defers all processing and filtering of data to Elasticsearch instead of your local machine. This means you can process large amounts of data within Elasticsearch from a Jupyter Notebook without overloading your machine. + +```python >>> import eland as ed ->>> # Connect to 'flights' index via localhost Elasticsearch node +>>> +# Connect to 'flights' index via localhost Elasticsearch node >>> df = ed.DataFrame('http://localhost:9200', 'flights') # eland.DataFrame instance has the same API as pandas.DataFrame @@ -29,14 +29,14 @@ without overloading your machine. Index: 13059 entries, 0 to 13058 Data columns (total 27 columns): - # Column Non-Null Count Dtype ---- ------ -------------- ----- - 0 AvgTicketPrice 13059 non-null float64 - 1 Cancelled 13059 non-null bool - 2 Carrier 13059 non-null object -... - 24 OriginWeather 13059 non-null object - 25 dayOfWeek 13059 non-null int64 + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 AvgTicketPrice 13059 non-null float64 + 1 Cancelled 13059 non-null bool + 2 Carrier 13059 non-null object +... + 24 OriginWeather 13059 non-null object + 25 dayOfWeek 13059 non-null int64 26 timestamp 13059 non-null datetime64[ns] dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17) memory usage: 80.0 bytes @@ -59,4 +59,5 @@ Elasticsearch storage usage: 5.043 MB sum 9.261629e+07 8.204365e+06 min 0.000000e+00 1.000205e+02 std 4.578263e+03 2.663867e+02 -------------------------------------- +``` + diff --git a/docs/guide/overview.asciidoc b/docs/reference/index.md similarity index 67% rename from docs/guide/overview.asciidoc rename to docs/reference/index.md index 49c32a0..be902ae 100644 --- a/docs/guide/overview.asciidoc +++ b/docs/reference/index.md @@ -1,33 +1,36 @@ -[[overview]] -== Overview +--- +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/index.html + - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/overview.html +navigation_title: Eland +--- -Eland is a Python client and toolkit for DataFrames and {ml} in {es}. -Full documentation is available on https://eland.readthedocs.io[Read the Docs]. -Source code is available on https://github.com/elastic/eland[GitHub]. +# Eland Python client [overview] -[discrete] -=== Compatibility +Eland is a Python client and toolkit for DataFrames and {{ml}} in {{es}}. Full documentation is available on [Read the Docs](https://eland.readthedocs.io). Source code is available on [GitHub](https://github.com/elastic/eland). -- Supports Python 3.9+ and Pandas 1.5 -- Supports {es} 8+ clusters, recommended 8.16 or later for all features to work. - Make sure your Eland major version matches the major version of your Elasticsearch cluster. -The recommended way to set your requirements in your `setup.py` or -`requirements.txt` is:: +## Compatibility [_compatibility] - # Elasticsearch 8.x - eland>=8,<9 +* Supports Python 3.9+ and Pandas 1.5 +* Supports {{es}} 8+ clusters, recommended 8.16 or later for all features to work. Make sure your Eland major version matches the major version of your Elasticsearch cluster. - # Elasticsearch 7.x - eland>=7,<8 +The recommended way to set your requirements in your `setup.py` or `requirements.txt` is:: -[discrete] -=== Getting Started +``` +# Elasticsearch 8.x +eland>=8,<9 +``` +``` +# Elasticsearch 7.x +eland>=7,<8 +``` -Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`: +## Getting Started [_getting_started] -[source,python] ------------------------------------- +Create a `DataFrame` object connected to an {{es}} cluster running on `http://localhost:9200`: + +```python >>> import eland as ed >>> df = ed.DataFrame( ... es_client="http://localhost:9200", @@ -48,15 +51,14 @@ Create a `DataFrame` object connected to an {es} cluster running on `http://loca 13058 858.144337 False ... 6 2018-02-11 14:54:34 [13059 rows x 27 columns] ------------------------------------- +``` -[discrete] -==== Elastic Cloud + +### Elastic Cloud [_elastic_cloud] You can also connect Eland to an Elasticsearch instance in Elastic Cloud: -[source,python] ------------------------------------- +```python >>> import eland as ed >>> from elasticsearch import Elasticsearch @@ -73,16 +75,16 @@ You can also connect Eland to an Elasticsearch instance in Elastic Cloud: 3 181.694216 True ... 0 2018-01-01 10:33:28 4 730.041778 False ... 0 2018-01-01 05:13:00 [5 rows x 27 columns] ------------------------------------- +``` Eland can be used for complex queries and aggregations: -[source,python] ------------------------------------- +```python >>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False) AvgTicketPrice Cancelled timestamp -Carrier +Carrier ES-Air 630.235816 0.129814 2018-01-21 20:45:00.200000000 JetBeats 627.457373 0.134698 2018-01-21 14:43:18.112400635 Logstash Airways 624.581974 0.125188 2018-01-21 16:14:50.711798340 ------------------------------------- +``` + diff --git a/docs/reference/installation.md b/docs/reference/installation.md new file mode 100644 index 0000000..550eec3 --- /dev/null +++ b/docs/reference/installation.md @@ -0,0 +1,19 @@ +--- +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/installation.html +--- + +# Installation [installation] + +Eland can be installed with [pip](https://pip.pypa.io) from [PyPI](https://pypi.org/project/eland). We recommend [using a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) when installing with pip: + +```sh +$ python -m pip install eland +``` + +Alternatively, Eland can be installed with [Conda](https://docs.conda.io) from [Conda Forge](https://anaconda.org/conda-forge/eland): + +```sh +$ conda install -c conda-forge eland +``` + diff --git a/docs/reference/machine-learning.md b/docs/reference/machine-learning.md new file mode 100644 index 0000000..1e69e88 --- /dev/null +++ b/docs/reference/machine-learning.md @@ -0,0 +1,199 @@ +--- +mapped_pages: + - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html +--- + +# Machine Learning [machine-learning] + + +## Trained models [ml-trained-models] + +Eland allows transforming *some* +https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters[trained models] from scikit-learn, XGBoost, +and LightGBM libraries to be serialized and used as an inference model in {{es}}. + +```python +>>> from xgboost import XGBClassifier +>>> from eland.ml import MLModel + +# Train and exercise an XGBoost ML model locally +>>> xgb_model = XGBClassifier(booster="gbtree") +>>> xgb_model.fit(training_data[0], training_data[1]) + +>>> xgb_model.predict(training_data[0]) +[0 1 1 0 1 0 0 0 1 0] + +# Import the model into Elasticsearch +>>> es_model = MLModel.import_model( + es_client="http://localhost:9200", + model_id="xgb-classifier", + model=xgb_model, + feature_names=["f0", "f1", "f2", "f3", "f4"], +) + +# Exercise the ML model in Elasticsearch with the training data +>>> es_model.predict(training_data[0]) +[0 1 1 0 1 0 0 0 1 0] +``` + + +## Natural language processing (NLP) with PyTorch [ml-nlp-pytorch] + +::::{important} +You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version. +:::: + + +For NLP tasks, Eland enables you to import PyTorch models into {{es}}. Use the `eland_import_hub_model` script to download and install supported [transformer models](https://huggingface.co/transformers) from the [Hugging Face model hub](https://huggingface.co/models). For example: + +```bash +$ eland_import_hub_model \ <1> + --url http://localhost:9200/ \ <2> + --hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3> + --task-type ner \ <4> + --start +``` + +1. Use an authentication method to access your cluster. Refer to [Authentication methods](machine-learning.md#ml-nlp-pytorch-auth). +2. The cluster URL. Alternatively, use `--cloud-id`. +3. Specify the identifier for the model in the Hugging Face model hub. +4. Specify the type of NLP task. Supported values are `fill_mask`, `ner`, `question_answering`, `text_classification`, `text_embedding`, `text_expansion`, `text_similarity` and `zero_shot_classification`. + + +For more information about the available options, run `eland_import_hub_model` with the `--help` option. + +```bash +$ eland_import_hub_model --help +``` + + +### Import model with Docker [ml-nlp-pytorch-docker] + +::::{important} +To use the Docker container, you need to clone the Eland repository: [https://github.com/elastic/eland](https://github.com/elastic/eland) +:::: + + +If you want to use Eland without installing it, you can use the Docker image: + +You can use the container interactively: + +```bash +$ docker run -it --rm --network host docker.elastic.co/eland/eland +``` + +Running installed scripts is also possible without an interactive shell, for example: + +```bash +docker run -it --rm docker.elastic.co/eland/eland \ + eland_import_hub_model \ + --url $ELASTICSEARCH_URL \ + --hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \ + --start +``` + +Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`. + + +### Install models in an air-gapped environment [ml-nlp-pytorch-air-gapped] + +You can install models in a restricted or closed network by pointing the `eland_import_hub_model` script to local files. + +For an offline install of a Hugging Face model, the model first needs to be cloned locally, Git and [Git Large File Storage](https://git-lfs.com/) are required to be installed in your system. + +1. Select a model you want to use from Hugging Face. Refer to the [compatible third party model](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md) list for more information on the supported architectures. +2. Clone the selected model from Hugging Face by using the model URL. For example: + + ```bash + git clone https://huggingface.co/dslim/bert-base-NER + ``` + + This command results in a local copy of of the model in the directory `bert-base-NER`. + +3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the directory of the cloned model to install it: + + ```bash + eland_import_hub_model \ + --url 'XXXX' \ + --hub-model-id /PATH/TO/MODEL \ + --task-type ner \ + --es-username elastic --es-password XXX \ + --es-model-id bert-base-ner + ``` + + If you use the Docker image to run `eland_import_hub_model` you must bind mount the model directory, so the container can read the files: + + ```bash + docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \ + eland_import_hub_model \ + --url 'XXXX' \ + --hub-model-id /model \ + --task-type ner \ + --es-username elastic --es-password XXX \ + --es-model-id bert-base-ner + ``` + + Once it’s uploaded to {{es}}, the model will have the ID specified by `--es-model-id`. If it is not set, the model ID is derived from `--hub-model-id`; spaces and path delimiters are converted to double underscores `__`. + + + +### Connect to Elasticsearch through a proxy [ml-nlp-pytorch-proxy] + +Behind the scenes, Eland uses the `requests` Python library, which [allows configuring proxies through an environment variable](https://requests.readthedocs.io/en/latest/user/advanced/#proxies). For example, to use an HTTP proxy to connect to an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable when invoking Eland: + +```bash +HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ... +``` + +If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY` instead. + + +### Authentication methods [ml-nlp-pytorch-auth] + +The following authentication options are available when using the import script: + +* Elasticsearch username and password authentication (specified with the `-u` and `-p` options): + + ```bash + eland_import_hub_model -u -p --cloud-id ... + ``` + + These `-u` and `-p` options also work when you use `--url`. + +* Elasticsearch username and password authentication (embedded in the URL): + + ```bash + eland_import_hub_model --url https://:@: ... + ``` + +* Elasticsearch API key authentication: + + ```bash + eland_import_hub_model --es-api-key --url https://: ... + ``` + +* HuggingFace Hub access token (for private models): + + ```bash + eland_import_hub_model --hub-access-token ... + ``` + + + +### TLS/SSL [ml-nlp-pytorch-tls] + +The following TLS/SSL options for Elasticsearch are available when using the import script: + +* Specify alternate CA bundle to verify the cluster certificate: + + ```bash + eland_import_hub_model --ca-certs CA_CERTS ... + ``` + +* Disable TLS/SSL verification altogether (strongly discouraged): + + ```bash + eland_import_hub_model --insecure ... + ``` + + diff --git a/docs/reference/toc.yml b/docs/reference/toc.yml new file mode 100644 index 0000000..a1e325d --- /dev/null +++ b/docs/reference/toc.yml @@ -0,0 +1,6 @@ +project: 'Eland reference' +toc: + - file: index.md + - file: installation.md + - file: dataframes.md + - file: machine-learning.md \ No newline at end of file