diff --git a/.github/workflows/comment-on-asciidoc-changes.yml b/.github/workflows/comment-on-asciidoc-changes.yml new file mode 100644 index 0000000..8e5f836 --- /dev/null +++ b/.github/workflows/comment-on-asciidoc-changes.yml @@ -0,0 +1,21 @@ +--- +name: Comment on PR for .asciidoc changes + +on: + # We need to use pull_request_target to be able to comment on PRs from forks + pull_request_target: + types: + - synchronize + - opened + - reopened + branches: + - main + - master + - "9.0" + +jobs: + comment-on-asciidoc-change: + permissions: + contents: read + pull-requests: write + uses: elastic/docs-builder/.github/workflows/comment-on-asciidoc-changes.yml@main diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml deleted file mode 100644 index bb46616..0000000 --- a/.github/workflows/docs-build.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: docs-build - -on: - push: - branches: - - main - pull_request_target: ~ - merge_group: ~ - -jobs: - docs-preview: - uses: elastic/docs-builder/.github/workflows/preview-build.yml@main - with: - path-pattern: docs/** - permissions: - deployments: write - id-token: write - contents: read - pull-requests: read diff --git a/.github/workflows/docs-cleanup.yml b/.github/workflows/docs-cleanup.yml deleted file mode 100644 index f83e017..0000000 --- a/.github/workflows/docs-cleanup.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: docs-cleanup - -on: - pull_request_target: - types: - - closed - -jobs: - docs-preview: - uses: elastic/docs-builder/.github/workflows/preview-cleanup.yml@main - permissions: - contents: none - id-token: write - deployments: write diff --git a/docs/docset.yml b/docs/docset.yml deleted file mode 100644 index 782b57b..0000000 --- a/docs/docset.yml +++ /dev/null @@ -1,8 +0,0 @@ -project: 'Eland Python client' -cross_links: - - docs-content -toc: - - toc: reference -subs: - es: "Elasticsearch" - ml: "machine learning" diff --git a/docs/reference/dataframes.md b/docs/guide/dataframes.asciidoc similarity index 69% rename from docs/reference/dataframes.md rename to docs/guide/dataframes.asciidoc index d8560bd..616a31d 100644 --- a/docs/reference/dataframes.md +++ b/docs/guide/dataframes.asciidoc @@ -1,16 +1,16 @@ ---- -mapped_pages: - - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/dataframes.html ---- +[[dataframes]] +== Data Frames -# Data Frames [dataframes] +`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API +and defers all processing and filtering of data to Elasticsearch +instead of your local machine. This means you can process large +amounts of data within Elasticsearch from a Jupyter Notebook +without overloading your machine. -`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API and defers all processing and filtering of data to Elasticsearch instead of your local machine. This means you can process large amounts of data within Elasticsearch from a Jupyter Notebook without overloading your machine. - -```python +[source,python] +------------------------------------- >>> import eland as ed ->>> -# Connect to 'flights' index via localhost Elasticsearch node +>>> # Connect to 'flights' index via localhost Elasticsearch node >>> df = ed.DataFrame('http://localhost:9200', 'flights') # eland.DataFrame instance has the same API as pandas.DataFrame @@ -29,14 +29,14 @@ mapped_pages: Index: 13059 entries, 0 to 13058 Data columns (total 27 columns): - # Column Non-Null Count Dtype ---- ------ -------------- ----- - 0 AvgTicketPrice 13059 non-null float64 - 1 Cancelled 13059 non-null bool - 2 Carrier 13059 non-null object -... - 24 OriginWeather 13059 non-null object - 25 dayOfWeek 13059 non-null int64 + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 AvgTicketPrice 13059 non-null float64 + 1 Cancelled 13059 non-null bool + 2 Carrier 13059 non-null object +... + 24 OriginWeather 13059 non-null object + 25 dayOfWeek 13059 non-null int64 26 timestamp 13059 non-null datetime64[ns] dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17) memory usage: 80.0 bytes @@ -59,5 +59,4 @@ Elasticsearch storage usage: 5.043 MB sum 9.261629e+07 8.204365e+06 min 0.000000e+00 1.000205e+02 std 4.578263e+03 2.663867e+02 -``` - +------------------------------------- diff --git a/docs/guide/index.asciidoc b/docs/guide/index.asciidoc new file mode 100644 index 0000000..f2a8029 --- /dev/null +++ b/docs/guide/index.asciidoc @@ -0,0 +1,14 @@ += Eland Python Client + +:doctype: book + +include::{asciidoc-dir}/../../shared/versions/stack/{source_branch}.asciidoc[] +include::{asciidoc-dir}/../../shared/attributes.asciidoc[] + +include::overview.asciidoc[] + +include::installation.asciidoc[] + +include::dataframes.asciidoc[] + +include::machine-learning.asciidoc[] diff --git a/docs/guide/installation.asciidoc b/docs/guide/installation.asciidoc new file mode 100644 index 0000000..69763cf --- /dev/null +++ b/docs/guide/installation.asciidoc @@ -0,0 +1,16 @@ +[[installation]] +== Installation + +Eland can be installed with https://pip.pypa.io[pip] from https://pypi.org/project/eland[PyPI]. We recommend https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/[using a virtual environment] when installing with pip: + +[source,sh] +----------------------------- +$ python -m pip install eland +----------------------------- + +Alternatively, Eland can be installed with https://docs.conda.io[Conda] from https://anaconda.org/conda-forge/eland[Conda Forge]: + +[source,sh] +------------------------------------ +$ conda install -c conda-forge eland +------------------------------------ diff --git a/docs/guide/machine-learning.asciidoc b/docs/guide/machine-learning.asciidoc index c91ae8d..46100e0 100644 --- a/docs/guide/machine-learning.asciidoc +++ b/docs/guide/machine-learning.asciidoc @@ -5,8 +5,7 @@ [[ml-trained-models]] === Trained models -Eland allows transforming *some* -https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters[trained models] from scikit-learn, XGBoost, +Eland allows transforming trained models from scikit-learn, XGBoost, and LightGBM libraries to be serialized and used as an inference model in {es}. diff --git a/docs/reference/index.md b/docs/guide/overview.asciidoc similarity index 67% rename from docs/reference/index.md rename to docs/guide/overview.asciidoc index be902ae..49c32a0 100644 --- a/docs/reference/index.md +++ b/docs/guide/overview.asciidoc @@ -1,36 +1,33 @@ ---- -mapped_pages: - - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/index.html - - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/overview.html -navigation_title: Eland ---- +[[overview]] +== Overview -# Eland Python client [overview] +Eland is a Python client and toolkit for DataFrames and {ml} in {es}. +Full documentation is available on https://eland.readthedocs.io[Read the Docs]. +Source code is available on https://github.com/elastic/eland[GitHub]. -Eland is a Python client and toolkit for DataFrames and {{ml}} in {{es}}. Full documentation is available on [Read the Docs](https://eland.readthedocs.io). Source code is available on [GitHub](https://github.com/elastic/eland). +[discrete] +=== Compatibility +- Supports Python 3.9+ and Pandas 1.5 +- Supports {es} 8+ clusters, recommended 8.16 or later for all features to work. + Make sure your Eland major version matches the major version of your Elasticsearch cluster. -## Compatibility [_compatibility] +The recommended way to set your requirements in your `setup.py` or +`requirements.txt` is:: -* Supports Python 3.9+ and Pandas 1.5 -* Supports {{es}} 8+ clusters, recommended 8.16 or later for all features to work. Make sure your Eland major version matches the major version of your Elasticsearch cluster. + # Elasticsearch 8.x + eland>=8,<9 -The recommended way to set your requirements in your `setup.py` or `requirements.txt` is:: + # Elasticsearch 7.x + eland>=7,<8 -``` -# Elasticsearch 8.x -eland>=8,<9 -``` -``` -# Elasticsearch 7.x -eland>=7,<8 -``` +[discrete] +=== Getting Started -## Getting Started [_getting_started] +Create a `DataFrame` object connected to an {es} cluster running on `http://localhost:9200`: -Create a `DataFrame` object connected to an {{es}} cluster running on `http://localhost:9200`: - -```python +[source,python] +------------------------------------ >>> import eland as ed >>> df = ed.DataFrame( ... es_client="http://localhost:9200", @@ -51,14 +48,15 @@ Create a `DataFrame` object connected to an {{es}} cluster running on `http://lo 13058 858.144337 False ... 6 2018-02-11 14:54:34 [13059 rows x 27 columns] -``` +------------------------------------ - -### Elastic Cloud [_elastic_cloud] +[discrete] +==== Elastic Cloud You can also connect Eland to an Elasticsearch instance in Elastic Cloud: -```python +[source,python] +------------------------------------ >>> import eland as ed >>> from elasticsearch import Elasticsearch @@ -75,16 +73,16 @@ You can also connect Eland to an Elasticsearch instance in Elastic Cloud: 3 181.694216 True ... 0 2018-01-01 10:33:28 4 730.041778 False ... 0 2018-01-01 05:13:00 [5 rows x 27 columns] -``` +------------------------------------ Eland can be used for complex queries and aggregations: -```python +[source,python] +------------------------------------ >>> df[df.Carrier != "Kibana Airlines"].groupby("Carrier").mean(numeric_only=False) AvgTicketPrice Cancelled timestamp -Carrier +Carrier ES-Air 630.235816 0.129814 2018-01-21 20:45:00.200000000 JetBeats 627.457373 0.134698 2018-01-21 14:43:18.112400635 Logstash Airways 624.581974 0.125188 2018-01-21 16:14:50.711798340 -``` - +------------------------------------ diff --git a/docs/reference/installation.md b/docs/reference/installation.md deleted file mode 100644 index 550eec3..0000000 --- a/docs/reference/installation.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -mapped_pages: - - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/installation.html ---- - -# Installation [installation] - -Eland can be installed with [pip](https://pip.pypa.io) from [PyPI](https://pypi.org/project/eland). We recommend [using a virtual environment](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) when installing with pip: - -```sh -$ python -m pip install eland -``` - -Alternatively, Eland can be installed with [Conda](https://docs.conda.io) from [Conda Forge](https://anaconda.org/conda-forge/eland): - -```sh -$ conda install -c conda-forge eland -``` - diff --git a/docs/reference/machine-learning.md b/docs/reference/machine-learning.md deleted file mode 100644 index 1e69e88..0000000 --- a/docs/reference/machine-learning.md +++ /dev/null @@ -1,199 +0,0 @@ ---- -mapped_pages: - - https://www.elastic.co/guide/en/elasticsearch/client/eland/current/machine-learning.html ---- - -# Machine Learning [machine-learning] - - -## Trained models [ml-trained-models] - -Eland allows transforming *some* -https://eland.readthedocs.io/en/latest/reference/api/eland.ml.MLModel.import_model.html#parameters[trained models] from scikit-learn, XGBoost, -and LightGBM libraries to be serialized and used as an inference model in {{es}}. - -```python ->>> from xgboost import XGBClassifier ->>> from eland.ml import MLModel - -# Train and exercise an XGBoost ML model locally ->>> xgb_model = XGBClassifier(booster="gbtree") ->>> xgb_model.fit(training_data[0], training_data[1]) - ->>> xgb_model.predict(training_data[0]) -[0 1 1 0 1 0 0 0 1 0] - -# Import the model into Elasticsearch ->>> es_model = MLModel.import_model( - es_client="http://localhost:9200", - model_id="xgb-classifier", - model=xgb_model, - feature_names=["f0", "f1", "f2", "f3", "f4"], -) - -# Exercise the ML model in Elasticsearch with the training data ->>> es_model.predict(training_data[0]) -[0 1 1 0 1 0 0 0 1 0] -``` - - -## Natural language processing (NLP) with PyTorch [ml-nlp-pytorch] - -::::{important} -You need to install the appropriate version of PyTorch to import an NLP model. Run `python -m pip install 'eland[pytorch]'` to install that version. -:::: - - -For NLP tasks, Eland enables you to import PyTorch models into {{es}}. Use the `eland_import_hub_model` script to download and install supported [transformer models](https://huggingface.co/transformers) from the [Hugging Face model hub](https://huggingface.co/models). For example: - -```bash -$ eland_import_hub_model \ <1> - --url http://localhost:9200/ \ <2> - --hub-model-id elastic/distilbert-base-cased-finetuned-conll03-english \ <3> - --task-type ner \ <4> - --start -``` - -1. Use an authentication method to access your cluster. Refer to [Authentication methods](machine-learning.md#ml-nlp-pytorch-auth). -2. The cluster URL. Alternatively, use `--cloud-id`. -3. Specify the identifier for the model in the Hugging Face model hub. -4. Specify the type of NLP task. Supported values are `fill_mask`, `ner`, `question_answering`, `text_classification`, `text_embedding`, `text_expansion`, `text_similarity` and `zero_shot_classification`. - - -For more information about the available options, run `eland_import_hub_model` with the `--help` option. - -```bash -$ eland_import_hub_model --help -``` - - -### Import model with Docker [ml-nlp-pytorch-docker] - -::::{important} -To use the Docker container, you need to clone the Eland repository: [https://github.com/elastic/eland](https://github.com/elastic/eland) -:::: - - -If you want to use Eland without installing it, you can use the Docker image: - -You can use the container interactively: - -```bash -$ docker run -it --rm --network host docker.elastic.co/eland/eland -``` - -Running installed scripts is also possible without an interactive shell, for example: - -```bash -docker run -it --rm docker.elastic.co/eland/eland \ - eland_import_hub_model \ - --url $ELASTICSEARCH_URL \ - --hub-model-id elastic/distilbert-base-uncased-finetuned-conll03-english \ - --start -``` - -Replace the `$ELASTICSEARCH_URL` with the URL for your Elasticsearch cluster. For authentication purposes, include an administrator username and password in the URL in the following format: `https://username:password@host:port`. - - -### Install models in an air-gapped environment [ml-nlp-pytorch-air-gapped] - -You can install models in a restricted or closed network by pointing the `eland_import_hub_model` script to local files. - -For an offline install of a Hugging Face model, the model first needs to be cloned locally, Git and [Git Large File Storage](https://git-lfs.com/) are required to be installed in your system. - -1. Select a model you want to use from Hugging Face. Refer to the [compatible third party model](docs-content://explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md) list for more information on the supported architectures. -2. Clone the selected model from Hugging Face by using the model URL. For example: - - ```bash - git clone https://huggingface.co/dslim/bert-base-NER - ``` - - This command results in a local copy of of the model in the directory `bert-base-NER`. - -3. Use the `eland_import_hub_model` script with the `--hub-model-id` set to the directory of the cloned model to install it: - - ```bash - eland_import_hub_model \ - --url 'XXXX' \ - --hub-model-id /PATH/TO/MODEL \ - --task-type ner \ - --es-username elastic --es-password XXX \ - --es-model-id bert-base-ner - ``` - - If you use the Docker image to run `eland_import_hub_model` you must bind mount the model directory, so the container can read the files: - - ```bash - docker run --mount type=bind,source=/PATH/TO/MODEL,destination=/model,readonly -it --rm docker.elastic.co/eland/eland \ - eland_import_hub_model \ - --url 'XXXX' \ - --hub-model-id /model \ - --task-type ner \ - --es-username elastic --es-password XXX \ - --es-model-id bert-base-ner - ``` - - Once it’s uploaded to {{es}}, the model will have the ID specified by `--es-model-id`. If it is not set, the model ID is derived from `--hub-model-id`; spaces and path delimiters are converted to double underscores `__`. - - - -### Connect to Elasticsearch through a proxy [ml-nlp-pytorch-proxy] - -Behind the scenes, Eland uses the `requests` Python library, which [allows configuring proxies through an environment variable](https://requests.readthedocs.io/en/latest/user/advanced/#proxies). For example, to use an HTTP proxy to connect to an HTTPS Elasticsearch cluster, you need to set the `HTTPS_PROXY` environment variable when invoking Eland: - -```bash -HTTPS_PROXY=http://proxy-host:proxy-port eland_import_hub_model ... -``` - -If you disabled security on your Elasticsearch cluster, you should use `HTTP_PROXY` instead. - - -### Authentication methods [ml-nlp-pytorch-auth] - -The following authentication options are available when using the import script: - -* Elasticsearch username and password authentication (specified with the `-u` and `-p` options): - - ```bash - eland_import_hub_model -u -p --cloud-id ... - ``` - - These `-u` and `-p` options also work when you use `--url`. - -* Elasticsearch username and password authentication (embedded in the URL): - - ```bash - eland_import_hub_model --url https://:@: ... - ``` - -* Elasticsearch API key authentication: - - ```bash - eland_import_hub_model --es-api-key --url https://: ... - ``` - -* HuggingFace Hub access token (for private models): - - ```bash - eland_import_hub_model --hub-access-token ... - ``` - - - -### TLS/SSL [ml-nlp-pytorch-tls] - -The following TLS/SSL options for Elasticsearch are available when using the import script: - -* Specify alternate CA bundle to verify the cluster certificate: - - ```bash - eland_import_hub_model --ca-certs CA_CERTS ... - ``` - -* Disable TLS/SSL verification altogether (strongly discouraged): - - ```bash - eland_import_hub_model --insecure ... - ``` - - diff --git a/docs/reference/toc.yml b/docs/reference/toc.yml deleted file mode 100644 index a1e325d..0000000 --- a/docs/reference/toc.yml +++ /dev/null @@ -1,6 +0,0 @@ -project: 'Eland reference' -toc: - - file: index.md - - file: installation.md - - file: dataframes.md - - file: machine-learning.md \ No newline at end of file