From 26354622b569d02f7e00c973177d4725fccc252a Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 12 Jan 2021 10:26:01 -0600 Subject: [PATCH] Add more sections for elastic.co/guide --- README.md | 2 +- docs/guide/dataframes.asciidoc | 62 ++++++++++++++++++++++++++++ docs/guide/index.asciidoc | 6 ++- docs/guide/machine-learning.asciidoc | 31 ++++++++++++++ docs/guide/overview.asciidoc | 31 ++++++++++++-- docs/sphinx/reference/dataframe.rst | 9 ++++ 6 files changed, 136 insertions(+), 5 deletions(-) create mode 100644 docs/guide/dataframes.asciidoc create mode 100644 docs/guide/machine-learning.asciidoc diff --git a/README.md b/README.md index 8268be0..bafc35a 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ to be serialized and used as an inference model in Elasticsearch [0 1 1 0 1 0 0 0 1 0] # Import the model into Elasticsearch ->>> es_model = MLModel( +>>> es_model = MLModel.import_model( es_client="localhost:9200", model_id="xgb-classifier", model=xgb_model, diff --git a/docs/guide/dataframes.asciidoc b/docs/guide/dataframes.asciidoc new file mode 100644 index 0000000..ddd58a3 --- /dev/null +++ b/docs/guide/dataframes.asciidoc @@ -0,0 +1,62 @@ +[[dataframes]] +== Data Frames + +`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API +and defers all processing and filtering of data to Elasticsearch +instead of your local machine. This means you can process large +amounts of data within Elasticsearch from a Jupyter Notebook +without overloading your machine. + +[source,python] +------------------------------------- +>>> import eland as ed +>>> # Connect to 'flights' index via localhost Elasticsearch node +>>> df = ed.DataFrame('localhost:9200', 'flights') + +# eland.DataFrame instance has the same API as pandas.DataFrame +# except all data is in Elasticsearch. See .info() memory usage. +>>> df.head() + AvgTicketPrice Cancelled ... dayOfWeek timestamp +0 841.265642 False ... 0 2018-01-01 00:00:00 +1 882.982662 False ... 0 2018-01-01 18:27:00 +2 190.636904 False ... 0 2018-01-01 17:11:14 +3 181.694216 True ... 0 2018-01-01 10:33:28 +4 730.041778 False ... 0 2018-01-01 05:13:00 + +[5 rows x 27 columns] + +>>> df.info() + +Index: 13059 entries, 0 to 13058 +Data columns (total 27 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 AvgTicketPrice 13059 non-null float64 + 1 Cancelled 13059 non-null bool + 2 Carrier 13059 non-null object +... + 24 OriginWeather 13059 non-null object + 25 dayOfWeek 13059 non-null int64 + 26 timestamp 13059 non-null datetime64[ns] +dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17) +memory usage: 80.0 bytes +Elasticsearch storage usage: 5.043 MB + +# Filtering of rows using comparisons +>>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head() + AvgTicketPrice Cancelled ... dayOfWeek timestamp +8 960.869736 True ... 0 2018-01-01 12:09:35 +26 975.812632 True ... 0 2018-01-01 15:38:32 +311 946.358410 True ... 0 2018-01-01 11:51:12 +651 975.383864 True ... 2 2018-01-03 21:13:17 +950 907.836523 True ... 2 2018-01-03 05:14:51 + +[5 rows x 27 columns] + +# Running aggregations across an index +>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) + DistanceKilometers AvgTicketPrice +sum 9.261629e+07 8.204365e+06 +min 0.000000e+00 1.000205e+02 +std 4.578263e+03 2.663867e+02 +------------------------------------- diff --git a/docs/guide/index.asciidoc b/docs/guide/index.asciidoc index bacb324..7633eb2 100644 --- a/docs/guide/index.asciidoc +++ b/docs/guide/index.asciidoc @@ -1,4 +1,4 @@ -= eland += Eland :doctype: book @@ -7,3 +7,7 @@ include::{asciidoc-dir}/../../shared/attributes.asciidoc[] include::overview.asciidoc[] include::installation.asciidoc[] + +include::dataframes.asciidoc[] + +include::machine-learning.asciidoc[] diff --git a/docs/guide/machine-learning.asciidoc b/docs/guide/machine-learning.asciidoc new file mode 100644 index 0000000..317d3d8 --- /dev/null +++ b/docs/guide/machine-learning.asciidoc @@ -0,0 +1,31 @@ +[[machine-learning]] +== Machine Learning + +Eland allows transforming trained models from scikit-learn, XGBoost, +and LightGBM libraries to be serialized and used as an inference +model in Elasticsearch + +[source,python] +------------------------ +>>> from xgboost import XGBClassifier +>>> from eland.ml import MLModel + +# Train and exercise an XGBoost ML model locally +>>> xgb_model = XGBClassifier(booster="gbtree") +>>> xgb_model.fit(training_data[0], training_data[1]) + +>>> xgb_model.predict(training_data[0]) +[0 1 1 0 1 0 0 0 1 0] + +# Import the model into Elasticsearch +>>> es_model = MLModel.import_model( + es_client="localhost:9200", + model_id="xgb-classifier", + model=xgb_model, + feature_names=["f0", "f1", "f2", "f3", "f4"], +) + +# Exercise the ML model in Elasticsearch with the training data +>>> es_model.predict(training_data[0]) +[0 1 1 0 1 0 0 0 1 0] +------------------------ diff --git a/docs/guide/overview.asciidoc b/docs/guide/overview.asciidoc index f99f2bc..4e328a2 100644 --- a/docs/guide/overview.asciidoc +++ b/docs/guide/overview.asciidoc @@ -8,8 +8,8 @@ Source code is available on https://github.com/elastic/eland[GitHub]. [discrete] === Compatibility -The library is compatible with all {es} versions since `7.6.x` but you -**have to use a matching major version**: +The library is compatible with Python 3.6 and later and all +{es} versions since `7.6.x` but you **have to use a matching major version**: The recommended way to set your requirements in your `setup.py` or `requirements.txt` is:: @@ -53,7 +53,32 @@ Create a `DataFrame` object connected to an {es} cluster running on `localhost:9 [13059 rows x 27 columns] ------------------------------------ -Eland can also be used for complex queries and aggregations: +[discrete] +==== Elastic Cloud + +You can also connect Eland to an Elasticsearch instance in Elastic Cloud: + +[source,python] +------------------------------------ +>>> import eland as ed +>>> from elasticsearch import Elasticsearch + +# First instantiate an 'Elasticsearch' instance connected to Elastic Cloud +>>> es = Elasticsearch(cloud_id="...", api_key=("...", "...")) + +# then wrap the client in an Eland DataFrame: +>>> df = ed.DataFrame(es, es_index_pattern="flights") +>>> df.head(5) + AvgTicketPrice Cancelled ... dayOfWeek timestamp +0 841.265642 False ... 0 2018-01-01 00:00:00 +1 882.982662 False ... 0 2018-01-01 18:27:00 +2 190.636904 False ... 0 2018-01-01 17:11:14 +3 181.694216 True ... 0 2018-01-01 10:33:28 +4 730.041778 False ... 0 2018-01-01 05:13:00 +[5 rows x 27 columns] +------------------------------------ + +Eland can be used for complex queries and aggregations: [source,python] ------------------------------------ diff --git a/docs/sphinx/reference/dataframe.rst b/docs/sphinx/reference/dataframe.rst index 391c66a..c795f65 100644 --- a/docs/sphinx/reference/dataframe.rst +++ b/docs/sphinx/reference/dataframe.rst @@ -41,6 +41,15 @@ Indexing, Iteration Function Application, GroupBy & Window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + Elasticsearch aggregations using cardinality (``count``) are accurate + approximations using the `HyperLogLog++ algorithm`_ so may not + be exact. + +.. _HyperLogLog++ algorithm: https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf + .. autosummary:: :toctree: api/