From 661b33dd0a13740d1522e08aa1a910b6f8631953 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Mon, 17 Aug 2020 15:20:32 -0500 Subject: [PATCH] Update and rearrange documentation --- MANIFEST.in | 1 + README.md | 361 ++++++------------ docs/source/development/contributing.rst | 2 +- .../implementation.rst} | 1 - docs/source/development/index.rst | 1 + docs/source/examples/demo_notebook.ipynb | 4 +- .../examples/online_retail_analysis.ipynb | 18 +- docs/source/implementation/index.rst | 10 - docs/source/index.rst | 23 +- .../reference/api/eland.DataFrame.ndim.rst | 6 + .../reference/api/eland.DataFrame.size.rst | 6 + .../reference/api/eland.Series.dtype.rst | 6 + .../reference/api/eland.Series.dtypes.rst | 6 + .../reference/api/eland.Series.ndim.rst | 6 + .../reference/api/eland.Series.size.rst | 6 + .../api/eland.ml.MLModel.delete_model.rst | 6 + .../api/eland.ml.MLModel.exists_model.rst | 6 + docs/source/reference/dataframe.rst | 4 +- docs/source/reference/index.rst | 8 +- docs/source/reference/ml.rst | 16 +- docs/source/reference/series.rst | 6 +- setup.py | 144 +------ utils/generate-supported-apis.py | 19 + 23 files changed, 252 insertions(+), 414 deletions(-) rename docs/source/{implementation/details.rst => development/implementation.rst} (99%) delete mode 100644 docs/source/implementation/index.rst create mode 100644 docs/source/reference/api/eland.DataFrame.ndim.rst create mode 100644 docs/source/reference/api/eland.DataFrame.size.rst create mode 100644 docs/source/reference/api/eland.Series.dtype.rst create mode 100644 docs/source/reference/api/eland.Series.dtypes.rst create mode 100644 docs/source/reference/api/eland.Series.ndim.rst create mode 100644 docs/source/reference/api/eland.Series.size.rst create mode 100644 docs/source/reference/api/eland.ml.MLModel.delete_model.rst create mode 100644 docs/source/reference/api/eland.ml.MLModel.exists_model.rst diff --git a/MANIFEST.in b/MANIFEST.in index 42eb410..6006776 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,2 @@ include LICENSE.txt +include README.md diff --git a/README.md b/README.md index 48edff9..59b0b98 100644 --- a/README.md +++ b/README.md @@ -1,179 +1,31 @@ -_Note, this project is still very much a work in progress and in an alpha state; input and contributions welcome!_ -

- - eland - + + Eland + +

+

+PyPI Version +Conda Version +Downloads +Package Status +Build Status +License

- - - - - - - - - - - - - - - - - - - - - -
PyPI - - latest release - -
Conda Forge - - latest release - -
Package Status - - status - -
License - - license - -
Build Status - - Build Status - -
-# What is it? - -Eland is a Python Elasticsearch client for exploring and analyzing data -residing in Elasticsearch with a familiar Pandas-compatible API. +Eland is a Python Elasticsearch client for exploring and +analyzing data in Elasticsearch with a familiar Pandas-compatible API. Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy, pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and not in memory, which allows Eland to access large datasets stored in Elasticsearch. -For example, to explore data in a large Elasticsearch index, simply create an eland DataFrame from an Elasticsearch -index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API: +Eland also provides tools to upload trained machine learning models from your +common libraries like [scikit-learn](https://scikit-learn.org), [XGBoost](https://xgboost.readthedocs.io), +and [LightGBM](https://lightgbm.readthedocs.io) into Elasticsearch. -``` ->>> import eland as ed +## Getting Started ->>> # Connect to 'flights' index via localhost Elasticsearch node ->>> df = ed.DataFrame('localhost:9200', 'flights') - ->>> df.head() - AvgTicketPrice Cancelled ... dayOfWeek timestamp -0 841.265642 False ... 0 2018-01-01 00:00:00 -1 882.982662 False ... 0 2018-01-01 18:27:00 -2 190.636904 False ... 0 2018-01-01 17:11:14 -3 181.694216 True ... 0 2018-01-01 10:33:28 -4 730.041778 False ... 0 2018-01-01 05:13:00 - -[5 rows x 27 columns] - ->>> df.describe() - AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek -count 13059.000000 13059.000000 ... 13059.000000 13059.000000 -mean 628.253689 7092.142457 ... 511.127842 2.835975 -std 266.386661 4578.263193 ... 334.741135 1.939365 -min 100.020531 0.000000 ... 0.000000 0.000000 -25% 410.008918 2470.545974 ... 251.739008 1.000000 -50% 640.387285 7612.072403 ... 503.148975 3.000000 -75% 842.262193 9735.660463 ... 720.505705 4.239865 -max 1199.729004 19881.482422 ... 1902.901978 6.000000 - -[8 rows x 7 columns] ->>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']] - Carrier AvgTicketPrice Cancelled -0 Kibana Airlines 841.265642 False -1 Logstash Airways 882.982662 False -2 Logstash Airways 190.636904 False -3 Kibana Airlines 181.694216 True -4 Kibana Airlines 730.041778 False -... ... ... ... -13054 Logstash Airways 1080.446279 False -13055 Logstash Airways 646.612941 False -13056 Logstash Airways 997.751876 False -13057 JetBeats 1102.814465 False -13058 JetBeats 858.144337 False - -[13059 rows x 3 columns] - ->>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head() - AvgTicketPrice Cancelled ... dayOfWeek timestamp -8 960.869736 True ... 0 2018-01-01 12:09:35 -26 975.812632 True ... 0 2018-01-01 15:38:32 -311 946.358410 True ... 0 2018-01-01 11:51:12 -651 975.383864 True ... 2 2018-01-03 21:13:17 -950 907.836523 True ... 2 2018-01-03 05:14:51 - -[5 rows x 27 columns] - ->>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) - DistanceKilometers AvgTicketPrice -sum 9.261629e+07 8.204365e+06 -min 0.000000e+00 1.000205e+02 -std 4.578263e+03 2.663867e+02 - ->>> df[['Carrier', 'Origin', 'Dest']].nunique() -Carrier 4 -Origin 156 -Dest 156 -dtype: int64 - ->>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin ->>> s -0 18174.857422 -1 10589.365723 -2 381.273804 -3 739.126221 -4 14818.327637 - ... -13054 10219.474121 -13055 8381.823975 -13056 12661.157104 -13057 20819.488281 -13058 18315.431274 -Length: 13059, dtype: float64 ->>> print(s.es_info()) -index_pattern: flights -Index: - index_field: _id - is_source_field: False -Mappings: - capabilities: - es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name -NaN script_field_None False double None float64 True True True script_field_None -Operations: - tasks: [] - size: None - sort_params: None - _source: ['script_field_None'] - body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}} - post_processing: [] - ->>> pd_df = ed.eland_to_pandas(df) ->>> pd_df.head() - AvgTicketPrice Cancelled ... dayOfWeek timestamp -0 841.265642 False ... 0 2018-01-01 00:00:00 -1 882.982662 False ... 0 2018-01-01 18:27:00 -2 190.636904 False ... 0 2018-01-01 17:11:14 -3 181.694216 True ... 0 2018-01-01 10:33:28 -4 730.041778 False ... 0 2018-01-01 05:13:00 - -[5 rows x 27 columns] -``` - -See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples. - -## Where to get it - -Eland can be installed from [PyPI](https://pypi.org/project/eland) via pip: +Eland can be installed from [PyPI](https://pypi.org/project/eland) with Pip: ```bash $ python -m pip install eland @@ -185,88 +37,129 @@ Eland can also be installed from [Conda Forge](https://anaconda.org/conda-forge/ $ conda install -c conda-forge eland ``` -The [source code](https://github.com/elastic/eland) is currently available on GitHub. +### Supported Versions -## Versions and Compatibility +- Supports Python 3.6+ and Pandas 1.0.0+ +- Supports Elasticsearch clusters that are 7.x+, recommended 7.6 or later for all features to work. -### Python Version Support +### Connecting to Elasticsearch -Officially Python 3.6 and above. +Eland uses the [Elasticsearch low level client](https://elasticsearch-py.readthedocs.io) to connect to Elasticsearch. +This client supports a range of [connection options and authentication options](https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch). -eland depends on pandas version 1.0.0+. +You can pass either an instance of `elasticsearch.Elasticsearch` to Eland APIs +or a string containing the host to connect to: -### Elasticsearch Versions +```python +import eland as ed -eland is versioned like the Elastic stack (eland 7.5.1 is compatible with Elasticsearch 7.x up to 7.5.1) +# Connecting to an Elasticsearch instance running on 'localhost:9200' +df = ed.DataFrame("localhost:9200", es_index_pattern="flights") -A major version of the client is compatible with the same major version of Elasticsearch. - -No compatibility assurances are given between different major versions of the client and Elasticsearch. -Major differences likely exist between major versions of Elasticsearch, -particularly around request and response object formats, but also around API urls and behaviour. - -## Connecting to Elasticsearch - -eland uses the [Elasticsearch low level client](https://elasticsearch-py.readthedocs.io/) to connect to Elasticsearch. -This client supports a range of [connection options and authentication mechanisms] -(https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch). - -### Basic Connection Options +# Connecting to an Elastic Cloud instance +from elasticsearch import Elasticsearch +es = Elasticsearch( + cloud_id="cluster-name:...", + http_auth=("elastic", "") +) +df = ed.DataFrame(es, es_index_pattern="flights") ``` + +## DataFrames in Eland + +`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API +and defers all processing and filtering of data to Elasticsearch +instead of your local machine. This means you can process large +amounts of data within Elasticsearch from a Jupyter Notebook +without overloading your machine. + +➤ [Eland DataFrame API documentation](https://eland.readthedocs.io/en/latest/reference/dataframe.html) + +➤ [Advanced examples in a Jupyter Notebook](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) + +```python >>> import eland as ed ->>> # Connect to flights index via localhost Elasticsearch node ->>> ed.DataFrame('localhost', 'flights') +>>> # Connect to 'flights' index via localhost Elasticsearch node +>>> df = ed.DataFrame('localhost:9200', 'flights') ->>> # Connect to flights index via localhost Elasticsearch node on port 9200 ->>> ed.DataFrame('localhost:9200', 'flights') +# eland.DataFrame instance has the same API as pandas.DataFrame +# except all data is in Elasticsearch. See .info() memory usage. +>>> df.head() + AvgTicketPrice Cancelled ... dayOfWeek timestamp +0 841.265642 False ... 0 2018-01-01 00:00:00 +1 882.982662 False ... 0 2018-01-01 18:27:00 +2 190.636904 False ... 0 2018-01-01 17:11:14 +3 181.694216 True ... 0 2018-01-01 10:33:28 +4 730.041778 False ... 0 2018-01-01 05:13:00 ->>> # Connect to flights index via localhost Elasticsearch node on port 9200 with : credentials ->>> ed.DataFrame('http://:@localhost:9200', 'flights') +[5 rows x 27 columns] ->>> # Connect to flights index via ssl ->>> es = Elasticsearch( - 'https://:@localhost:443', - use_ssl=True, - verify_certs=True, - ca_certs='/path/to/ca.crt' +>>> df.info() + +Index: 13059 entries, 0 to 13058 +Data columns (total 27 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 AvgTicketPrice 13059 non-null float64 + 1 Cancelled 13059 non-null bool + 2 Carrier 13059 non-null object +... + 24 OriginWeather 13059 non-null object + 25 dayOfWeek 13059 non-null int64 + 26 timestamp 13059 non-null datetime64[ns] +dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17) +memory usage: 80.0 bytes + +# Filtering of rows using comparisons +>>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head() + AvgTicketPrice Cancelled ... dayOfWeek timestamp +8 960.869736 True ... 0 2018-01-01 12:09:35 +26 975.812632 True ... 0 2018-01-01 15:38:32 +311 946.358410 True ... 0 2018-01-01 11:51:12 +651 975.383864 True ... 2 2018-01-03 21:13:17 +950 907.836523 True ... 2 2018-01-03 05:14:51 + +[5 rows x 27 columns] + +# Running aggregations across an index +>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) + DistanceKilometers AvgTicketPrice +sum 9.261629e+07 8.204365e+06 +min 0.000000e+00 1.000205e+02 +std 4.578263e+03 2.663867e+02 +``` + +## Machine Learning in Eland + +Eland allows transforming trained models from scikit-learn, XGBoost, and LightGBM libraries +to be serialized and used as an inference model in Elasticsearch + +➤ [Eland Machine Learning API documentation](https://eland.readthedocs.io/en/latest/reference/ml.html) + +➤ [Read more about Machine Learning in Elasticsearch](https://www.elastic.co/guide/en/machine-learning/current/ml-getting-started.html) + +```python +>>> from xgboost import XGBClassifier +>>> from eland.ml import ImportedMLModel + +# Train and exercise an XGBoost ML model locally +>>> xgb_model = XGBClassifier(booster="gbtree") +>>> xgb_model.fit(training_data[0], training_data[1]) + +>>> xgb_model.predict(training_data[0]) +[0 1 1 0 1 0 0 0 1 0] + +# Import the model into Elasticsearch +>>> es_model = ImportedMLModel( + es_client="localhost:9200", + model_id="xgb-classifier", + model=xgb_model, + feature_names=["f0", "f1", "f2", "f3", "f4"], ) ->>> ed.DataFrame(es, 'flights') ->>> # Connect to flights index via ssl using Urllib3HttpConnection options ->>> es = Elasticsearch( - ['localhost:443', 'other_host:443'], - use_ssl=True, - verify_certs=True, - ca_certs='/path/to/CA_certs', - client_cert='/path/to/clientcert.pem', - client_key='/path/to/clientkey.pem' -) ->>> ed.DataFrame(es, 'flights') +# Exercise the ML model in Elasticsearch with the training data +>>> es_model.predict(training_data[0]) +[0 1 1 0 1 0 0 0 1 0] ``` - -### Connecting to an Elasticsearch Cloud Cluster - -``` ->>> import eland as ed ->>> from elasticsearch import Elasticsearch - ->>> es = Elasticsearch(cloud_id="", http_auth=('','')) - ->>> es.info() -{'name': 'instance-0000000000', 'cluster_name': 'bf900cfce5684a81bca0be0cce5913bc', 'cluster_uuid': 'xLPvrV3jQNeadA7oM4l1jA', 'version': {'number': '7.4.2', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '2f90bbf7b93631e52bafb59b3b049cb44ec25e96', 'build_date': '2019-10-28T20:40:44.881551Z', 'build_snapshot': False, 'lucene_version': '8.2.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'} - ->>> df = ed.read_es(es, 'reviews') -``` - -## Why eland? - -Naming is difficult, but as we had to call it something: - -* eland: elastic and data -* eland: 'Elk/Moose' in Dutch (Alces alces) -* [Elandsgracht](https://goo.gl/maps/3hGBMqeGRcsBJfKx8): Amsterdam street near Elastic's Amsterdam office - -[Pronunciation](https://commons.wikimedia.org/wiki/File:Nl-eland.ogg): /ˈeːlɑnt/ - diff --git a/docs/source/development/contributing.rst b/docs/source/development/contributing.rst index 9f628a9..1ccd2f1 100644 --- a/docs/source/development/contributing.rst +++ b/docs/source/development/contributing.rst @@ -1,5 +1,5 @@ ===================== -Contributing to eland +Contributing to Eland ===================== Eland is an open source project and we love to receive contributions diff --git a/docs/source/implementation/details.rst b/docs/source/development/implementation.rst similarity index 99% rename from docs/source/implementation/details.rst rename to docs/source/development/implementation.rst index 7149bfd..9cb1ab1 100644 --- a/docs/source/implementation/details.rst +++ b/docs/source/development/implementation.rst @@ -58,4 +58,3 @@ the ``pandas.DataFrame`` API. This resolves some of the issues above as: * Creating a new ``eland.DataFrame`` API gives us full flexibility in terms of implementation. However, it does create a large amount of work which may duplicate a lot of the ``pandas`` code - for example, printing objects etc. - this creates maintenance issues etc. - diff --git a/docs/source/development/index.rst b/docs/source/development/index.rst index 6347536..7feb0a4 100644 --- a/docs/source/development/index.rst +++ b/docs/source/development/index.rst @@ -8,3 +8,4 @@ Development :maxdepth: 2 contributing.rst + implementation.rst diff --git a/docs/source/examples/demo_notebook.ipynb b/docs/source/examples/demo_notebook.ipynb index 2c55123..624cbaf 100644 --- a/docs/source/examples/demo_notebook.ipynb +++ b/docs/source/examples/demo_notebook.ipynb @@ -32,7 +32,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Compare eland DataFrame vs pandas DataFrame" + "## Compare Eland DataFrame vs pandas DataFrame" ] }, { @@ -52,7 +52,7 @@ }, "outputs": [], "source": [ - "ed_flights = ed.read_es('localhost', 'flights')" + "ed_flights = ed.DataFrame('localhost', 'flights')" ] }, { diff --git a/docs/source/examples/online_retail_analysis.ipynb b/docs/source/examples/online_retail_analysis.ipynb index 7b0bd2a..53652b2 100644 --- a/docs/source/examples/online_retail_analysis.ipynb +++ b/docs/source/examples/online_retail_analysis.ipynb @@ -38,14 +38,14 @@ "metadata": {}, "outputs": [], "source": [ - "df = ed.read_csv(\"data/online-retail.csv.gz\",\n", - " es_client='localhost', \n", - " es_dest_index='online-retail', \n", - " es_if_exists='replace', \n", - " es_dropna=True,\n", - " es_refresh=True,\n", - " compression='gzip',\n", - " index_col=0)" + "df = ed.csv_to_eland(\"data/online-retail.csv.gz\",\n", + " es_client='localhost', \n", + " es_dest_index='online-retail', \n", + " es_if_exists='replace', \n", + " es_dropna=True,\n", + " es_refresh=True,\n", + " compression='gzip',\n", + " index_col=0)" ] }, { @@ -390,7 +390,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### selecting columns\n", + "### Selecting columns\n", "\n", "you can also pass a list of columns to select columns from the data frame in a specified order." ] diff --git a/docs/source/implementation/index.rst b/docs/source/implementation/index.rst deleted file mode 100644 index 49447a4..0000000 --- a/docs/source/implementation/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -.. _implementation: - -==================== -Implementation Notes -==================== - -.. toctree:: - :maxdepth: 2 - - details.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 2d225f5..011d2b9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,5 +1,3 @@ -.. eland documentation master file, created by - .. module:: eland ************************************************************** @@ -11,10 +9,10 @@ Eland: DataFrames and Machine Learning backed by Elasticsearch **Useful links**: `Source Repository `__ | `Issues & Ideas `__ | -`Q&A Support `__ | +`Q&A Support `__ Eland is a Python Elasticsearch client for exploring and analyzing data -residing in Elasticsearch with a familiar Pandas-compatible API. +in Elasticsearch with a familiar Pandas-compatible API. Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy, pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and @@ -48,30 +46,27 @@ If you're new to Elasticsearch we recommend `reading the documentation `_. -See https://www.elastic.co/guide/en/machine-learning/current/setup.html and other documentation for more detail. +See `Elasticsearch Machine Learning documentation `_ more details. ImportedMLModel ~~~~~~~~~~~~~~~ @@ -28,10 +29,17 @@ Constructor ImportedMLModel -Learning API -^^^^^^^^^^^^ +Predictions +^^^^^^^^^^^ .. autosummary:: :toctree: api/ ImportedMLModel.predict +Manage Models +^^^^^^^^^^^^^ +.. autosummary:: + :toctree: api/ + + MLModel.exists_model + MLModel.delete_model diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst index 3b646ab..b030ada 100644 --- a/docs/source/reference/series.rst +++ b/docs/source/reference/series.rst @@ -14,15 +14,17 @@ Constructor Attributes and underlying data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -**Axes** - .. autosummary:: :toctree: api/ Series.index + Series.dtype + Series.dtypes Series.shape Series.name Series.empty + Series.ndim + Series.size Indexing, iteration ~~~~~~~~~~~~~~~~~~~ diff --git a/setup.py b/setup.py index ec24e27..450a7be 100644 --- a/setup.py +++ b/setup.py @@ -32,152 +32,34 @@ CLASSIFIERS = [ "License :: OSI Approved :: Apache Software License", "Environment :: Console", "Operating System :: OS Independent", + "Intended Audience :: Developers", "Intended Audience :: Science/Research", + "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering", ] -LONG_DESCRIPTION = """ -eland is a Elasticsearch client Python package to analyse, explore and manipulate data that resides in Elasticsearch. -Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy, -pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and -not in memory, which allows eland to access large datasets stored in Elasticsearch. +# Remove all raw HTML from README for long description +with open(path.join(here, "README.md"), "r", "utf-8") as f: + lines = f.read().split("\n") + last_html_index = 0 + for i, line in enumerate(lines): + if line == "

": + last_html_index = i + 1 + long_description = "\n".join(lines[last_html_index:]) -For example, to explore data in a large Elasticsearch index, simply create an eland DataFrame from an Elasticsearch -index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API: - -``` ->>> import eland as ed - ->>> # Connect to 'flights' index via localhost Elasticsearch node ->>> df = ed.DataFrame('localhost:9200', 'flights') - ->>> df.head() - AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp -0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00 -1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00 -2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14 -3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28 -4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00 - -[5 rows x 27 columns] - ->>> df.describe() - AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin FlightTimeHour FlightTimeMin dayOfWeek -count 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 -mean 628.253689 7092.142457 4406.853010 47.335171 8.518797 511.127842 2.835975 -std 266.386661 4578.263193 2844.800855 96.743006 5.579019 334.741135 1.939365 -min 100.020531 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -25% 410.008918 2470.545974 1535.126118 0.000000 4.194976 251.738513 1.000000 -50% 640.362667 7612.072403 4729.922470 0.000000 8.385816 503.148975 3.000000 -75% 842.254990 9735.082407 6049.459005 15.000000 12.009396 720.534532 4.141221 -max 1199.729004 19881.482422 12353.780273 360.000000 31.715034 1902.901978 6.000000 - ->>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']] - Carrier AvgTicketPrice Cancelled -0 Kibana Airlines 841.265642 False -1 Logstash Airways 882.982662 False -2 Logstash Airways 190.636904 False -3 Kibana Airlines 181.694216 True -4 Kibana Airlines 730.041778 False -... ... ... ... -13054 Logstash Airways 1080.446279 False -13055 Logstash Airways 646.612941 False -13056 Logstash Airways 997.751876 False -13057 JetBeats 1102.814465 False -13058 JetBeats 858.144337 False - -[13059 rows x 3 columns] - ->>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head() - AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp -8 960.869736 True Kibana Airlines ... Heavy Fog 0 2018-01-01 12:09:35 -26 975.812632 True Kibana Airlines ... Rain 0 2018-01-01 15:38:32 -311 946.358410 True Kibana Airlines ... Heavy Fog 0 2018-01-01 11:51:12 -651 975.383864 True Kibana Airlines ... Rain 2 2018-01-03 21:13:17 -950 907.836523 True Kibana Airlines ... Thunder & Lightning 2 2018-01-03 05:14:51 - -[5 rows x 27 columns] - ->>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) - DistanceKilometers AvgTicketPrice -sum 9.261629e+07 8.204365e+06 -min 0.000000e+00 1.000205e+02 -std 4.578263e+03 2.663867e+02 - ->>> df[['Carrier', 'Origin', 'Dest']].nunique() -Carrier 4 -Origin 156 -Dest 156 -dtype: int64 - ->>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin ->>> s -0 18174.857422 -1 10589.365723 -2 381.273804 -3 739.126221 -4 14818.327637 - ... -13054 10219.474121 -13055 8381.823975 -13056 12661.157104 -13057 20819.488281 -13058 18315.431274 -Length: 13059, dtype: float64 - ->>> print(s.info_es()) -index_pattern: flights -Index: - index_field: _id - is_source_field: False -Mappings: - capabilities: - es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name -NaN script_field_None False double None float64 True True True script_field_None -Operations: - tasks: [] - size: None - sort_params: None - _source: ['script_field_None'] - body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}} - post_processing: [] - ->>> pd_df = ed.eland_to_pandas(df) ->>> pd_df.head() - AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp -0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00 -1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00 -2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14 -3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28 -4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00 - -[5 rows x 27 columns] -``` - -See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples. - -## Where to get it -The source code is currently hosted on GitHub at: -https://github.com/elastic/eland - -Binary installers for the latest released version are available at the [Python -package index](https://pypi.org/project/eland). - -```sh -pip install eland -``` -""" setup( name=about["__title__"], version=about["__version__"], description=about["__description__"], - long_description=LONG_DESCRIPTION, + long_description=long_description, long_description_content_type="text/markdown", url=about["__url__"], author=about["__author__"], diff --git a/utils/generate-supported-apis.py b/utils/generate-supported-apis.py index d5f9119..58020da 100644 --- a/utils/generate-supported-apis.py +++ b/utils/generate-supported-apis.py @@ -21,8 +21,10 @@ import re import eland import pandas import inspect +from pathlib import Path +api_docs_dir = Path(__file__).absolute().parent.parent / "docs/source/reference/api" is_supported = [] supported_attr = re.compile( r"(?:[a-zA-Z0-9][a-zA-Z0-9_]*|__[a-zA-Z0-9][a-zA-Z0-9_]*__)" @@ -68,6 +70,23 @@ def main(): ) print(row_delimiter) + for attr, supported in is_supported: + if supported and "__" not in attr: + attr = attr.replace("ed.", "eland.").rstrip("()") + attr_doc_path = api_docs_dir / f"{attr}.rst" + if not attr_doc_path.exists(): + with attr_doc_path.open(mode="w") as f: + f.truncate() + f.write( + f"""{attr} +{'=' * len(attr)} + +.. currentmodule:: eland + +.. automethod:: { attr.replace('eland.', '') } +""" + ) + if __name__ == "__main__": main()