diff --git a/MANIFEST.in b/MANIFEST.in
index 42eb410..6006776 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
include LICENSE.txt
+include README.md
diff --git a/README.md b/README.md
index 48edff9..59b0b98 100644
--- a/README.md
+++ b/README.md
@@ -1,179 +1,31 @@
-_Note, this project is still very much a work in progress and in an alpha state; input and contributions welcome!_
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
- PyPI |
-
-
-
-
- |
-
-
- Conda Forge |
-
-
-
-
- |
-
-
- Package Status |
-
-
-
-
- |
-
-
- License |
-
-
-
-
- |
-
-
- Build Status |
-
-
-
-
- |
-
-
-# What is it?
-
-Eland is a Python Elasticsearch client for exploring and analyzing data
-residing in Elasticsearch with a familiar Pandas-compatible API.
+Eland is a Python Elasticsearch client for exploring and
+analyzing data in Elasticsearch with a familiar Pandas-compatible API.
Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy,
pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and
not in memory, which allows Eland to access large datasets stored in Elasticsearch.
-For example, to explore data in a large Elasticsearch index, simply create an eland DataFrame from an Elasticsearch
-index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API:
+Eland also provides tools to upload trained machine learning models from your
+common libraries like [scikit-learn](https://scikit-learn.org), [XGBoost](https://xgboost.readthedocs.io),
+and [LightGBM](https://lightgbm.readthedocs.io) into Elasticsearch.
-```
->>> import eland as ed
+## Getting Started
->>> # Connect to 'flights' index via localhost Elasticsearch node
->>> df = ed.DataFrame('localhost:9200', 'flights')
-
->>> df.head()
- AvgTicketPrice Cancelled ... dayOfWeek timestamp
-0 841.265642 False ... 0 2018-01-01 00:00:00
-1 882.982662 False ... 0 2018-01-01 18:27:00
-2 190.636904 False ... 0 2018-01-01 17:11:14
-3 181.694216 True ... 0 2018-01-01 10:33:28
-4 730.041778 False ... 0 2018-01-01 05:13:00
-
-[5 rows x 27 columns]
-
->>> df.describe()
- AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek
-count 13059.000000 13059.000000 ... 13059.000000 13059.000000
-mean 628.253689 7092.142457 ... 511.127842 2.835975
-std 266.386661 4578.263193 ... 334.741135 1.939365
-min 100.020531 0.000000 ... 0.000000 0.000000
-25% 410.008918 2470.545974 ... 251.739008 1.000000
-50% 640.387285 7612.072403 ... 503.148975 3.000000
-75% 842.262193 9735.660463 ... 720.505705 4.239865
-max 1199.729004 19881.482422 ... 1902.901978 6.000000
-
-[8 rows x 7 columns]
->>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']]
- Carrier AvgTicketPrice Cancelled
-0 Kibana Airlines 841.265642 False
-1 Logstash Airways 882.982662 False
-2 Logstash Airways 190.636904 False
-3 Kibana Airlines 181.694216 True
-4 Kibana Airlines 730.041778 False
-... ... ... ...
-13054 Logstash Airways 1080.446279 False
-13055 Logstash Airways 646.612941 False
-13056 Logstash Airways 997.751876 False
-13057 JetBeats 1102.814465 False
-13058 JetBeats 858.144337 False
-
-[13059 rows x 3 columns]
-
->>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head()
- AvgTicketPrice Cancelled ... dayOfWeek timestamp
-8 960.869736 True ... 0 2018-01-01 12:09:35
-26 975.812632 True ... 0 2018-01-01 15:38:32
-311 946.358410 True ... 0 2018-01-01 11:51:12
-651 975.383864 True ... 2 2018-01-03 21:13:17
-950 907.836523 True ... 2 2018-01-03 05:14:51
-
-[5 rows x 27 columns]
-
->>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
- DistanceKilometers AvgTicketPrice
-sum 9.261629e+07 8.204365e+06
-min 0.000000e+00 1.000205e+02
-std 4.578263e+03 2.663867e+02
-
->>> df[['Carrier', 'Origin', 'Dest']].nunique()
-Carrier 4
-Origin 156
-Dest 156
-dtype: int64
-
->>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin
->>> s
-0 18174.857422
-1 10589.365723
-2 381.273804
-3 739.126221
-4 14818.327637
- ...
-13054 10219.474121
-13055 8381.823975
-13056 12661.157104
-13057 20819.488281
-13058 18315.431274
-Length: 13059, dtype: float64
->>> print(s.es_info())
-index_pattern: flights
-Index:
- index_field: _id
- is_source_field: False
-Mappings:
- capabilities:
- es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name
-NaN script_field_None False double None float64 True True True script_field_None
-Operations:
- tasks: []
- size: None
- sort_params: None
- _source: ['script_field_None']
- body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}}
- post_processing: []
-
->>> pd_df = ed.eland_to_pandas(df)
->>> pd_df.head()
- AvgTicketPrice Cancelled ... dayOfWeek timestamp
-0 841.265642 False ... 0 2018-01-01 00:00:00
-1 882.982662 False ... 0 2018-01-01 18:27:00
-2 190.636904 False ... 0 2018-01-01 17:11:14
-3 181.694216 True ... 0 2018-01-01 10:33:28
-4 730.041778 False ... 0 2018-01-01 05:13:00
-
-[5 rows x 27 columns]
-```
-
-See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples.
-
-## Where to get it
-
-Eland can be installed from [PyPI](https://pypi.org/project/eland) via pip:
+Eland can be installed from [PyPI](https://pypi.org/project/eland) with Pip:
```bash
$ python -m pip install eland
@@ -185,88 +37,129 @@ Eland can also be installed from [Conda Forge](https://anaconda.org/conda-forge/
$ conda install -c conda-forge eland
```
-The [source code](https://github.com/elastic/eland) is currently available on GitHub.
+### Supported Versions
-## Versions and Compatibility
+- Supports Python 3.6+ and Pandas 1.0.0+
+- Supports Elasticsearch clusters that are 7.x+, recommended 7.6 or later for all features to work.
-### Python Version Support
+### Connecting to Elasticsearch
-Officially Python 3.6 and above.
+Eland uses the [Elasticsearch low level client](https://elasticsearch-py.readthedocs.io) to connect to Elasticsearch.
+This client supports a range of [connection options and authentication options](https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch).
-eland depends on pandas version 1.0.0+.
+You can pass either an instance of `elasticsearch.Elasticsearch` to Eland APIs
+or a string containing the host to connect to:
-### Elasticsearch Versions
+```python
+import eland as ed
-eland is versioned like the Elastic stack (eland 7.5.1 is compatible with Elasticsearch 7.x up to 7.5.1)
+# Connecting to an Elasticsearch instance running on 'localhost:9200'
+df = ed.DataFrame("localhost:9200", es_index_pattern="flights")
-A major version of the client is compatible with the same major version of Elasticsearch.
-
-No compatibility assurances are given between different major versions of the client and Elasticsearch.
-Major differences likely exist between major versions of Elasticsearch,
-particularly around request and response object formats, but also around API urls and behaviour.
-
-## Connecting to Elasticsearch
-
-eland uses the [Elasticsearch low level client](https://elasticsearch-py.readthedocs.io/) to connect to Elasticsearch.
-This client supports a range of [connection options and authentication mechanisms]
-(https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch).
-
-### Basic Connection Options
+# Connecting to an Elastic Cloud instance
+from elasticsearch import Elasticsearch
+es = Elasticsearch(
+ cloud_id="cluster-name:...",
+ http_auth=("elastic", "")
+)
+df = ed.DataFrame(es, es_index_pattern="flights")
```
+
+## DataFrames in Eland
+
+`eland.DataFrame` wraps an Elasticsearch index in a Pandas-like API
+and defers all processing and filtering of data to Elasticsearch
+instead of your local machine. This means you can process large
+amounts of data within Elasticsearch from a Jupyter Notebook
+without overloading your machine.
+
+➤ [Eland DataFrame API documentation](https://eland.readthedocs.io/en/latest/reference/dataframe.html)
+
+➤ [Advanced examples in a Jupyter Notebook](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html)
+
+```python
>>> import eland as ed
->>> # Connect to flights index via localhost Elasticsearch node
->>> ed.DataFrame('localhost', 'flights')
+>>> # Connect to 'flights' index via localhost Elasticsearch node
+>>> df = ed.DataFrame('localhost:9200', 'flights')
->>> # Connect to flights index via localhost Elasticsearch node on port 9200
->>> ed.DataFrame('localhost:9200', 'flights')
+# eland.DataFrame instance has the same API as pandas.DataFrame
+# except all data is in Elasticsearch. See .info() memory usage.
+>>> df.head()
+ AvgTicketPrice Cancelled ... dayOfWeek timestamp
+0 841.265642 False ... 0 2018-01-01 00:00:00
+1 882.982662 False ... 0 2018-01-01 18:27:00
+2 190.636904 False ... 0 2018-01-01 17:11:14
+3 181.694216 True ... 0 2018-01-01 10:33:28
+4 730.041778 False ... 0 2018-01-01 05:13:00
->>> # Connect to flights index via localhost Elasticsearch node on port 9200 with : credentials
->>> ed.DataFrame('http://:@localhost:9200', 'flights')
+[5 rows x 27 columns]
->>> # Connect to flights index via ssl
->>> es = Elasticsearch(
- 'https://:@localhost:443',
- use_ssl=True,
- verify_certs=True,
- ca_certs='/path/to/ca.crt'
+>>> df.info()
+
+Index: 13059 entries, 0 to 13058
+Data columns (total 27 columns):
+ # Column Non-Null Count Dtype
+--- ------ -------------- -----
+ 0 AvgTicketPrice 13059 non-null float64
+ 1 Cancelled 13059 non-null bool
+ 2 Carrier 13059 non-null object
+...
+ 24 OriginWeather 13059 non-null object
+ 25 dayOfWeek 13059 non-null int64
+ 26 timestamp 13059 non-null datetime64[ns]
+dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
+memory usage: 80.0 bytes
+
+# Filtering of rows using comparisons
+>>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head()
+ AvgTicketPrice Cancelled ... dayOfWeek timestamp
+8 960.869736 True ... 0 2018-01-01 12:09:35
+26 975.812632 True ... 0 2018-01-01 15:38:32
+311 946.358410 True ... 0 2018-01-01 11:51:12
+651 975.383864 True ... 2 2018-01-03 21:13:17
+950 907.836523 True ... 2 2018-01-03 05:14:51
+
+[5 rows x 27 columns]
+
+# Running aggregations across an index
+>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
+ DistanceKilometers AvgTicketPrice
+sum 9.261629e+07 8.204365e+06
+min 0.000000e+00 1.000205e+02
+std 4.578263e+03 2.663867e+02
+```
+
+## Machine Learning in Eland
+
+Eland allows transforming trained models from scikit-learn, XGBoost, and LightGBM libraries
+to be serialized and used as an inference model in Elasticsearch
+
+➤ [Eland Machine Learning API documentation](https://eland.readthedocs.io/en/latest/reference/ml.html)
+
+➤ [Read more about Machine Learning in Elasticsearch](https://www.elastic.co/guide/en/machine-learning/current/ml-getting-started.html)
+
+```python
+>>> from xgboost import XGBClassifier
+>>> from eland.ml import ImportedMLModel
+
+# Train and exercise an XGBoost ML model locally
+>>> xgb_model = XGBClassifier(booster="gbtree")
+>>> xgb_model.fit(training_data[0], training_data[1])
+
+>>> xgb_model.predict(training_data[0])
+[0 1 1 0 1 0 0 0 1 0]
+
+# Import the model into Elasticsearch
+>>> es_model = ImportedMLModel(
+ es_client="localhost:9200",
+ model_id="xgb-classifier",
+ model=xgb_model,
+ feature_names=["f0", "f1", "f2", "f3", "f4"],
)
->>> ed.DataFrame(es, 'flights')
->>> # Connect to flights index via ssl using Urllib3HttpConnection options
->>> es = Elasticsearch(
- ['localhost:443', 'other_host:443'],
- use_ssl=True,
- verify_certs=True,
- ca_certs='/path/to/CA_certs',
- client_cert='/path/to/clientcert.pem',
- client_key='/path/to/clientkey.pem'
-)
->>> ed.DataFrame(es, 'flights')
+# Exercise the ML model in Elasticsearch with the training data
+>>> es_model.predict(training_data[0])
+[0 1 1 0 1 0 0 0 1 0]
```
-
-### Connecting to an Elasticsearch Cloud Cluster
-
-```
->>> import eland as ed
->>> from elasticsearch import Elasticsearch
-
->>> es = Elasticsearch(cloud_id="", http_auth=('',''))
-
->>> es.info()
-{'name': 'instance-0000000000', 'cluster_name': 'bf900cfce5684a81bca0be0cce5913bc', 'cluster_uuid': 'xLPvrV3jQNeadA7oM4l1jA', 'version': {'number': '7.4.2', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '2f90bbf7b93631e52bafb59b3b049cb44ec25e96', 'build_date': '2019-10-28T20:40:44.881551Z', 'build_snapshot': False, 'lucene_version': '8.2.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
-
->>> df = ed.read_es(es, 'reviews')
-```
-
-## Why eland?
-
-Naming is difficult, but as we had to call it something:
-
-* eland: elastic and data
-* eland: 'Elk/Moose' in Dutch (Alces alces)
-* [Elandsgracht](https://goo.gl/maps/3hGBMqeGRcsBJfKx8): Amsterdam street near Elastic's Amsterdam office
-
-[Pronunciation](https://commons.wikimedia.org/wiki/File:Nl-eland.ogg): /ˈeːlɑnt/
-
diff --git a/docs/source/development/contributing.rst b/docs/source/development/contributing.rst
index 9f628a9..1ccd2f1 100644
--- a/docs/source/development/contributing.rst
+++ b/docs/source/development/contributing.rst
@@ -1,5 +1,5 @@
=====================
-Contributing to eland
+Contributing to Eland
=====================
Eland is an open source project and we love to receive contributions
diff --git a/docs/source/implementation/details.rst b/docs/source/development/implementation.rst
similarity index 99%
rename from docs/source/implementation/details.rst
rename to docs/source/development/implementation.rst
index 7149bfd..9cb1ab1 100644
--- a/docs/source/implementation/details.rst
+++ b/docs/source/development/implementation.rst
@@ -58,4 +58,3 @@ the ``pandas.DataFrame`` API. This resolves some of the issues above as:
* Creating a new ``eland.DataFrame`` API gives us full flexibility in terms of implementation. However,
it does create a large amount of work which may duplicate a lot of the ``pandas`` code - for example,
printing objects etc. - this creates maintenance issues etc.
-
diff --git a/docs/source/development/index.rst b/docs/source/development/index.rst
index 6347536..7feb0a4 100644
--- a/docs/source/development/index.rst
+++ b/docs/source/development/index.rst
@@ -8,3 +8,4 @@ Development
:maxdepth: 2
contributing.rst
+ implementation.rst
diff --git a/docs/source/examples/demo_notebook.ipynb b/docs/source/examples/demo_notebook.ipynb
index 2c55123..624cbaf 100644
--- a/docs/source/examples/demo_notebook.ipynb
+++ b/docs/source/examples/demo_notebook.ipynb
@@ -32,7 +32,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Compare eland DataFrame vs pandas DataFrame"
+ "## Compare Eland DataFrame vs pandas DataFrame"
]
},
{
@@ -52,7 +52,7 @@
},
"outputs": [],
"source": [
- "ed_flights = ed.read_es('localhost', 'flights')"
+ "ed_flights = ed.DataFrame('localhost', 'flights')"
]
},
{
diff --git a/docs/source/examples/online_retail_analysis.ipynb b/docs/source/examples/online_retail_analysis.ipynb
index 7b0bd2a..53652b2 100644
--- a/docs/source/examples/online_retail_analysis.ipynb
+++ b/docs/source/examples/online_retail_analysis.ipynb
@@ -38,14 +38,14 @@
"metadata": {},
"outputs": [],
"source": [
- "df = ed.read_csv(\"data/online-retail.csv.gz\",\n",
- " es_client='localhost', \n",
- " es_dest_index='online-retail', \n",
- " es_if_exists='replace', \n",
- " es_dropna=True,\n",
- " es_refresh=True,\n",
- " compression='gzip',\n",
- " index_col=0)"
+ "df = ed.csv_to_eland(\"data/online-retail.csv.gz\",\n",
+ " es_client='localhost', \n",
+ " es_dest_index='online-retail', \n",
+ " es_if_exists='replace', \n",
+ " es_dropna=True,\n",
+ " es_refresh=True,\n",
+ " compression='gzip',\n",
+ " index_col=0)"
]
},
{
@@ -390,7 +390,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### selecting columns\n",
+ "### Selecting columns\n",
"\n",
"you can also pass a list of columns to select columns from the data frame in a specified order."
]
diff --git a/docs/source/implementation/index.rst b/docs/source/implementation/index.rst
deleted file mode 100644
index 49447a4..0000000
--- a/docs/source/implementation/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-.. _implementation:
-
-====================
-Implementation Notes
-====================
-
-.. toctree::
- :maxdepth: 2
-
- details.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 2d225f5..011d2b9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,5 +1,3 @@
-.. eland documentation master file, created by
-
.. module:: eland
**************************************************************
@@ -11,10 +9,10 @@ Eland: DataFrames and Machine Learning backed by Elasticsearch
**Useful links**:
`Source Repository `__ |
`Issues & Ideas `__ |
-`Q&A Support `__ |
+`Q&A Support `__
Eland is a Python Elasticsearch client for exploring and analyzing data
-residing in Elasticsearch with a familiar Pandas-compatible API.
+in Elasticsearch with a familiar Pandas-compatible API.
Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy,
pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and
@@ -48,30 +46,27 @@ If you're new to Elasticsearch we recommend `reading the documentation `_.
-See https://www.elastic.co/guide/en/machine-learning/current/setup.html and other documentation for more detail.
+See `Elasticsearch Machine Learning documentation `_ more details.
ImportedMLModel
~~~~~~~~~~~~~~~
@@ -28,10 +29,17 @@ Constructor
ImportedMLModel
-Learning API
-^^^^^^^^^^^^
+Predictions
+^^^^^^^^^^^
.. autosummary::
:toctree: api/
ImportedMLModel.predict
+Manage Models
+^^^^^^^^^^^^^
+.. autosummary::
+ :toctree: api/
+
+ MLModel.exists_model
+ MLModel.delete_model
diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
index 3b646ab..b030ada 100644
--- a/docs/source/reference/series.rst
+++ b/docs/source/reference/series.rst
@@ -14,15 +14,17 @@ Constructor
Attributes and underlying data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Axes**
-
.. autosummary::
:toctree: api/
Series.index
+ Series.dtype
+ Series.dtypes
Series.shape
Series.name
Series.empty
+ Series.ndim
+ Series.size
Indexing, iteration
~~~~~~~~~~~~~~~~~~~
diff --git a/setup.py b/setup.py
index ec24e27..450a7be 100644
--- a/setup.py
+++ b/setup.py
@@ -32,152 +32,34 @@ CLASSIFIERS = [
"License :: OSI Approved :: Apache Software License",
"Environment :: Console",
"Operating System :: OS Independent",
+ "Intended Audience :: Developers",
"Intended Audience :: Science/Research",
+ "Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering",
]
-LONG_DESCRIPTION = """
-eland is a Elasticsearch client Python package to analyse, explore and manipulate data that resides in Elasticsearch.
-Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy,
-pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and
-not in memory, which allows eland to access large datasets stored in Elasticsearch.
+# Remove all raw HTML from README for long description
+with open(path.join(here, "README.md"), "r", "utf-8") as f:
+ lines = f.read().split("\n")
+ last_html_index = 0
+ for i, line in enumerate(lines):
+ if line == "":
+ last_html_index = i + 1
+ long_description = "\n".join(lines[last_html_index:])
-For example, to explore data in a large Elasticsearch index, simply create an eland DataFrame from an Elasticsearch
-index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API:
-
-```
->>> import eland as ed
-
->>> # Connect to 'flights' index via localhost Elasticsearch node
->>> df = ed.DataFrame('localhost:9200', 'flights')
-
->>> df.head()
- AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp
-0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00
-1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00
-2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14
-3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28
-4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00
-
-[5 rows x 27 columns]
-
->>> df.describe()
- AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin FlightTimeHour FlightTimeMin dayOfWeek
-count 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000
-mean 628.253689 7092.142457 4406.853010 47.335171 8.518797 511.127842 2.835975
-std 266.386661 4578.263193 2844.800855 96.743006 5.579019 334.741135 1.939365
-min 100.020531 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
-25% 410.008918 2470.545974 1535.126118 0.000000 4.194976 251.738513 1.000000
-50% 640.362667 7612.072403 4729.922470 0.000000 8.385816 503.148975 3.000000
-75% 842.254990 9735.082407 6049.459005 15.000000 12.009396 720.534532 4.141221
-max 1199.729004 19881.482422 12353.780273 360.000000 31.715034 1902.901978 6.000000
-
->>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']]
- Carrier AvgTicketPrice Cancelled
-0 Kibana Airlines 841.265642 False
-1 Logstash Airways 882.982662 False
-2 Logstash Airways 190.636904 False
-3 Kibana Airlines 181.694216 True
-4 Kibana Airlines 730.041778 False
-... ... ... ...
-13054 Logstash Airways 1080.446279 False
-13055 Logstash Airways 646.612941 False
-13056 Logstash Airways 997.751876 False
-13057 JetBeats 1102.814465 False
-13058 JetBeats 858.144337 False
-
-[13059 rows x 3 columns]
-
->>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head()
- AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp
-8 960.869736 True Kibana Airlines ... Heavy Fog 0 2018-01-01 12:09:35
-26 975.812632 True Kibana Airlines ... Rain 0 2018-01-01 15:38:32
-311 946.358410 True Kibana Airlines ... Heavy Fog 0 2018-01-01 11:51:12
-651 975.383864 True Kibana Airlines ... Rain 2 2018-01-03 21:13:17
-950 907.836523 True Kibana Airlines ... Thunder & Lightning 2 2018-01-03 05:14:51
-
-[5 rows x 27 columns]
-
->>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
- DistanceKilometers AvgTicketPrice
-sum 9.261629e+07 8.204365e+06
-min 0.000000e+00 1.000205e+02
-std 4.578263e+03 2.663867e+02
-
->>> df[['Carrier', 'Origin', 'Dest']].nunique()
-Carrier 4
-Origin 156
-Dest 156
-dtype: int64
-
->>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin
->>> s
-0 18174.857422
-1 10589.365723
-2 381.273804
-3 739.126221
-4 14818.327637
- ...
-13054 10219.474121
-13055 8381.823975
-13056 12661.157104
-13057 20819.488281
-13058 18315.431274
-Length: 13059, dtype: float64
-
->>> print(s.info_es())
-index_pattern: flights
-Index:
- index_field: _id
- is_source_field: False
-Mappings:
- capabilities:
- es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name
-NaN script_field_None False double None float64 True True True script_field_None
-Operations:
- tasks: []
- size: None
- sort_params: None
- _source: ['script_field_None']
- body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}}
- post_processing: []
-
->>> pd_df = ed.eland_to_pandas(df)
->>> pd_df.head()
- AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp
-0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00
-1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00
-2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14
-3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28
-4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00
-
-[5 rows x 27 columns]
-```
-
-See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples.
-
-## Where to get it
-The source code is currently hosted on GitHub at:
-https://github.com/elastic/eland
-
-Binary installers for the latest released version are available at the [Python
-package index](https://pypi.org/project/eland).
-
-```sh
-pip install eland
-```
-"""
setup(
name=about["__title__"],
version=about["__version__"],
description=about["__description__"],
- long_description=LONG_DESCRIPTION,
+ long_description=long_description,
long_description_content_type="text/markdown",
url=about["__url__"],
author=about["__author__"],
diff --git a/utils/generate-supported-apis.py b/utils/generate-supported-apis.py
index d5f9119..58020da 100644
--- a/utils/generate-supported-apis.py
+++ b/utils/generate-supported-apis.py
@@ -21,8 +21,10 @@ import re
import eland
import pandas
import inspect
+from pathlib import Path
+api_docs_dir = Path(__file__).absolute().parent.parent / "docs/source/reference/api"
is_supported = []
supported_attr = re.compile(
r"(?:[a-zA-Z0-9][a-zA-Z0-9_]*|__[a-zA-Z0-9][a-zA-Z0-9_]*__)"
@@ -68,6 +70,23 @@ def main():
)
print(row_delimiter)
+ for attr, supported in is_supported:
+ if supported and "__" not in attr:
+ attr = attr.replace("ed.", "eland.").rstrip("()")
+ attr_doc_path = api_docs_dir / f"{attr}.rst"
+ if not attr_doc_path.exists():
+ with attr_doc_path.open(mode="w") as f:
+ f.truncate()
+ f.write(
+ f"""{attr}
+{'=' * len(attr)}
+
+.. currentmodule:: eland
+
+.. automethod:: { attr.replace('eland.', '') }
+"""
+ )
+
if __name__ == "__main__":
main()