mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
199 lines
8.7 KiB
Python
199 lines
8.7 KiB
Python
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
# license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright
|
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
# the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# flake8: noqa
|
|
|
|
from codecs import open
|
|
from os import path
|
|
|
|
from setuptools import setup, find_packages
|
|
|
|
here = path.abspath(path.dirname(__file__))
|
|
about = {}
|
|
with open(path.join(here, "eland", "_version.py"), "r", "utf-8") as f:
|
|
exec(f.read(), about)
|
|
|
|
CLASSIFIERS = [
|
|
"Development Status :: 3 - Alpha",
|
|
"License :: OSI Approved :: Apache Software License",
|
|
"Environment :: Console",
|
|
"Operating System :: OS Independent",
|
|
"Intended Audience :: Science/Research",
|
|
"Programming Language :: Python",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.6",
|
|
"Programming Language :: Python :: 3.7",
|
|
"Programming Language :: Python :: 3.8",
|
|
"Topic :: Scientific/Engineering",
|
|
]
|
|
|
|
LONG_DESCRIPTION = """
|
|
eland is a Elasticsearch client Python package to analyse, explore and manipulate data that resides in Elasticsearch.
|
|
Where possible the package uses existing Python APIs and data structures to make it easy to switch between numpy,
|
|
pandas, scikit-learn to their Elasticsearch powered equivalents. In general, the data resides in Elasticsearch and
|
|
not in memory, which allows eland to access large datasets stored in Elasticsearch.
|
|
|
|
For example, to explore data in a large Elasticsearch index, simply create an eland DataFrame from an Elasticsearch
|
|
index pattern, and explore using an API that mirrors a subset of the pandas.DataFrame API:
|
|
|
|
```
|
|
>>> import eland as ed
|
|
|
|
>>> # Connect to 'flights' index via localhost Elasticsearch node
|
|
>>> df = ed.DataFrame('localhost:9200', 'flights')
|
|
|
|
>>> df.head()
|
|
AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp
|
|
0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00
|
|
1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00
|
|
2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14
|
|
3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28
|
|
4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00
|
|
|
|
[5 rows x 27 columns]
|
|
|
|
>>> df.describe()
|
|
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin FlightTimeHour FlightTimeMin dayOfWeek
|
|
count 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000
|
|
mean 628.253689 7092.142457 4406.853010 47.335171 8.518797 511.127842 2.835975
|
|
std 266.386661 4578.263193 2844.800855 96.743006 5.579019 334.741135 1.939365
|
|
min 100.020531 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
|
|
25% 410.008918 2470.545974 1535.126118 0.000000 4.194976 251.738513 1.000000
|
|
50% 640.362667 7612.072403 4729.922470 0.000000 8.385816 503.148975 3.000000
|
|
75% 842.254990 9735.082407 6049.459005 15.000000 12.009396 720.534532 4.141221
|
|
max 1199.729004 19881.482422 12353.780273 360.000000 31.715034 1902.901978 6.000000
|
|
|
|
>>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
|
Carrier AvgTicketPrice Cancelled
|
|
0 Kibana Airlines 841.265642 False
|
|
1 Logstash Airways 882.982662 False
|
|
2 Logstash Airways 190.636904 False
|
|
3 Kibana Airlines 181.694216 True
|
|
4 Kibana Airlines 730.041778 False
|
|
... ... ... ...
|
|
13054 Logstash Airways 1080.446279 False
|
|
13055 Logstash Airways 646.612941 False
|
|
13056 Logstash Airways 997.751876 False
|
|
13057 JetBeats 1102.814465 False
|
|
13058 JetBeats 858.144337 False
|
|
|
|
[13059 rows x 3 columns]
|
|
|
|
>>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head()
|
|
AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp
|
|
8 960.869736 True Kibana Airlines ... Heavy Fog 0 2018-01-01 12:09:35
|
|
26 975.812632 True Kibana Airlines ... Rain 0 2018-01-01 15:38:32
|
|
311 946.358410 True Kibana Airlines ... Heavy Fog 0 2018-01-01 11:51:12
|
|
651 975.383864 True Kibana Airlines ... Rain 2 2018-01-03 21:13:17
|
|
950 907.836523 True Kibana Airlines ... Thunder & Lightning 2 2018-01-03 05:14:51
|
|
|
|
[5 rows x 27 columns]
|
|
|
|
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
|
|
DistanceKilometers AvgTicketPrice
|
|
sum 9.261629e+07 8.204365e+06
|
|
min 0.000000e+00 1.000205e+02
|
|
std 4.578263e+03 2.663867e+02
|
|
|
|
>>> df[['Carrier', 'Origin', 'Dest']].nunique()
|
|
Carrier 4
|
|
Origin 156
|
|
Dest 156
|
|
dtype: int64
|
|
|
|
>>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin
|
|
>>> s
|
|
0 18174.857422
|
|
1 10589.365723
|
|
2 381.273804
|
|
3 739.126221
|
|
4 14818.327637
|
|
...
|
|
13054 10219.474121
|
|
13055 8381.823975
|
|
13056 12661.157104
|
|
13057 20819.488281
|
|
13058 18315.431274
|
|
Length: 13059, dtype: float64
|
|
|
|
>>> print(s.info_es())
|
|
index_pattern: flights
|
|
Index:
|
|
index_field: _id
|
|
is_source_field: False
|
|
Mappings:
|
|
capabilities:
|
|
es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name
|
|
NaN script_field_None False double None float64 True True True script_field_None
|
|
Operations:
|
|
tasks: []
|
|
size: None
|
|
sort_params: None
|
|
_source: ['script_field_None']
|
|
body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}}
|
|
post_processing: []
|
|
|
|
>>> pd_df = ed.eland_to_pandas(df)
|
|
>>> pd_df.head()
|
|
AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp
|
|
0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00
|
|
1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00
|
|
2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14
|
|
3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28
|
|
4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00
|
|
|
|
[5 rows x 27 columns]
|
|
```
|
|
|
|
See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples.
|
|
|
|
## Where to get it
|
|
The source code is currently hosted on GitHub at:
|
|
https://github.com/elastic/eland
|
|
|
|
Binary installers for the latest released version are available at the [Python
|
|
package index](https://pypi.org/project/eland).
|
|
|
|
```sh
|
|
pip install eland
|
|
```
|
|
"""
|
|
|
|
setup(
|
|
name=about["__title__"],
|
|
version=about["__version__"],
|
|
description=about["__description__"],
|
|
long_description=LONG_DESCRIPTION,
|
|
long_description_content_type="text/markdown",
|
|
url=about["__url__"],
|
|
author=about["__author__"],
|
|
author_email=about["__author_email__"],
|
|
maintainer=about["__maintainer__"],
|
|
maintainer_email=about["__maintainer_email__"],
|
|
license="Apache-2.0",
|
|
classifiers=CLASSIFIERS,
|
|
keywords="elastic eland pandas python",
|
|
packages=find_packages(include=["eland", "eland.*"]),
|
|
install_requires=["elasticsearch>=7.7", "pandas>=1", "matplotlib", "numpy"],
|
|
python_requires=">=3.6",
|
|
extras_require={
|
|
"xgboost": ["xgboost>=0.90,<2"],
|
|
"scikit-learn": ["scikit-learn>=0.22.1,<1"],
|
|
"lightgbm": ["lightgbm>=2,<2.4"],
|
|
},
|
|
)
|