First prototype code commit

Experimental prototype, for internal development use only!
This commit is contained in:
Stephen Dodson 2019-06-12 11:46:20 +00:00
parent 90649f66f0
commit f1e27f1dda
6 changed files with 562 additions and 0 deletions

19
eland/Client.py Normal file
View File

@ -0,0 +1,19 @@
from elasticsearch import Elasticsearch
# eland client - implement as facade to control access to Elasticsearch methods
class Client(object):
def __init__(self, es=None):
if isinstance(es, Elasticsearch):
self.es = es
else:
self.es = Elasticsearch(es)
def info(self):
return self.es.info()
def indices(self):
return self.es.indices
def search(self, **kwargs):
return self.es.search(**kwargs)

100
eland/DataFrame.py Normal file
View File

@ -0,0 +1,100 @@
import eland
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import pandas as pd
class DataFrame():
def __init__(self, client, index_pattern):
self.client = eland.Client(client)
self.index_pattern = index_pattern
self.client.indices().exists(index_pattern)
@staticmethod
def _es_results_to_pandas(results):
# TODO - resolve nested fields
rows = []
for hit in results['hits']['hits']:
row = {}
for k in hit.keys():
if k == '_source':
row.update(hit['_source'])
rows.append(row)
return pd.DataFrame(data=rows)
@staticmethod
def _flatten_mapping(prefix, properties, result):
for k, v in properties.items():
if 'properties' in v:
if(prefix == ''):
prefix = k
else:
prefix = prefix + '.' + k
DataFrame._flatten_mapping(prefix, v['properties'], result)
else:
if(prefix == ''):
key = k
else:
key = prefix + '.' + k
type = v['type']
result.append((key, type))
@staticmethod
def _es_mappings_to_pandas(mappings):
fields = []
for index in mappings:
if 'properties' in mappings[index]['mappings']:
properties = mappings[index]['mappings']['properties']
DataFrame._flatten_mapping('', properties, fields)
return pd.DataFrame(data=fields, columns=['field', 'datatype'])
def head(self, n=5):
results = self.client.search(index=self.index_pattern, size=n)
return DataFrame._es_results_to_pandas(results)
def describe(self):
# First get all types
#mapping = self.client.indices().get_mapping(index=self.index_pattern)
mapping = self.client.indices().get_mapping(index=self.index_pattern)
fields = DataFrame._es_mappings_to_pandas(mapping)
# Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking)
# https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html
# TODO refactor this list out of method
numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]')
# for each field we copute:
# count, mean, std, min, 25%, 50%, 75%, max
search = Search(using=self.client, index=self.index_pattern).extra(size=0)
for field in numeric_fields.field:
search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
response = search.execute()
results = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
for field in numeric_fields.field:
values = []
values.append(response.aggregations['extended_stats_'+field]['count'])
values.append(response.aggregations['extended_stats_'+field]['avg'])
values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
values.append(response.aggregations['extended_stats_'+field]['min'])
values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
values.append(response.aggregations['extended_stats_'+field]['max'])
# if not None
if (values.count(None) < len(values)):
results = results.assign(**{field: values})
return results

3
eland/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .utils import *
from .DataFrame import *
from .Client import *

4
eland/utils.py Normal file
View File

@ -0,0 +1,4 @@
import eland
def from_es(es_params, index_pattern):
return eland.DataFrame(es_params, index_pattern)

429
test.ipynb Normal file
View File

@ -0,0 +1,429 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import eland as ed"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = ed.from_es('localhost', 'kibana_sample_data_flights')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AvgTicketPrice</th>\n",
" <th>Cancelled</th>\n",
" <th>Carrier</th>\n",
" <th>Dest</th>\n",
" <th>DestAirportID</th>\n",
" <th>DestCityName</th>\n",
" <th>DestCountry</th>\n",
" <th>DestLocation</th>\n",
" <th>DestRegion</th>\n",
" <th>DestWeather</th>\n",
" <th>...</th>\n",
" <th>FlightTimeMin</th>\n",
" <th>Origin</th>\n",
" <th>OriginAirportID</th>\n",
" <th>OriginCityName</th>\n",
" <th>OriginCountry</th>\n",
" <th>OriginLocation</th>\n",
" <th>OriginRegion</th>\n",
" <th>OriginWeather</th>\n",
" <th>dayOfWeek</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>841.265642</td>\n",
" <td>False</td>\n",
" <td>Kibana Airlines</td>\n",
" <td>Sydney Kingsford Smith International Airport</td>\n",
" <td>SYD</td>\n",
" <td>Sydney</td>\n",
" <td>AU</td>\n",
" <td>{'lat': '-33.94609833', 'lon': '151.177002'}</td>\n",
" <td>SE-BD</td>\n",
" <td>Rain</td>\n",
" <td>...</td>\n",
" <td>1030.770416</td>\n",
" <td>Frankfurt am Main Airport</td>\n",
" <td>FRA</td>\n",
" <td>Frankfurt am Main</td>\n",
" <td>DE</td>\n",
" <td>{'lat': '50.033333', 'lon': '8.570556'}</td>\n",
" <td>DE-HE</td>\n",
" <td>Sunny</td>\n",
" <td>0</td>\n",
" <td>2019-05-27T00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>882.982662</td>\n",
" <td>False</td>\n",
" <td>Logstash Airways</td>\n",
" <td>Venice Marco Polo Airport</td>\n",
" <td>VE05</td>\n",
" <td>Venice</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>Sunny</td>\n",
" <td>...</td>\n",
" <td>464.389481</td>\n",
" <td>Cape Town International Airport</td>\n",
" <td>CPT</td>\n",
" <td>Cape Town</td>\n",
" <td>ZA</td>\n",
" <td>{'lat': '-33.96480179', 'lon': '18.60169983'}</td>\n",
" <td>SE-BD</td>\n",
" <td>Clear</td>\n",
" <td>0</td>\n",
" <td>2019-05-27T18:27:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>190.636904</td>\n",
" <td>False</td>\n",
" <td>Logstash Airways</td>\n",
" <td>Venice Marco Polo Airport</td>\n",
" <td>VE05</td>\n",
" <td>Venice</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>Cloudy</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>Venice Marco Polo Airport</td>\n",
" <td>VE05</td>\n",
" <td>Venice</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>Rain</td>\n",
" <td>0</td>\n",
" <td>2019-05-27T17:11:14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>181.694216</td>\n",
" <td>True</td>\n",
" <td>Kibana Airlines</td>\n",
" <td>Treviso-Sant'Angelo Airport</td>\n",
" <td>TV01</td>\n",
" <td>Treviso</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.648399', 'lon': '12.1944'}</td>\n",
" <td>IT-34</td>\n",
" <td>Clear</td>\n",
" <td>...</td>\n",
" <td>222.749059</td>\n",
" <td>Naples International Airport</td>\n",
" <td>NA01</td>\n",
" <td>Naples</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '40.886002', 'lon': '14.2908'}</td>\n",
" <td>IT-72</td>\n",
" <td>Thunder &amp; Lightning</td>\n",
" <td>0</td>\n",
" <td>2019-05-27T10:33:28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>730.041778</td>\n",
" <td>False</td>\n",
" <td>Kibana Airlines</td>\n",
" <td>Xi'an Xianyang International Airport</td>\n",
" <td>XIY</td>\n",
" <td>Xi'an</td>\n",
" <td>CN</td>\n",
" <td>{'lat': '34.447102', 'lon': '108.751999'}</td>\n",
" <td>SE-BD</td>\n",
" <td>Clear</td>\n",
" <td>...</td>\n",
" <td>785.779071</td>\n",
" <td>Licenciado Benito Juarez International Airport</td>\n",
" <td>AICM</td>\n",
" <td>Mexico City</td>\n",
" <td>MX</td>\n",
" <td>{'lat': '19.4363', 'lon': '-99.072098'}</td>\n",
" <td>MX-DIF</td>\n",
" <td>Damaging Wind</td>\n",
" <td>0</td>\n",
" <td>2019-05-27T05:13:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" AvgTicketPrice Cancelled Carrier \\\n",
"0 841.265642 False Kibana Airlines \n",
"1 882.982662 False Logstash Airways \n",
"2 190.636904 False Logstash Airways \n",
"3 181.694216 True Kibana Airlines \n",
"4 730.041778 False Kibana Airlines \n",
"\n",
" Dest DestAirportID DestCityName \\\n",
"0 Sydney Kingsford Smith International Airport SYD Sydney \n",
"1 Venice Marco Polo Airport VE05 Venice \n",
"2 Venice Marco Polo Airport VE05 Venice \n",
"3 Treviso-Sant'Angelo Airport TV01 Treviso \n",
"4 Xi'an Xianyang International Airport XIY Xi'an \n",
"\n",
" DestCountry DestLocation DestRegion \\\n",
"0 AU {'lat': '-33.94609833', 'lon': '151.177002'} SE-BD \n",
"1 IT {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n",
"2 IT {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n",
"3 IT {'lat': '45.648399', 'lon': '12.1944'} IT-34 \n",
"4 CN {'lat': '34.447102', 'lon': '108.751999'} SE-BD \n",
"\n",
" DestWeather ... FlightTimeMin \\\n",
"0 Rain ... 1030.770416 \n",
"1 Sunny ... 464.389481 \n",
"2 Cloudy ... 0.000000 \n",
"3 Clear ... 222.749059 \n",
"4 Clear ... 785.779071 \n",
"\n",
" Origin OriginAirportID \\\n",
"0 Frankfurt am Main Airport FRA \n",
"1 Cape Town International Airport CPT \n",
"2 Venice Marco Polo Airport VE05 \n",
"3 Naples International Airport NA01 \n",
"4 Licenciado Benito Juarez International Airport AICM \n",
"\n",
" OriginCityName OriginCountry \\\n",
"0 Frankfurt am Main DE \n",
"1 Cape Town ZA \n",
"2 Venice IT \n",
"3 Naples IT \n",
"4 Mexico City MX \n",
"\n",
" OriginLocation OriginRegion \\\n",
"0 {'lat': '50.033333', 'lon': '8.570556'} DE-HE \n",
"1 {'lat': '-33.96480179', 'lon': '18.60169983'} SE-BD \n",
"2 {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n",
"3 {'lat': '40.886002', 'lon': '14.2908'} IT-72 \n",
"4 {'lat': '19.4363', 'lon': '-99.072098'} MX-DIF \n",
"\n",
" OriginWeather dayOfWeek timestamp \n",
"0 Sunny 0 2019-05-27T00:00:00 \n",
"1 Clear 0 2019-05-27T18:27:00 \n",
"2 Rain 0 2019-05-27T17:11:14 \n",
"3 Thunder & Lightning 0 2019-05-27T10:33:28 \n",
"4 Damaging Wind 0 2019-05-27T05:13:00 \n",
"\n",
"[5 rows x 27 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AvgTicketPrice</th>\n",
" <th>DistanceKilometers</th>\n",
" <th>DistanceMiles</th>\n",
" <th>FlightDelayMin</th>\n",
" <th>FlightTimeMin</th>\n",
" <th>dayOfWeek</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>628.253689</td>\n",
" <td>7092.142457</td>\n",
" <td>4406.853010</td>\n",
" <td>47.335171</td>\n",
" <td>511.127842</td>\n",
" <td>2.835975</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>266.386661</td>\n",
" <td>4578.263193</td>\n",
" <td>2844.800855</td>\n",
" <td>96.743006</td>\n",
" <td>334.741135</td>\n",
" <td>1.939365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>100.020531</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>410.008918</td>\n",
" <td>2470.545974</td>\n",
" <td>1535.126118</td>\n",
" <td>0.000000</td>\n",
" <td>251.773003</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>640.362667</td>\n",
" <td>7612.072403</td>\n",
" <td>4729.922470</td>\n",
" <td>0.000000</td>\n",
" <td>503.148975</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>842.233478</td>\n",
" <td>9735.887390</td>\n",
" <td>6049.459005</td>\n",
" <td>15.000000</td>\n",
" <td>720.534532</td>\n",
" <td>4.095833</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1199.729004</td>\n",
" <td>19881.482422</td>\n",
" <td>12353.780273</td>\n",
" <td>360.000000</td>\n",
" <td>1902.901978</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin \\\n",
"count 13059.000000 13059.000000 13059.000000 13059.000000 \n",
"mean 628.253689 7092.142457 4406.853010 47.335171 \n",
"std 266.386661 4578.263193 2844.800855 96.743006 \n",
"min 100.020531 0.000000 0.000000 0.000000 \n",
"25% 410.008918 2470.545974 1535.126118 0.000000 \n",
"50% 640.362667 7612.072403 4729.922470 0.000000 \n",
"75% 842.233478 9735.887390 6049.459005 15.000000 \n",
"max 1199.729004 19881.482422 12353.780273 360.000000 \n",
"\n",
" FlightTimeMin dayOfWeek \n",
"count 13059.000000 13059.000000 \n",
"mean 511.127842 2.835975 \n",
"std 334.741135 1.939365 \n",
"min 0.000000 0.000000 \n",
"25% 251.773003 1.000000 \n",
"50% 503.148975 3.000000 \n",
"75% 720.534532 4.095833 \n",
"max 1902.901978 6.000000 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

7
test.py Normal file
View File

@ -0,0 +1,7 @@
import eland as ed
df = ed.from_es('localhost', 'kibana_sample_data_flights')
print(df.head())
print(df.describe())