diff --git a/eland/Client.py b/eland/Client.py new file mode 100644 index 0000000..e951d30 --- /dev/null +++ b/eland/Client.py @@ -0,0 +1,19 @@ +from elasticsearch import Elasticsearch + +# eland client - implement as facade to control access to Elasticsearch methods +class Client(object): + + def __init__(self, es=None): + if isinstance(es, Elasticsearch): + self.es = es + else: + self.es = Elasticsearch(es) + + def info(self): + return self.es.info() + + def indices(self): + return self.es.indices + + def search(self, **kwargs): + return self.es.search(**kwargs) diff --git a/eland/DataFrame.py b/eland/DataFrame.py new file mode 100644 index 0000000..1f49216 --- /dev/null +++ b/eland/DataFrame.py @@ -0,0 +1,100 @@ +import eland + +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search + +import pandas as pd + +class DataFrame(): + + def __init__(self, client, index_pattern): + self.client = eland.Client(client) + self.index_pattern = index_pattern + + self.client.indices().exists(index_pattern) + + @staticmethod + def _es_results_to_pandas(results): + # TODO - resolve nested fields + rows = [] + for hit in results['hits']['hits']: + row = {} + for k in hit.keys(): + if k == '_source': + row.update(hit['_source']) + rows.append(row) + return pd.DataFrame(data=rows) + + @staticmethod + def _flatten_mapping(prefix, properties, result): + for k, v in properties.items(): + if 'properties' in v: + if(prefix == ''): + prefix = k + else: + prefix = prefix + '.' + k + DataFrame._flatten_mapping(prefix, v['properties'], result) + else: + if(prefix == ''): + key = k + else: + key = prefix + '.' + k + type = v['type'] + result.append((key, type)) + + @staticmethod + def _es_mappings_to_pandas(mappings): + fields = [] + for index in mappings: + if 'properties' in mappings[index]['mappings']: + properties = mappings[index]['mappings']['properties'] + + DataFrame._flatten_mapping('', properties, fields) + + return pd.DataFrame(data=fields, columns=['field', 'datatype']) + + def head(self, n=5): + results = self.client.search(index=self.index_pattern, size=n) + + return DataFrame._es_results_to_pandas(results) + + def describe(self): + # First get all types + #mapping = self.client.indices().get_mapping(index=self.index_pattern) + mapping = self.client.indices().get_mapping(index=self.index_pattern) + + fields = DataFrame._es_mappings_to_pandas(mapping) + + # Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking) + # https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html + # TODO refactor this list out of method + numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]') + + # for each field we copute: + # count, mean, std, min, 25%, 50%, 75%, max + search = Search(using=self.client, index=self.index_pattern).extra(size=0) + + for field in numeric_fields.field: + search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field) + search.aggs.metric('percentiles_'+field, 'percentiles', field=field) + + response = search.execute() + + results = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) + + for field in numeric_fields.field: + values = [] + values.append(response.aggregations['extended_stats_'+field]['count']) + values.append(response.aggregations['extended_stats_'+field]['avg']) + values.append(response.aggregations['extended_stats_'+field]['std_deviation']) + values.append(response.aggregations['extended_stats_'+field]['min']) + values.append(response.aggregations['percentiles_'+field]['values']['25.0']) + values.append(response.aggregations['percentiles_'+field]['values']['50.0']) + values.append(response.aggregations['percentiles_'+field]['values']['75.0']) + values.append(response.aggregations['extended_stats_'+field]['max']) + + # if not None + if (values.count(None) < len(values)): + results = results.assign(**{field: values}) + + return results diff --git a/eland/__init__.py b/eland/__init__.py new file mode 100644 index 0000000..a4723d1 --- /dev/null +++ b/eland/__init__.py @@ -0,0 +1,3 @@ +from .utils import * +from .DataFrame import * +from .Client import * diff --git a/eland/utils.py b/eland/utils.py new file mode 100644 index 0000000..28ee93e --- /dev/null +++ b/eland/utils.py @@ -0,0 +1,4 @@ +import eland + +def from_es(es_params, index_pattern): + return eland.DataFrame(es_params, index_pattern) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..2ba1dea --- /dev/null +++ b/test.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import eland as ed" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = ed.from_es('localhost', 'kibana_sample_data_flights')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | AvgTicketPrice | \n", + "Cancelled | \n", + "Carrier | \n", + "Dest | \n", + "DestAirportID | \n", + "DestCityName | \n", + "DestCountry | \n", + "DestLocation | \n", + "DestRegion | \n", + "DestWeather | \n", + "... | \n", + "FlightTimeMin | \n", + "Origin | \n", + "OriginAirportID | \n", + "OriginCityName | \n", + "OriginCountry | \n", + "OriginLocation | \n", + "OriginRegion | \n", + "OriginWeather | \n", + "dayOfWeek | \n", + "timestamp | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "841.265642 | \n", + "False | \n", + "Kibana Airlines | \n", + "Sydney Kingsford Smith International Airport | \n", + "SYD | \n", + "Sydney | \n", + "AU | \n", + "{'lat': '-33.94609833', 'lon': '151.177002'} | \n", + "SE-BD | \n", + "Rain | \n", + "... | \n", + "1030.770416 | \n", + "Frankfurt am Main Airport | \n", + "FRA | \n", + "Frankfurt am Main | \n", + "DE | \n", + "{'lat': '50.033333', 'lon': '8.570556'} | \n", + "DE-HE | \n", + "Sunny | \n", + "0 | \n", + "2019-05-27T00:00:00 | \n", + "
1 | \n", + "882.982662 | \n", + "False | \n", + "Logstash Airways | \n", + "Venice Marco Polo Airport | \n", + "VE05 | \n", + "Venice | \n", + "IT | \n", + "{'lat': '45.505299', 'lon': '12.3519'} | \n", + "IT-34 | \n", + "Sunny | \n", + "... | \n", + "464.389481 | \n", + "Cape Town International Airport | \n", + "CPT | \n", + "Cape Town | \n", + "ZA | \n", + "{'lat': '-33.96480179', 'lon': '18.60169983'} | \n", + "SE-BD | \n", + "Clear | \n", + "0 | \n", + "2019-05-27T18:27:00 | \n", + "
2 | \n", + "190.636904 | \n", + "False | \n", + "Logstash Airways | \n", + "Venice Marco Polo Airport | \n", + "VE05 | \n", + "Venice | \n", + "IT | \n", + "{'lat': '45.505299', 'lon': '12.3519'} | \n", + "IT-34 | \n", + "Cloudy | \n", + "... | \n", + "0.000000 | \n", + "Venice Marco Polo Airport | \n", + "VE05 | \n", + "Venice | \n", + "IT | \n", + "{'lat': '45.505299', 'lon': '12.3519'} | \n", + "IT-34 | \n", + "Rain | \n", + "0 | \n", + "2019-05-27T17:11:14 | \n", + "
3 | \n", + "181.694216 | \n", + "True | \n", + "Kibana Airlines | \n", + "Treviso-Sant'Angelo Airport | \n", + "TV01 | \n", + "Treviso | \n", + "IT | \n", + "{'lat': '45.648399', 'lon': '12.1944'} | \n", + "IT-34 | \n", + "Clear | \n", + "... | \n", + "222.749059 | \n", + "Naples International Airport | \n", + "NA01 | \n", + "Naples | \n", + "IT | \n", + "{'lat': '40.886002', 'lon': '14.2908'} | \n", + "IT-72 | \n", + "Thunder & Lightning | \n", + "0 | \n", + "2019-05-27T10:33:28 | \n", + "
4 | \n", + "730.041778 | \n", + "False | \n", + "Kibana Airlines | \n", + "Xi'an Xianyang International Airport | \n", + "XIY | \n", + "Xi'an | \n", + "CN | \n", + "{'lat': '34.447102', 'lon': '108.751999'} | \n", + "SE-BD | \n", + "Clear | \n", + "... | \n", + "785.779071 | \n", + "Licenciado Benito Juarez International Airport | \n", + "AICM | \n", + "Mexico City | \n", + "MX | \n", + "{'lat': '19.4363', 'lon': '-99.072098'} | \n", + "MX-DIF | \n", + "Damaging Wind | \n", + "0 | \n", + "2019-05-27T05:13:00 | \n", + "
5 rows × 27 columns
\n", + "\n", + " | AvgTicketPrice | \n", + "DistanceKilometers | \n", + "DistanceMiles | \n", + "FlightDelayMin | \n", + "FlightTimeMin | \n", + "dayOfWeek | \n", + "
---|---|---|---|---|---|---|
count | \n", + "13059.000000 | \n", + "13059.000000 | \n", + "13059.000000 | \n", + "13059.000000 | \n", + "13059.000000 | \n", + "13059.000000 | \n", + "
mean | \n", + "628.253689 | \n", + "7092.142457 | \n", + "4406.853010 | \n", + "47.335171 | \n", + "511.127842 | \n", + "2.835975 | \n", + "
std | \n", + "266.386661 | \n", + "4578.263193 | \n", + "2844.800855 | \n", + "96.743006 | \n", + "334.741135 | \n", + "1.939365 | \n", + "
min | \n", + "100.020531 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
25% | \n", + "410.008918 | \n", + "2470.545974 | \n", + "1535.126118 | \n", + "0.000000 | \n", + "251.773003 | \n", + "1.000000 | \n", + "
50% | \n", + "640.362667 | \n", + "7612.072403 | \n", + "4729.922470 | \n", + "0.000000 | \n", + "503.148975 | \n", + "3.000000 | \n", + "
75% | \n", + "842.233478 | \n", + "9735.887390 | \n", + "6049.459005 | \n", + "15.000000 | \n", + "720.534532 | \n", + "4.095833 | \n", + "
max | \n", + "1199.729004 | \n", + "19881.482422 | \n", + "12353.780273 | \n", + "360.000000 | \n", + "1902.901978 | \n", + "6.000000 | \n", + "