diff --git a/eland/Client.py b/eland/Client.py new file mode 100644 index 0000000..e951d30 --- /dev/null +++ b/eland/Client.py @@ -0,0 +1,19 @@ +from elasticsearch import Elasticsearch + +# eland client - implement as facade to control access to Elasticsearch methods +class Client(object): + + def __init__(self, es=None): + if isinstance(es, Elasticsearch): + self.es = es + else: + self.es = Elasticsearch(es) + + def info(self): + return self.es.info() + + def indices(self): + return self.es.indices + + def search(self, **kwargs): + return self.es.search(**kwargs) diff --git a/eland/DataFrame.py b/eland/DataFrame.py new file mode 100644 index 0000000..1f49216 --- /dev/null +++ b/eland/DataFrame.py @@ -0,0 +1,100 @@ +import eland + +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search + +import pandas as pd + +class DataFrame(): + + def __init__(self, client, index_pattern): + self.client = eland.Client(client) + self.index_pattern = index_pattern + + self.client.indices().exists(index_pattern) + + @staticmethod + def _es_results_to_pandas(results): + # TODO - resolve nested fields + rows = [] + for hit in results['hits']['hits']: + row = {} + for k in hit.keys(): + if k == '_source': + row.update(hit['_source']) + rows.append(row) + return pd.DataFrame(data=rows) + + @staticmethod + def _flatten_mapping(prefix, properties, result): + for k, v in properties.items(): + if 'properties' in v: + if(prefix == ''): + prefix = k + else: + prefix = prefix + '.' + k + DataFrame._flatten_mapping(prefix, v['properties'], result) + else: + if(prefix == ''): + key = k + else: + key = prefix + '.' + k + type = v['type'] + result.append((key, type)) + + @staticmethod + def _es_mappings_to_pandas(mappings): + fields = [] + for index in mappings: + if 'properties' in mappings[index]['mappings']: + properties = mappings[index]['mappings']['properties'] + + DataFrame._flatten_mapping('', properties, fields) + + return pd.DataFrame(data=fields, columns=['field', 'datatype']) + + def head(self, n=5): + results = self.client.search(index=self.index_pattern, size=n) + + return DataFrame._es_results_to_pandas(results) + + def describe(self): + # First get all types + #mapping = self.client.indices().get_mapping(index=self.index_pattern) + mapping = self.client.indices().get_mapping(index=self.index_pattern) + + fields = DataFrame._es_mappings_to_pandas(mapping) + + # Get numeric types (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking) + # https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html + # TODO refactor this list out of method + numeric_fields = fields.query('datatype == ["long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float"]') + + # for each field we copute: + # count, mean, std, min, 25%, 50%, 75%, max + search = Search(using=self.client, index=self.index_pattern).extra(size=0) + + for field in numeric_fields.field: + search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field) + search.aggs.metric('percentiles_'+field, 'percentiles', field=field) + + response = search.execute() + + results = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) + + for field in numeric_fields.field: + values = [] + values.append(response.aggregations['extended_stats_'+field]['count']) + values.append(response.aggregations['extended_stats_'+field]['avg']) + values.append(response.aggregations['extended_stats_'+field]['std_deviation']) + values.append(response.aggregations['extended_stats_'+field]['min']) + values.append(response.aggregations['percentiles_'+field]['values']['25.0']) + values.append(response.aggregations['percentiles_'+field]['values']['50.0']) + values.append(response.aggregations['percentiles_'+field]['values']['75.0']) + values.append(response.aggregations['extended_stats_'+field]['max']) + + # if not None + if (values.count(None) < len(values)): + results = results.assign(**{field: values}) + + return results diff --git a/eland/__init__.py b/eland/__init__.py new file mode 100644 index 0000000..a4723d1 --- /dev/null +++ b/eland/__init__.py @@ -0,0 +1,3 @@ +from .utils import * +from .DataFrame import * +from .Client import * diff --git a/eland/utils.py b/eland/utils.py new file mode 100644 index 0000000..28ee93e --- /dev/null +++ b/eland/utils.py @@ -0,0 +1,4 @@ +import eland + +def from_es(es_params, index_pattern): + return eland.DataFrame(es_params, index_pattern) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..2ba1dea --- /dev/null +++ b/test.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import eland as ed" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = ed.from_es('localhost', 'kibana_sample_data_flights')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceCancelledCarrierDestDestAirportIDDestCityNameDestCountryDestLocationDestRegionDestWeather...FlightTimeMinOriginOriginAirportIDOriginCityNameOriginCountryOriginLocationOriginRegionOriginWeatherdayOfWeektimestamp
0841.265642FalseKibana AirlinesSydney Kingsford Smith International AirportSYDSydneyAU{'lat': '-33.94609833', 'lon': '151.177002'}SE-BDRain...1030.770416Frankfurt am Main AirportFRAFrankfurt am MainDE{'lat': '50.033333', 'lon': '8.570556'}DE-HESunny02019-05-27T00:00:00
1882.982662FalseLogstash AirwaysVenice Marco Polo AirportVE05VeniceIT{'lat': '45.505299', 'lon': '12.3519'}IT-34Sunny...464.389481Cape Town International AirportCPTCape TownZA{'lat': '-33.96480179', 'lon': '18.60169983'}SE-BDClear02019-05-27T18:27:00
2190.636904FalseLogstash AirwaysVenice Marco Polo AirportVE05VeniceIT{'lat': '45.505299', 'lon': '12.3519'}IT-34Cloudy...0.000000Venice Marco Polo AirportVE05VeniceIT{'lat': '45.505299', 'lon': '12.3519'}IT-34Rain02019-05-27T17:11:14
3181.694216TrueKibana AirlinesTreviso-Sant'Angelo AirportTV01TrevisoIT{'lat': '45.648399', 'lon': '12.1944'}IT-34Clear...222.749059Naples International AirportNA01NaplesIT{'lat': '40.886002', 'lon': '14.2908'}IT-72Thunder & Lightning02019-05-27T10:33:28
4730.041778FalseKibana AirlinesXi'an Xianyang International AirportXIYXi'anCN{'lat': '34.447102', 'lon': '108.751999'}SE-BDClear...785.779071Licenciado Benito Juarez International AirportAICMMexico CityMX{'lat': '19.4363', 'lon': '-99.072098'}MX-DIFDamaging Wind02019-05-27T05:13:00
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice Cancelled Carrier \\\n", + "0 841.265642 False Kibana Airlines \n", + "1 882.982662 False Logstash Airways \n", + "2 190.636904 False Logstash Airways \n", + "3 181.694216 True Kibana Airlines \n", + "4 730.041778 False Kibana Airlines \n", + "\n", + " Dest DestAirportID DestCityName \\\n", + "0 Sydney Kingsford Smith International Airport SYD Sydney \n", + "1 Venice Marco Polo Airport VE05 Venice \n", + "2 Venice Marco Polo Airport VE05 Venice \n", + "3 Treviso-Sant'Angelo Airport TV01 Treviso \n", + "4 Xi'an Xianyang International Airport XIY Xi'an \n", + "\n", + " DestCountry DestLocation DestRegion \\\n", + "0 AU {'lat': '-33.94609833', 'lon': '151.177002'} SE-BD \n", + "1 IT {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n", + "2 IT {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n", + "3 IT {'lat': '45.648399', 'lon': '12.1944'} IT-34 \n", + "4 CN {'lat': '34.447102', 'lon': '108.751999'} SE-BD \n", + "\n", + " DestWeather ... FlightTimeMin \\\n", + "0 Rain ... 1030.770416 \n", + "1 Sunny ... 464.389481 \n", + "2 Cloudy ... 0.000000 \n", + "3 Clear ... 222.749059 \n", + "4 Clear ... 785.779071 \n", + "\n", + " Origin OriginAirportID \\\n", + "0 Frankfurt am Main Airport FRA \n", + "1 Cape Town International Airport CPT \n", + "2 Venice Marco Polo Airport VE05 \n", + "3 Naples International Airport NA01 \n", + "4 Licenciado Benito Juarez International Airport AICM \n", + "\n", + " OriginCityName OriginCountry \\\n", + "0 Frankfurt am Main DE \n", + "1 Cape Town ZA \n", + "2 Venice IT \n", + "3 Naples IT \n", + "4 Mexico City MX \n", + "\n", + " OriginLocation OriginRegion \\\n", + "0 {'lat': '50.033333', 'lon': '8.570556'} DE-HE \n", + "1 {'lat': '-33.96480179', 'lon': '18.60169983'} SE-BD \n", + "2 {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n", + "3 {'lat': '40.886002', 'lon': '14.2908'} IT-72 \n", + "4 {'lat': '19.4363', 'lon': '-99.072098'} MX-DIF \n", + "\n", + " OriginWeather dayOfWeek timestamp \n", + "0 Sunny 0 2019-05-27T00:00:00 \n", + "1 Clear 0 2019-05-27T18:27:00 \n", + "2 Rain 0 2019-05-27T17:11:14 \n", + "3 Thunder & Lightning 0 2019-05-27T10:33:28 \n", + "4 Damaging Wind 0 2019-05-27T05:13:00 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AvgTicketPriceDistanceKilometersDistanceMilesFlightDelayMinFlightTimeMindayOfWeek
count13059.00000013059.00000013059.00000013059.00000013059.00000013059.000000
mean628.2536897092.1424574406.85301047.335171511.1278422.835975
std266.3866614578.2631932844.80085596.743006334.7411351.939365
min100.0205310.0000000.0000000.0000000.0000000.000000
25%410.0089182470.5459741535.1261180.000000251.7730031.000000
50%640.3626677612.0724034729.9224700.000000503.1489753.000000
75%842.2334789735.8873906049.45900515.000000720.5345324.095833
max1199.72900419881.48242212353.780273360.0000001902.9019786.000000
\n", + "
" + ], + "text/plain": [ + " AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin \\\n", + "count 13059.000000 13059.000000 13059.000000 13059.000000 \n", + "mean 628.253689 7092.142457 4406.853010 47.335171 \n", + "std 266.386661 4578.263193 2844.800855 96.743006 \n", + "min 100.020531 0.000000 0.000000 0.000000 \n", + "25% 410.008918 2470.545974 1535.126118 0.000000 \n", + "50% 640.362667 7612.072403 4729.922470 0.000000 \n", + "75% 842.233478 9735.887390 6049.459005 15.000000 \n", + "max 1199.729004 19881.482422 12353.780273 360.000000 \n", + "\n", + " FlightTimeMin dayOfWeek \n", + "count 13059.000000 13059.000000 \n", + "mean 511.127842 2.835975 \n", + "std 334.741135 1.939365 \n", + "min 0.000000 0.000000 \n", + "25% 251.773003 1.000000 \n", + "50% 503.148975 3.000000 \n", + "75% 720.534532 4.095833 \n", + "max 1902.901978 6.000000 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test.py b/test.py new file mode 100644 index 0000000..30f2cf9 --- /dev/null +++ b/test.py @@ -0,0 +1,7 @@ +import eland as ed + +df = ed.from_es('localhost', 'kibana_sample_data_flights') + +print(df.head()) + +print(df.describe())