{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Eland Demo Notebook" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "import eland as ed\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from elasticsearch import Elasticsearch\n", "\n", "# Import standard test settings for consistent results\n", "from eland.conftest import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compare eland DataFrame vs pandas DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create an eland.DataFrame from a `flights` index" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "ed_flights = ed.read_es('localhost', 'flights')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "eland.dataframe.DataFrame" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(ed_flights)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compare to pandas DataFrame (created from the same data)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "pd_flights = ed.eland_to_pandas(ed_flights)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(pd_flights)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Attributes and underlying data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.columns" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", " 'timestamp'],\n", " dtype='object')" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.columns" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", " 'timestamp'],\n", " dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.dtypes" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice float64\n", "Cancelled bool\n", "Carrier object\n", "Dest object\n", "DestAirportID object\n", " ... \n", "OriginLocation object\n", "OriginRegion object\n", "OriginWeather object\n", "dayOfWeek int64\n", "timestamp datetime64[ns]\n", "Length: 27, dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.dtypes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice float64\n", "Cancelled bool\n", "Carrier object\n", "Dest object\n", "DestAirportID object\n", " ... \n", "OriginLocation object\n", "OriginRegion object\n", "OriginWeather object\n", "dayOfWeek int64\n", "timestamp datetime64[ns]\n", "Length: 27, dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.select_dtypes" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
0841.26564216492.326654...1030.7704160
1882.9826628823.400140...464.3894810
2190.6369040.000000...0.0000000
3181.694216555.737767...222.7490590
4730.04177813358.244200...785.7790710
..................
130541080.4462798058.581753...402.9290886
13055646.6129417088.598322...644.4180296
13056997.75187610920.652972...937.5408116
130571102.81446518748.859647...1697.4049716
13058858.14433716809.141923...1610.7618276
\n", "

13059 rows × 7 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", "0 841.265642 16492.326654 ... 1030.770416 0\n", "1 882.982662 8823.400140 ... 464.389481 0\n", "2 190.636904 0.000000 ... 0.000000 0\n", "3 181.694216 555.737767 ... 222.749059 0\n", "4 730.041778 13358.244200 ... 785.779071 0\n", "... ... ... ... ... ...\n", "13054 1080.446279 8058.581753 ... 402.929088 6\n", "13055 646.612941 7088.598322 ... 644.418029 6\n", "13056 997.751876 10920.652972 ... 937.540811 6\n", "13057 1102.814465 18748.859647 ... 1697.404971 6\n", "13058 858.144337 16809.141923 ... 1610.761827 6\n", "\n", "[13059 rows x 7 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.select_dtypes(include=np.number)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
0841.26564216492.326654...1030.7704160
1882.9826628823.400140...464.3894810
2190.6369040.000000...0.0000000
3181.694216555.737767...222.7490590
4730.04177813358.244200...785.7790710
..................
130541080.4462798058.581753...402.9290886
13055646.6129417088.598322...644.4180296
13056997.75187610920.652972...937.5408116
130571102.81446518748.859647...1697.4049716
13058858.14433716809.141923...1610.7618276
\n", "
\n", "

13059 rows × 7 columns

" ], "text/plain": [ " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", "0 841.265642 16492.326654 ... 1030.770416 0\n", "1 882.982662 8823.400140 ... 464.389481 0\n", "2 190.636904 0.000000 ... 0.000000 0\n", "3 181.694216 555.737767 ... 222.749059 0\n", "4 730.041778 13358.244200 ... 785.779071 0\n", "... ... ... ... ... ...\n", "13054 1080.446279 8058.581753 ... 402.929088 6\n", "13055 646.612941 7088.598322 ... 644.418029 6\n", "13056 997.751876 10920.652972 ... 937.540811 6\n", "13057 1102.814465 18748.859647 ... 1697.404971 6\n", "13058 858.144337 16809.141923 ... 1610.761827 6\n", "\n", "[13059 rows x 7 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.select_dtypes(include=np.number)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.empty" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.empty" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.empty" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "(13059, 27)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "(13059, 27)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.index\n", "\n", "Note, `eland.DataFrame.index` does not mirror `pandas.DataFrame.index`. " ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n", " ...\n", " '13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'],\n", " dtype='object', length=13059)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.index" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NBVAL_IGNORE_OUTPUT\n", "ed_flights.index" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "'_id'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.index.es_index_field" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.values\n", "\n", "Note, `eland.DataFrame.values` is not supported." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0,\n", " Timestamp('2018-01-01 00:00:00')],\n", " [882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0,\n", " Timestamp('2018-01-01 18:27:00')],\n", " [190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0,\n", " Timestamp('2018-01-01 17:11:14')],\n", " ...,\n", " [997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6,\n", " Timestamp('2018-02-11 04:09:27')],\n", " [1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6,\n", " Timestamp('2018-02-11 08:28:21')],\n", " [858.1443369038839, False, 'JetBeats', ..., 'Rain', 6,\n", " Timestamp('2018-02-11 14:54:34')]], dtype=object)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.values" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This method would scan/scroll the entire Elasticsearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`\n" ] } ], "source": [ "try:\n", " ed_flights.values\n", "except AttributeError as e:\n", " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indexing, iteration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.head" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
0841.265642False...02018-01-01 00:00:00
1882.982662False...02018-01-01 18:27:00
2190.636904False...02018-01-01 17:11:14
3181.694216True...02018-01-01 10:33:28
4730.041778False...02018-01-01 05:13:00
\n", "

5 rows × 27 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "0 841.265642 False ... 0 2018-01-01 00:00:00\n", "1 882.982662 False ... 0 2018-01-01 18:27:00\n", "2 190.636904 False ... 0 2018-01-01 17:11:14\n", "3 181.694216 True ... 0 2018-01-01 10:33:28\n", "4 730.041778 False ... 0 2018-01-01 05:13:00\n", "\n", "[5 rows x 27 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
0841.265642False...02018-01-01 00:00:00
1882.982662False...02018-01-01 18:27:00
2190.636904False...02018-01-01 17:11:14
3181.694216True...02018-01-01 10:33:28
4730.041778False...02018-01-01 05:13:00
\n", "
\n", "

5 rows × 27 columns

" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "0 841.265642 False ... 0 2018-01-01 00:00:00\n", "1 882.982662 False ... 0 2018-01-01 18:27:00\n", "2 190.636904 False ... 0 2018-01-01 17:11:14\n", "3 181.694216 True ... 0 2018-01-01 10:33:28\n", "4 730.041778 False ... 0 2018-01-01 05:13:00\n", "\n", "[5 rows x 27 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.tail" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
130541080.446279False...62018-02-11 20:42:25
13055646.612941False...62018-02-11 01:41:57
13056997.751876False...62018-02-11 04:09:27
130571102.814465False...62018-02-11 08:28:21
13058858.144337False...62018-02-11 14:54:34
\n", "

5 rows × 27 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "13054 1080.446279 False ... 6 2018-02-11 20:42:25\n", "13055 646.612941 False ... 6 2018-02-11 01:41:57\n", "13056 997.751876 False ... 6 2018-02-11 04:09:27\n", "13057 1102.814465 False ... 6 2018-02-11 08:28:21\n", "13058 858.144337 False ... 6 2018-02-11 14:54:34\n", "\n", "[5 rows x 27 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.tail()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
130541080.446279False...62018-02-11 20:42:25
13055646.612941False...62018-02-11 01:41:57
13056997.751876False...62018-02-11 04:09:27
130571102.814465False...62018-02-11 08:28:21
13058858.144337False...62018-02-11 14:54:34
\n", "
\n", "

5 rows × 27 columns

" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "13054 1080.446279 False ... 6 2018-02-11 20:42:25\n", "13055 646.612941 False ... 6 2018-02-11 01:41:57\n", "13056 997.751876 False ... 6 2018-02-11 04:09:27\n", "13057 1102.814465 False ... 6 2018-02-11 08:28:21\n", "13058 858.144337 False ... 6 2018-02-11 14:54:34\n", "\n", "[5 rows x 27 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.keys" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", " 'timestamp'],\n", " dtype='object')" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.keys()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n", " 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n", " 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n", " 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n", " 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n", " 'timestamp'],\n", " dtype='object')" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.get" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "0 Kibana Airlines\n", "1 Logstash Airways\n", "2 Logstash Airways\n", "3 Kibana Airlines\n", "4 Kibana Airlines\n", " ... \n", "13054 Logstash Airways\n", "13055 Logstash Airways\n", "13056 Logstash Airways\n", "13057 JetBeats\n", "13058 JetBeats\n", "Name: Carrier, Length: 13059, dtype: object" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.get('Carrier')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "0 Kibana Airlines\n", "1 Logstash Airways\n", "2 Logstash Airways\n", "3 Kibana Airlines\n", "4 Kibana Airlines\n", " ... \n", "13054 Logstash Airways\n", "13055 Logstash Airways\n", "13056 Logstash Airways\n", "13057 JetBeats\n", "13058 JetBeats\n", "Name: Carrier, Length: 13059, dtype: object" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.get('Carrier')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarrierOrigin
0Kibana AirlinesFrankfurt am Main Airport
1Logstash AirwaysCape Town International Airport
2Logstash AirwaysVenice Marco Polo Airport
3Kibana AirlinesNaples International Airport
4Kibana AirlinesLicenciado Benito Juarez International Airport
.........
13054Logstash AirwaysPisa International Airport
13055Logstash AirwaysWinnipeg / James Armstrong Richardson Internat...
13056Logstash AirwaysLicenciado Benito Juarez International Airport
13057JetBeatsItami Airport
13058JetBeatsAdelaide International Airport
\n", "

13059 rows × 2 columns

\n", "
" ], "text/plain": [ " Carrier Origin\n", "0 Kibana Airlines Frankfurt am Main Airport\n", "1 Logstash Airways Cape Town International Airport\n", "2 Logstash Airways Venice Marco Polo Airport\n", "3 Kibana Airlines Naples International Airport\n", "4 Kibana Airlines Licenciado Benito Juarez International Airport\n", "... ... ...\n", "13054 Logstash Airways Pisa International Airport\n", "13055 Logstash Airways Winnipeg / James Armstrong Richardson Internat...\n", "13056 Logstash Airways Licenciado Benito Juarez International Airport\n", "13057 JetBeats Itami Airport\n", "13058 JetBeats Adelaide International Airport\n", "\n", "[13059 rows x 2 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.get(['Carrier', 'Origin'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "List input not currently supported by `eland.DataFrame.get`" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "unhashable type: 'list'\n" ] } ], "source": [ "try:\n", " ed_flights.get(['Carrier', 'Origin'])\n", "except TypeError as e:\n", " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.query" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", "

68 rows × 27 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "8 960.869736 True ... 0 2018-01-01 12:09:35\n", "26 975.812632 True ... 0 2018-01-01 15:38:32\n", "311 946.358410 True ... 0 2018-01-01 11:51:12\n", "651 975.383864 True ... 2 2018-01-03 21:13:17\n", "950 907.836523 True ... 2 2018-01-03 05:14:51\n", "... ... ... ... ... ...\n", "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", "\n", "[68 rows x 27 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`eland.DataFrame.query` requires qualifier on bool i.e.\n", "\n", "`ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled')` fails" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", "
\n", "

68 rows × 27 columns

" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "8 960.869736 True ... 0 2018-01-01 12:09:35\n", "26 975.812632 True ... 0 2018-01-01 15:38:32\n", "311 946.358410 True ... 0 2018-01-01 11:51:12\n", "651 975.383864 True ... 2 2018-01-03 21:13:17\n", "950 907.836523 True ... 2 2018-01-03 05:14:51\n", "... ... ... ... ... ...\n", "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", "\n", "[68 rows x 27 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Boolean indexing query" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", "

68 rows × 27 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "8 960.869736 True ... 0 2018-01-01 12:09:35\n", "26 975.812632 True ... 0 2018-01-01 15:38:32\n", "311 946.358410 True ... 0 2018-01-01 11:51:12\n", "651 975.383864 True ... 2 2018-01-03 21:13:17\n", "950 907.836523 True ... 2 2018-01-03 05:14:51\n", "... ... ... ... ... ...\n", "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", "\n", "[68 rows x 27 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights[(pd_flights.Carrier==\"Kibana Airlines\") & \n", " (pd_flights.AvgTicketPrice > 900.0) &\n", " (pd_flights.Cancelled == True)]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceCancelled...dayOfWeektimestamp
8960.869736True...02018-01-01 12:09:35
26975.812632True...02018-01-01 15:38:32
311946.358410True...02018-01-01 11:51:12
651975.383864True...22018-01-03 21:13:17
950907.836523True...22018-01-03 05:14:51
..................
12820909.973606True...52018-02-10 05:11:35
12906983.429244True...62018-02-11 06:19:58
129181136.678150True...62018-02-11 16:03:10
129191105.211803True...62018-02-11 05:36:05
130131055.350213True...62018-02-11 13:20:16
\n", "
\n", "

68 rows × 27 columns

" ], "text/plain": [ " AvgTicketPrice Cancelled ... dayOfWeek timestamp\n", "8 960.869736 True ... 0 2018-01-01 12:09:35\n", "26 975.812632 True ... 0 2018-01-01 15:38:32\n", "311 946.358410 True ... 0 2018-01-01 11:51:12\n", "651 975.383864 True ... 2 2018-01-03 21:13:17\n", "950 907.836523 True ... 2 2018-01-03 05:14:51\n", "... ... ... ... ... ...\n", "12820 909.973606 True ... 5 2018-02-10 05:11:35\n", "12906 983.429244 True ... 6 2018-02-11 06:19:58\n", "12918 1136.678150 True ... 6 2018-02-11 16:03:10\n", "12919 1105.211803 True ... 6 2018-02-11 05:36:05\n", "13013 1055.350213 True ... 6 2018-02-11 13:20:16\n", "\n", "[68 rows x 27 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights[(ed_flights.Carrier==\"Kibana Airlines\") & \n", " (ed_flights.AvgTicketPrice > 900.0) &\n", " (ed_flights.Cancelled == True)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Function application, GroupBy & window" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.aggs" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistanceKilometersAvgTicketPrice
sum9.261629e+078.204365e+06
min0.000000e+001.000205e+02
std4.578438e+032.663969e+02
\n", "
" ], "text/plain": [ " DistanceKilometers AvgTicketPrice\n", "sum 9.261629e+07 8.204365e+06\n", "min 0.000000e+00 1.000205e+02\n", "std 4.578438e+03 2.663969e+02" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`eland.DataFrame.aggregate` currently only supported numeric columns" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DistanceKilometersAvgTicketPrice
sum9.261629e+078.204365e+06
min0.000000e+001.000205e+02
std4.578614e+032.664071e+02
\n", "
" ], "text/plain": [ " DistanceKilometers AvgTicketPrice\n", "sum 9.261629e+07 8.204365e+06\n", "min 0.000000e+00 1.000205e+02\n", "std 4.578614e+03 2.664071e+02" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Computations / descriptive stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.count" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 13059\n", "Cancelled 13059\n", "Carrier 13059\n", "Dest 13059\n", "DestAirportID 13059\n", " ... \n", "OriginLocation 13059\n", "OriginRegion 13059\n", "OriginWeather 13059\n", "dayOfWeek 13059\n", "timestamp 13059\n", "Length: 27, dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.count()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 13059\n", "Cancelled 13059\n", "Carrier 13059\n", "Dest 13059\n", "DestAirportID 13059\n", " ... \n", "OriginLocation 13059\n", "OriginRegion 13059\n", "OriginWeather 13059\n", "dayOfWeek 13059\n", "timestamp 13059\n", "Length: 27, dtype: int64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.describe" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
count13059.00000013059.000000...13059.00000013059.000000
mean628.2536897092.142455...511.1278422.835975
std266.3968614578.438497...334.7539521.939439
min100.0205280.000000...0.0000000.000000
25%409.8938162459.705673...252.3331921.000000
50%640.5566687610.330866...503.0451703.000000
75%842.1854709736.637600...720.4160364.000000
max1199.72905319881.482315...1902.9020326.000000
\n", "

8 rows × 7 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", "count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n", "mean 628.253689 7092.142455 ... 511.127842 2.835975\n", "std 266.396861 4578.438497 ... 334.753952 1.939439\n", "min 100.020528 0.000000 ... 0.000000 0.000000\n", "25% 409.893816 2459.705673 ... 252.333192 1.000000\n", "50% 640.556668 7610.330866 ... 503.045170 3.000000\n", "75% 842.185470 9736.637600 ... 720.416036 4.000000\n", "max 1199.729053 19881.482315 ... 1902.902032 6.000000\n", "\n", "[8 rows x 7 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Values returned from `eland.DataFrame.describe` may vary due to results of Elasticsearch aggregations." ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AvgTicketPriceDistanceKilometers...FlightTimeMindayOfWeek
count13059.00000013059.000000...13059.00000013059.000000
mean628.2536897092.142457...511.1278422.835975
std266.3866614578.263193...334.7411351.939365
min100.0205310.000000...0.0000000.000000
25%410.0089182470.545974...252.0641621.000000
50%640.3872857612.072403...503.1489753.000000
75%842.2275939735.860651...720.5119684.068548
max1199.72900419881.482422...1902.9019786.000000
\n", "

8 rows × 7 columns

\n", "
" ], "text/plain": [ " AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n", "count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n", "mean 628.253689 7092.142457 ... 511.127842 2.835975\n", "std 266.386661 4578.263193 ... 334.741135 1.939365\n", "min 100.020531 0.000000 ... 0.000000 0.000000\n", "25% 410.008918 2470.545974 ... 252.064162 1.000000\n", "50% 640.387285 7612.072403 ... 503.148975 3.000000\n", "75% 842.227593 9735.860651 ... 720.511968 4.068548\n", "max 1199.729004 19881.482422 ... 1902.901978 6.000000\n", "\n", "[8 rows x 7 columns]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NBVAL_IGNORE_OUTPUT\n", "ed_flights.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.info" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 13059 entries, 0 to 13058\n", "Data columns (total 27 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 AvgTicketPrice 13059 non-null float64 \n", " 1 Cancelled 13059 non-null bool \n", " 2 Carrier 13059 non-null object \n", " 3 Dest 13059 non-null object \n", " 4 DestAirportID 13059 non-null object \n", " 5 DestCityName 13059 non-null object \n", " 6 DestCountry 13059 non-null object \n", " 7 DestLocation 13059 non-null object \n", " 8 DestRegion 13059 non-null object \n", " 9 DestWeather 13059 non-null object \n", " 10 DistanceKilometers 13059 non-null float64 \n", " 11 DistanceMiles 13059 non-null float64 \n", " 12 FlightDelay 13059 non-null bool \n", " 13 FlightDelayMin 13059 non-null int64 \n", " 14 FlightDelayType 13059 non-null object \n", " 15 FlightNum 13059 non-null object \n", " 16 FlightTimeHour 13059 non-null float64 \n", " 17 FlightTimeMin 13059 non-null float64 \n", " 18 Origin 13059 non-null object \n", " 19 OriginAirportID 13059 non-null object \n", " 20 OriginCityName 13059 non-null object \n", " 21 OriginCountry 13059 non-null object \n", " 22 OriginLocation 13059 non-null object \n", " 23 OriginRegion 13059 non-null object \n", " 24 OriginWeather 13059 non-null object \n", " 25 dayOfWeek 13059 non-null int64 \n", " 26 timestamp 13059 non-null datetime64[ns]\n", "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n", "memory usage: 3.2+ MB\n" ] } ], "source": [ "pd_flights.info()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 13059 entries, 0 to 13058\n", "Data columns (total 27 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 AvgTicketPrice 13059 non-null float64 \n", " 1 Cancelled 13059 non-null bool \n", " 2 Carrier 13059 non-null object \n", " 3 Dest 13059 non-null object \n", " 4 DestAirportID 13059 non-null object \n", " 5 DestCityName 13059 non-null object \n", " 6 DestCountry 13059 non-null object \n", " 7 DestLocation 13059 non-null object \n", " 8 DestRegion 13059 non-null object \n", " 9 DestWeather 13059 non-null object \n", " 10 DistanceKilometers 13059 non-null float64 \n", " 11 DistanceMiles 13059 non-null float64 \n", " 12 FlightDelay 13059 non-null bool \n", " 13 FlightDelayMin 13059 non-null int64 \n", " 14 FlightDelayType 13059 non-null object \n", " 15 FlightNum 13059 non-null object \n", " 16 FlightTimeHour 13059 non-null float64 \n", " 17 FlightTimeMin 13059 non-null float64 \n", " 18 Origin 13059 non-null object \n", " 19 OriginAirportID 13059 non-null object \n", " 20 OriginCityName 13059 non-null object \n", " 21 OriginCountry 13059 non-null object \n", " 22 OriginLocation 13059 non-null object \n", " 23 OriginRegion 13059 non-null object \n", " 24 OriginWeather 13059 non-null object \n", " 25 dayOfWeek 13059 non-null int64 \n", " 26 timestamp 13059 non-null datetime64[ns]\n", "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n", "memory usage: 80.0 bytes\n" ] } ], "source": [ "ed_flights.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### max" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 1199.73\n", "Cancelled True\n", "DistanceKilometers 19881.5\n", "DistanceMiles 12353.8\n", "FlightDelay True\n", "FlightDelayMin 360\n", "FlightTimeHour 31.715\n", "FlightTimeMin 1902.9\n", "dayOfWeek 6\n", "dtype: object" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.max(numeric_only=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`eland.DataFrame.max,min,mean,sum` only aggregate numeric columns" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 1199.73\n", "Cancelled True\n", "DistanceKilometers 19881.5\n", "DistanceMiles 12353.8\n", "FlightDelay True\n", "FlightDelayMin 360\n", "FlightTimeHour 31.715\n", "FlightTimeMin 1902.9\n", "dayOfWeek 6\n", "dtype: object" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.max(numeric_only=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### min" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 100.021\n", "Cancelled False\n", "DistanceKilometers 0\n", "DistanceMiles 0\n", "FlightDelay False\n", "FlightDelayMin 0\n", "FlightTimeHour 0\n", "FlightTimeMin 0\n", "dayOfWeek 0\n", "dtype: object" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.min(numeric_only=True)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 100.021\n", "Cancelled False\n", "DistanceKilometers 0\n", "DistanceMiles 0\n", "FlightDelay False\n", "FlightDelayMin 0\n", "FlightTimeHour 0\n", "FlightTimeMin 0\n", "dayOfWeek 0\n", "dtype: object" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.min(numeric_only=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### mean" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 628.253689\n", "Cancelled 0.128494\n", "DistanceKilometers 7092.142455\n", "DistanceMiles 4406.853013\n", "FlightDelay 0.251168\n", "FlightDelayMin 47.335171\n", "FlightTimeHour 8.518797\n", "FlightTimeMin 511.127842\n", "dayOfWeek 2.835975\n", "dtype: float64" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.mean(numeric_only=True)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 628.253689\n", "Cancelled 0.128494\n", "DistanceKilometers 7092.142457\n", "DistanceMiles 4406.853010\n", "FlightDelay 0.251168\n", "FlightDelayMin 47.335171\n", "FlightTimeHour 8.518797\n", "FlightTimeMin 511.127842\n", "dayOfWeek 2.835975\n", "dtype: float64" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.mean(numeric_only=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### sum" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 8.204365e+06\n", "Cancelled 1.678000e+03\n", "DistanceKilometers 9.261629e+07\n", "DistanceMiles 5.754909e+07\n", "FlightDelay 3.280000e+03\n", "FlightDelayMin 6.181500e+05\n", "FlightTimeHour 1.112470e+05\n", "FlightTimeMin 6.674818e+06\n", "dayOfWeek 3.703500e+04\n", "dtype: float64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.sum(numeric_only=True)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "AvgTicketPrice 8.204365e+06\n", "Cancelled 1.678000e+03\n", "DistanceKilometers 9.261629e+07\n", "DistanceMiles 5.754909e+07\n", "FlightDelay 3.280000e+03\n", "FlightDelayMin 6.181500e+05\n", "FlightTimeHour 1.112470e+05\n", "FlightTimeMin 6.674818e+06\n", "dayOfWeek 3.703500e+04\n", "dtype: float64" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.sum(numeric_only=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.nunique" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Carrier 4\n", "Origin 156\n", "Dest 156\n", "dtype: int64" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights[['Carrier', 'Origin', 'Dest']].nunique()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": [ "Carrier 4\n", "Origin 156\n", "Dest 156\n", "dtype: int64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights[['Carrier', 'Origin', 'Dest']].nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DataFrame.drop" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarrierDestRegion...dayOfWeektimestamp
0Kibana AirlinesSE-BD...02018-01-01 00:00:00
1Logstash AirwaysIT-34...02018-01-01 18:27:00
2Logstash AirwaysIT-34...02018-01-01 17:11:14
3Kibana AirlinesIT-34...02018-01-01 10:33:28
4Kibana AirlinesSE-BD...02018-01-01 05:13:00
..................
13054Logstash AirwaysSE-BD...62018-02-11 20:42:25
13055Logstash AirwaysCH-ZH...62018-02-11 01:41:57
13056Logstash AirwaysRU-AMU...62018-02-11 04:09:27
13057JetBeatsSE-BD...62018-02-11 08:28:21
13058JetBeatsUS-DC...62018-02-11 14:54:34
\n", "

13059 rows × 20 columns

\n", "
" ], "text/plain": [ " Carrier DestRegion ... dayOfWeek timestamp\n", "0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n", "1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n", "2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n", "3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n", "4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n", "... ... ... ... ... ...\n", "13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n", "13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n", "13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n", "13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n", "13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n", "\n", "[13059 rows x 20 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd_flights.drop(columns=['AvgTicketPrice', \n", " 'Cancelled', \n", " 'DestLocation',\n", " 'Dest', \n", " 'DestAirportID', \n", " 'DestCityName', \n", " 'DestCountry'])" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CarrierDestRegion...dayOfWeektimestamp
0Kibana AirlinesSE-BD...02018-01-01 00:00:00
1Logstash AirwaysIT-34...02018-01-01 18:27:00
2Logstash AirwaysIT-34...02018-01-01 17:11:14
3Kibana AirlinesIT-34...02018-01-01 10:33:28
4Kibana AirlinesSE-BD...02018-01-01 05:13:00
..................
13054Logstash AirwaysSE-BD...62018-02-11 20:42:25
13055Logstash AirwaysCH-ZH...62018-02-11 01:41:57
13056Logstash AirwaysRU-AMU...62018-02-11 04:09:27
13057JetBeatsSE-BD...62018-02-11 08:28:21
13058JetBeatsUS-DC...62018-02-11 14:54:34
\n", "
\n", "

13059 rows × 20 columns

" ], "text/plain": [ " Carrier DestRegion ... dayOfWeek timestamp\n", "0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n", "1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n", "2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n", "3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n", "4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n", "... ... ... ... ... ...\n", "13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n", "13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n", "13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n", "13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n", "13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n", "\n", "[13059 rows x 20 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ed_flights.drop(columns=['AvgTicketPrice', \n", " 'Cancelled', \n", " 'DestLocation',\n", " 'Dest', \n", " 'DestAirportID', \n", " 'DestCityName', \n", " 'DestCountry'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plotting" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "ed_flights.select_dtypes(include=np.number).hist(figsize=[10,10])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elasticsearch utilities" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "ed_flights2 = ed_flights[(ed_flights.OriginAirportID == 'AMS') & (ed_flights.FlightDelayMin > 60)]\n", "ed_flights2 = ed_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]\n", "ed_flights2 = ed_flights2.tail()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "es_index_pattern: flights\n", "Index:\n", " es_index_field: _id\n", " is_source_field: False\n", "Mappings:\n", " capabilities:\n", " es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name\n", "timestamp timestamp True date strict_date_hour_minute_second datetime64[ns] True True False timestamp\n", "OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID\n", "DestAirportID DestAirportID True keyword None object True True False DestAirportID\n", "FlightDelayMin FlightDelayMin True integer None int64 True True False FlightDelayMin\n", "Operations:\n", " tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]\n", " size: 5\n", " sort_params: _doc:desc\n", " _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']\n", " body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}\n", " post_processing: [('sort_index')]\n", "\n" ] } ], "source": [ "print(ed_flights2.info_es())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 2 }