Added json file for pandas comparison

+ renamed from_es to read_es1
This commit is contained in:
Stephen Dodson 2019-06-12 12:12:40 +00:00
parent f1e27f1dda
commit 2b83edad69
4 changed files with 441 additions and 14 deletions

View File

@ -1,4 +1,4 @@
import eland
def from_es(es_params, index_pattern):
def read_es(es_params, index_pattern):
return eland.DataFrame(es_params, index_pattern)

BIN
flights.json.gz Normal file

Binary file not shown.

View File

@ -1,5 +1,12 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Eland"
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -15,7 +22,7 @@
"metadata": {},
"outputs": [],
"source": [
"df = ed.from_es('localhost', 'kibana_sample_data_flights')"
"df = ed.read_es('localhost', 'kibana_sample_data_flights')"
]
},
{
@ -339,12 +346,12 @@
" <td>2470.545974</td>\n",
" <td>1535.126118</td>\n",
" <td>0.000000</td>\n",
" <td>251.773003</td>\n",
" <td>251.682199</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>640.362667</td>\n",
" <td>640.362374</td>\n",
" <td>7612.072403</td>\n",
" <td>4729.922470</td>\n",
" <td>0.000000</td>\n",
@ -353,12 +360,12 @@
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>842.233478</td>\n",
" <td>9735.887390</td>\n",
" <td>842.260482</td>\n",
" <td>9735.660463</td>\n",
" <td>6049.459005</td>\n",
" <td>15.000000</td>\n",
" <td>720.534532</td>\n",
" <td>4.095833</td>\n",
" <td>14.102113</td>\n",
" <td>720.569838</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
@ -380,8 +387,8 @@
"std 266.386661 4578.263193 2844.800855 96.743006 \n",
"min 100.020531 0.000000 0.000000 0.000000 \n",
"25% 410.008918 2470.545974 1535.126118 0.000000 \n",
"50% 640.362667 7612.072403 4729.922470 0.000000 \n",
"75% 842.233478 9735.887390 6049.459005 15.000000 \n",
"50% 640.362374 7612.072403 4729.922470 0.000000 \n",
"75% 842.260482 9735.660463 6049.459005 14.102113 \n",
"max 1199.729004 19881.482422 12353.780273 360.000000 \n",
"\n",
" FlightTimeMin dayOfWeek \n",
@ -389,9 +396,9 @@
"mean 511.127842 2.835975 \n",
"std 334.741135 1.939365 \n",
"min 0.000000 0.000000 \n",
"25% 251.773003 1.000000 \n",
"25% 251.682199 1.000000 \n",
"50% 503.148975 3.000000 \n",
"75% 720.534532 4.095833 \n",
"75% 720.569838 4.000000 \n",
"max 1902.901978 6.000000 "
]
},
@ -403,6 +410,426 @@
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pandas"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"pd_df = pd.read_json('flights.json.gz', lines=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AvgTicketPrice</th>\n",
" <th>Cancelled</th>\n",
" <th>Carrier</th>\n",
" <th>Dest</th>\n",
" <th>DestAirportID</th>\n",
" <th>DestCityName</th>\n",
" <th>DestCountry</th>\n",
" <th>DestLocation</th>\n",
" <th>DestRegion</th>\n",
" <th>DestWeather</th>\n",
" <th>...</th>\n",
" <th>FlightTimeMin</th>\n",
" <th>Origin</th>\n",
" <th>OriginAirportID</th>\n",
" <th>OriginCityName</th>\n",
" <th>OriginCountry</th>\n",
" <th>OriginLocation</th>\n",
" <th>OriginRegion</th>\n",
" <th>OriginWeather</th>\n",
" <th>dayOfWeek</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>841.265642</td>\n",
" <td>False</td>\n",
" <td>Kibana Airlines</td>\n",
" <td>Sydney Kingsford Smith International Airport</td>\n",
" <td>SYD</td>\n",
" <td>Sydney</td>\n",
" <td>AU</td>\n",
" <td>{'lat': '-33.94609833', 'lon': '151.177002'}</td>\n",
" <td>SE-BD</td>\n",
" <td>Rain</td>\n",
" <td>...</td>\n",
" <td>1030.770416</td>\n",
" <td>Frankfurt am Main Airport</td>\n",
" <td>FRA</td>\n",
" <td>Frankfurt am Main</td>\n",
" <td>DE</td>\n",
" <td>{'lat': '50.033333', 'lon': '8.570556'}</td>\n",
" <td>DE-HE</td>\n",
" <td>Sunny</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 00:00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>882.982662</td>\n",
" <td>False</td>\n",
" <td>Logstash Airways</td>\n",
" <td>Venice Marco Polo Airport</td>\n",
" <td>VE05</td>\n",
" <td>Venice</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>Sunny</td>\n",
" <td>...</td>\n",
" <td>464.389481</td>\n",
" <td>Cape Town International Airport</td>\n",
" <td>CPT</td>\n",
" <td>Cape Town</td>\n",
" <td>ZA</td>\n",
" <td>{'lat': '-33.96480179', 'lon': '18.60169983'}</td>\n",
" <td>SE-BD</td>\n",
" <td>Clear</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 18:27:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>190.636904</td>\n",
" <td>False</td>\n",
" <td>Logstash Airways</td>\n",
" <td>Venice Marco Polo Airport</td>\n",
" <td>VE05</td>\n",
" <td>Venice</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>Cloudy</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>Venice Marco Polo Airport</td>\n",
" <td>VE05</td>\n",
" <td>Venice</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>Rain</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 17:11:14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>181.694216</td>\n",
" <td>True</td>\n",
" <td>Kibana Airlines</td>\n",
" <td>Treviso-Sant'Angelo Airport</td>\n",
" <td>TV01</td>\n",
" <td>Treviso</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '45.648399', 'lon': '12.1944'}</td>\n",
" <td>IT-34</td>\n",
" <td>Clear</td>\n",
" <td>...</td>\n",
" <td>222.749059</td>\n",
" <td>Naples International Airport</td>\n",
" <td>NA01</td>\n",
" <td>Naples</td>\n",
" <td>IT</td>\n",
" <td>{'lat': '40.886002', 'lon': '14.2908'}</td>\n",
" <td>IT-72</td>\n",
" <td>Thunder &amp; Lightning</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 10:33:28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>730.041778</td>\n",
" <td>False</td>\n",
" <td>Kibana Airlines</td>\n",
" <td>Xi'an Xianyang International Airport</td>\n",
" <td>XIY</td>\n",
" <td>Xi'an</td>\n",
" <td>CN</td>\n",
" <td>{'lat': '34.447102', 'lon': '108.751999'}</td>\n",
" <td>SE-BD</td>\n",
" <td>Clear</td>\n",
" <td>...</td>\n",
" <td>785.779071</td>\n",
" <td>Licenciado Benito Juarez International Airport</td>\n",
" <td>AICM</td>\n",
" <td>Mexico City</td>\n",
" <td>MX</td>\n",
" <td>{'lat': '19.4363', 'lon': '-99.072098'}</td>\n",
" <td>MX-DIF</td>\n",
" <td>Damaging Wind</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 05:13:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" AvgTicketPrice Cancelled Carrier \\\n",
"0 841.265642 False Kibana Airlines \n",
"1 882.982662 False Logstash Airways \n",
"2 190.636904 False Logstash Airways \n",
"3 181.694216 True Kibana Airlines \n",
"4 730.041778 False Kibana Airlines \n",
"\n",
" Dest DestAirportID DestCityName \\\n",
"0 Sydney Kingsford Smith International Airport SYD Sydney \n",
"1 Venice Marco Polo Airport VE05 Venice \n",
"2 Venice Marco Polo Airport VE05 Venice \n",
"3 Treviso-Sant'Angelo Airport TV01 Treviso \n",
"4 Xi'an Xianyang International Airport XIY Xi'an \n",
"\n",
" DestCountry DestLocation DestRegion \\\n",
"0 AU {'lat': '-33.94609833', 'lon': '151.177002'} SE-BD \n",
"1 IT {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n",
"2 IT {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n",
"3 IT {'lat': '45.648399', 'lon': '12.1944'} IT-34 \n",
"4 CN {'lat': '34.447102', 'lon': '108.751999'} SE-BD \n",
"\n",
" DestWeather ... FlightTimeMin \\\n",
"0 Rain ... 1030.770416 \n",
"1 Sunny ... 464.389481 \n",
"2 Cloudy ... 0.000000 \n",
"3 Clear ... 222.749059 \n",
"4 Clear ... 785.779071 \n",
"\n",
" Origin OriginAirportID \\\n",
"0 Frankfurt am Main Airport FRA \n",
"1 Cape Town International Airport CPT \n",
"2 Venice Marco Polo Airport VE05 \n",
"3 Naples International Airport NA01 \n",
"4 Licenciado Benito Juarez International Airport AICM \n",
"\n",
" OriginCityName OriginCountry \\\n",
"0 Frankfurt am Main DE \n",
"1 Cape Town ZA \n",
"2 Venice IT \n",
"3 Naples IT \n",
"4 Mexico City MX \n",
"\n",
" OriginLocation OriginRegion \\\n",
"0 {'lat': '50.033333', 'lon': '8.570556'} DE-HE \n",
"1 {'lat': '-33.96480179', 'lon': '18.60169983'} SE-BD \n",
"2 {'lat': '45.505299', 'lon': '12.3519'} IT-34 \n",
"3 {'lat': '40.886002', 'lon': '14.2908'} IT-72 \n",
"4 {'lat': '19.4363', 'lon': '-99.072098'} MX-DIF \n",
"\n",
" OriginWeather dayOfWeek timestamp \n",
"0 Sunny 0 2018-01-01 00:00:00 \n",
"1 Clear 0 2018-01-01 18:27:00 \n",
"2 Rain 0 2018-01-01 17:11:14 \n",
"3 Thunder & Lightning 0 2018-01-01 10:33:28 \n",
"4 Damaging Wind 0 2018-01-01 05:13:00 \n",
"\n",
"[5 rows x 27 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AvgTicketPrice</th>\n",
" <th>DistanceKilometers</th>\n",
" <th>DistanceMiles</th>\n",
" <th>FlightDelayMin</th>\n",
" <th>FlightTimeHour</th>\n",
" <th>FlightTimeMin</th>\n",
" <th>dayOfWeek</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" <td>13059.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>628.253689</td>\n",
" <td>7092.142455</td>\n",
" <td>4406.853013</td>\n",
" <td>47.335171</td>\n",
" <td>8.518797</td>\n",
" <td>511.127842</td>\n",
" <td>2.835975</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>266.396861</td>\n",
" <td>4578.438497</td>\n",
" <td>2844.909787</td>\n",
" <td>96.746711</td>\n",
" <td>5.579233</td>\n",
" <td>334.753952</td>\n",
" <td>1.939439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>100.020528</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>409.893816</td>\n",
" <td>2459.705673</td>\n",
" <td>1528.390247</td>\n",
" <td>0.000000</td>\n",
" <td>4.205553</td>\n",
" <td>252.333192</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>640.556668</td>\n",
" <td>7610.330866</td>\n",
" <td>4728.840363</td>\n",
" <td>0.000000</td>\n",
" <td>8.384086</td>\n",
" <td>503.045170</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>842.185470</td>\n",
" <td>9736.637600</td>\n",
" <td>6050.066114</td>\n",
" <td>15.000000</td>\n",
" <td>12.006934</td>\n",
" <td>720.416036</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1199.729053</td>\n",
" <td>19881.482315</td>\n",
" <td>12353.780369</td>\n",
" <td>360.000000</td>\n",
" <td>31.715034</td>\n",
" <td>1902.902032</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin \\\n",
"count 13059.000000 13059.000000 13059.000000 13059.000000 \n",
"mean 628.253689 7092.142455 4406.853013 47.335171 \n",
"std 266.396861 4578.438497 2844.909787 96.746711 \n",
"min 100.020528 0.000000 0.000000 0.000000 \n",
"25% 409.893816 2459.705673 1528.390247 0.000000 \n",
"50% 640.556668 7610.330866 4728.840363 0.000000 \n",
"75% 842.185470 9736.637600 6050.066114 15.000000 \n",
"max 1199.729053 19881.482315 12353.780369 360.000000 \n",
"\n",
" FlightTimeHour FlightTimeMin dayOfWeek \n",
"count 13059.000000 13059.000000 13059.000000 \n",
"mean 8.518797 511.127842 2.835975 \n",
"std 5.579233 334.753952 1.939439 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 4.205553 252.333192 1.000000 \n",
"50% 8.384086 503.045170 3.000000 \n",
"75% 12.006934 720.416036 4.000000 \n",
"max 31.715034 1902.902032 6.000000 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd_df.describe()"
]
}
],
"metadata": {

View File

@ -1,6 +1,6 @@
import eland as ed
df = ed.from_es('localhost', 'kibana_sample_data_flights')
df = ed.read_es('localhost', 'kibana_sample_data_flights')
print(df.head())