mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
3753 lines
166 KiB
Plaintext
3753 lines
166 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Eland Demo Notebook"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import eland as ed\n",
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"from elasticsearch import Elasticsearch\n",
|
||
"\n",
|
||
"# Import standard test settings for consistent results\n",
|
||
"from eland.conftest import *"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Compare eland DataFrame vs pandas DataFrame"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Create an eland.DataFrame from a `flights` index"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"ed_flights = ed.read_es('localhost', 'flights')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"eland.dataframe.DataFrame"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"type(ed_flights)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Compare to pandas DataFrame (created from the same data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pd_flights = ed.eland_to_pandas(ed_flights)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"pandas.core.frame.DataFrame"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"type(pd_flights)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Attributes and underlying data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n",
|
||
" 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n",
|
||
" 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n",
|
||
" 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n",
|
||
" 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n",
|
||
" 'timestamp'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n",
|
||
" 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n",
|
||
" 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n",
|
||
" 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n",
|
||
" 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n",
|
||
" 'timestamp'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice float64\n",
|
||
"Cancelled bool\n",
|
||
"Carrier object\n",
|
||
"Dest object\n",
|
||
"DestAirportID object\n",
|
||
" ... \n",
|
||
"OriginLocation object\n",
|
||
"OriginRegion object\n",
|
||
"OriginWeather object\n",
|
||
"dayOfWeek int64\n",
|
||
"timestamp datetime64[ns]\n",
|
||
"Length: 27, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice float64\n",
|
||
"Cancelled bool\n",
|
||
"Carrier object\n",
|
||
"Dest object\n",
|
||
"DestAirportID object\n",
|
||
" ... \n",
|
||
"OriginLocation object\n",
|
||
"OriginRegion object\n",
|
||
"OriginWeather object\n",
|
||
"dayOfWeek int64\n",
|
||
"timestamp datetime64[ns]\n",
|
||
"Length: 27, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.select_dtypes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>DistanceKilometers</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>FlightTimeMin</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>841.265642</td>\n",
|
||
" <td>16492.326654</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1030.770416</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>882.982662</td>\n",
|
||
" <td>8823.400140</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>464.389481</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>190.636904</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>181.694216</td>\n",
|
||
" <td>555.737767</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>222.749059</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>730.041778</td>\n",
|
||
" <td>13358.244200</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>785.779071</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>1080.446279</td>\n",
|
||
" <td>8058.581753</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>402.929088</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>646.612941</td>\n",
|
||
" <td>7088.598322</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>644.418029</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>997.751876</td>\n",
|
||
" <td>10920.652972</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>937.540811</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>1102.814465</td>\n",
|
||
" <td>18748.859647</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1697.404971</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>858.144337</td>\n",
|
||
" <td>16809.141923</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1610.761827</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>13059 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
|
||
"0 841.265642 16492.326654 ... 1030.770416 0\n",
|
||
"1 882.982662 8823.400140 ... 464.389481 0\n",
|
||
"2 190.636904 0.000000 ... 0.000000 0\n",
|
||
"3 181.694216 555.737767 ... 222.749059 0\n",
|
||
"4 730.041778 13358.244200 ... 785.779071 0\n",
|
||
"... ... ... ... ... ...\n",
|
||
"13054 1080.446279 8058.581753 ... 402.929088 6\n",
|
||
"13055 646.612941 7088.598322 ... 644.418029 6\n",
|
||
"13056 997.751876 10920.652972 ... 937.540811 6\n",
|
||
"13057 1102.814465 18748.859647 ... 1697.404971 6\n",
|
||
"13058 858.144337 16809.141923 ... 1610.761827 6\n",
|
||
"\n",
|
||
"[13059 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.select_dtypes(include=np.number)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>DistanceKilometers</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>FlightTimeMin</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>841.265642</td>\n",
|
||
" <td>16492.326654</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1030.770416</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>882.982662</td>\n",
|
||
" <td>8823.400140</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>464.389481</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>190.636904</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>181.694216</td>\n",
|
||
" <td>555.737767</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>222.749059</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>730.041778</td>\n",
|
||
" <td>13358.244200</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>785.779071</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>1080.446279</td>\n",
|
||
" <td>8058.581753</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>402.929088</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>646.612941</td>\n",
|
||
" <td>7088.598322</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>644.418029</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>997.751876</td>\n",
|
||
" <td>10920.652972</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>937.540811</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>1102.814465</td>\n",
|
||
" <td>18748.859647</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1697.404971</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>858.144337</td>\n",
|
||
" <td>16809.141923</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1610.761827</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
"<p>13059 rows × 7 columns</p>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
|
||
"0 841.265642 16492.326654 ... 1030.770416 0\n",
|
||
"1 882.982662 8823.400140 ... 464.389481 0\n",
|
||
"2 190.636904 0.000000 ... 0.000000 0\n",
|
||
"3 181.694216 555.737767 ... 222.749059 0\n",
|
||
"4 730.041778 13358.244200 ... 785.779071 0\n",
|
||
"... ... ... ... ... ...\n",
|
||
"13054 1080.446279 8058.581753 ... 402.929088 6\n",
|
||
"13055 646.612941 7088.598322 ... 644.418029 6\n",
|
||
"13056 997.751876 10920.652972 ... 937.540811 6\n",
|
||
"13057 1102.814465 18748.859647 ... 1697.404971 6\n",
|
||
"13058 858.144337 16809.141923 ... 1610.761827 6\n",
|
||
"\n",
|
||
"[13059 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.select_dtypes(include=np.number)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.empty"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.empty"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"False"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.empty"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(13059, 27)"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(13059, 27)"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.index\n",
|
||
"\n",
|
||
"Note, `eland.DataFrame.index` does not mirror `pandas.DataFrame.index`. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',\n",
|
||
" ...\n",
|
||
" '13049', '13050', '13051', '13052', '13053', '13054', '13055', '13056', '13057', '13058'],\n",
|
||
" dtype='object', length=13059)"
|
||
]
|
||
},
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.index"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<eland.index.Index at 0x7fc6765aefd0>"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# NBVAL_IGNORE_OUTPUT\n",
|
||
"ed_flights.index"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'_id'"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.index.index_field"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.values\n",
|
||
"\n",
|
||
"Note, `eland.DataFrame.values` is not supported."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[841.2656419677076, False, 'Kibana Airlines', ..., 'Sunny', 0,\n",
|
||
" Timestamp('2018-01-01 00:00:00')],\n",
|
||
" [882.9826615595518, False, 'Logstash Airways', ..., 'Clear', 0,\n",
|
||
" Timestamp('2018-01-01 18:27:00')],\n",
|
||
" [190.6369038508356, False, 'Logstash Airways', ..., 'Rain', 0,\n",
|
||
" Timestamp('2018-01-01 17:11:14')],\n",
|
||
" ...,\n",
|
||
" [997.7518761454494, False, 'Logstash Airways', ..., 'Sunny', 6,\n",
|
||
" Timestamp('2018-02-11 04:09:27')],\n",
|
||
" [1102.8144645388556, False, 'JetBeats', ..., 'Hail', 6,\n",
|
||
" Timestamp('2018-02-11 08:28:21')],\n",
|
||
" [858.1443369038839, False, 'JetBeats', ..., 'Rain', 6,\n",
|
||
" Timestamp('2018-02-11 14:54:34')]], dtype=object)"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.values"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"This method would scan/scroll the entire Elasticsearch index(s) into memory. If this is explicitly required, and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"try:\n",
|
||
" ed_flights.values\n",
|
||
"except AttributeError as e:\n",
|
||
" print(e)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Indexing, iteration"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.head"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>841.265642</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 00:00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>882.982662</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 18:27:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>190.636904</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 17:11:14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>181.694216</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 10:33:28</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>730.041778</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 05:13:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 27 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"0 841.265642 False ... 0 2018-01-01 00:00:00\n",
|
||
"1 882.982662 False ... 0 2018-01-01 18:27:00\n",
|
||
"2 190.636904 False ... 0 2018-01-01 17:11:14\n",
|
||
"3 181.694216 True ... 0 2018-01-01 10:33:28\n",
|
||
"4 730.041778 False ... 0 2018-01-01 05:13:00\n",
|
||
"\n",
|
||
"[5 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>841.265642</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 00:00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>882.982662</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 18:27:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>190.636904</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 17:11:14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>181.694216</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 10:33:28</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>730.041778</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 05:13:00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
"<p>5 rows × 27 columns</p>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"0 841.265642 False ... 0 2018-01-01 00:00:00\n",
|
||
"1 882.982662 False ... 0 2018-01-01 18:27:00\n",
|
||
"2 190.636904 False ... 0 2018-01-01 17:11:14\n",
|
||
"3 181.694216 True ... 0 2018-01-01 10:33:28\n",
|
||
"4 730.041778 False ... 0 2018-01-01 05:13:00\n",
|
||
"\n",
|
||
"[5 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 22,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.tail"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>1080.446279</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 20:42:25</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>646.612941</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 01:41:57</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>997.751876</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 04:09:27</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>1102.814465</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 08:28:21</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>858.144337</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 14:54:34</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 27 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"13054 1080.446279 False ... 6 2018-02-11 20:42:25\n",
|
||
"13055 646.612941 False ... 6 2018-02-11 01:41:57\n",
|
||
"13056 997.751876 False ... 6 2018-02-11 04:09:27\n",
|
||
"13057 1102.814465 False ... 6 2018-02-11 08:28:21\n",
|
||
"13058 858.144337 False ... 6 2018-02-11 14:54:34\n",
|
||
"\n",
|
||
"[5 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>1080.446279</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 20:42:25</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>646.612941</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 01:41:57</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>997.751876</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 04:09:27</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>1102.814465</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 08:28:21</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>858.144337</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 14:54:34</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
"<p>5 rows × 27 columns</p>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"13054 1080.446279 False ... 6 2018-02-11 20:42:25\n",
|
||
"13055 646.612941 False ... 6 2018-02-11 01:41:57\n",
|
||
"13056 997.751876 False ... 6 2018-02-11 04:09:27\n",
|
||
"13057 1102.814465 False ... 6 2018-02-11 08:28:21\n",
|
||
"13058 858.144337 False ... 6 2018-02-11 14:54:34\n",
|
||
"\n",
|
||
"[5 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.keys"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n",
|
||
" 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n",
|
||
" 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n",
|
||
" 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n",
|
||
" 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n",
|
||
" 'timestamp'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.keys()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',\n",
|
||
" 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',\n",
|
||
" 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',\n",
|
||
" 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',\n",
|
||
" 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',\n",
|
||
" 'timestamp'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.keys()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.get"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 Kibana Airlines\n",
|
||
"1 Logstash Airways\n",
|
||
"2 Logstash Airways\n",
|
||
"3 Kibana Airlines\n",
|
||
"4 Kibana Airlines\n",
|
||
" ... \n",
|
||
"13054 Logstash Airways\n",
|
||
"13055 Logstash Airways\n",
|
||
"13056 Logstash Airways\n",
|
||
"13057 JetBeats\n",
|
||
"13058 JetBeats\n",
|
||
"Name: Carrier, Length: 13059, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 27,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.get('Carrier')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 Kibana Airlines\n",
|
||
"1 Logstash Airways\n",
|
||
"2 Logstash Airways\n",
|
||
"3 Kibana Airlines\n",
|
||
"4 Kibana Airlines\n",
|
||
" ... \n",
|
||
"13054 Logstash Airways\n",
|
||
"13055 Logstash Airways\n",
|
||
"13056 Logstash Airways\n",
|
||
"13057 JetBeats\n",
|
||
"13058 JetBeats\n",
|
||
"Name: Carrier, Length: 13059, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 28,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.get('Carrier')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Carrier</th>\n",
|
||
" <th>Origin</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>Frankfurt am Main Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>Cape Town International Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>Venice Marco Polo Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>Naples International Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>Licenciado Benito Juarez International Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>Pisa International Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>Winnipeg / James Armstrong Richardson Internat...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>Licenciado Benito Juarez International Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>JetBeats</td>\n",
|
||
" <td>Itami Airport</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>JetBeats</td>\n",
|
||
" <td>Adelaide International Airport</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>13059 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Carrier Origin\n",
|
||
"0 Kibana Airlines Frankfurt am Main Airport\n",
|
||
"1 Logstash Airways Cape Town International Airport\n",
|
||
"2 Logstash Airways Venice Marco Polo Airport\n",
|
||
"3 Kibana Airlines Naples International Airport\n",
|
||
"4 Kibana Airlines Licenciado Benito Juarez International Airport\n",
|
||
"... ... ...\n",
|
||
"13054 Logstash Airways Pisa International Airport\n",
|
||
"13055 Logstash Airways Winnipeg / James Armstrong Richardson Internat...\n",
|
||
"13056 Logstash Airways Licenciado Benito Juarez International Airport\n",
|
||
"13057 JetBeats Itami Airport\n",
|
||
"13058 JetBeats Adelaide International Airport\n",
|
||
"\n",
|
||
"[13059 rows x 2 columns]"
|
||
]
|
||
},
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.get(['Carrier', 'Origin'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"List input not currently supported by `eland.DataFrame.get`"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"unhashable type: 'list'\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"try:\n",
|
||
" ed_flights.get(['Carrier', 'Origin'])\n",
|
||
"except TypeError as e:\n",
|
||
" print(e)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.query"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>960.869736</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 12:09:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>975.812632</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 15:38:32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>311</th>\n",
|
||
" <td>946.358410</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 11:51:12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>651</th>\n",
|
||
" <td>975.383864</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 21:13:17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>950</th>\n",
|
||
" <td>907.836523</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 05:14:51</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12820</th>\n",
|
||
" <td>909.973606</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2018-02-10 05:11:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12906</th>\n",
|
||
" <td>983.429244</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 06:19:58</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12918</th>\n",
|
||
" <td>1136.678150</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 16:03:10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12919</th>\n",
|
||
" <td>1105.211803</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 05:36:05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13013</th>\n",
|
||
" <td>1055.350213</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 13:20:16</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>68 rows × 27 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"8 960.869736 True ... 0 2018-01-01 12:09:35\n",
|
||
"26 975.812632 True ... 0 2018-01-01 15:38:32\n",
|
||
"311 946.358410 True ... 0 2018-01-01 11:51:12\n",
|
||
"651 975.383864 True ... 2 2018-01-03 21:13:17\n",
|
||
"950 907.836523 True ... 2 2018-01-03 05:14:51\n",
|
||
"... ... ... ... ... ...\n",
|
||
"12820 909.973606 True ... 5 2018-02-10 05:11:35\n",
|
||
"12906 983.429244 True ... 6 2018-02-11 06:19:58\n",
|
||
"12918 1136.678150 True ... 6 2018-02-11 16:03:10\n",
|
||
"12919 1105.211803 True ... 6 2018-02-11 05:36:05\n",
|
||
"13013 1055.350213 True ... 6 2018-02-11 13:20:16\n",
|
||
"\n",
|
||
"[68 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"`eland.DataFrame.query` requires qualifier on bool i.e.\n",
|
||
"\n",
|
||
"`ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled')` fails"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>960.869736</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 12:09:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>975.812632</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 15:38:32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>311</th>\n",
|
||
" <td>946.358410</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 11:51:12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>651</th>\n",
|
||
" <td>975.383864</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 21:13:17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>950</th>\n",
|
||
" <td>907.836523</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 05:14:51</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12820</th>\n",
|
||
" <td>909.973606</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2018-02-10 05:11:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12906</th>\n",
|
||
" <td>983.429244</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 06:19:58</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12918</th>\n",
|
||
" <td>1136.678150</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 16:03:10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12919</th>\n",
|
||
" <td>1105.211803</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 05:36:05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13013</th>\n",
|
||
" <td>1055.350213</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 13:20:16</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
"<p>68 rows × 27 columns</p>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"8 960.869736 True ... 0 2018-01-01 12:09:35\n",
|
||
"26 975.812632 True ... 0 2018-01-01 15:38:32\n",
|
||
"311 946.358410 True ... 0 2018-01-01 11:51:12\n",
|
||
"651 975.383864 True ... 2 2018-01-03 21:13:17\n",
|
||
"950 907.836523 True ... 2 2018-01-03 05:14:51\n",
|
||
"... ... ... ... ... ...\n",
|
||
"12820 909.973606 True ... 5 2018-02-10 05:11:35\n",
|
||
"12906 983.429244 True ... 6 2018-02-11 06:19:58\n",
|
||
"12918 1136.678150 True ... 6 2018-02-11 16:03:10\n",
|
||
"12919 1105.211803 True ... 6 2018-02-11 05:36:05\n",
|
||
"13013 1055.350213 True ... 6 2018-02-11 13:20:16\n",
|
||
"\n",
|
||
"[68 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Boolean indexing query"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>960.869736</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 12:09:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>975.812632</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 15:38:32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>311</th>\n",
|
||
" <td>946.358410</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 11:51:12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>651</th>\n",
|
||
" <td>975.383864</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 21:13:17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>950</th>\n",
|
||
" <td>907.836523</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 05:14:51</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12820</th>\n",
|
||
" <td>909.973606</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2018-02-10 05:11:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12906</th>\n",
|
||
" <td>983.429244</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 06:19:58</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12918</th>\n",
|
||
" <td>1136.678150</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 16:03:10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12919</th>\n",
|
||
" <td>1105.211803</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 05:36:05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13013</th>\n",
|
||
" <td>1055.350213</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 13:20:16</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>68 rows × 27 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"8 960.869736 True ... 0 2018-01-01 12:09:35\n",
|
||
"26 975.812632 True ... 0 2018-01-01 15:38:32\n",
|
||
"311 946.358410 True ... 0 2018-01-01 11:51:12\n",
|
||
"651 975.383864 True ... 2 2018-01-03 21:13:17\n",
|
||
"950 907.836523 True ... 2 2018-01-03 05:14:51\n",
|
||
"... ... ... ... ... ...\n",
|
||
"12820 909.973606 True ... 5 2018-02-10 05:11:35\n",
|
||
"12906 983.429244 True ... 6 2018-02-11 06:19:58\n",
|
||
"12918 1136.678150 True ... 6 2018-02-11 16:03:10\n",
|
||
"12919 1105.211803 True ... 6 2018-02-11 05:36:05\n",
|
||
"13013 1055.350213 True ... 6 2018-02-11 13:20:16\n",
|
||
"\n",
|
||
"[68 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 33,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights[(pd_flights.Carrier==\"Kibana Airlines\") & \n",
|
||
" (pd_flights.AvgTicketPrice > 900.0) &\n",
|
||
" (pd_flights.Cancelled == True)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>Cancelled</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>960.869736</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 12:09:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>975.812632</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 15:38:32</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>311</th>\n",
|
||
" <td>946.358410</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 11:51:12</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>651</th>\n",
|
||
" <td>975.383864</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 21:13:17</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>950</th>\n",
|
||
" <td>907.836523</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2018-01-03 05:14:51</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12820</th>\n",
|
||
" <td>909.973606</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2018-02-10 05:11:35</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12906</th>\n",
|
||
" <td>983.429244</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 06:19:58</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12918</th>\n",
|
||
" <td>1136.678150</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 16:03:10</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12919</th>\n",
|
||
" <td>1105.211803</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 05:36:05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13013</th>\n",
|
||
" <td>1055.350213</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 13:20:16</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
"<p>68 rows × 27 columns</p>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice Cancelled ... dayOfWeek timestamp\n",
|
||
"8 960.869736 True ... 0 2018-01-01 12:09:35\n",
|
||
"26 975.812632 True ... 0 2018-01-01 15:38:32\n",
|
||
"311 946.358410 True ... 0 2018-01-01 11:51:12\n",
|
||
"651 975.383864 True ... 2 2018-01-03 21:13:17\n",
|
||
"950 907.836523 True ... 2 2018-01-03 05:14:51\n",
|
||
"... ... ... ... ... ...\n",
|
||
"12820 909.973606 True ... 5 2018-02-10 05:11:35\n",
|
||
"12906 983.429244 True ... 6 2018-02-11 06:19:58\n",
|
||
"12918 1136.678150 True ... 6 2018-02-11 16:03:10\n",
|
||
"12919 1105.211803 True ... 6 2018-02-11 05:36:05\n",
|
||
"13013 1055.350213 True ... 6 2018-02-11 13:20:16\n",
|
||
"\n",
|
||
"[68 rows x 27 columns]"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights[(ed_flights.Carrier==\"Kibana Airlines\") & \n",
|
||
" (ed_flights.AvgTicketPrice > 900.0) &\n",
|
||
" (ed_flights.Cancelled == True)]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Function application, GroupBy & window"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.aggs"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>DistanceKilometers</th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>sum</th>\n",
|
||
" <td>9.261629e+07</td>\n",
|
||
" <td>8.204365e+06</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>1.000205e+02</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>4.578438e+03</td>\n",
|
||
" <td>2.663969e+02</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" DistanceKilometers AvgTicketPrice\n",
|
||
"sum 9.261629e+07 8.204365e+06\n",
|
||
"min 0.000000e+00 1.000205e+02\n",
|
||
"std 4.578438e+03 2.663969e+02"
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"`eland.DataFrame.aggregate` currently only supported numeric columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>DistanceKilometers</th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>sum</th>\n",
|
||
" <td>9.261629e+07</td>\n",
|
||
" <td>8.204365e+06</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>1.000205e+02</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>4.578263e+03</td>\n",
|
||
" <td>2.663867e+02</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" DistanceKilometers AvgTicketPrice\n",
|
||
"sum 9.261629e+07 8.204365e+06\n",
|
||
"min 0.000000e+00 1.000205e+02\n",
|
||
"std 4.578263e+03 2.663867e+02"
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Computations / descriptive stats"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.count"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 13059\n",
|
||
"Cancelled 13059\n",
|
||
"Carrier 13059\n",
|
||
"Dest 13059\n",
|
||
"DestAirportID 13059\n",
|
||
" ... \n",
|
||
"OriginLocation 13059\n",
|
||
"OriginRegion 13059\n",
|
||
"OriginWeather 13059\n",
|
||
"dayOfWeek 13059\n",
|
||
"timestamp 13059\n",
|
||
"Length: 27, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.count()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 13059\n",
|
||
"Cancelled 13059\n",
|
||
"Carrier 13059\n",
|
||
"Dest 13059\n",
|
||
"DestAirportID 13059\n",
|
||
" ... \n",
|
||
"OriginLocation 13059\n",
|
||
"OriginRegion 13059\n",
|
||
"OriginWeather 13059\n",
|
||
"dayOfWeek 13059\n",
|
||
"timestamp 13059\n",
|
||
"Length: 27, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.count()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.describe"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>DistanceKilometers</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>FlightTimeMin</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>628.253689</td>\n",
|
||
" <td>7092.142455</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>511.127842</td>\n",
|
||
" <td>2.835975</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>266.396861</td>\n",
|
||
" <td>4578.438497</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>334.753952</td>\n",
|
||
" <td>1.939439</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>100.020528</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>409.893816</td>\n",
|
||
" <td>2459.705673</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>252.333192</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>640.556668</td>\n",
|
||
" <td>7610.330866</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>503.045170</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>842.185470</td>\n",
|
||
" <td>9736.637600</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>720.416036</td>\n",
|
||
" <td>4.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>1199.729053</td>\n",
|
||
" <td>19881.482315</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1902.902032</td>\n",
|
||
" <td>6.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
|
||
"count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n",
|
||
"mean 628.253689 7092.142455 ... 511.127842 2.835975\n",
|
||
"std 266.396861 4578.438497 ... 334.753952 1.939439\n",
|
||
"min 100.020528 0.000000 ... 0.000000 0.000000\n",
|
||
"25% 409.893816 2459.705673 ... 252.333192 1.000000\n",
|
||
"50% 640.556668 7610.330866 ... 503.045170 3.000000\n",
|
||
"75% 842.185470 9736.637600 ... 720.416036 4.000000\n",
|
||
"max 1199.729053 19881.482315 ... 1902.902032 6.000000\n",
|
||
"\n",
|
||
"[8 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Values returned from `eland.DataFrame.describe` may vary due to results of Elasticsearch aggregations."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>AvgTicketPrice</th>\n",
|
||
" <th>DistanceKilometers</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>FlightTimeMin</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" <td>13059.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>628.253689</td>\n",
|
||
" <td>7092.142457</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>511.127842</td>\n",
|
||
" <td>2.835975</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>266.386661</td>\n",
|
||
" <td>4578.263193</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>334.741135</td>\n",
|
||
" <td>1.939365</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>100.020531</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>410.012798</td>\n",
|
||
" <td>2470.545974</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>251.682199</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>640.362667</td>\n",
|
||
" <td>7612.072403</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>503.148975</td>\n",
|
||
" <td>3.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>842.233478</td>\n",
|
||
" <td>9735.660463</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>720.572969</td>\n",
|
||
" <td>4.271242</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>1199.729004</td>\n",
|
||
" <td>19881.482422</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1902.901978</td>\n",
|
||
" <td>6.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
|
||
"count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n",
|
||
"mean 628.253689 7092.142457 ... 511.127842 2.835975\n",
|
||
"std 266.386661 4578.263193 ... 334.741135 1.939365\n",
|
||
"min 100.020531 0.000000 ... 0.000000 0.000000\n",
|
||
"25% 410.012798 2470.545974 ... 251.682199 1.000000\n",
|
||
"50% 640.362667 7612.072403 ... 503.148975 3.000000\n",
|
||
"75% 842.233478 9735.660463 ... 720.572969 4.271242\n",
|
||
"max 1199.729004 19881.482422 ... 1902.901978 6.000000\n",
|
||
"\n",
|
||
"[8 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# NBVAL_IGNORE_OUTPUT\n",
|
||
"ed_flights.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.info"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||
"Index: 13059 entries, 0 to 13058\n",
|
||
"Data columns (total 27 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 AvgTicketPrice 13059 non-null float64 \n",
|
||
" 1 Cancelled 13059 non-null bool \n",
|
||
" 2 Carrier 13059 non-null object \n",
|
||
" 3 Dest 13059 non-null object \n",
|
||
" 4 DestAirportID 13059 non-null object \n",
|
||
" 5 DestCityName 13059 non-null object \n",
|
||
" 6 DestCountry 13059 non-null object \n",
|
||
" 7 DestLocation 13059 non-null object \n",
|
||
" 8 DestRegion 13059 non-null object \n",
|
||
" 9 DestWeather 13059 non-null object \n",
|
||
" 10 DistanceKilometers 13059 non-null float64 \n",
|
||
" 11 DistanceMiles 13059 non-null float64 \n",
|
||
" 12 FlightDelay 13059 non-null bool \n",
|
||
" 13 FlightDelayMin 13059 non-null int64 \n",
|
||
" 14 FlightDelayType 13059 non-null object \n",
|
||
" 15 FlightNum 13059 non-null object \n",
|
||
" 16 FlightTimeHour 13059 non-null float64 \n",
|
||
" 17 FlightTimeMin 13059 non-null float64 \n",
|
||
" 18 Origin 13059 non-null object \n",
|
||
" 19 OriginAirportID 13059 non-null object \n",
|
||
" 20 OriginCityName 13059 non-null object \n",
|
||
" 21 OriginCountry 13059 non-null object \n",
|
||
" 22 OriginLocation 13059 non-null object \n",
|
||
" 23 OriginRegion 13059 non-null object \n",
|
||
" 24 OriginWeather 13059 non-null object \n",
|
||
" 25 dayOfWeek 13059 non-null int64 \n",
|
||
" 26 timestamp 13059 non-null datetime64[ns]\n",
|
||
"dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
|
||
"memory usage: 3.2+ MB\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"<class 'eland.dataframe.DataFrame'>\n",
|
||
"Index: 13059 entries, 0 to 13058\n",
|
||
"Data columns (total 27 columns):\n",
|
||
" # Column Non-Null Count Dtype \n",
|
||
"--- ------ -------------- ----- \n",
|
||
" 0 AvgTicketPrice 13059 non-null float64 \n",
|
||
" 1 Cancelled 13059 non-null bool \n",
|
||
" 2 Carrier 13059 non-null object \n",
|
||
" 3 Dest 13059 non-null object \n",
|
||
" 4 DestAirportID 13059 non-null object \n",
|
||
" 5 DestCityName 13059 non-null object \n",
|
||
" 6 DestCountry 13059 non-null object \n",
|
||
" 7 DestLocation 13059 non-null object \n",
|
||
" 8 DestRegion 13059 non-null object \n",
|
||
" 9 DestWeather 13059 non-null object \n",
|
||
" 10 DistanceKilometers 13059 non-null float64 \n",
|
||
" 11 DistanceMiles 13059 non-null float64 \n",
|
||
" 12 FlightDelay 13059 non-null bool \n",
|
||
" 13 FlightDelayMin 13059 non-null int64 \n",
|
||
" 14 FlightDelayType 13059 non-null object \n",
|
||
" 15 FlightNum 13059 non-null object \n",
|
||
" 16 FlightTimeHour 13059 non-null float64 \n",
|
||
" 17 FlightTimeMin 13059 non-null float64 \n",
|
||
" 18 Origin 13059 non-null object \n",
|
||
" 19 OriginAirportID 13059 non-null object \n",
|
||
" 20 OriginCityName 13059 non-null object \n",
|
||
" 21 OriginCountry 13059 non-null object \n",
|
||
" 22 OriginLocation 13059 non-null object \n",
|
||
" 23 OriginRegion 13059 non-null object \n",
|
||
" 24 OriginWeather 13059 non-null object \n",
|
||
" 25 dayOfWeek 13059 non-null int64 \n",
|
||
" 26 timestamp 13059 non-null datetime64[ns]\n",
|
||
"dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
|
||
"memory usage: 80.0 bytes\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.max, DataFrame.min, DataFrame.mean, DataFrame.sum"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### max"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 1199.73\n",
|
||
"Cancelled True\n",
|
||
"DistanceKilometers 19881.5\n",
|
||
"DistanceMiles 12353.8\n",
|
||
"FlightDelay True\n",
|
||
"FlightDelayMin 360\n",
|
||
"FlightTimeHour 31.715\n",
|
||
"FlightTimeMin 1902.9\n",
|
||
"dayOfWeek 6\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.max(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"`eland.DataFrame.max,min,mean,sum` only aggregate numeric columns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 1199.73\n",
|
||
"Cancelled True\n",
|
||
"DistanceKilometers 19881.5\n",
|
||
"DistanceMiles 12353.8\n",
|
||
"FlightDelay True\n",
|
||
"FlightDelayMin 360\n",
|
||
"FlightTimeHour 31.715\n",
|
||
"FlightTimeMin 1902.9\n",
|
||
"dayOfWeek 6\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.max(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### min"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 100.021\n",
|
||
"Cancelled False\n",
|
||
"DistanceKilometers 0\n",
|
||
"DistanceMiles 0\n",
|
||
"FlightDelay False\n",
|
||
"FlightDelayMin 0\n",
|
||
"FlightTimeHour 0\n",
|
||
"FlightTimeMin 0\n",
|
||
"dayOfWeek 0\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.min(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 100.021\n",
|
||
"Cancelled False\n",
|
||
"DistanceKilometers 0\n",
|
||
"DistanceMiles 0\n",
|
||
"FlightDelay False\n",
|
||
"FlightDelayMin 0\n",
|
||
"FlightTimeHour 0\n",
|
||
"FlightTimeMin 0\n",
|
||
"dayOfWeek 0\n",
|
||
"dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.min(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### mean"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 628.253689\n",
|
||
"Cancelled 0.128494\n",
|
||
"DistanceKilometers 7092.142455\n",
|
||
"DistanceMiles 4406.853013\n",
|
||
"FlightDelay 0.251168\n",
|
||
"FlightDelayMin 47.335171\n",
|
||
"FlightTimeHour 8.518797\n",
|
||
"FlightTimeMin 511.127842\n",
|
||
"dayOfWeek 2.835975\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.mean(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 628.253689\n",
|
||
"Cancelled 0.128494\n",
|
||
"DistanceKilometers 7092.142457\n",
|
||
"DistanceMiles 4406.853010\n",
|
||
"FlightDelay 0.251168\n",
|
||
"FlightDelayMin 47.335171\n",
|
||
"FlightTimeHour 8.518797\n",
|
||
"FlightTimeMin 511.127842\n",
|
||
"dayOfWeek 2.835975\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 48,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.mean(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### sum"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 8.204365e+06\n",
|
||
"Cancelled 1.678000e+03\n",
|
||
"DistanceKilometers 9.261629e+07\n",
|
||
"DistanceMiles 5.754909e+07\n",
|
||
"FlightDelay 3.280000e+03\n",
|
||
"FlightDelayMin 6.181500e+05\n",
|
||
"FlightTimeHour 1.112470e+05\n",
|
||
"FlightTimeMin 6.674818e+06\n",
|
||
"dayOfWeek 3.703500e+04\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.sum(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"AvgTicketPrice 8.204365e+06\n",
|
||
"Cancelled 1.678000e+03\n",
|
||
"DistanceKilometers 9.261629e+07\n",
|
||
"DistanceMiles 5.754909e+07\n",
|
||
"FlightDelay 3.280000e+03\n",
|
||
"FlightDelayMin 6.181500e+05\n",
|
||
"FlightTimeHour 1.112470e+05\n",
|
||
"FlightTimeMin 6.674818e+06\n",
|
||
"dayOfWeek 3.703500e+04\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 50,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.sum(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.nunique"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Carrier 4\n",
|
||
"Origin 156\n",
|
||
"Dest 156\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 51,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights[['Carrier', 'Origin', 'Dest']].nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Carrier 4\n",
|
||
"Origin 156\n",
|
||
"Dest 156\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights[['Carrier', 'Origin', 'Dest']].nunique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### DataFrame.drop"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Carrier</th>\n",
|
||
" <th>DestRegion</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 00:00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>IT-34</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 18:27:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>IT-34</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 17:11:14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>IT-34</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 10:33:28</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 05:13:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 20:42:25</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>CH-ZH</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 01:41:57</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>RU-AMU</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 04:09:27</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>JetBeats</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 08:28:21</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>JetBeats</td>\n",
|
||
" <td>US-DC</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 14:54:34</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>13059 rows × 20 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Carrier DestRegion ... dayOfWeek timestamp\n",
|
||
"0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n",
|
||
"1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n",
|
||
"2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n",
|
||
"3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n",
|
||
"4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n",
|
||
"... ... ... ... ... ...\n",
|
||
"13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n",
|
||
"13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n",
|
||
"13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n",
|
||
"13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n",
|
||
"13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n",
|
||
"\n",
|
||
"[13059 rows x 20 columns]"
|
||
]
|
||
},
|
||
"execution_count": 53,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.drop(columns=['AvgTicketPrice', \n",
|
||
" 'Cancelled', \n",
|
||
" 'DestLocation',\n",
|
||
" 'Dest', \n",
|
||
" 'DestAirportID', \n",
|
||
" 'DestCityName', \n",
|
||
" 'DestCountry'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 54,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Carrier</th>\n",
|
||
" <th>DestRegion</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>dayOfWeek</th>\n",
|
||
" <th>timestamp</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 00:00:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>IT-34</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 18:27:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>IT-34</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 17:11:14</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>IT-34</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 10:33:28</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Kibana Airlines</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>2018-01-01 05:13:00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13054</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 20:42:25</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13055</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>CH-ZH</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 01:41:57</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13056</th>\n",
|
||
" <td>Logstash Airways</td>\n",
|
||
" <td>RU-AMU</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 04:09:27</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13057</th>\n",
|
||
" <td>JetBeats</td>\n",
|
||
" <td>SE-BD</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 08:28:21</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13058</th>\n",
|
||
" <td>JetBeats</td>\n",
|
||
" <td>US-DC</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>2018-02-11 14:54:34</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>\n",
|
||
"<p>13059 rows × 20 columns</p>"
|
||
],
|
||
"text/plain": [
|
||
" Carrier DestRegion ... dayOfWeek timestamp\n",
|
||
"0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n",
|
||
"1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n",
|
||
"2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n",
|
||
"3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n",
|
||
"4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n",
|
||
"... ... ... ... ... ...\n",
|
||
"13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n",
|
||
"13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n",
|
||
"13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n",
|
||
"13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n",
|
||
"13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n",
|
||
"\n",
|
||
"[13059 rows x 20 columns]"
|
||
]
|
||
},
|
||
"execution_count": 54,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.drop(columns=['AvgTicketPrice', \n",
|
||
" 'Cancelled', \n",
|
||
" 'DestLocation',\n",
|
||
" 'Dest', \n",
|
||
" 'DestAirportID', \n",
|
||
" 'DestCityName', \n",
|
||
" 'DestCountry'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Plotting"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 55,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/daniel/PycharmProjects/eland/venv/lib/python3.6/site-packages/pandas/plotting/_matplotlib/tools.py:298: MatplotlibDeprecationWarning: \n",
|
||
"The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.\n",
|
||
" layout[ax.rowNum, ax.colNum] = ax.get_visible()\n",
|
||
"/home/daniel/PycharmProjects/eland/venv/lib/python3.6/site-packages/pandas/plotting/_matplotlib/tools.py:298: MatplotlibDeprecationWarning: \n",
|
||
"The colNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().colspan.start instead.\n",
|
||
" layout[ax.rowNum, ax.colNum] = ax.get_visible()\n",
|
||
"/home/daniel/PycharmProjects/eland/venv/lib/python3.6/site-packages/pandas/plotting/_matplotlib/tools.py:304: MatplotlibDeprecationWarning: \n",
|
||
"The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.\n",
|
||
" if not layout[ax.rowNum + 1, ax.colNum]:\n",
|
||
"/home/daniel/PycharmProjects/eland/venv/lib/python3.6/site-packages/pandas/plotting/_matplotlib/tools.py:304: MatplotlibDeprecationWarning: \n",
|
||
"The colNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().colspan.start instead.\n",
|
||
" if not layout[ax.rowNum + 1, ax.colNum]:\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x720 with 9 Axes>"
|
||
]
|
||
},
|
||
"metadata": {
|
||
"needs_background": "light"
|
||
},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"pd_flights.select_dtypes(include=np.number).hist(figsize=[10,10])\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 56,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "\n",
|
||
"text/plain": [
|
||
"<Figure size 720x720 with 9 Axes>"
|
||
]
|
||
},
|
||
"metadata": {
|
||
"needs_background": "light"
|
||
},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"ed_flights.select_dtypes(include=np.number).hist(figsize=[10,10])\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Elasticsearch utilities"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"ed_flights2 = ed_flights[(ed_flights.OriginAirportID == 'AMS') & (ed_flights.FlightDelayMin > 60)]\n",
|
||
"ed_flights2 = ed_flights2[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]\n",
|
||
"ed_flights2 = ed_flights2.tail()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 59,
|
||
"metadata": {
|
||
"pycharm": {
|
||
"is_executing": false
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"index_pattern: flights\n",
|
||
"Index:\n",
|
||
" index_field: _id\n",
|
||
" is_source_field: False\n",
|
||
"Mappings:\n",
|
||
" capabilities:\n",
|
||
" es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name\n",
|
||
"timestamp timestamp True date strict_date_hour_minute_second datetime64[ns] True True False timestamp\n",
|
||
"OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID\n",
|
||
"DestAirportID DestAirportID True keyword None object True True False DestAirportID\n",
|
||
"FlightDelayMin FlightDelayMin True integer None int64 True True False FlightDelayMin\n",
|
||
"Operations:\n",
|
||
" tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]\n",
|
||
" size: 5\n",
|
||
" sort_params: _doc:desc\n",
|
||
" _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']\n",
|
||
" body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}\n",
|
||
" post_processing: [('sort_index')]\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(ed_flights2.info_es())"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.9"
|
||
},
|
||
"pycharm": {
|
||
"stem_cell": {
|
||
"cell_type": "raw",
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"source": []
|
||
}
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|