diff --git a/example/Online Retail Analysis.ipynb b/example/Online Retail Analysis.ipynb
new file mode 100644
index 0000000..1cdb001
--- /dev/null
+++ b/example/Online Retail Analysis.ipynb
@@ -0,0 +1,1105 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import eland as ed\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exploratory Data Analysis with eland"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Getting Started\n",
+ "\n",
+ "To get started, let's explore the attributes of the `online-retail` index. First, we'll instantiate the data frame by pointing the constructor to a particular instance in our local elasticsearch cluster. \n",
+ "\n",
+ "The `online-retail` index was created by running `python load_data.py` from the `examples` directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = ed.read_es(\"http://localhost:9200\", \"online-retail\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here we see that the `\"_id\"` field was used to index our data frame. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'_id'"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.index.index_field"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next, we can check which field from elasticsearch are available to our eland data frame. `columns` is available as a parameter when instantiating the data frame which allows one to choose only a subset of fields from your index to be included in the data frame. Since we didn't set this parameter, we have access to all fields."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['country', 'customer_id', 'description', 'invoice_date', 'invoice_no',\n",
+ " 'quantity', 'stock_code', 'unit_price'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, let's see the data types of our fields. Running `df.dtypes`, we can see that elasticsearch field types are mapped to pandas field types."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "country object\n",
+ "customer_id object\n",
+ "description object\n",
+ "invoice_date datetime64[ns]\n",
+ "invoice_no object\n",
+ "quantity int64\n",
+ "stock_code object\n",
+ "unit_price float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We also offer a `.info_es()` data frame method that shows all info about the underlying index. It also contains information about operations being passed from data frame methods to elasticsearch. More on this later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "index_pattern: online-retail\n",
+ "Index:\n",
+ "\tindex_field: _id\n",
+ "\tis_source_field: False\n",
+ "Mappings:\n",
+ "\tcapabilities: _source es_dtype pd_dtype searchable aggregatable\n",
+ "country True keyword object True True\n",
+ "customer_id True keyword object True True\n",
+ "description True keyword object True True\n",
+ "invoice_date True date datetime64[ns] True True\n",
+ "invoice_no True keyword object True True\n",
+ "quantity True integer int64 True True\n",
+ "stock_code True keyword object True True\n",
+ "unit_price True float float64 True True\n",
+ "Operations:\n",
+ "\ttasks: []\n",
+ "\tsize: None\n",
+ "\tsort_params: None\n",
+ "\tcolumns: None\n",
+ "\tpost_processing: []\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df.info_es())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Selecting and Indexing Data\n",
+ "\n",
+ "Now that we understand how to create a data frame and get access to it's underlying attributes, let's see how we can select subsets of our data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### head and tail\n",
+ "\n",
+ "much like pandas, eland data frames offer `.head(n)` and `.tail(n)` methods that return the first and last n rows, respectively."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " customer_id | \n",
+ " description | \n",
+ " invoice_date | \n",
+ " invoice_no | \n",
+ " quantity | \n",
+ " stock_code | \n",
+ " unit_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " wXcVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 17850 | \n",
+ " WHITE HANGING HEART T-LIGHT HOLDER | \n",
+ " 2010-12-01 08:26:00 | \n",
+ " 536365 | \n",
+ " 6 | \n",
+ " 85123A | \n",
+ " 2.55 | \n",
+ "
\n",
+ " \n",
+ " wncVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 17850 | \n",
+ " WHITE METAL LANTERN | \n",
+ " 2010-12-01 08:26:00 | \n",
+ " 536365 | \n",
+ " 6 | \n",
+ " 71053 | \n",
+ " 3.39 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "2 rows x 8 columns
"
+ ],
+ "text/plain": [
+ " country customer_id \\\n",
+ "wXcVa24BUkfJ5hz0pRsL United Kingdom 17850 \n",
+ "wncVa24BUkfJ5hz0pRsL United Kingdom 17850 \n",
+ "\n",
+ " description invoice_date \\\n",
+ "wXcVa24BUkfJ5hz0pRsL WHITE HANGING HEART T-LIGHT HOLDER 2010-12-01 08:26:00 \n",
+ "wncVa24BUkfJ5hz0pRsL WHITE METAL LANTERN 2010-12-01 08:26:00 \n",
+ "\n",
+ " invoice_no quantity stock_code unit_price \n",
+ "wXcVa24BUkfJ5hz0pRsL 536365 6 85123A 2.55 \n",
+ "wncVa24BUkfJ5hz0pRsL 536365 6 71053 3.39 \n",
+ "\n",
+ "[2 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " customer_id | \n",
+ " description | \n",
+ " invoice_date | \n",
+ " invoice_no | \n",
+ " quantity | \n",
+ " stock_code | \n",
+ " unit_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " vXgVa24BUkfJ5hz0txvj | \n",
+ " United Kingdom | \n",
+ " | \n",
+ " MULTICOLOUR HONEYCOMB FAN | \n",
+ " 2011-01-20 18:08:00 | \n",
+ " 541696 | \n",
+ " 1 | \n",
+ " 21209 | \n",
+ " 1.63 | \n",
+ "
\n",
+ " \n",
+ " vngVa24BUkfJ5hz0txvj | \n",
+ " United Kingdom | \n",
+ " | \n",
+ " PACK OF 72 RETROSPOT CAKE CASES | \n",
+ " 2011-01-20 18:08:00 | \n",
+ " 541696 | \n",
+ " 1 | \n",
+ " 21212 | \n",
+ " 1.25 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "2 rows x 8 columns
"
+ ],
+ "text/plain": [
+ " country customer_id \\\n",
+ "vXgVa24BUkfJ5hz0txvj United Kingdom \n",
+ "vngVa24BUkfJ5hz0txvj United Kingdom \n",
+ "\n",
+ " description invoice_date \\\n",
+ "vXgVa24BUkfJ5hz0txvj MULTICOLOUR HONEYCOMB FAN 2011-01-20 18:08:00 \n",
+ "vngVa24BUkfJ5hz0txvj PACK OF 72 RETROSPOT CAKE CASES 2011-01-20 18:08:00 \n",
+ "\n",
+ " invoice_no quantity stock_code unit_price \n",
+ "vXgVa24BUkfJ5hz0txvj 541696 1 21209 1.63 \n",
+ "vngVa24BUkfJ5hz0txvj 541696 1 21212 1.25 \n",
+ "\n",
+ "[2 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.tail(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### selecting columns\n",
+ "\n",
+ "you can also pass a list of columns to select columns from the data frame in a specified order."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " invoice_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " wXcVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 2010-12-01 08:26:00 | \n",
+ "
\n",
+ " \n",
+ " wncVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 2010-12-01 08:26:00 | \n",
+ "
\n",
+ " \n",
+ " w3cVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 2010-12-01 08:26:00 | \n",
+ "
\n",
+ " \n",
+ " xHcVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 2010-12-01 08:26:00 | \n",
+ "
\n",
+ " \n",
+ " xXcVa24BUkfJ5hz0pRsL | \n",
+ " United Kingdom | \n",
+ " 2010-12-01 08:26:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "5 rows x 2 columns
"
+ ],
+ "text/plain": [
+ " country invoice_date\n",
+ "wXcVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n",
+ "wncVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n",
+ "w3cVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n",
+ "xHcVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n",
+ "xXcVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n",
+ "\n",
+ "[5 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[['country', 'invoice_date']].head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Boolean Indexing\n",
+ "\n",
+ "we also allow you to filter the data frame using boolean indexing. Under the hood, a boolean index maps to a `terms` query that is then passed to elasticsearch to filter the index."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'term': {'country': 'Germany'}}\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " customer_id | \n",
+ " description | \n",
+ " invoice_date | \n",
+ " invoice_no | \n",
+ " quantity | \n",
+ " stock_code | \n",
+ " unit_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " FncVa24BUkfJ5hz0pSBJ | \n",
+ " Germany | \n",
+ " 12662 | \n",
+ " SET OF 6 T-LIGHTS SANTA | \n",
+ " 2010-12-01 13:04:00 | \n",
+ " 536527 | \n",
+ " 6 | \n",
+ " 22809 | \n",
+ " 2.95 | \n",
+ "
\n",
+ " \n",
+ " F3cVa24BUkfJ5hz0pSBJ | \n",
+ " Germany | \n",
+ " 12662 | \n",
+ " ROTATING SILVER ANGELS T-LIGHT HLDR | \n",
+ " 2010-12-01 13:04:00 | \n",
+ " 536527 | \n",
+ " 6 | \n",
+ " 84347 | \n",
+ " 2.55 | \n",
+ "
\n",
+ " \n",
+ " GHcVa24BUkfJ5hz0pSBJ | \n",
+ " Germany | \n",
+ " 12662 | \n",
+ " MULTI COLOUR SILVER T-LIGHT HOLDER | \n",
+ " 2010-12-01 13:04:00 | \n",
+ " 536527 | \n",
+ " 12 | \n",
+ " 84945 | \n",
+ " 0.85 | \n",
+ "
\n",
+ " \n",
+ " GXcVa24BUkfJ5hz0pSBJ | \n",
+ " Germany | \n",
+ " 12662 | \n",
+ " 5 HOOK HANGER MAGIC TOADSTOOL | \n",
+ " 2010-12-01 13:04:00 | \n",
+ " 536527 | \n",
+ " 12 | \n",
+ " 22242 | \n",
+ " 1.65 | \n",
+ "
\n",
+ " \n",
+ " GncVa24BUkfJ5hz0pSBJ | \n",
+ " Germany | \n",
+ " 12662 | \n",
+ " 3 HOOK HANGER MAGIC GARDEN | \n",
+ " 2010-12-01 13:04:00 | \n",
+ " 536527 | \n",
+ " 12 | \n",
+ " 22244 | \n",
+ " 1.95 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "5 rows x 8 columns
"
+ ],
+ "text/plain": [
+ " country customer_id \\\n",
+ "FncVa24BUkfJ5hz0pSBJ Germany 12662 \n",
+ "F3cVa24BUkfJ5hz0pSBJ Germany 12662 \n",
+ "GHcVa24BUkfJ5hz0pSBJ Germany 12662 \n",
+ "GXcVa24BUkfJ5hz0pSBJ Germany 12662 \n",
+ "GncVa24BUkfJ5hz0pSBJ Germany 12662 \n",
+ "\n",
+ " description invoice_date \\\n",
+ "FncVa24BUkfJ5hz0pSBJ SET OF 6 T-LIGHTS SANTA 2010-12-01 13:04:00 \n",
+ "F3cVa24BUkfJ5hz0pSBJ ROTATING SILVER ANGELS T-LIGHT HLDR 2010-12-01 13:04:00 \n",
+ "GHcVa24BUkfJ5hz0pSBJ MULTI COLOUR SILVER T-LIGHT HOLDER 2010-12-01 13:04:00 \n",
+ "GXcVa24BUkfJ5hz0pSBJ 5 HOOK HANGER MAGIC TOADSTOOL 2010-12-01 13:04:00 \n",
+ "GncVa24BUkfJ5hz0pSBJ 3 HOOK HANGER MAGIC GARDEN 2010-12-01 13:04:00 \n",
+ "\n",
+ " invoice_no quantity stock_code unit_price \n",
+ "FncVa24BUkfJ5hz0pSBJ 536527 6 22809 2.95 \n",
+ "F3cVa24BUkfJ5hz0pSBJ 536527 6 84347 2.55 \n",
+ "GHcVa24BUkfJ5hz0pSBJ 536527 12 84945 0.85 \n",
+ "GXcVa24BUkfJ5hz0pSBJ 536527 12 22242 1.65 \n",
+ "GncVa24BUkfJ5hz0pSBJ 536527 12 22244 1.95 \n",
+ "\n",
+ "[5 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 111,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# the construction of a boolean vector maps directly to an elasticsearch query\n",
+ "print(df['country']=='Germany')\n",
+ "df[(df['country']=='Germany')].head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# add isin example"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can also combine boolean vectors to further filter the data frame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " customer_id | \n",
+ " description | \n",
+ " invoice_date | \n",
+ " invoice_no | \n",
+ " quantity | \n",
+ " stock_code | \n",
+ " unit_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5XcVa24BUkfJ5hz0q3Mq | \n",
+ " Germany | \n",
+ " 12471 | \n",
+ " FUNKY DIVA PEN | \n",
+ " 2010-12-10 09:35:00 | \n",
+ " 538174 | \n",
+ " 96 | \n",
+ " 22741 | \n",
+ " 0.85 | \n",
+ "
\n",
+ " \n",
+ " 7XcVa24BUkfJ5hz0q3Mq | \n",
+ " Germany | \n",
+ " 12471 | \n",
+ " LIPSTICK PEN RED | \n",
+ " 2010-12-10 09:35:00 | \n",
+ " 538174 | \n",
+ " 100 | \n",
+ " 22419 | \n",
+ " 0.36 | \n",
+ "
\n",
+ " \n",
+ " FHcVa24BUkfJ5hz0s-K9 | \n",
+ " Germany | \n",
+ " 12500 | \n",
+ " PACK OF 6 BIRDY GIFT TAGS | \n",
+ " 2011-01-10 09:48:00 | \n",
+ " 540553 | \n",
+ " 144 | \n",
+ " 22585 | \n",
+ " 1.06 | \n",
+ "
\n",
+ " \n",
+ " XncVa24BUkfJ5hz0s-K9 | \n",
+ " Germany | \n",
+ " 12524 | \n",
+ " BOX OF 24 COCKTAIL PARASOLS | \n",
+ " 2011-01-10 10:35:00 | \n",
+ " 540562 | \n",
+ " 100 | \n",
+ " 84692 | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "4 rows x 8 columns
"
+ ],
+ "text/plain": [
+ " country customer_id description \\\n",
+ "5XcVa24BUkfJ5hz0q3Mq Germany 12471 FUNKY DIVA PEN \n",
+ "7XcVa24BUkfJ5hz0q3Mq Germany 12471 LIPSTICK PEN RED \n",
+ "FHcVa24BUkfJ5hz0s-K9 Germany 12500 PACK OF 6 BIRDY GIFT TAGS \n",
+ "XncVa24BUkfJ5hz0s-K9 Germany 12524 BOX OF 24 COCKTAIL PARASOLS \n",
+ "\n",
+ " invoice_date invoice_no quantity stock_code \\\n",
+ "5XcVa24BUkfJ5hz0q3Mq 2010-12-10 09:35:00 538174 96 22741 \n",
+ "7XcVa24BUkfJ5hz0q3Mq 2010-12-10 09:35:00 538174 100 22419 \n",
+ "FHcVa24BUkfJ5hz0s-K9 2011-01-10 09:48:00 540553 144 22585 \n",
+ "XncVa24BUkfJ5hz0s-K9 2011-01-10 10:35:00 540562 100 84692 \n",
+ "\n",
+ " unit_price \n",
+ "5XcVa24BUkfJ5hz0q3Mq 0.85 \n",
+ "7XcVa24BUkfJ5hz0q3Mq 0.36 \n",
+ "FHcVa24BUkfJ5hz0s-K9 1.06 \n",
+ "XncVa24BUkfJ5hz0s-K9 0.42 \n",
+ "\n",
+ "[4 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 115,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[(df['country']=='Germany') & (df['quantity']>90)]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Using this example, let see how eland translates this boolean filter to an elasticsearch `bool` query."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "index_pattern: online-retail\n",
+ "Index:\n",
+ "\tindex_field: _id\n",
+ "\tis_source_field: False\n",
+ "Mappings:\n",
+ "\tcapabilities: _source es_dtype pd_dtype searchable aggregatable\n",
+ "country True keyword object True True\n",
+ "customer_id True keyword object True True\n",
+ "description True keyword object True True\n",
+ "invoice_date True date datetime64[ns] True True\n",
+ "invoice_no True keyword object True True\n",
+ "quantity True integer int64 True True\n",
+ "stock_code True keyword object True True\n",
+ "unit_price True float float64 True True\n",
+ "Operations:\n",
+ "\ttasks: [('boolean_filter', {'bool': {'must': [{'term': {'country': 'Germany'}}, {'range': {'quantity': {'gt': 90}}}]}})]\n",
+ "\tsize: None\n",
+ "\tsort_params: None\n",
+ "\tcolumns: None\n",
+ "\tpost_processing: []\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df[(df['country']=='Germany') & (df['quantity']>90)].info_es())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Aggregation and Descriptive Statistics\n",
+ "\n",
+ "Let's begin to ask some questions of our data and use eland to get the answers."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**How many different countries are there?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "country 24\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['country'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**What is the total sum of products ordered?**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "quantity 548076.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['quantity'].sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Show me the sum, mean, min, and max of the qunatity and unit_price fields**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " quantity | \n",
+ " unit_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " sum | \n",
+ " 548076.000000 | \n",
+ " 383761.569666 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 8.363231 | \n",
+ " 5.855916 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 74215.000000 | \n",
+ " 16888.019531 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " -74215.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " quantity unit_price\n",
+ "sum 548076.000000 383761.569666\n",
+ "mean 8.363231 5.855916\n",
+ "max 74215.000000 16888.019531\n",
+ "min -74215.000000 0.000000"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[['quantity','unit_price']].agg(['sum', 'mean', 'max', 'min'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Give me descriptive statistics for the entire data frame**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " quantity | \n",
+ " unit_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 65534.000000 | \n",
+ " 65534.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 8.363231 | \n",
+ " 5.855916 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 413.694481 | \n",
+ " 145.755942 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " -74215.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1.000000 | \n",
+ " 1.250000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 2.000000 | \n",
+ " 2.510000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 8.000000 | \n",
+ " 4.234706 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 74215.000000 | \n",
+ " 16888.019531 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " quantity unit_price\n",
+ "count 65534.000000 65534.000000\n",
+ "mean 8.363231 5.855916\n",
+ "std 413.694481 145.755942\n",
+ "min -74215.000000 0.000000\n",
+ "25% 1.000000 1.250000\n",
+ "50% 2.000000 2.510000\n",
+ "75% 8.000000 4.234706\n",
+ "max 74215.000000 16888.019531"
+ ]
+ },
+ "execution_count": 119,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Show me a histogram of numeric columns**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[,\n",
+ " ]],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 110,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df[(df['quantity']>-50) & \n",
+ " (df['quantity']<50) & \n",
+ " (df['unit_price']>0) & \n",
+ " (df['unit_price']<100)].select_dtypes(include=[np.number]).hist(figsize=[12,4], bins=30)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 0000000..32b5824
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,17 @@
+# Example Walkthrough for eland
+
+This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
+
+To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
+
+```
+pip install -r requirements-example.txt
+```
+
+Once these requirements are satisfied, load the data using the provided script:
+
+```
+python load.py
+```
+
+This will create an index called `online-retail` with a mapping defined in `load.py`.
\ No newline at end of file
diff --git a/example/load_data.py b/example/load_data.py
new file mode 100644
index 0000000..4b93d91
--- /dev/null
+++ b/example/load_data.py
@@ -0,0 +1,131 @@
+import argparse
+import csv
+
+from elasticsearch import Elasticsearch, helpers
+from elasticsearch.exceptions import TransportError
+
+def create_index(es, index):
+ mapping = {
+ "mappings": {
+ "properties": {
+ "invoice_no": {"type": "keyword"},
+ "stock_code": {"type": "keyword"},
+ "description": {"type": "keyword"},
+ "quantity": {"type": "integer"},
+ "invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
+ "unit_price": {"type": "float"},
+ "customer_id": {"type": "keyword"},
+ "country": {"type": "keyword"}
+ }
+ }
+ }
+
+ # create an empty index
+ try:
+ es.indices.create(index=index, body=mapping)
+ except TransportError as e:
+ # ignore already existing index
+ if e.error == "resource_already_exists_exception":
+ pass
+ else:
+ raise
+
+def parse_date(date):
+ """
+ we need to convert dates to conform to the mapping in the following way:
+ months: one or two digit ints -> MM
+ days: one or two digit ints -> dd
+ years: two digit ints -> yyyy
+ times: {H}H:mm -> HH:mm
+ """
+
+ date = date.split("/")
+
+ month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
+
+ day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
+
+ year = date[2].split(" ")[0]
+ year = "20{}".format(year)
+
+ time = date[2].split(" ")[1]
+ time = time if len(time) == 5 else "0{}".format(time)
+
+ date = "{}/{}/{} {}".format(month, day, year, time)
+
+ return date
+
+def parse_line(line):
+ """
+ creates the document to be indexed
+ """
+ obj = {
+ "invoice_no": line[0],
+ "stock_code": line[1],
+ "description": line[2],
+ "quantity": line[3],
+ "invoice_date": parse_date(line[4]),
+ "unit_price": line[5],
+ "customer_id": line[6],
+ "country": line[7].replace("\n", "")
+ }
+
+ return obj
+
+def load_data(es):
+ """
+ generate one document per line of online-retail.csv
+ read file line by line to avoid loading all data into memory
+ """
+
+ create_index(es, "online-retail")
+
+ header = True
+ with open("data/online-retail.csv", "r") as f:
+ reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
+ for line in reader:
+ if header:
+ header=False
+ continue
+ doc = parse_line(line)
+
+ yield doc
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-H",
+ "--host",
+ action="store",
+ default="localhost:9200",
+ help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
+ )
+
+ args = parser.parse_args()
+
+ # create the elasticsearch client, pointing to the host parameter
+ es = Elasticsearch(args.host)
+ index='online-retail'
+
+ # load data from online retail csv in data directory
+ stream = load_data(es)
+ for ok, result in helpers.streaming_bulk(
+ es,
+ actions=stream,
+ index=index,
+ chunk_size=1000
+ ):
+ action, result = result.popitem()
+ doc_id = "/{}/doc/{}".format(index, result['_id'])
+
+ if not ok:
+ print("Failed to {} document {} {}".format(action, doc_id, result))
+ else:
+ print(doc_id)
+
+ # make docs available for searches
+ es.indices.refresh(index=index)
+
+ # notify user of number of documents indexed
+ print(es.count(index=index)["count"], "documents in index")
diff --git a/example/requirements-example.txt b/example/requirements-example.txt
new file mode 100644
index 0000000..dc9beca
--- /dev/null
+++ b/example/requirements-example.txt
@@ -0,0 +1,80 @@
+alabaster==0.7.12
+appnope==0.1.0
+atomicwrites==1.3.0
+attrs==19.3.0
+Babel==2.7.0
+backcall==0.1.0
+bleach==3.1.0
+certifi==2019.9.11
+chardet==3.0.4
+cycler==0.10.0
+decorator==4.4.1
+defusedxml==0.6.0
+docutils==0.15.2
+eland==0.1
+elasticsearch==7.1.0
+entrypoints==0.3
+idna==2.8
+imagesize==1.1.0
+importlib-metadata==0.23
+ipykernel==5.1.3
+ipython==7.9.0
+ipython-genutils==0.2.0
+ipywidgets==7.5.1
+jedi==0.15.1
+Jinja2==2.10.3
+jsonschema==3.1.1
+jupyter==1.0.0
+jupyter-client==5.3.4
+jupyter-console==6.0.0
+jupyter-core==4.6.1
+kiwisolver==1.1.0
+MarkupSafe==1.1.1
+matplotlib==3.1.1
+mistune==0.8.4
+more-itertools==7.2.0
+nbconvert==5.6.1
+nbformat==4.4.0
+notebook==6.0.2
+numpy==1.17.4
+numpydoc==0.8.0
+packaging==19.2
+pandas==0.25.1
+pandocfilters==1.4.2
+parso==0.5.1
+pexpect==4.7.0
+pickleshare==0.7.5
+pluggy==0.13.0
+prometheus-client==0.7.1
+prompt-toolkit==2.0.10
+ptyprocess==0.6.0
+py==1.8.0
+Pygments==2.4.2
+pyparsing==2.4.5
+pyrsistent==0.15.5
+pytest==5.2.2
+python-dateutil==2.8.1
+pytz==2019.3
+pyzmq==18.1.1
+qtconsole==4.5.5
+requests==2.22.0
+Send2Trash==1.5.0
+six==1.13.0
+snowballstemmer==2.0.0
+Sphinx==2.2.1
+sphinx-rtd-theme==0.4.3
+sphinxcontrib-applehelp==1.0.1
+sphinxcontrib-devhelp==1.0.1
+sphinxcontrib-htmlhelp==1.0.2
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.2
+sphinxcontrib-serializinghtml==1.1.3
+terminado==0.8.3
+testpath==0.4.4
+tornado==6.0.3
+traitlets==4.3.3
+urllib3==1.25.7
+wcwidth==0.1.7
+webencodings==0.5.1
+widgetsnbextension==3.5.1
+zipp==0.6.0