diff --git a/example/Online Retail Analysis.ipynb b/example/Online Retail Analysis.ipynb new file mode 100644 index 0000000..1cdb001 --- /dev/null +++ b/example/Online Retail Analysis.ipynb @@ -0,0 +1,1105 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "import eland as ed\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exploratory Data Analysis with eland" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Started\n", + "\n", + "To get started, let's explore the attributes of the `online-retail` index. First, we'll instantiate the data frame by pointing the constructor to a particular instance in our local elasticsearch cluster. \n", + "\n", + "The `online-retail` index was created by running `python load_data.py` from the `examples` directory." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = ed.read_es(\"http://localhost:9200\", \"online-retail\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we see that the `\"_id\"` field was used to index our data frame. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'_id'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index.index_field" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we can check which field from elasticsearch are available to our eland data frame. `columns` is available as a parameter when instantiating the data frame which allows one to choose only a subset of fields from your index to be included in the data frame. Since we didn't set this parameter, we have access to all fields." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['country', 'customer_id', 'description', 'invoice_date', 'invoice_no',\n", + " 'quantity', 'stock_code', 'unit_price'],\n", + " dtype='object')" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's see the data types of our fields. Running `df.dtypes`, we can see that elasticsearch field types are mapped to pandas field types." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "country object\n", + "customer_id object\n", + "description object\n", + "invoice_date datetime64[ns]\n", + "invoice_no object\n", + "quantity int64\n", + "stock_code object\n", + "unit_price float64\n", + "dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also offer a `.info_es()` data frame method that shows all info about the underlying index. It also contains information about operations being passed from data frame methods to elasticsearch. More on this later." + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "index_pattern: online-retail\n", + "Index:\n", + "\tindex_field: _id\n", + "\tis_source_field: False\n", + "Mappings:\n", + "\tcapabilities: _source es_dtype pd_dtype searchable aggregatable\n", + "country True keyword object True True\n", + "customer_id True keyword object True True\n", + "description True keyword object True True\n", + "invoice_date True date datetime64[ns] True True\n", + "invoice_no True keyword object True True\n", + "quantity True integer int64 True True\n", + "stock_code True keyword object True True\n", + "unit_price True float float64 True True\n", + "Operations:\n", + "\ttasks: []\n", + "\tsize: None\n", + "\tsort_params: None\n", + "\tcolumns: None\n", + "\tpost_processing: []\n", + "\n" + ] + } + ], + "source": [ + "print(df.info_es())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selecting and Indexing Data\n", + "\n", + "Now that we understand how to create a data frame and get access to it's underlying attributes, let's see how we can select subsets of our data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### head and tail\n", + "\n", + "much like pandas, eland data frames offer `.head(n)` and `.tail(n)` methods that return the first and last n rows, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycustomer_iddescriptioninvoice_dateinvoice_noquantitystock_codeunit_price
wXcVa24BUkfJ5hz0pRsLUnited Kingdom17850WHITE HANGING HEART T-LIGHT HOLDER2010-12-01 08:26:00536365685123A2.55
wncVa24BUkfJ5hz0pRsLUnited Kingdom17850WHITE METAL LANTERN2010-12-01 08:26:005363656710533.39
\n", + "
\n", + "

2 rows x 8 columns

" + ], + "text/plain": [ + " country customer_id \\\n", + "wXcVa24BUkfJ5hz0pRsL United Kingdom 17850 \n", + "wncVa24BUkfJ5hz0pRsL United Kingdom 17850 \n", + "\n", + " description invoice_date \\\n", + "wXcVa24BUkfJ5hz0pRsL WHITE HANGING HEART T-LIGHT HOLDER 2010-12-01 08:26:00 \n", + "wncVa24BUkfJ5hz0pRsL WHITE METAL LANTERN 2010-12-01 08:26:00 \n", + "\n", + " invoice_no quantity stock_code unit_price \n", + "wXcVa24BUkfJ5hz0pRsL 536365 6 85123A 2.55 \n", + "wncVa24BUkfJ5hz0pRsL 536365 6 71053 3.39 \n", + "\n", + "[2 rows x 8 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycustomer_iddescriptioninvoice_dateinvoice_noquantitystock_codeunit_price
vXgVa24BUkfJ5hz0txvjUnited KingdomMULTICOLOUR HONEYCOMB FAN2011-01-20 18:08:005416961212091.63
vngVa24BUkfJ5hz0txvjUnited KingdomPACK OF 72 RETROSPOT CAKE CASES2011-01-20 18:08:005416961212121.25
\n", + "
\n", + "

2 rows x 8 columns

" + ], + "text/plain": [ + " country customer_id \\\n", + "vXgVa24BUkfJ5hz0txvj United Kingdom \n", + "vngVa24BUkfJ5hz0txvj United Kingdom \n", + "\n", + " description invoice_date \\\n", + "vXgVa24BUkfJ5hz0txvj MULTICOLOUR HONEYCOMB FAN 2011-01-20 18:08:00 \n", + "vngVa24BUkfJ5hz0txvj PACK OF 72 RETROSPOT CAKE CASES 2011-01-20 18:08:00 \n", + "\n", + " invoice_no quantity stock_code unit_price \n", + "vXgVa24BUkfJ5hz0txvj 541696 1 21209 1.63 \n", + "vngVa24BUkfJ5hz0txvj 541696 1 21212 1.25 \n", + "\n", + "[2 rows x 8 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### selecting columns\n", + "\n", + "you can also pass a list of columns to select columns from the data frame in a specified order." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryinvoice_date
wXcVa24BUkfJ5hz0pRsLUnited Kingdom2010-12-01 08:26:00
wncVa24BUkfJ5hz0pRsLUnited Kingdom2010-12-01 08:26:00
w3cVa24BUkfJ5hz0pRsLUnited Kingdom2010-12-01 08:26:00
xHcVa24BUkfJ5hz0pRsLUnited Kingdom2010-12-01 08:26:00
xXcVa24BUkfJ5hz0pRsLUnited Kingdom2010-12-01 08:26:00
\n", + "
\n", + "

5 rows x 2 columns

" + ], + "text/plain": [ + " country invoice_date\n", + "wXcVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n", + "wncVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n", + "w3cVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n", + "xHcVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n", + "xXcVa24BUkfJ5hz0pRsL United Kingdom 2010-12-01 08:26:00\n", + "\n", + "[5 rows x 2 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['country', 'invoice_date']].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Boolean Indexing\n", + "\n", + "we also allow you to filter the data frame using boolean indexing. Under the hood, a boolean index maps to a `terms` query that is then passed to elasticsearch to filter the index." + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'term': {'country': 'Germany'}}\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycustomer_iddescriptioninvoice_dateinvoice_noquantitystock_codeunit_price
FncVa24BUkfJ5hz0pSBJGermany12662SET OF 6 T-LIGHTS SANTA2010-12-01 13:04:005365276228092.95
F3cVa24BUkfJ5hz0pSBJGermany12662ROTATING SILVER ANGELS T-LIGHT HLDR2010-12-01 13:04:005365276843472.55
GHcVa24BUkfJ5hz0pSBJGermany12662MULTI COLOUR SILVER T-LIGHT HOLDER2010-12-01 13:04:0053652712849450.85
GXcVa24BUkfJ5hz0pSBJGermany126625 HOOK HANGER MAGIC TOADSTOOL2010-12-01 13:04:0053652712222421.65
GncVa24BUkfJ5hz0pSBJGermany126623 HOOK HANGER MAGIC GARDEN2010-12-01 13:04:0053652712222441.95
\n", + "
\n", + "

5 rows x 8 columns

" + ], + "text/plain": [ + " country customer_id \\\n", + "FncVa24BUkfJ5hz0pSBJ Germany 12662 \n", + "F3cVa24BUkfJ5hz0pSBJ Germany 12662 \n", + "GHcVa24BUkfJ5hz0pSBJ Germany 12662 \n", + "GXcVa24BUkfJ5hz0pSBJ Germany 12662 \n", + "GncVa24BUkfJ5hz0pSBJ Germany 12662 \n", + "\n", + " description invoice_date \\\n", + "FncVa24BUkfJ5hz0pSBJ SET OF 6 T-LIGHTS SANTA 2010-12-01 13:04:00 \n", + "F3cVa24BUkfJ5hz0pSBJ ROTATING SILVER ANGELS T-LIGHT HLDR 2010-12-01 13:04:00 \n", + "GHcVa24BUkfJ5hz0pSBJ MULTI COLOUR SILVER T-LIGHT HOLDER 2010-12-01 13:04:00 \n", + "GXcVa24BUkfJ5hz0pSBJ 5 HOOK HANGER MAGIC TOADSTOOL 2010-12-01 13:04:00 \n", + "GncVa24BUkfJ5hz0pSBJ 3 HOOK HANGER MAGIC GARDEN 2010-12-01 13:04:00 \n", + "\n", + " invoice_no quantity stock_code unit_price \n", + "FncVa24BUkfJ5hz0pSBJ 536527 6 22809 2.95 \n", + "F3cVa24BUkfJ5hz0pSBJ 536527 6 84347 2.55 \n", + "GHcVa24BUkfJ5hz0pSBJ 536527 12 84945 0.85 \n", + "GXcVa24BUkfJ5hz0pSBJ 536527 12 22242 1.65 \n", + "GncVa24BUkfJ5hz0pSBJ 536527 12 22244 1.95 \n", + "\n", + "[5 rows x 8 columns]" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the construction of a boolean vector maps directly to an elasticsearch query\n", + "print(df['country']=='Germany')\n", + "df[(df['country']=='Germany')].head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# add isin example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also combine boolean vectors to further filter the data frame." + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycustomer_iddescriptioninvoice_dateinvoice_noquantitystock_codeunit_price
5XcVa24BUkfJ5hz0q3MqGermany12471FUNKY DIVA PEN2010-12-10 09:35:0053817496227410.85
7XcVa24BUkfJ5hz0q3MqGermany12471LIPSTICK PEN RED2010-12-10 09:35:00538174100224190.36
FHcVa24BUkfJ5hz0s-K9Germany12500PACK OF 6 BIRDY GIFT TAGS2011-01-10 09:48:00540553144225851.06
XncVa24BUkfJ5hz0s-K9Germany12524BOX OF 24 COCKTAIL PARASOLS2011-01-10 10:35:00540562100846920.42
\n", + "
\n", + "

4 rows x 8 columns

" + ], + "text/plain": [ + " country customer_id description \\\n", + "5XcVa24BUkfJ5hz0q3Mq Germany 12471 FUNKY DIVA PEN \n", + "7XcVa24BUkfJ5hz0q3Mq Germany 12471 LIPSTICK PEN RED \n", + "FHcVa24BUkfJ5hz0s-K9 Germany 12500 PACK OF 6 BIRDY GIFT TAGS \n", + "XncVa24BUkfJ5hz0s-K9 Germany 12524 BOX OF 24 COCKTAIL PARASOLS \n", + "\n", + " invoice_date invoice_no quantity stock_code \\\n", + "5XcVa24BUkfJ5hz0q3Mq 2010-12-10 09:35:00 538174 96 22741 \n", + "7XcVa24BUkfJ5hz0q3Mq 2010-12-10 09:35:00 538174 100 22419 \n", + "FHcVa24BUkfJ5hz0s-K9 2011-01-10 09:48:00 540553 144 22585 \n", + "XncVa24BUkfJ5hz0s-K9 2011-01-10 10:35:00 540562 100 84692 \n", + "\n", + " unit_price \n", + "5XcVa24BUkfJ5hz0q3Mq 0.85 \n", + "7XcVa24BUkfJ5hz0q3Mq 0.36 \n", + "FHcVa24BUkfJ5hz0s-K9 1.06 \n", + "XncVa24BUkfJ5hz0s-K9 0.42 \n", + "\n", + "[4 rows x 8 columns]" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[(df['country']=='Germany') & (df['quantity']>90)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using this example, let see how eland translates this boolean filter to an elasticsearch `bool` query." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "index_pattern: online-retail\n", + "Index:\n", + "\tindex_field: _id\n", + "\tis_source_field: False\n", + "Mappings:\n", + "\tcapabilities: _source es_dtype pd_dtype searchable aggregatable\n", + "country True keyword object True True\n", + "customer_id True keyword object True True\n", + "description True keyword object True True\n", + "invoice_date True date datetime64[ns] True True\n", + "invoice_no True keyword object True True\n", + "quantity True integer int64 True True\n", + "stock_code True keyword object True True\n", + "unit_price True float float64 True True\n", + "Operations:\n", + "\ttasks: [('boolean_filter', {'bool': {'must': [{'term': {'country': 'Germany'}}, {'range': {'quantity': {'gt': 90}}}]}})]\n", + "\tsize: None\n", + "\tsort_params: None\n", + "\tcolumns: None\n", + "\tpost_processing: []\n", + "\n" + ] + } + ], + "source": [ + "print(df[(df['country']=='Germany') & (df['quantity']>90)].info_es())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregation and Descriptive Statistics\n", + "\n", + "Let's begin to ask some questions of our data and use eland to get the answers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**How many different countries are there?**" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "country 24\n", + "dtype: int64" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['country'].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**What is the total sum of products ordered?**" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "quantity 548076.0\n", + "dtype: float64" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['quantity'].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Show me the sum, mean, min, and max of the qunatity and unit_price fields**" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quantityunit_price
sum548076.000000383761.569666
mean8.3632315.855916
max74215.00000016888.019531
min-74215.0000000.000000
\n", + "
" + ], + "text/plain": [ + " quantity unit_price\n", + "sum 548076.000000 383761.569666\n", + "mean 8.363231 5.855916\n", + "max 74215.000000 16888.019531\n", + "min -74215.000000 0.000000" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['quantity','unit_price']].agg(['sum', 'mean', 'max', 'min'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Give me descriptive statistics for the entire data frame**" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quantityunit_price
count65534.00000065534.000000
mean8.3632315.855916
std413.694481145.755942
min-74215.0000000.000000
25%1.0000001.250000
50%2.0000002.510000
75%8.0000004.234706
max74215.00000016888.019531
\n", + "
" + ], + "text/plain": [ + " quantity unit_price\n", + "count 65534.000000 65534.000000\n", + "mean 8.363231 5.855916\n", + "std 413.694481 145.755942\n", + "min -74215.000000 0.000000\n", + "25% 1.000000 1.250000\n", + "50% 2.000000 2.510000\n", + "75% 8.000000 4.234706\n", + "max 74215.000000 16888.019531" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Show me a histogram of numeric columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[,\n", + " ]],\n", + " dtype=object)" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df[(df['quantity']>-50) & \n", + " (df['quantity']<50) & \n", + " (df['unit_price']>0) & \n", + " (df['unit_price']<100)].select_dtypes(include=[np.number]).hist(figsize=[12,4], bins=30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..32b5824 --- /dev/null +++ b/example/README.md @@ -0,0 +1,17 @@ +# Example Walkthrough for eland + +This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail). + +To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`: + +``` +pip install -r requirements-example.txt +``` + +Once these requirements are satisfied, load the data using the provided script: + +``` +python load.py +``` + +This will create an index called `online-retail` with a mapping defined in `load.py`. \ No newline at end of file diff --git a/example/load_data.py b/example/load_data.py new file mode 100644 index 0000000..4b93d91 --- /dev/null +++ b/example/load_data.py @@ -0,0 +1,131 @@ +import argparse +import csv + +from elasticsearch import Elasticsearch, helpers +from elasticsearch.exceptions import TransportError + +def create_index(es, index): + mapping = { + "mappings": { + "properties": { + "invoice_no": {"type": "keyword"}, + "stock_code": {"type": "keyword"}, + "description": {"type": "keyword"}, + "quantity": {"type": "integer"}, + "invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"}, + "unit_price": {"type": "float"}, + "customer_id": {"type": "keyword"}, + "country": {"type": "keyword"} + } + } + } + + # create an empty index + try: + es.indices.create(index=index, body=mapping) + except TransportError as e: + # ignore already existing index + if e.error == "resource_already_exists_exception": + pass + else: + raise + +def parse_date(date): + """ + we need to convert dates to conform to the mapping in the following way: + months: one or two digit ints -> MM + days: one or two digit ints -> dd + years: two digit ints -> yyyy + times: {H}H:mm -> HH:mm + """ + + date = date.split("/") + + month = date[0] if len(date[0]) == 2 else "0{}".format(date[0]) + + day = date[1] if len(date[1]) == 2 else "0{}".format(date[1]) + + year = date[2].split(" ")[0] + year = "20{}".format(year) + + time = date[2].split(" ")[1] + time = time if len(time) == 5 else "0{}".format(time) + + date = "{}/{}/{} {}".format(month, day, year, time) + + return date + +def parse_line(line): + """ + creates the document to be indexed + """ + obj = { + "invoice_no": line[0], + "stock_code": line[1], + "description": line[2], + "quantity": line[3], + "invoice_date": parse_date(line[4]), + "unit_price": line[5], + "customer_id": line[6], + "country": line[7].replace("\n", "") + } + + return obj + +def load_data(es): + """ + generate one document per line of online-retail.csv + read file line by line to avoid loading all data into memory + """ + + create_index(es, "online-retail") + + header = True + with open("data/online-retail.csv", "r") as f: + reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL) + for line in reader: + if header: + header=False + continue + doc = parse_line(line) + + yield doc + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-H", + "--host", + action="store", + default="localhost:9200", + help="The elasticsearch host you wish to connect to. (Default: localhost:9200)" + ) + + args = parser.parse_args() + + # create the elasticsearch client, pointing to the host parameter + es = Elasticsearch(args.host) + index='online-retail' + + # load data from online retail csv in data directory + stream = load_data(es) + for ok, result in helpers.streaming_bulk( + es, + actions=stream, + index=index, + chunk_size=1000 + ): + action, result = result.popitem() + doc_id = "/{}/doc/{}".format(index, result['_id']) + + if not ok: + print("Failed to {} document {} {}".format(action, doc_id, result)) + else: + print(doc_id) + + # make docs available for searches + es.indices.refresh(index=index) + + # notify user of number of documents indexed + print(es.count(index=index)["count"], "documents in index") diff --git a/example/requirements-example.txt b/example/requirements-example.txt new file mode 100644 index 0000000..dc9beca --- /dev/null +++ b/example/requirements-example.txt @@ -0,0 +1,80 @@ +alabaster==0.7.12 +appnope==0.1.0 +atomicwrites==1.3.0 +attrs==19.3.0 +Babel==2.7.0 +backcall==0.1.0 +bleach==3.1.0 +certifi==2019.9.11 +chardet==3.0.4 +cycler==0.10.0 +decorator==4.4.1 +defusedxml==0.6.0 +docutils==0.15.2 +eland==0.1 +elasticsearch==7.1.0 +entrypoints==0.3 +idna==2.8 +imagesize==1.1.0 +importlib-metadata==0.23 +ipykernel==5.1.3 +ipython==7.9.0 +ipython-genutils==0.2.0 +ipywidgets==7.5.1 +jedi==0.15.1 +Jinja2==2.10.3 +jsonschema==3.1.1 +jupyter==1.0.0 +jupyter-client==5.3.4 +jupyter-console==6.0.0 +jupyter-core==4.6.1 +kiwisolver==1.1.0 +MarkupSafe==1.1.1 +matplotlib==3.1.1 +mistune==0.8.4 +more-itertools==7.2.0 +nbconvert==5.6.1 +nbformat==4.4.0 +notebook==6.0.2 +numpy==1.17.4 +numpydoc==0.8.0 +packaging==19.2 +pandas==0.25.1 +pandocfilters==1.4.2 +parso==0.5.1 +pexpect==4.7.0 +pickleshare==0.7.5 +pluggy==0.13.0 +prometheus-client==0.7.1 +prompt-toolkit==2.0.10 +ptyprocess==0.6.0 +py==1.8.0 +Pygments==2.4.2 +pyparsing==2.4.5 +pyrsistent==0.15.5 +pytest==5.2.2 +python-dateutil==2.8.1 +pytz==2019.3 +pyzmq==18.1.1 +qtconsole==4.5.5 +requests==2.22.0 +Send2Trash==1.5.0 +six==1.13.0 +snowballstemmer==2.0.0 +Sphinx==2.2.1 +sphinx-rtd-theme==0.4.3 +sphinxcontrib-applehelp==1.0.1 +sphinxcontrib-devhelp==1.0.1 +sphinxcontrib-htmlhelp==1.0.2 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.2 +sphinxcontrib-serializinghtml==1.1.3 +terminado==0.8.3 +testpath==0.4.4 +tornado==6.0.3 +traitlets==4.3.3 +urllib3==1.25.7 +wcwidth==0.1.7 +webencodings==0.5.1 +widgetsnbextension==3.5.1 +zipp==0.6.0