diff --git a/eland/dataframe.py b/eland/dataframe.py index a7de746..036fcfc 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -463,7 +463,6 @@ class DataFrame(NDFrame): "quotechar": quotechar, "line_terminator": line_terminator, "chunksize": chunksize, - "tupleize_cols": tupleize_cols, "date_format": date_format, "doublequote": doublequote, "escapechar": escapechar, @@ -552,7 +551,7 @@ class DataFrame(NDFrame): # currently we only support a subset of functions that aggregate columns. # ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] - if isinstance(func, compat.string_types): + if isinstance(func, str): # wrap in list func = [func] return self._query_compiler.aggs(func) diff --git a/eland/mappings.py b/eland/mappings.py index d76aa1b..c07931c 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -290,7 +290,7 @@ class Mappings: return es_dtype @staticmethod - def _generate_es_mappings(dataframe): + def _generate_es_mappings(dataframe, geo_points=None): """Given a pandas dataframe, generate the associated Elasticsearch mapping Parameters @@ -325,7 +325,10 @@ class Mappings: mappings = {} mappings['properties'] = {} for column_name, dtype in dataframe.dtypes.iteritems(): - es_dtype = Mappings._pd_dtype_to_es_dtype(dtype) + if geo_points is not None and column_name in geo_points: + es_dtype = 'geo_point' + else: + es_dtype = Mappings._pd_dtype_to_es_dtype(dtype) mappings['properties'][column_name] = {} mappings['properties'][column_name]['type'] = es_dtype diff --git a/eland/plotting.py b/eland/plotting.py index cf5de53..c546813 100644 --- a/eland/plotting.py +++ b/eland/plotting.py @@ -3,13 +3,14 @@ import numpy as np import pandas.core.common as com from pandas.core.dtypes.generic import ( ABCIndexClass) +from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, **kwds): """ - Derived from pandas.plotting._core.hist_frame 0.24.2 + Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1 Ideally, we'd call hist_frame directly with histogram data, but weights are applied to ALL series. For example, we can @@ -29,8 +30,6 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None, # Start with empty pandas data frame derived from ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins) - _raise_if_no_mpl() - _converter._WARN = False if by is not None: raise NotImplementedError("TODO") """ diff --git a/eland/tests/dataframe/test_iloc_pytest.py b/eland/tests/dataframe/test_iloc_pytest.py index 0b8d23c..256a104 100644 --- a/eland/tests/dataframe/test_iloc_pytest.py +++ b/eland/tests/dataframe/test_iloc_pytest.py @@ -24,31 +24,22 @@ class TestDataFrameiLoc(TestData): pd_iloc1= pd_flights.iloc[[0]] pd_iloc2= pd_flights.iloc[[0, 1]] pd_iloc3 = pd_flights.iloc[:3] - pd_iloc4 = pd_flights.iloc[[True, False, True]] pd_iloc5 = pd_flights.iloc[0, 1] pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]] pd_iloc7 = pd_flights.iloc[1:3, 0:3] - pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]] - pd_iloc9 = pd_flights.iloc[[True, False, True, False]] ed_iloc0 = ed_flights.iloc[0] ed_iloc1 = ed_flights.iloc[[0]] ed_iloc2 = ed_flights.iloc[[0, 1]] ed_iloc3 = ed_flights.iloc[:3] - ed_iloc4 = ed_flights.iloc[[True, False, True]] ed_iloc5 = ed_flights.iloc[0, 1] ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]] ed_iloc7 = ed_flights.iloc[1:3, 0:3] - ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]] - ed_iloc9 = ed_flights.iloc[[True, False, True, False]] #assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1) assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2) assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3) - assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4) #assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6) assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7) - assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8) - assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9) diff --git a/eland/tests/dataframe/test_reviews_pytest.py b/eland/tests/dataframe/test_reviews_pytest.py deleted file mode 100644 index 5c7d789..0000000 --- a/eland/tests/dataframe/test_reviews_pytest.py +++ /dev/null @@ -1,31 +0,0 @@ -# File called _pytest for PyCharm compatability - -import gzip - -import pandas as pd - -import eland as ed -from eland.tests.common import TestData - - -class TestDataFrameReviews(TestData): - - def test_explore(self): - ed_reviews = ed.DataFrame('localhost', 'anonreviews') - - print(ed_reviews.head()) - print(ed_reviews.describe()) - print(ed_reviews.info()) - print(ed_reviews.hist(column="rating", bins=5)) - # print(ed_reviews.head().info_es()) - - def test_review(self): - csv_handle = gzip.open('../anonreviews.csv.gz') - - reviews = pd.read_csv(csv_handle) - - reviews['date'] = pd.to_datetime(reviews['date']) - - g = reviews.groupby('reviewerId') - - print(g.describe()) diff --git a/eland/tests/dataframe/test_to_csv_pytest.py b/eland/tests/dataframe/test_to_csv_pytest.py index b2b0911..d916c18 100644 --- a/eland/tests/dataframe/test_to_csv_pytest.py +++ b/eland/tests/dataframe/test_to_csv_pytest.py @@ -3,20 +3,23 @@ import pandas as pd from eland.tests.common import TestData +from eland.tests.common import ROOT_DIR from pandas.util.testing import (assert_equal, assert_frame_equal) import ast + class TestDataFrameToCSV(TestData): def test_to_csv_head(self): + results_file = ROOT_DIR + '/dataframe/results/test_to_csv_head.csv' + ed_flights = self.ed_flights().head() pd_flights = self.pd_flights().head() - - ed_flights.to_csv('results/test_to_csv_head.csv') + ed_flights.to_csv(results_file) # Converting back from csv is messy as pd_flights is created from a json file - pd_from_csv = pd.read_csv('results/test_to_csv_head.csv', index_col=0, converters={ + pd_from_csv = pd.read_csv(results_file, index_col=0, converters={ 'DestLocation': lambda x: ast.literal_eval(x), 'OriginLocation': lambda x: ast.literal_eval(x)}) pd_from_csv.index = pd_from_csv.index.map(str) @@ -25,13 +28,15 @@ class TestDataFrameToCSV(TestData): assert_frame_equal(pd_flights, pd_from_csv) def test_to_csv_full(self): + results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv' + # Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs ed_flights = self.ed_flights() pd_flights = self.pd_flights() - ed_flights.to_csv('results/test_to_csv_full.csv') + ed_flights.to_csv(results_file) # Converting back from csv is messy as pd_flights is created from a json file - pd_from_csv = pd.read_csv('results/test_to_csv_full.csv', index_col=0, converters={ + pd_from_csv = pd.read_csv(results_file, index_col=0, converters={ 'DestLocation': lambda x: ast.literal_eval(x), 'OriginLocation': lambda x: ast.literal_eval(x)}) pd_from_csv.index = pd_from_csv.index.map(str) diff --git a/eland/tests/demo_day_20190815.ipynb b/eland/tests/demo_day_20190815.ipynb index 7c785bd..878d00d 100644 --- a/eland/tests/demo_day_20190815.ipynb +++ b/eland/tests/demo_day_20190815.ipynb @@ -7144,7 +7144,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/eland/tests/operators/test_operators_pytest.py b/eland/tests/operators/test_operators_pytest.py index bab87ca..81c329c 100644 --- a/eland/tests/operators/test_operators_pytest.py +++ b/eland/tests/operators/test_operators_pytest.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -from eland.operators import * +from eland.filter import * class TestOperators(): @@ -21,11 +21,6 @@ class TestOperators(): 'script': {'script': {'inline': 'doc["num1"].value > params.param1', 'params': {'param1': 5}}}} assert IsIn('ids', [1, 2, 3]).build() == {'ids': {'values': [1, 2, 3]}} - def test_and_none(self): - exp = None - exp = exp & Less('b', 3) - print(exp.build()) - def test_and_filter1(self): exp = GreaterEqual('a', 2) & Less('b', 3) assert exp.build() == {'bool': {'must': [{'range': {'a': {'gte': 2}}}, {'range': {'b': {'lt': 3}}}]}} diff --git a/eland/tests/pivot_review_data_pandas.ipynb b/eland/tests/pivot_review_data_pandas.ipynb deleted file mode 100644 index e412c96..0000000 --- a/eland/tests/pivot_review_data_pandas.ipynb +++ /dev/null @@ -1,1382 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pivot review data in pandas\n", - "\n", - "This notebook shows how data can be pivoted by python [pandas](https://pandas.pydata.org/) to reveal insights into the behaviour of reviewers. The use case and data is from Mark Harwood's talk on [entity-centric indexing](https://www.elastic.co/videos/entity-centric-indexing-mark-harwood).\n", - "\n", - "An alternative version of this notebook uses the [Elastic data frames](https://www.elastic.co/guide/en/elastic-stack-overview/master/ml-dataframes.html) to create the same results." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [], - "source": [ - "import gzip\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from pandas.plotting import scatter_matrix" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read data to pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [], - "source": [ - "csv_handle = gzip.open('./anonreviews.csv.gz')\n", - "\n", - "reviews = pd.read_csv(csv_handle)\n", - "\n", - "reviews['date'] = pd.to_datetime(reviews['date'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Explore data" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
reviewerIdvendorIdratingdate
00052006-04-07 17:08:00
11152006-05-04 12:16:00
22242006-04-21 12:26:00
33352006-04-18 15:48:00
43452006-04-18 15:49:00
\n", - "
" - ], - "text/plain": [ - " reviewerId vendorId rating date\n", - "0 0 0 5 2006-04-07 17:08:00\n", - "1 1 1 5 2006-05-04 12:16:00\n", - "2 2 2 4 2006-04-21 12:26:00\n", - "3 3 3 5 2006-04-18 15:48:00\n", - "4 3 4 5 2006-04-18 15:49:00" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reviews.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
reviewerIdvendorIdrating
count578805.000000578805.000000578805.000000
mean174124.09843760.6452674.679671
std116951.97220954.4880530.800891
min0.0000000.0000000.000000
25%70043.00000020.0000005.000000
50%161052.00000044.0000005.000000
75%272697.00000083.0000005.000000
max400140.000000246.0000005.000000
\n", - "
" - ], - "text/plain": [ - " reviewerId vendorId rating\n", - "count 578805.000000 578805.000000 578805.000000\n", - "mean 174124.098437 60.645267 4.679671\n", - "std 116951.972209 54.488053 0.800891\n", - "min 0.000000 0.000000 0.000000\n", - "25% 70043.000000 20.000000 5.000000\n", - "50% 161052.000000 44.000000 5.000000\n", - "75% 272697.000000 83.000000 5.000000\n", - "max 400140.000000 246.000000 5.000000" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reviews.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 578805 entries, 0 to 578804\n", - "Data columns (total 4 columns):\n", - "reviewerId 578805 non-null int64\n", - "vendorId 578805 non-null int64\n", - "rating 578805 non-null int64\n", - "date 578805 non-null datetime64[ns]\n", - "dtypes: datetime64[ns](1), int64(3)\n", - "memory usage: 17.7 MB\n" - ] - } - ], - "source": [ - "reviews.info()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Distribution of reviews (high number of five star ratings)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "pycharm": { - "is_executing": true, - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEICAYAAACqMQjAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAUQUlEQVR4nO3df6zldX3n8eerIJUdRRDwlp0hHRMnVgpbf0yADbubu9rFi5rCppJAujIamskaSGwkqbjZDeuvRP+gNhhLd7aQgZYV2aoZVlE6AW4aNyA/FBmBukypK1OoUx1ALm61Y9/7x/mMOY7nc++dO3PPmZnzfCQn9/t9n8/3+/l8OJnzOt8f55CqQpKkUX5p0gOQJB2+DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEtIYJPnjJP9l0uOQDlT8noR0aCV5N/C7VfWvJj0W6WB5JCEdoCTHTnoM0rgYEtIyJPlOkg8keQR4Mcl/TvLXSV5I8liSf9/avQ74Y+BfJllI8lyrb03y0bY8m2RXkquS7E7yTJL3DPV1cpL/leSHSR5I8tEkX53AtCVDQjoAlwJvB04Evg38a+AVwIeAP0tyWlU9DvxH4N6qellVndjZ16+0bdcClwOfTnJSe+7TwIutzab2kCbCkJCW77qqeqqq/l9V/c+qerqq/qmqPgs8AZx9APv6R+DDVfWPVXUHsAC8NskxwG8D11TVj6rqMeCmQz4TaZkMCWn5ntq3kOSyJA8nea6dUjoTOOUA9vWDqto7tP4j4GXAqcCxw33ttyyNlSEhLV8BJPlV4L8DVwInt1NK3wIy3G6F/h7YC6wbqp1+EPuTDoohIR24NQyC4O8B2kXnM4ee/x6wLslxB7rjqvop8Hngvyb5Z0l+Dbjs4IcsrYwhIR2gdp3gWuBeBoFwFvC/h5rcDTwK/F2S76+giysZXNT+O+BPgc8APz6YMUsr5ZfppMNckk8Av1JV3uWksfNIQjrMJPm1JP8iA2czuEX2C5Mel6aT3xyVDj8vZ3CK6Z8Duxmc2to20RFpanm6SZLU5ekmSVLXUXe66ZRTTqn169evaNsXX3yRNWvWHNoBHeac83Rwzke/g53vQw899P2qOnX/+lEXEuvXr+fBBx9c0bbz8/PMzs4e2gEd5pzzdHDOR7+DnW+S/zuq7ukmSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlS11H3jWtJAtjxt8/z7qu/NOlhjM3WudX5CRKPJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUtKySSfCfJjiQPJ3mw1V6ZZHuSJ9rfk1o9Sa5LsjPJI0neOLSfTa39E0k2DdXf1Pa/s22bxfqQJI3HgRxJ/Nuqen1VbWzrVwN3VdUG4K62DnABsKE9NgPXw+ANH7gGOAc4G7hm6E3/+tZ233ZzS/QhSRqDgznddCFwU1u+CbhoqH5zDdwHnJjkNOCtwPaq2lNVzwLbgbn23AlVdW9VFXDzfvsa1YckaQyW+1PhBfxFkgL+W1VtAWaq6hmAqnomyata27XAU0Pb7mq1xeq7RtRZpI+fk2QzgyMRZmZmmJ+fX+a0ft7CwsKKtz1SOefpMI1znjkerjpr76SHMTar9RovNyTOq6qn25v09iR/tUjbjKjVCurL1kJrC8DGjRtrdnb2QDb/mfn5eVa67ZHKOU+HaZzzp27ZxrU7pud/mbN1bs2qvMbLOt1UVU+3v7uBLzC4pvC9dqqI9nd3a74LOH1o83XA00vU142os0gfkqQxWDIkkqxJ8vJ9y8D5wLeA24F9dyhtAra15duBy9pdTucCz7dTRncC5yc5qV2wPh+4sz33QpJz211Nl+23r1F9SJLGYDnHYjPAF9pdqccC/6OqvpLkAeC2JJcD3wUubu3vAN4G7AR+BLwHoKr2JPkI8EBr9+Gq2tOW3wtsBY4HvtweAB/v9CFJGoMlQ6KqngR+Y0T9B8BbRtQLuKKzrxuBG0fUHwTOXG4fkqTx8BvXkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV3LDokkxyT5RpIvtvVXJ/lakieSfDbJca3+y219Z3t+/dA+Ptjq307y1qH6XKvtTHL1UH1kH5Kk8TiQI4n3AY8PrX8C+GRVbQCeBS5v9cuBZ6vqNcAnWzuSnAFcAvw6MAf8UQueY4BPAxcAZwCXtraL9SFJGoNlhUSSdcDbgT9p6wHeDPx5a3ITcFFbvrCt055/S2t/IXBrVf24qv4G2Amc3R47q+rJqvoJcCtw4RJ9SJLG4NhltvtD4PeBl7f1k4HnqmpvW98FrG3La4GnAKpqb5LnW/u1wH1D+xze5qn96ucs0cfPSbIZ2AwwMzPD/Pz8Mqf18xYWFla87ZHKOU+HaZzzzPFw1Vl7l254lFit13jJkEjyDmB3VT2UZHZfeUTTWuK5Xn3U0cxi7X+xWLUF2AKwcePGmp2dHdVsSfPz86x02yOVc54O0zjnT92yjWt3LPdz8JFv69yaVXmNl/Nf8Dzgt5K8DXgpcAKDI4sTkxzbPumvA55u7XcBpwO7khwLvALYM1TfZ3ibUfXvL9KHJGkMlrwmUVUfrKp1VbWewYXnu6vqd4B7gHe2ZpuAbW359rZOe/7uqqpWv6Td/fRqYANwP/AAsKHdyXRc6+P2tk2vD0nSGBzM9yQ+ALw/yU4G1w9uaPUbgJNb/f3A1QBV9ShwG/AY8BXgiqr6aTtKuBK4k8HdU7e1tov1IUkagwM6YVdV88B8W36SwZ1J+7f5B+DizvYfAz42on4HcMeI+sg+JEnj4TeuJUldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUtWRIJHlpkvuTfDPJo0k+1OqvTvK1JE8k+WyS41r9l9v6zvb8+qF9fbDVv53krUP1uVbbmeTqofrIPiRJ47GcI4kfA2+uqt8AXg/MJTkX+ATwyaraADwLXN7aXw48W1WvAT7Z2pHkDOAS4NeBOeCPkhyT5Bjg08AFwBnApa0ti/QhSRqDJUOiBhba6kvao4A3A3/e6jcBF7XlC9s67fm3JEmr31pVP66qvwF2Ame3x86qerKqfgLcClzYtun1IUkag2OX06h92n8IeA2DT/1/DTxXVXtbk13A2ra8FngKoKr2JnkeOLnV7xva7fA2T+1XP6dt0+tj//FtBjYDzMzMMD8/v5xp/YKFhYUVb3ukcs7TYRrnPHM8XHXW3qUbHiVW6zVeVkhU1U+B1yc5EfgC8LpRzdrfdJ7r1UcdzSzWftT4tgBbADZu3Fizs7Ojmi1pfn6elW57pHLO02Ea5/ypW7Zx7Y5lvcUdFbbOrVmV1/iA7m6qqueAeeBc4MQk+16BdcDTbXkXcDpAe/4VwJ7h+n7b9OrfX6QPSdIYLOfuplPbEQRJjgd+E3gcuAd4Z2u2CdjWlm9v67Tn766qavVL2t1PrwY2APcDDwAb2p1MxzG4uH1726bXhyRpDJZzLHYacFO7LvFLwG1V9cUkjwG3Jvko8A3ghtb+BuBPk+xkcARxCUBVPZrkNuAxYC9wRTuNRZIrgTuBY4Abq+rRtq8PdPqQJI3BkiFRVY8AbxhRf5LBnUn71/8BuLizr48BHxtRvwO4Y7l9SJLGw29cS5K6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpa8mQSHJ6knuSPJ7k0STva/VXJtme5In296RWT5LrkuxM8kiSNw7ta1Nr/0SSTUP1NyXZ0ba5LkkW60OSNB7LOZLYC1xVVa8DzgWuSHIGcDVwV1VtAO5q6wAXABvaYzNwPQze8IFrgHOAs4Frht70r29t92031+q9PiRJY7BkSFTVM1X19bb8AvA4sBa4ELipNbsJuKgtXwjcXAP3AScmOQ14K7C9qvZU1bPAdmCuPXdCVd1bVQXcvN++RvUhSRqDYw+kcZL1wBuArwEzVfUMDIIkyatas7XAU0Ob7Wq1xeq7RtRZpI/9x7WZwZEIMzMzzM/PH8i0fmZhYWHF2x6pnPN0mMY5zxwPV521d9LDGJvVeo2XHRJJXgZ8Dvi9qvphu2wwsumIWq2gvmxVtQXYArBx48aanZ09kM1/Zn5+npVue6RyztNhGuf8qVu2ce2OA/ocfETbOrdmVV7jZd3dlOQlDALilqr6fCt/r50qov3d3eq7gNOHNl8HPL1Efd2I+mJ9SJLGYDl3NwW4AXi8qv5g6KnbgX13KG0Ctg3VL2t3OZ0LPN9OGd0JnJ/kpHbB+nzgzvbcC0nObX1dtt++RvUhSRqD5RyLnQe8C9iR5OFW+0/Ax4HbklwOfBe4uD13B/A2YCfwI+A9AFW1J8lHgAdauw9X1Z62/F5gK3A88OX2YJE+JEljsGRIVNVXGX3dAOAtI9oXcEVnXzcCN46oPwicOaL+g1F9SJLGw29cS5K6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpa8mQSHJjkt1JvjVUe2WS7UmeaH9PavUkuS7JziSPJHnj0DabWvsnkmwaqr8pyY62zXVJslgfkqTxWc6RxFZgbr/a1cBdVbUBuKutA1wAbGiPzcD1MHjDB64BzgHOBq4ZetO/vrXdt93cEn1IksZkyZCoqr8E9uxXvhC4qS3fBFw0VL+5Bu4DTkxyGvBWYHtV7amqZ4HtwFx77oSqureqCrh5v32N6kOSNCYrvSYxU1XPALS/r2r1tcBTQ+12tdpi9V0j6ov1IUkak2MP8f4yolYrqB9Yp8lmBqesmJmZYX5+/kB3AcDCwsKKtz1SOefpMI1znjkerjpr76SHMTar9RqvNCS+l+S0qnqmnTLa3eq7gNOH2q0Dnm712f3q862+bkT7xfr4BVW1BdgCsHHjxpqdne01XdT8/Dwr3fZI5ZynwzTO+VO3bOPaHYf6c/Dha+vcmlV5jVd6uul2YN8dSpuAbUP1y9pdTucCz7dTRXcC5yc5qV2wPh+4sz33QpJz211Nl+23r1F9SJLGZMmYTfIZBkcBpyTZxeAupY8DtyW5HPgucHFrfgfwNmAn8CPgPQBVtSfJR4AHWrsPV9W+i+HvZXAH1fHAl9uDRfqQJI3JkiFRVZd2nnrLiLYFXNHZz43AjSPqDwJnjqj/YFQfkqTx8RvXkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6jp30ACStvh1/+zzvvvpLkx7GWF111qRHcHQwJKbcNL55bJ1bM+khSEcMQ0JTZxqD0U/VWqnD/ppEkrkk306yM8nVkx6PJE2TwzokkhwDfBq4ADgDuDTJGZMdlSRNj8M6JICzgZ1V9WRV/QS4FbhwwmOSpKmRqpr0GLqSvBOYq6rfbevvAs6pqiv3a7cZ2NxWXwt8e4VdngJ8f4XbHqmc83Rwzke/g53vr1bVqfsXD/cL1xlR+4VUq6otwJaD7ix5sKo2Hux+jiTOeTo456Pfas33cD/dtAs4fWh9HfD0hMYiSVPncA+JB4ANSV6d5DjgEuD2CY9JkqbGYX26qar2JrkSuBM4Brixqh5dxS4P+pTVEcg5TwfnfPRblfke1heuJUmTdbifbpIkTZAhIUnqMiSaafv5jyQ3Jtmd5FuTHss4JDk9yT1JHk/yaJL3TXpMqy3JS5Pcn+Sbbc4fmvSYxiXJMUm+keSLkx7LOCT5TpIdSR5O8uAh3bfXJH728x//B/h3DG67fQC4tKoem+jAVlGSfwMsADdX1ZmTHs9qS3IacFpVfT3Jy4GHgIuO8tc4wJqqWkjyEuCrwPuq6r4JD23VJXk/sBE4oareMenxrLYk3wE2VtUh//KgRxIDU/fzH1X1l8CeSY9jXKrqmar6elt+AXgcWDvZUa2uGlhoqy9pj6P+U2GSdcDbgT+Z9FiOBobEwFrgqaH1XRzlbyDTLMl64A3A1yY7ktXXTrs8DOwGtlfVUT9n4A+B3wf+adIDGaMC/iLJQ+1nig4ZQ2JgWT//oSNfkpcBnwN+r6p+OOnxrLaq+mlVvZ7BrxWcneSoPrWY5B3A7qp6aNJjGbPzquqNDH4x+4p2OvmQMCQG/PmPKdDOy38OuKWqPj/p8YxTVT0HzANzEx7KajsP+K12jv5W4M1J/myyQ1p9VfV0+7sb+AKDU+iHhCEx4M9/HOXaRdwbgMer6g8mPZ5xSHJqkhPb8vHAbwJ/NdlRra6q+mBVrauq9Qz+Hd9dVf9hwsNaVUnWtJsxSLIGOB84ZHctGhIMfv4D2PfzH48Dt63yz39MXJLPAPcCr02yK8nlkx7TKjsPeBeDT5YPt8fbJj2oVXYacE+SRxh8ENpeVVNxS+iUmQG+muSbwP3Al6rqK4dq594CK0nq8khCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1/X9Z7NqbbHgXOgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#align = {str} 'center'\n", - "#bottom = {ndarray} [0. 0. 0. 0. 0.]\n", - "#height = {ndarray} [5.30000e+02 1.18700e+04 1.03770e+04 1.49710e+04 5.41057e+05]\n", - "#kwargs = {dict} : {'log': False, 'color': '#1f77b4'}\n", - "#self = {AxesSubplot} AxesSubplot(0.125,0.125;0.775x0.755)\n", - "#width = {ndarray} [1. 1. 1. 1. 1.]\n", - "#x = {ndarray} [0.5 1.5 2.5 3.5 4.5] \n", - "hist = reviews.hist(column=\"rating\", bins = 5)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0 1 2 3 4]\n" - ] - } - ], - "source": [ - "bins = [530.0, 11870.0, 10377.0, 14971.0, 541057.0]\n", - "objects = range(5)\n", - "y_pos = np.arange(len(objects))\n", - "print(y_pos)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD4CAYAAAAZ1BptAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAQoElEQVR4nO3dbaxdVZ3H8e9PCkpQBKEQ0pIpic1EJBGxgSYkkxlwoKCxvJAEMiONIWliMME4iZZ5Q9QhwTdiSJSkkcYy44jEh9AoWhseYkx4uiiCUJneQUZuSmyxgBCjBvzPi7Nqjpez7r0tveeU3u8nOTl7//fae60dwvndvfc6p6kqJEka5S2THoAk6fBlSEiSugwJSVKXISFJ6jIkJEldyyY9gEPt5JNPrlWrVk16GJL0pvLII488X1XLZ9ePuJBYtWoVU1NTkx6GJL2pJPm/UXVvN0mSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkrqOuG9cSxLAqk0/mPQQxuqZGz+0KMf1SkKS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXgkIiyTNJHk/yaJKpVntXkh1JdrX3E1s9SW5OMp3ksSTnDB1nQ2u/K8mGofoH2vGn276Zqw9J0ngcyJXEP1XV2VW1pq1vAu6uqtXA3W0d4BJgdXttBG6BwQc+cD1wHnAucP3Qh/4tre3+/dbN04ckaQzeyO2m9cDWtrwVuGyoflsNPACckOQ04GJgR1Xtq6oXgB3Aurbt+Kq6v6oKuG3WsUb1IUkag4WGRAE/TvJIko2tdmpVPQfQ3k9p9RXAs0P7zrTaXPWZEfW5+vgbSTYmmUoytXfv3gWekiRpPgv99yTOr6rdSU4BdiT51RxtM6JWB1FfsKraDGwGWLNmzQHtK0nqW9CVRFXtbu97gO8xeKbw23ariPa+pzWfAU4f2n0lsHue+soRdeboQ5I0BvOGRJLjkrxj/zJwEfBLYBuwf4bSBuDOtrwNuKrNcloLvNRuFW0HLkpyYntgfRGwvW17OcnaNqvpqlnHGtWHJGkMFnK76VTge21W6jLgv6vqR0keBu5IcjXwG+Dy1v4u4FJgGvgD8HGAqtqX5AvAw63d56tqX1v+BPB14Fjgh+0FcGOnD0nSGMwbElX1NPC+EfXfAReOqBdwTedYW4AtI+pTwFkL7UOSNB5+41qS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpa8EhkeSoJD9P8v22fkaSB5PsSvKtJMe0+lvb+nTbvmroGNe1+lNJLh6qr2u16SSbhuoj+5AkjceBXElcC+wcWv8icFNVrQZeAK5u9auBF6rq3cBNrR1JzgSuAN4LrAO+2oLnKOArwCXAmcCVre1cfUiSxmBBIZFkJfAh4GttPcAFwLdbk63AZW15fVunbb+wtV8P3F5Vf6qqXwPTwLntNV1VT1fVn4HbgfXz9CFJGoOFXkl8GfgM8Je2fhLwYlW92tZngBVteQXwLEDb/lJr/9f6rH169bn6+BtJNiaZSjK1d+/eBZ6SJGk+84ZEkg8De6rqkeHyiKY1z7ZDVX99sWpzVa2pqjXLly8f1USSdBCWLaDN+cBHklwKvA04nsGVxQlJlrW/9FcCu1v7GeB0YCbJMuCdwL6h+n7D+4yqPz9HH5KkMZj3SqKqrquqlVW1isGD53uq6l+Ae4GPtmYbgDvb8ra2Ttt+T1VVq1/RZj+dAawGHgIeBla3mUzHtD62tX16fUiSxuCNfE/is8Cnk0wzeH5wa6vfCpzU6p8GNgFU1RPAHcCTwI+Aa6rqtXaV8ElgO4PZU3e0tnP1IUkag4XcbvqrqroPuK8tP81gZtLsNn8ELu/sfwNww4j6XcBdI+oj+5AkjYffuJYkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUte8IZHkbUkeSvKLJE8k+Vyrn5HkwSS7knwryTGt/ta2Pt22rxo61nWt/lSSi4fq61ptOsmmofrIPiRJ47GQK4k/ARdU1fuAs4F1SdYCXwRuqqrVwAvA1a391cALVfVu4KbWjiRnAlcA7wXWAV9NclSSo4CvAJcAZwJXtrbM0YckaQzmDYkaeKWtHt1eBVwAfLvVtwKXteX1bZ22/cIkafXbq+pPVfVrYBo4t72mq+rpqvozcDuwvu3T60OSNAYLeibR/uJ/FNgD7AD+F3ixql5tTWaAFW15BfAsQNv+EnDScH3WPr36SXP0MXt8G5NMJZnau3fvQk5JkrQACwqJqnqtqs4GVjL4y/89o5q193S2Har6qPFtrqo1VbVm+fLlo5pIkg7CAc1uqqoXgfuAtcAJSZa1TSuB3W15BjgdoG1/J7BvuD5rn179+Tn6kCSNwUJmNy1PckJbPhb4ILATuBf4aGu2AbizLW9r67Tt91RVtfoVbfbTGcBq4CHgYWB1m8l0DIOH29vaPr0+JEljsGz+JpwGbG2zkN4C3FFV30/yJHB7kv8Afg7c2trfCvxnkmkGVxBXAFTVE0nuAJ4EXgWuqarXAJJ8EtgOHAVsqaon2rE+2+lDkjQG84ZEVT0GvH9E/WkGzydm1/8IXN451g3ADSPqdwF3LbQPSdJ4+I1rSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1zRsSSU5Pcm+SnUmeSHJtq78ryY4ku9r7ia2eJDcnmU7yWJJzho61obXflWTDUP0DSR5v+9ycJHP1IUkaj4VcSbwK/FtVvQdYC1yT5ExgE3B3Va0G7m7rAJcAq9trI3ALDD7wgeuB84BzgeuHPvRvaW3377eu1Xt9SJLGYN6QqKrnqupnbfllYCewAlgPbG3NtgKXteX1wG018ABwQpLTgIuBHVW1r6peAHYA69q246vq/qoq4LZZxxrVhyRpDA7omUSSVcD7gQeBU6vqORgECXBKa7YCeHZot5lWm6s+M6LOHH3MHtfGJFNJpvbu3XsgpyRJmsOCQyLJ24HvAJ+qqt/P1XRErQ6ivmBVtbmq1lTVmuXLlx/IrpKkOSwoJJIczSAgvlFV323l37ZbRbT3Pa0+A5w+tPtKYPc89ZUj6nP1IUkag4XMbgpwK7Czqr40tGkbsH+G0gbgzqH6VW2W01rgpXaraDtwUZIT2wPri4DtbdvLSda2vq6adaxRfUiSxmDZAtqcD3wMeDzJo63278CNwB1JrgZ+A1zett0FXApMA38APg5QVfuSfAF4uLX7fFXta8ufAL4OHAv8sL2Yow9J0hjMGxJV9VNGPzcAuHBE+wKu6RxrC7BlRH0KOGtE/Xej+pAkjYffuJYkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUte8IZFkS5I9SX45VHtXkh1JdrX3E1s9SW5OMp3ksSTnDO2zobXflWTDUP0DSR5v+9ycJHP1IUkan4VcSXwdWDertgm4u6pWA3e3dYBLgNXttRG4BQYf+MD1wHnAucD1Qx/6t7S2+/dbN08fkqQxmTckquonwL5Z5fXA1ra8FbhsqH5bDTwAnJDkNOBiYEdV7auqF4AdwLq27fiqur+qCrht1rFG9SFJGpODfSZxalU9B9DeT2n1FcCzQ+1mWm2u+syI+lx9SJLG5FA/uM6IWh1E/cA6TTYmmUoytXfv3gPdXZLUcbAh8dt2q4j2vqfVZ4DTh9qtBHbPU185oj5XH69TVZurak1VrVm+fPlBnpIkabaDDYltwP4ZShuAO4fqV7VZTmuBl9qtou3ARUlObA+sLwK2t20vJ1nbZjVdNetYo/qQJI3JsvkaJPkm8I/AyUlmGMxSuhG4I8nVwG+Ay1vzu4BLgWngD8DHAapqX5IvAA+3dp+vqv0Pwz/BYAbVscAP24s5+pAkjcm8IVFVV3Y2XTiibQHXdI6zBdgyoj4FnDWi/rtRfUiSxsdvXEuSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6lo26QFIWnyrNv1g0kPQm5QhscQtxQ+PZ2780KSHIL1pGBJacpZiMEoH67B/JpFkXZKnkkwn2TTp8UjSUnJYh0SSo4CvAJcAZwJXJjlzsqOSpKXjsA4J4Fxguqqerqo/A7cD6yc8JklaMg73ZxIrgGeH1meA82Y3SrIR2NhWX0ny1EH2dzLw/EHu+2blOS8NnvMRLl98w+f7d6OKh3tIZEStXleo2gxsfsOdJVNVteaNHufNxHNeGjznI99ine/hfrtpBjh9aH0lsHtCY5GkJedwD4mHgdVJzkhyDHAFsG3CY5KkJeOwvt1UVa8m+SSwHTgK2FJVTyxil2/4ltWbkOe8NHjOR75FOd9Uve4WvyRJwOF/u0mSNEGGhCSpy5BoltrPfyTZkmRPkl9OeizjkOT0JPcm2ZnkiSTXTnpMiy3J25I8lOQX7Zw/N+kxjUuSo5L8PMn3Jz2WcUjyTJLHkzyaZOqQHttnEn/9+Y//Af6ZwbTbh4Erq+rJiQ5sESX5B+AV4LaqOmvS41lsSU4DTquqnyV5B/AIcNkR/t84wHFV9UqSo4GfAtdW1QMTHtqiS/JpYA1wfFV9eNLjWWxJngHWVNUh//KgVxIDS+7nP6rqJ8C+SY9jXKrquar6WVt+GdjJ4Bv9R6waeKWtHt1eR/xfhUlWAh8CvjbpsRwJDImBUT//cUR/gCxlSVYB7wcenOxIFl+77fIosAfYUVVH/DkDXwY+A/xl0gMZowJ+nOSR9jNFh4whMbCgn//Qm1+StwPfAT5VVb+f9HgWW1W9VlVnM/i1gnOTHNG3FpN8GNhTVY9Meixjdn5VncPgF7OvabeTDwlDYsCf/1gC2n357wDfqKrvTno841RVLwL3AesmPJTFdj7wkXaP/nbggiT/NdkhLb6q2t3e9wDfY3AL/ZAwJAb8+Y8jXHuIeyuws6q+NOnxjEOS5UlOaMvHAh8EfjXZUS2uqrquqlZW1SoG/x/fU1X/OuFhLaokx7XJGCQ5DrgIOGSzFg0JBj//Aez/+Y+dwB2L/PMfE5fkm8D9wN8nmUly9aTHtMjOBz7G4C/LR9vr0kkPapGdBtyb5DEGfwjtqKolMSV0iTkV+GmSXwAPAT+oqh8dqoM7BVaS1OWVhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6vp/iRg84VQhRvYAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.bar(y_pos, bins, align='edge', width=1.0)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#### Typically how many vendors does a reviewer review? (mainly one or two)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "IOPub data rate exceeded.\n", - "The notebook server will temporarily stop sending output\n", - "to the client in order to avoid crashing it.\n", - "To change this limit, set the config variable\n", - "`--NotebookApp.iopub_data_rate_limit`.\n", - "\n", - "Current values:\n", - "NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)\n", - "NotebookApp.rate_limit_window=3.0 (secs)\n", - "\n" - ] - } - ], - "source": [ - "g = reviews.groupby('reviewerId')['vendorId'].nunique()\n", - "\n", - "print(type(g))\n", - "\n", - "print(reviews.groupby('reviewerId').indices)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(reviews.groupby('reviewerId')['vendorId'].nunique(), '.')\n", - "plt.xlabel('reviewerId')\n", - "plt.ylabel('dc(vendorId)')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Aggregate and Pivot data\n", - "\n", - "Pivot data so we get summaries for each reviewer." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "aggregations = {\n", - " 'rating':'mean',\n", - " 'vendorId':'nunique',\n", - " 'reviewerId':'count'\n", - "}\n", - "\n", - "grouped = reviews.groupby('reviewerId').agg(aggregations)\n", - "grouped.columns=['avg_rating', 'dc_vendorId', 'count']" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_ratingdc_vendorIdcount
reviewerId
05.00000011
15.00000079
24.00000055
34.57142977
45.00000011
\n", - "
" - ], - "text/plain": [ - " avg_rating dc_vendorId count\n", - "reviewerId \n", - "0 5.000000 1 1\n", - "1 5.000000 7 9\n", - "2 4.000000 5 5\n", - "3 4.571429 7 7\n", - "4 5.000000 1 1" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grouped.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_ratingdc_vendorIdcount
count400141.000000400141.000000400141.000000
mean4.6438461.3570171.446503
std0.8135860.9894131.541802
min0.0000001.0000001.000000
25%5.0000001.0000001.000000
50%5.0000001.0000001.000000
75%5.0000001.0000001.000000
max5.00000032.000000168.000000
\n", - "
" - ], - "text/plain": [ - " avg_rating dc_vendorId count\n", - "count 400141.000000 400141.000000 400141.000000\n", - "mean 4.643846 1.357017 1.446503\n", - "std 0.813586 0.989413 1.541802\n", - "min 0.000000 1.000000 1.000000\n", - "25% 5.000000 1.000000 1.000000\n", - "50% 5.000000 1.000000 1.000000\n", - "75% 5.000000 1.000000 1.000000\n", - "max 5.000000 32.000000 168.000000" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grouped.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.rcParams[\"figure.figsize\"] = (10,10)\n", - "\n", - "scatter_matrix(grouped)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Find 'haters'\n", - "\n", - "Reviewers that give more than five zero star reviews to one vendor" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_ratingdc_vendorIdcount
reviewerId
103920.0194
170330.0151
210460.0125
114790.0120
274480.0119
81850.0115
176020.0115
139840.0110
2281290.019
252670.018
534320.018
198130.017
119870.016
1355060.016
\n", - "
" - ], - "text/plain": [ - " avg_rating dc_vendorId count\n", - "reviewerId \n", - "10392 0.0 1 94\n", - "17033 0.0 1 51\n", - "21046 0.0 1 25\n", - "11479 0.0 1 20\n", - "27448 0.0 1 19\n", - "8185 0.0 1 15\n", - "17602 0.0 1 15\n", - "13984 0.0 1 10\n", - "228129 0.0 1 9\n", - "25267 0.0 1 8\n", - "53432 0.0 1 8\n", - "19813 0.0 1 7\n", - "11987 0.0 1 6\n", - "135506 0.0 1 6" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grouped[ \n", - " (grouped['dc_vendorId'] == 1) & \n", - " (grouped['count'] > 5) & \n", - " (grouped['avg_rating'] == 0)\n", - "].sort_values('count', ascending=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For example, reviewer 10392 gives 94 zero star reviews to vendor 122" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
reviewerIdvendorIdratingdate
123401039212202006-04-05 15:38:00
123411039212202006-04-06 09:24:00
123421039212202006-04-06 20:24:00
123431039212202006-04-11 07:43:00
123441039212202006-04-11 15:53:00
\n", - "
" - ], - "text/plain": [ - " reviewerId vendorId rating date\n", - "12340 10392 122 0 2006-04-05 15:38:00\n", - "12341 10392 122 0 2006-04-06 09:24:00\n", - "12342 10392 122 0 2006-04-06 20:24:00\n", - "12343 10392 122 0 2006-04-11 07:43:00\n", - "12344 10392 122 0 2006-04-11 15:53:00" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reviews[reviews['reviewerId'] == 10392].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Find 'fanboys'\n", - "\n", - "Reviewers that give more than five five star reviews to one vendor" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
avg_ratingdc_vendorIdcount
reviewerId
1837515.0173
2602255.0169
2058645.0135
3450805.0123
1799445.0122
3450825.0121
3450815.0120
3450685.0120
3450695.0119
3450865.0118
3450835.0118
3450705.0118
3450855.0117
3450845.0117
2646355.0113
3212065.0112
125395.0111
1590355.0110
1146615.019
396555.018
225155.017
1800825.017
584475.017
1800855.016
1681435.016
1600185.016
750105.016
350485.016
304745.016
288145.016
3932375.016
\n", - "
" - ], - "text/plain": [ - " avg_rating dc_vendorId count\n", - "reviewerId \n", - "183751 5.0 1 73\n", - "260225 5.0 1 69\n", - "205864 5.0 1 35\n", - "345080 5.0 1 23\n", - "179944 5.0 1 22\n", - "345082 5.0 1 21\n", - "345081 5.0 1 20\n", - "345068 5.0 1 20\n", - "345069 5.0 1 19\n", - "345086 5.0 1 18\n", - "345083 5.0 1 18\n", - "345070 5.0 1 18\n", - "345085 5.0 1 17\n", - "345084 5.0 1 17\n", - "264635 5.0 1 13\n", - "321206 5.0 1 12\n", - "12539 5.0 1 11\n", - "159035 5.0 1 10\n", - "114661 5.0 1 9\n", - "39655 5.0 1 8\n", - "22515 5.0 1 7\n", - "180082 5.0 1 7\n", - "58447 5.0 1 7\n", - "180085 5.0 1 6\n", - "168143 5.0 1 6\n", - "160018 5.0 1 6\n", - "75010 5.0 1 6\n", - "35048 5.0 1 6\n", - "30474 5.0 1 6\n", - "28814 5.0 1 6\n", - "393237 5.0 1 6" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grouped[ \n", - " (grouped['dc_vendorId'] == 1) & \n", - " (grouped['count'] > 5) & \n", - " (grouped['avg_rating'] == 5) \n", - "].sort_values('count', ascending=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reviewer 183751 gives 73 five star reviews to vendor 190" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
reviewerIdvendorIdratingdate
26146918375119052006-09-22 16:36:00
26147018375119052006-09-22 16:36:00
26147118375119052006-09-22 16:35:00
26147418375119052006-09-22 15:53:00
26147518375119052006-09-22 15:53:00
\n", - "
" - ], - "text/plain": [ - " reviewerId vendorId rating date\n", - "261469 183751 190 5 2006-09-22 16:36:00\n", - "261470 183751 190 5 2006-09-22 16:36:00\n", - "261471 183751 190 5 2006-09-22 16:35:00\n", - "261474 183751 190 5 2006-09-22 15:53:00\n", - "261475 183751 190 5 2006-09-22 15:53:00" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reviews[reviews['reviewerId'] == 183751].head()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/eland/utils.py b/eland/utils.py index 51f50cf..3375539 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -7,7 +7,7 @@ def read_es(es_params, index_pattern): return DataFrame(client=es_params, index_pattern=index_pattern) -def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False): +def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None): """ Append a pandas DataFrame to an Elasticsearch index. Mainly used in testing. @@ -30,10 +30,19 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size= If table exists, drop it, recreate it, and insert data. ``'append'`` If table exists, insert data. Create if does not exist. + + dropna : bool + ``'True'`` + Remove missing values (see pandas.Series.dropna) + ``'False;`` + Include missing values - may cause bulk to fail + + geo_points : list or None + List of columns to map to geo_point data type """ client = Client(es_params) - mapping = Mappings._generate_es_mappings(df) + mapping = Mappings._generate_es_mappings(df, geo_points) # If table exists, check if_exists parameter if client.index_exists(index=destination_index): @@ -58,7 +67,11 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size= for row in df.iterrows(): # Use index as _id id = row[0] - values = row[1].to_dict() + + if dropna: + values = row[1].dropna().to_dict() + else: + values = row[1].to_dict() # Use integer as id field for repeatable results action = {'_index': destination_index, '_source': values, '_id': str(id)} diff --git a/requirements.txt b/requirements.txt index b3de7a4..70cfd76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,2 @@ -elasticsearch==7.0.2 -elasticsearch-dsl==7.0.0 -numpy==1.16.4 -pandas==0.24.2 -python-dateutil==2.8.0 -pytz==2019.1 -six==1.12.0 -urllib3==1.25.3 +elasticsearch>=7.0.5 +pandas==0.25.1 diff --git a/setup.py b/setup.py index 515fe57..28f078d 100644 --- a/setup.py +++ b/setup.py @@ -13,8 +13,7 @@ setup(name='eland', license='ELASTIC LICENSE', packages=['eland'], install_requires=[ - 'elasticsearch', - 'elasticsearch_dsl', - 'pandas' + 'elasticsearch>=7.0.5', + 'pandas==0.25.1' ], zip_safe=False)