diff --git a/docs/source/reference/api/eland-Series-hist-1.png b/docs/source/reference/api/eland-Series-hist-1.png new file mode 100644 index 0000000..d1c0465 Binary files /dev/null and b/docs/source/reference/api/eland-Series-hist-1.png differ diff --git a/docs/source/reference/api/eland.Series.hist.rst b/docs/source/reference/api/eland.Series.hist.rst new file mode 100644 index 0000000..2b1378f --- /dev/null +++ b/docs/source/reference/api/eland.Series.hist.rst @@ -0,0 +1,8 @@ +eland.Series.hist +==================== + +.. currentmodule:: eland + +.. automethod:: Series.hist +.. image:: eland-Series-hist-1.png + diff --git a/eland/operations.py b/eland/operations.py index ea31fb1..ab0b966 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -14,6 +14,7 @@ import copy from collections import OrderedDict +import warnings import pandas as pd @@ -258,12 +259,10 @@ class Operations: for field in numeric_source_fields: body.hist_aggs(field, field, min_aggs, max_aggs, num_bins) - response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) - # results are like # "aggregations" : { # "DistanceKilometers" : { @@ -293,6 +292,19 @@ class Operations: # weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.] # So sum last 2 buckets for field in numeric_source_fields: + + # in case of series let plotting.ed_hist_series thrown an exception + if not response.get('aggregations'): + continue + + # in case of dataframe, throw warning that field is excluded + if not response['aggregations'].get(field): + warnings.warn("{} has no meaningful histogram interval and will be excluded. " + "All values 0." + .format(field), + UserWarning) + continue + buckets = response['aggregations'][field]['buckets'] bins[field] = [] diff --git a/eland/plotting.py b/eland/plotting.py index 253884e..3c34132 100644 --- a/eland/plotting.py +++ b/eland/plotting.py @@ -86,3 +86,48 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None, fig.subplots_adjust(wspace=0.3, hspace=0.3) return axes + +def ed_hist_series(ed_s, column=None, by=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, ax=None, + figsize=None, layout=None, bins=10, **kwds): + """ + See :pandas_api_docs:`pandas.Series.hist` for usage. + + Notes + ----- + Derived from ``pandas.plotting._core.hist_frame 0.24.2`` - TODO update to ``0.25.1`` + + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce') + >>> hist = df['taxful_total_price'].hist(figsize=[10,10]) # doctest: +SKIP + """ + # this is mostly the same code as above, it has been split out + # to a series specific method now so we can expand series plotting + + + # Start with empty pandas data frame derived from + ed_s_bins, ed_s_weights = ed_s._hist(num_bins=bins) + + if by is not None: + raise NotImplementedError("TODO") + + # raise error rather than warning when series is not plottable + if ed_s_bins.empty: + raise ValueError("{} has no meaningful histogram interval. All values 0." + .format(ed_s.name)) + + naxes = len(ed_s_bins.columns) + fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, figsize=figsize, layout=layout) + _axes = _flatten(axes) + for i, col in enumerate(com.try_sort(ed_s_bins.columns)): + ax = _axes[i] + ax.hist(ed_s_bins[col][:-1], bins=ed_s_bins[col], weights=ed_s_weights[col], **kwds) + ax.grid(grid) + + _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot) + fig.subplots_adjust(wspace=0.3, hspace=0.3) + + return axes \ No newline at end of file diff --git a/eland/query.py b/eland/query.py index 19bce86..de82744 100644 --- a/eland/query.py +++ b/eland/query.py @@ -148,7 +148,10 @@ class Query: "interval": interval } } - self._aggs[name] = agg + + if not min == max == 0: + self._aggs[name] = agg + def to_search_body(self): if self._query.empty(): diff --git a/eland/series.py b/eland/series.py index 250c37b..d695eff 100644 --- a/eland/series.py +++ b/eland/series.py @@ -40,6 +40,7 @@ from pandas.io.common import _expand_user, _stringify_path from eland import NDFrame from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn +import eland.plotting as gfx def _get_method_name(): @@ -106,6 +107,8 @@ class Series(NDFrame): index_field=index_field, query_compiler=query_compiler) + hist = gfx.ed_hist_series + @property def empty(self): """Determines if the Series is empty. diff --git a/eland/tests/plotting/test_series_hist_pytest.py b/eland/tests/plotting/test_series_hist_pytest.py new file mode 100644 index 0000000..6fb85b7 --- /dev/null +++ b/eland/tests/plotting/test_series_hist_pytest.py @@ -0,0 +1,33 @@ +# Copyright 2019 Elasticsearch BV +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# File called _pytest for PyCharm compatability +import pytest +from matplotlib.testing.decorators import check_figures_equal + +from eland.tests.common import TestData + + +@check_figures_equal(extensions=['png']) +def test_plot_hist(fig_test, fig_ref): + test_data = TestData() + + pd_flights = test_data.pd_flights()['FlightDelayMin'] + ed_flights = test_data.ed_flights()['FlightDelayMin'] + + pd_ax = fig_ref.subplots() + ed_ax = fig_test.subplots() + + pd_flights.hist(ax=pd_ax) + ed_flights.hist(ax=ed_ax) diff --git a/eland/tests/series/test_hist_pytest.py b/eland/tests/series/test_hist_pytest.py new file mode 100644 index 0000000..9fdbc79 --- /dev/null +++ b/eland/tests/series/test_hist_pytest.py @@ -0,0 +1,49 @@ +# Copyright 2019 Elasticsearch BV +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# File called _pytest for PyCharm compatability + +import numpy as np +import pandas as pd +from pandas.util.testing import assert_almost_equal +import pytest + +from eland.tests.common import TestData + + +class TestSeriesFrameHist(TestData): + + def test_flight_delay_min_hist(self): + pd_flights = self.pd_flights() + ed_flights = self.ed_flights() + + num_bins = 10 + + # pandas data + pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins) + + pd_bins = pd.DataFrame( + {'FlightDelayMin': pd_flightdelaymin[1]}) + pd_weights = pd.DataFrame( + {'FlightDelayMin': pd_flightdelaymin[0]}) + + ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins) + + # Numbers are slightly different + assert_almost_equal(pd_bins, ed_bins) + assert_almost_equal(pd_weights, ed_weights) + + def test_invalid_hist(self): + with pytest.raises(ValueError): + assert self.ed_ecommerce()['products.tax_amount'].hist()