Add Support for Series Histograms (#95)

* add support for series plotting
* update docs for series plotting support
* add tests for series plotting
* fix typo
* adds comment to ed_hist_series
This commit is contained in:
Michael Hirsch 2019-12-11 14:51:47 -05:00 committed by GitHub
parent 40a584dcc2
commit 79fdb1727e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 156 additions and 3 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

View File

@ -0,0 +1,8 @@
eland.Series.hist
====================
.. currentmodule:: eland
.. automethod:: Series.hist
.. image:: eland-Series-hist-1.png

View File

@ -14,6 +14,7 @@
import copy
from collections import OrderedDict
import warnings
import pandas as pd
@ -258,12 +259,10 @@ class Operations:
for field in numeric_source_fields:
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
response = query_compiler._client.search(
index=query_compiler._index_pattern,
size=0,
body=body.to_search_body())
# results are like
# "aggregations" : {
# "DistanceKilometers" : {
@ -293,6 +292,19 @@ class Operations:
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.]
# So sum last 2 buckets
for field in numeric_source_fields:
# in case of series let plotting.ed_hist_series thrown an exception
if not response.get('aggregations'):
continue
# in case of dataframe, throw warning that field is excluded
if not response['aggregations'].get(field):
warnings.warn("{} has no meaningful histogram interval and will be excluded. "
"All values 0."
.format(field),
UserWarning)
continue
buckets = response['aggregations'][field]['buckets']
bins[field] = []

View File

@ -86,3 +86,48 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
fig.subplots_adjust(wspace=0.3, hspace=0.3)
return axes
def ed_hist_series(ed_s, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None,
figsize=None, layout=None, bins=10, **kwds):
"""
See :pandas_api_docs:`pandas.Series.hist` for usage.
Notes
-----
Derived from ``pandas.plotting._core.hist_frame 0.24.2`` - TODO update to ``0.25.1``
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce')
>>> hist = df['taxful_total_price'].hist(figsize=[10,10]) # doctest: +SKIP
"""
# this is mostly the same code as above, it has been split out
# to a series specific method now so we can expand series plotting
# Start with empty pandas data frame derived from
ed_s_bins, ed_s_weights = ed_s._hist(num_bins=bins)
if by is not None:
raise NotImplementedError("TODO")
# raise error rather than warning when series is not plottable
if ed_s_bins.empty:
raise ValueError("{} has no meaningful histogram interval. All values 0."
.format(ed_s.name))
naxes = len(ed_s_bins.columns)
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, figsize=figsize, layout=layout)
_axes = _flatten(axes)
for i, col in enumerate(com.try_sort(ed_s_bins.columns)):
ax = _axes[i]
ax.hist(ed_s_bins[col][:-1], bins=ed_s_bins[col], weights=ed_s_weights[col], **kwds)
ax.grid(grid)
_set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
ylabelsize=ylabelsize, yrot=yrot)
fig.subplots_adjust(wspace=0.3, hspace=0.3)
return axes

View File

@ -148,8 +148,11 @@ class Query:
"interval": interval
}
}
if not min == max == 0:
self._aggs[name] = agg
def to_search_body(self):
if self._query.empty():
if self._aggs:

View File

@ -40,6 +40,7 @@ from pandas.io.common import _expand_user, _stringify_path
from eland import NDFrame
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
import eland.plotting as gfx
def _get_method_name():
@ -106,6 +107,8 @@ class Series(NDFrame):
index_field=index_field,
query_compiler=query_compiler)
hist = gfx.ed_hist_series
@property
def empty(self):
"""Determines if the Series is empty.

View File

@ -0,0 +1,33 @@
# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# File called _pytest for PyCharm compatability
import pytest
from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData
@check_figures_equal(extensions=['png'])
def test_plot_hist(fig_test, fig_ref):
test_data = TestData()
pd_flights = test_data.pd_flights()['FlightDelayMin']
ed_flights = test_data.ed_flights()['FlightDelayMin']
pd_ax = fig_ref.subplots()
ed_ax = fig_test.subplots()
pd_flights.hist(ax=pd_ax)
ed_flights.hist(ax=ed_ax)

View File

@ -0,0 +1,49 @@
# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# File called _pytest for PyCharm compatability
import numpy as np
import pandas as pd
from pandas.util.testing import assert_almost_equal
import pytest
from eland.tests.common import TestData
class TestSeriesFrameHist(TestData):
def test_flight_delay_min_hist(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
num_bins = 10
# pandas data
pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
pd_bins = pd.DataFrame(
{'FlightDelayMin': pd_flightdelaymin[1]})
pd_weights = pd.DataFrame(
{'FlightDelayMin': pd_flightdelaymin[0]})
ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins)
# Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)
def test_invalid_hist(self):
with pytest.raises(ValueError):
assert self.ed_ecommerce()['products.tax_amount'].hist()