mirror of
https://github.com/elastic/eland.git
synced 2025-07-24 00:00:39 +08:00
Add Support for Series Histograms (#95)
* add support for series plotting * update docs for series plotting support * add tests for series plotting * fix typo * adds comment to ed_hist_series
This commit is contained in:
parent
40a584dcc2
commit
79fdb1727e
BIN
docs/source/reference/api/eland-Series-hist-1.png
Normal file
BIN
docs/source/reference/api/eland-Series-hist-1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 45 KiB |
8
docs/source/reference/api/eland.Series.hist.rst
Normal file
8
docs/source/reference/api/eland.Series.hist.rst
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
eland.Series.hist
|
||||||
|
====================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. automethod:: Series.hist
|
||||||
|
.. image:: eland-Series-hist-1.png
|
||||||
|
|
@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
import copy
|
import copy
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
import warnings
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@ -258,12 +259,10 @@ class Operations:
|
|||||||
|
|
||||||
for field in numeric_source_fields:
|
for field in numeric_source_fields:
|
||||||
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
||||||
|
|
||||||
response = query_compiler._client.search(
|
response = query_compiler._client.search(
|
||||||
index=query_compiler._index_pattern,
|
index=query_compiler._index_pattern,
|
||||||
size=0,
|
size=0,
|
||||||
body=body.to_search_body())
|
body=body.to_search_body())
|
||||||
|
|
||||||
# results are like
|
# results are like
|
||||||
# "aggregations" : {
|
# "aggregations" : {
|
||||||
# "DistanceKilometers" : {
|
# "DistanceKilometers" : {
|
||||||
@ -293,6 +292,19 @@ class Operations:
|
|||||||
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.]
|
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.]
|
||||||
# So sum last 2 buckets
|
# So sum last 2 buckets
|
||||||
for field in numeric_source_fields:
|
for field in numeric_source_fields:
|
||||||
|
|
||||||
|
# in case of series let plotting.ed_hist_series thrown an exception
|
||||||
|
if not response.get('aggregations'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# in case of dataframe, throw warning that field is excluded
|
||||||
|
if not response['aggregations'].get(field):
|
||||||
|
warnings.warn("{} has no meaningful histogram interval and will be excluded. "
|
||||||
|
"All values 0."
|
||||||
|
.format(field),
|
||||||
|
UserWarning)
|
||||||
|
continue
|
||||||
|
|
||||||
buckets = response['aggregations'][field]['buckets']
|
buckets = response['aggregations'][field]['buckets']
|
||||||
|
|
||||||
bins[field] = []
|
bins[field] = []
|
||||||
|
@ -86,3 +86,48 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
|||||||
fig.subplots_adjust(wspace=0.3, hspace=0.3)
|
fig.subplots_adjust(wspace=0.3, hspace=0.3)
|
||||||
|
|
||||||
return axes
|
return axes
|
||||||
|
|
||||||
|
def ed_hist_series(ed_s, column=None, by=None, grid=True, xlabelsize=None,
|
||||||
|
xrot=None, ylabelsize=None, yrot=None, ax=None,
|
||||||
|
figsize=None, layout=None, bins=10, **kwds):
|
||||||
|
"""
|
||||||
|
See :pandas_api_docs:`pandas.Series.hist` for usage.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
Derived from ``pandas.plotting._core.hist_frame 0.24.2`` - TODO update to ``0.25.1``
|
||||||
|
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame('localhost', 'ecommerce')
|
||||||
|
>>> hist = df['taxful_total_price'].hist(figsize=[10,10]) # doctest: +SKIP
|
||||||
|
"""
|
||||||
|
# this is mostly the same code as above, it has been split out
|
||||||
|
# to a series specific method now so we can expand series plotting
|
||||||
|
|
||||||
|
|
||||||
|
# Start with empty pandas data frame derived from
|
||||||
|
ed_s_bins, ed_s_weights = ed_s._hist(num_bins=bins)
|
||||||
|
|
||||||
|
if by is not None:
|
||||||
|
raise NotImplementedError("TODO")
|
||||||
|
|
||||||
|
# raise error rather than warning when series is not plottable
|
||||||
|
if ed_s_bins.empty:
|
||||||
|
raise ValueError("{} has no meaningful histogram interval. All values 0."
|
||||||
|
.format(ed_s.name))
|
||||||
|
|
||||||
|
naxes = len(ed_s_bins.columns)
|
||||||
|
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, figsize=figsize, layout=layout)
|
||||||
|
_axes = _flatten(axes)
|
||||||
|
for i, col in enumerate(com.try_sort(ed_s_bins.columns)):
|
||||||
|
ax = _axes[i]
|
||||||
|
ax.hist(ed_s_bins[col][:-1], bins=ed_s_bins[col], weights=ed_s_weights[col], **kwds)
|
||||||
|
ax.grid(grid)
|
||||||
|
|
||||||
|
_set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
|
||||||
|
ylabelsize=ylabelsize, yrot=yrot)
|
||||||
|
fig.subplots_adjust(wspace=0.3, hspace=0.3)
|
||||||
|
|
||||||
|
return axes
|
@ -148,7 +148,10 @@ class Query:
|
|||||||
"interval": interval
|
"interval": interval
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self._aggs[name] = agg
|
|
||||||
|
if not min == max == 0:
|
||||||
|
self._aggs[name] = agg
|
||||||
|
|
||||||
|
|
||||||
def to_search_body(self):
|
def to_search_body(self):
|
||||||
if self._query.empty():
|
if self._query.empty():
|
||||||
|
@ -40,6 +40,7 @@ from pandas.io.common import _expand_user, _stringify_path
|
|||||||
from eland import NDFrame
|
from eland import NDFrame
|
||||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||||
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
|
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
|
||||||
|
import eland.plotting as gfx
|
||||||
|
|
||||||
|
|
||||||
def _get_method_name():
|
def _get_method_name():
|
||||||
@ -106,6 +107,8 @@ class Series(NDFrame):
|
|||||||
index_field=index_field,
|
index_field=index_field,
|
||||||
query_compiler=query_compiler)
|
query_compiler=query_compiler)
|
||||||
|
|
||||||
|
hist = gfx.ed_hist_series
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def empty(self):
|
def empty(self):
|
||||||
"""Determines if the Series is empty.
|
"""Determines if the Series is empty.
|
||||||
|
33
eland/tests/plotting/test_series_hist_pytest.py
Normal file
33
eland/tests/plotting/test_series_hist_pytest.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# Copyright 2019 Elasticsearch BV
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
import pytest
|
||||||
|
from matplotlib.testing.decorators import check_figures_equal
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
@check_figures_equal(extensions=['png'])
|
||||||
|
def test_plot_hist(fig_test, fig_ref):
|
||||||
|
test_data = TestData()
|
||||||
|
|
||||||
|
pd_flights = test_data.pd_flights()['FlightDelayMin']
|
||||||
|
ed_flights = test_data.ed_flights()['FlightDelayMin']
|
||||||
|
|
||||||
|
pd_ax = fig_ref.subplots()
|
||||||
|
ed_ax = fig_test.subplots()
|
||||||
|
|
||||||
|
pd_flights.hist(ax=pd_ax)
|
||||||
|
ed_flights.hist(ax=ed_ax)
|
49
eland/tests/series/test_hist_pytest.py
Normal file
49
eland/tests/series/test_hist_pytest.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# Copyright 2019 Elasticsearch BV
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from pandas.util.testing import assert_almost_equal
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestSeriesFrameHist(TestData):
|
||||||
|
|
||||||
|
def test_flight_delay_min_hist(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
num_bins = 10
|
||||||
|
|
||||||
|
# pandas data
|
||||||
|
pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
|
||||||
|
|
||||||
|
pd_bins = pd.DataFrame(
|
||||||
|
{'FlightDelayMin': pd_flightdelaymin[1]})
|
||||||
|
pd_weights = pd.DataFrame(
|
||||||
|
{'FlightDelayMin': pd_flightdelaymin[0]})
|
||||||
|
|
||||||
|
ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins)
|
||||||
|
|
||||||
|
# Numbers are slightly different
|
||||||
|
assert_almost_equal(pd_bins, ed_bins)
|
||||||
|
assert_almost_equal(pd_weights, ed_weights)
|
||||||
|
|
||||||
|
def test_invalid_hist(self):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
assert self.ed_ecommerce()['products.tax_amount'].hist()
|
Loading…
x
Reference in New Issue
Block a user