Adding first implementation of eland.DataFrame.hist

This commit is contained in:
Stephen Dodson 2019-07-31 09:59:52 +00:00
parent 1fa4d3fbe7
commit 3435ffac1b
13 changed files with 180 additions and 105 deletions

View File

@ -13,7 +13,8 @@ from pandas.io.formats.printing import pprint_thing
from eland import NDFrame
from eland import Series
from eland import hist_frame
import eland.plotting as gfx
class DataFrame(NDFrame):
# This is effectively 2 constructors
@ -457,4 +458,4 @@ class DataFrame(NDFrame):
def keys(self):
return self.columns
hist = hist_frame
hist = gfx.ed_hist_frame

View File

@ -227,8 +227,8 @@ class NDFrame(BasePandasDataset):
raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.max()
def _hist(self, interval)
return self._query_compiler._hist(interval)
def _hist(self, num_bins):
return self._query_compiler._hist(num_bins)
def describe(self):
return self._query_compiler.describe()

View File

@ -166,8 +166,8 @@ class Operations:
results[field] = response['aggregations'][field]['value']
# Return single value if this is a series
if len(numeric_source_fields) == 1:
return np.float64(results[numeric_source_fields[0]])
#if len(numeric_source_fields) == 1:
# return np.float64(results[numeric_source_fields[0]])
s = pd.Series(data=results, index=numeric_source_fields)
@ -203,7 +203,8 @@ class Operations:
return s
def _hist_aggs(self, query_compiler, bins):
def _hist_aggs(self, query_compiler, num_bins):
# Get histogram bins and weights for numeric columns
query_params, post_processing = self._resolve_tasks()
size = self._size(query_params, post_processing)
@ -220,21 +221,59 @@ class Operations:
max_aggs = self._metric_aggs(query_compiler, 'max')
for field in numeric_source_fields:
body.hist_aggs(field, min_aggs, max_aggs, bins)
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
response = query_compiler._client.search(
index=query_compiler._index_pattern,
size=0,
body=body.to_search_body())
results = {}
# results are like
# "aggregations" : {
# "DistanceKilometers" : {
# "buckets" : [
# {
# "key" : 0.0,
# "doc_count" : 2956
# },
# {
# "key" : 1988.1482421875,
# "doc_count" : 768
# },
# ...
bins = {}
weights = {}
# There is one more bin that weights
# len(bins) = len(weights) + 1
# bins = [ 0. 36. 72. 108. 144. 180. 216. 252. 288. 324. 360.]
# len(bins) == 11
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 394.]
# len(weights) == 10
# ES returns
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.]
# So sum last 2 buckets
for field in numeric_source_fields:
results[field] = response['aggregations'][field]['value']
buckets = response['aggregations'][field]['buckets']
s = pd.Series(data=results, index=numeric_source_fields)
bins[field] = []
weights[field] = []
return s
for bucket in buckets:
bins[field].append(bucket['key'])
if bucket == buckets[-1]:
weights[field][-1] += bucket['doc_count']
else:
weights[field].append(bucket['doc_count'])
df_bins = pd.DataFrame(data=bins)
df_weights = pd.DataFrame(data=weights)
return df_bins, df_weights
def describe(self, query_compiler):
query_params, post_processing = self._resolve_tasks()

View File

@ -7,40 +7,63 @@ from pandas.plotting._core import (
_raise_if_no_mpl, _converter, grouped_hist, _subplots, _flatten, _set_ticks_props)
def hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
"""
Derived from pandas.plotting._core.hist_frame 0.24.2
Ideally, we'd call hist_frame directly with histogram data,
but weights are applied to ALL series. For example, we can
plot a histogram of pre-binned data via:
counts, bins = np.histogram(data)
plt.hist(bins[:-1], bins, weights=counts)
However,
ax.hist(data[col].dropna().values, bins=bins, **kwds)
is for [col] and weights are a single array.
We therefore cut/paste code.
"""
# Start with empty pandas data frame derived from
empty_pd_df = ed_df._empty_pd_df()
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
_raise_if_no_mpl()
_converter._WARN = False
if by is not None:
axes = grouped_hist(empty_pd_df, column=column, by=by, ax=ax, grid=grid,
raise NotImplementedError("TODO")
"""
axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid,
figsize=figsize, sharex=sharex, sharey=sharey,
layout=layout, bins=bins, xlabelsize=xlabelsize,
xrot=xrot, ylabelsize=ylabelsize,
yrot=yrot, **kwds)
"""
return axes
if column is not None:
if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
column = [column]
empty_pd_df = empty_pd_df[column]
data = empty_pd_df._get_numeric_data()
naxes = len(empty_pd_df.columns)
ed_df_bins = ed_df_bins[column]
ed_df_weights = ed_df_weights[column]
naxes = len(ed_df_bins.columns)
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
sharex=sharex, sharey=sharey, figsize=figsize,
layout=layout)
_axes = _flatten(axes)
for i, col in enumerate(com.try_sort(data.columns)):
for i, col in enumerate(com.try_sort(ed_df_bins.columns)):
ax = _axes[i]
ax.hist(empty_pd_df[col].dropna().values, bins=bins, **kwds)
# pandas code
# pandas / plotting / _core.py: 2410
# ax.hist(data[col].dropna().values, bins=bins, **kwds)
ax.hist(ed_df_bins[col][:-1], bins=ed_df_bins[col], weights=ed_df_weights[col])
ax.set_title(col)
ax.grid(grid)

View File

@ -68,7 +68,7 @@ class Query:
}
self._aggs[name] = agg
def hist_aggs(self, name, field, min_aggs, max_aggs, bins):
def hist_aggs(self, name, field, min_aggs, max_aggs, num_bins):
"""
Add histogram agg e.g.
"aggs": {
@ -80,6 +80,18 @@ class Query:
}
}
"""
min = min_aggs[field]
max = max_aggs[field]
interval = (max - min)/num_bins
agg = {
"histogram": {
"field": field,
"interval": interval
}
}
self._aggs[name] = agg
def to_search_body(self):
body = {"query": self._query, "aggs": self._aggs}

View File

@ -370,5 +370,5 @@ class ElandQueryCompiler(BaseQueryCompiler):
def describe(self):
return self._operations.describe(self)
def _hist(self, interval):
return self._operations.hist(self, interval)
def _hist(self, num_bins):
return self._operations.hist(self, num_bins)

View File

@ -4,7 +4,7 @@ from io import StringIO
from eland.tests.common import TestData
class TestDataFrameInfo(TestData):
class TestDataFrameDescribe(TestData):
def test_to_describe1(self):
pd_flights = self.pd_flights()

View File

@ -10,7 +10,7 @@ from eland.tests.common import (
import numpy as np
class TestDataFrameiLoc(TestData):
class TestDataFrameGet(TestData):
def test_get1(self):
ed_flights = self.ed_flights()

View File

@ -0,0 +1,31 @@
# File called _pytest for PyCharm compatability
import numpy as np
import pandas as pd
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
class TestDataFrameHist(TestData):
def test_to_hist1(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
num_bins = 10
# pandas data
pd_distancekilometers = np.histogram(pd_flights['DistanceKilometers'], num_bins)
pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
pd_bins = pd.DataFrame(
{'DistanceKilometers': pd_distancekilometers[1], 'FlightDelayMin': pd_flightdelaymin[1]})
pd_weights = pd.DataFrame(
{'DistanceKilometers': pd_distancekilometers[0], 'FlightDelayMin': pd_flightdelaymin[0]})
ed_bins, ed_weights = ed_flights[['DistanceKilometers', 'FlightDelayMin']]._hist(num_bins=num_bins)
# Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)

View File

@ -3,7 +3,7 @@
from eland.tests.common import TestData
class TestDataFrameInfo(TestData):
class TestDataFrameInfoEs(TestData):
def test_to_info1(self):
ed_flights = self.ed_flights()

View File

@ -6,7 +6,7 @@ from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
class TestDataFrameMean(TestData):
class TestDataFrameMetrics(TestData):
def test_to_mean(self):
pd_flights = self.pd_flights()

File diff suppressed because one or more lines are too long

View File

@ -5,6 +5,7 @@ from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
import numpy as np
import pandas as pd
class TestDataFrameHist(TestData):
@ -14,24 +15,25 @@ class TestDataFrameHist(TestData):
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
"""
pd_flights.hist(figsize=[10, 10])
ed_flights.hist(figsize=[10, 10])
#ed_flights.hist(figsize=[10, 10])
pd_min = pd_flights['DistanceKilometers'].min()
pd_max = pd_flights['DistanceKilometers'].max()
ed_min = ed_flights['DistanceKilometers'].min()
ed_max = ed_flights['DistanceKilometers'].max()
#ed_min = ed_flights['DistanceKilometers'].min()
#ed_max = ed_flights['DistanceKilometers'].max()
num_bins = 10.0
#num_bins = 10.0
bins = np.linspace(ed_min, ed_max, num=num_bins+1)
#bins = np.linspace(ed_min, ed_max, num=num_bins+1)
print(bins)
#print(bins)
print(np.diff(bins).mean())
#print(np.diff(bins).mean())
hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
#hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
x = [2956., 768., 719., 2662., 2934., 1320., 641., 529., 426., 104.]
@ -43,4 +45,24 @@ class TestDataFrameHist(TestData):
a = bins[0:10]
print(np.histogram(a, weights=x, bins=bins))
#counts, bins = np.histogram(data)
#plt.hist(bins[:-1], bins, weights=counts)
"""
h1 = np.histogram(pd_flights['DistanceKilometers'], 10)
h2 = np.histogram(pd_flights['FlightDelayMin'], 10)
l1 = list(h1[0])
l2 = list(h2[0])
l1.append(0)
l2.append(0)
d = {'DistanceKilometers': h1[1],
'FlightDelayMin': h2[1]}
df = pd.DataFrame(data=d)
df.hist(weights=[l1, l2])