mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Adding first implementation of eland.DataFrame.hist
This commit is contained in:
parent
1fa4d3fbe7
commit
3435ffac1b
@ -13,7 +13,8 @@ from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
from eland import NDFrame
|
||||
from eland import Series
|
||||
from eland import hist_frame
|
||||
|
||||
import eland.plotting as gfx
|
||||
|
||||
class DataFrame(NDFrame):
|
||||
# This is effectively 2 constructors
|
||||
@ -457,4 +458,4 @@ class DataFrame(NDFrame):
|
||||
def keys(self):
|
||||
return self.columns
|
||||
|
||||
hist = hist_frame
|
||||
hist = gfx.ed_hist_frame
|
||||
|
@ -227,8 +227,8 @@ class NDFrame(BasePandasDataset):
|
||||
raise NotImplementedError("Only sum of numeric fields is implemented")
|
||||
return self._query_compiler.max()
|
||||
|
||||
def _hist(self, interval)
|
||||
return self._query_compiler._hist(interval)
|
||||
def _hist(self, num_bins):
|
||||
return self._query_compiler._hist(num_bins)
|
||||
|
||||
def describe(self):
|
||||
return self._query_compiler.describe()
|
||||
|
@ -166,8 +166,8 @@ class Operations:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
# Return single value if this is a series
|
||||
if len(numeric_source_fields) == 1:
|
||||
return np.float64(results[numeric_source_fields[0]])
|
||||
#if len(numeric_source_fields) == 1:
|
||||
# return np.float64(results[numeric_source_fields[0]])
|
||||
|
||||
s = pd.Series(data=results, index=numeric_source_fields)
|
||||
|
||||
@ -203,7 +203,8 @@ class Operations:
|
||||
|
||||
return s
|
||||
|
||||
def _hist_aggs(self, query_compiler, bins):
|
||||
def _hist_aggs(self, query_compiler, num_bins):
|
||||
# Get histogram bins and weights for numeric columns
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
@ -220,21 +221,59 @@ class Operations:
|
||||
max_aggs = self._metric_aggs(query_compiler, 'max')
|
||||
|
||||
for field in numeric_source_fields:
|
||||
body.hist_aggs(field, min_aggs, max_aggs, bins)
|
||||
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
body=body.to_search_body())
|
||||
|
||||
results = {}
|
||||
# results are like
|
||||
# "aggregations" : {
|
||||
# "DistanceKilometers" : {
|
||||
# "buckets" : [
|
||||
# {
|
||||
# "key" : 0.0,
|
||||
# "doc_count" : 2956
|
||||
# },
|
||||
# {
|
||||
# "key" : 1988.1482421875,
|
||||
# "doc_count" : 768
|
||||
# },
|
||||
# ...
|
||||
|
||||
bins = {}
|
||||
weights = {}
|
||||
|
||||
# There is one more bin that weights
|
||||
# len(bins) = len(weights) + 1
|
||||
|
||||
# bins = [ 0. 36. 72. 108. 144. 180. 216. 252. 288. 324. 360.]
|
||||
# len(bins) == 11
|
||||
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 394.]
|
||||
# len(weights) == 10
|
||||
|
||||
# ES returns
|
||||
# weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.]
|
||||
# So sum last 2 buckets
|
||||
for field in numeric_source_fields:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
buckets = response['aggregations'][field]['buckets']
|
||||
|
||||
s = pd.Series(data=results, index=numeric_source_fields)
|
||||
bins[field] = []
|
||||
weights[field] = []
|
||||
|
||||
return s
|
||||
for bucket in buckets:
|
||||
bins[field].append(bucket['key'])
|
||||
|
||||
if bucket == buckets[-1]:
|
||||
weights[field][-1] += bucket['doc_count']
|
||||
else:
|
||||
weights[field].append(bucket['doc_count'])
|
||||
|
||||
df_bins = pd.DataFrame(data=bins)
|
||||
df_weights = pd.DataFrame(data=weights)
|
||||
|
||||
return df_bins, df_weights
|
||||
|
||||
def describe(self, query_compiler):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
|
@ -7,40 +7,63 @@ from pandas.plotting._core import (
|
||||
_raise_if_no_mpl, _converter, grouped_hist, _subplots, _flatten, _set_ticks_props)
|
||||
|
||||
|
||||
def hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
|
||||
sharey=False, figsize=None, layout=None, bins=10, **kwds):
|
||||
"""
|
||||
Derived from pandas.plotting._core.hist_frame 0.24.2
|
||||
|
||||
Ideally, we'd call hist_frame directly with histogram data,
|
||||
but weights are applied to ALL series. For example, we can
|
||||
plot a histogram of pre-binned data via:
|
||||
|
||||
counts, bins = np.histogram(data)
|
||||
plt.hist(bins[:-1], bins, weights=counts)
|
||||
|
||||
However,
|
||||
|
||||
ax.hist(data[col].dropna().values, bins=bins, **kwds)
|
||||
|
||||
is for [col] and weights are a single array.
|
||||
|
||||
We therefore cut/paste code.
|
||||
"""
|
||||
# Start with empty pandas data frame derived from
|
||||
empty_pd_df = ed_df._empty_pd_df()
|
||||
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
|
||||
|
||||
_raise_if_no_mpl()
|
||||
_converter._WARN = False
|
||||
if by is not None:
|
||||
axes = grouped_hist(empty_pd_df, column=column, by=by, ax=ax, grid=grid,
|
||||
raise NotImplementedError("TODO")
|
||||
"""
|
||||
axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid,
|
||||
figsize=figsize, sharex=sharex, sharey=sharey,
|
||||
layout=layout, bins=bins, xlabelsize=xlabelsize,
|
||||
xrot=xrot, ylabelsize=ylabelsize,
|
||||
yrot=yrot, **kwds)
|
||||
"""
|
||||
return axes
|
||||
|
||||
if column is not None:
|
||||
if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
|
||||
column = [column]
|
||||
empty_pd_df = empty_pd_df[column]
|
||||
data = empty_pd_df._get_numeric_data()
|
||||
naxes = len(empty_pd_df.columns)
|
||||
ed_df_bins = ed_df_bins[column]
|
||||
ed_df_weights = ed_df_weights[column]
|
||||
naxes = len(ed_df_bins.columns)
|
||||
|
||||
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
|
||||
sharex=sharex, sharey=sharey, figsize=figsize,
|
||||
layout=layout)
|
||||
_axes = _flatten(axes)
|
||||
|
||||
for i, col in enumerate(com.try_sort(data.columns)):
|
||||
for i, col in enumerate(com.try_sort(ed_df_bins.columns)):
|
||||
ax = _axes[i]
|
||||
ax.hist(empty_pd_df[col].dropna().values, bins=bins, **kwds)
|
||||
|
||||
# pandas code
|
||||
# pandas / plotting / _core.py: 2410
|
||||
# ax.hist(data[col].dropna().values, bins=bins, **kwds)
|
||||
|
||||
ax.hist(ed_df_bins[col][:-1], bins=ed_df_bins[col], weights=ed_df_weights[col])
|
||||
ax.set_title(col)
|
||||
ax.grid(grid)
|
||||
|
||||
|
@ -68,7 +68,7 @@ class Query:
|
||||
}
|
||||
self._aggs[name] = agg
|
||||
|
||||
def hist_aggs(self, name, field, min_aggs, max_aggs, bins):
|
||||
def hist_aggs(self, name, field, min_aggs, max_aggs, num_bins):
|
||||
"""
|
||||
Add histogram agg e.g.
|
||||
"aggs": {
|
||||
@ -80,6 +80,18 @@ class Query:
|
||||
}
|
||||
}
|
||||
"""
|
||||
min = min_aggs[field]
|
||||
max = max_aggs[field]
|
||||
|
||||
interval = (max - min)/num_bins
|
||||
|
||||
agg = {
|
||||
"histogram": {
|
||||
"field": field,
|
||||
"interval": interval
|
||||
}
|
||||
}
|
||||
self._aggs[name] = agg
|
||||
|
||||
def to_search_body(self):
|
||||
body = {"query": self._query, "aggs": self._aggs}
|
||||
|
@ -370,5 +370,5 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
def describe(self):
|
||||
return self._operations.describe(self)
|
||||
|
||||
def _hist(self, interval):
|
||||
return self._operations.hist(self, interval)
|
||||
def _hist(self, num_bins):
|
||||
return self._operations.hist(self, num_bins)
|
||||
|
@ -4,7 +4,7 @@ from io import StringIO
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameInfo(TestData):
|
||||
class TestDataFrameDescribe(TestData):
|
||||
|
||||
def test_to_describe1(self):
|
||||
pd_flights = self.pd_flights()
|
||||
|
@ -10,7 +10,7 @@ from eland.tests.common import (
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameiLoc(TestData):
|
||||
class TestDataFrameGet(TestData):
|
||||
|
||||
def test_get1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
|
31
eland/tests/dataframe/test_hist_pytest.py
Normal file
31
eland/tests/dataframe/test_hist_pytest.py
Normal file
@ -0,0 +1,31 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameHist(TestData):
|
||||
|
||||
def test_to_hist1(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
num_bins = 10
|
||||
|
||||
# pandas data
|
||||
pd_distancekilometers = np.histogram(pd_flights['DistanceKilometers'], num_bins)
|
||||
pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
|
||||
|
||||
pd_bins = pd.DataFrame(
|
||||
{'DistanceKilometers': pd_distancekilometers[1], 'FlightDelayMin': pd_flightdelaymin[1]})
|
||||
pd_weights = pd.DataFrame(
|
||||
{'DistanceKilometers': pd_distancekilometers[0], 'FlightDelayMin': pd_flightdelaymin[0]})
|
||||
|
||||
ed_bins, ed_weights = ed_flights[['DistanceKilometers', 'FlightDelayMin']]._hist(num_bins=num_bins)
|
||||
|
||||
# Numbers are slightly different
|
||||
assert_almost_equal(pd_bins, ed_bins)
|
||||
assert_almost_equal(pd_weights, ed_weights)
|
@ -3,7 +3,7 @@
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameInfo(TestData):
|
||||
class TestDataFrameInfoEs(TestData):
|
||||
|
||||
def test_to_info1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
|
@ -6,7 +6,7 @@ from eland.tests.common import TestData
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
|
||||
class TestDataFrameMean(TestData):
|
||||
class TestDataFrameMetrics(TestData):
|
||||
|
||||
def test_to_mean(self):
|
||||
pd_flights = self.pd_flights()
|
||||
|
File diff suppressed because one or more lines are too long
@ -5,6 +5,7 @@ from eland.tests.common import TestData
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class TestDataFrameHist(TestData):
|
||||
|
||||
@ -14,24 +15,25 @@ class TestDataFrameHist(TestData):
|
||||
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
|
||||
"""
|
||||
pd_flights.hist(figsize=[10, 10])
|
||||
ed_flights.hist(figsize=[10, 10])
|
||||
#ed_flights.hist(figsize=[10, 10])
|
||||
|
||||
pd_min = pd_flights['DistanceKilometers'].min()
|
||||
pd_max = pd_flights['DistanceKilometers'].max()
|
||||
|
||||
ed_min = ed_flights['DistanceKilometers'].min()
|
||||
ed_max = ed_flights['DistanceKilometers'].max()
|
||||
#ed_min = ed_flights['DistanceKilometers'].min()
|
||||
#ed_max = ed_flights['DistanceKilometers'].max()
|
||||
|
||||
num_bins = 10.0
|
||||
#num_bins = 10.0
|
||||
|
||||
bins = np.linspace(ed_min, ed_max, num=num_bins+1)
|
||||
#bins = np.linspace(ed_min, ed_max, num=num_bins+1)
|
||||
|
||||
print(bins)
|
||||
#print(bins)
|
||||
|
||||
print(np.diff(bins).mean())
|
||||
#print(np.diff(bins).mean())
|
||||
|
||||
hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
|
||||
#hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
|
||||
|
||||
|
||||
x = [2956., 768., 719., 2662., 2934., 1320., 641., 529., 426., 104.]
|
||||
@ -43,4 +45,24 @@ class TestDataFrameHist(TestData):
|
||||
a = bins[0:10]
|
||||
|
||||
print(np.histogram(a, weights=x, bins=bins))
|
||||
#counts, bins = np.histogram(data)
|
||||
#plt.hist(bins[:-1], bins, weights=counts)
|
||||
"""
|
||||
|
||||
h1 = np.histogram(pd_flights['DistanceKilometers'], 10)
|
||||
h2 = np.histogram(pd_flights['FlightDelayMin'], 10)
|
||||
l1 = list(h1[0])
|
||||
l2 = list(h2[0])
|
||||
l1.append(0)
|
||||
l2.append(0)
|
||||
|
||||
d = {'DistanceKilometers': h1[1],
|
||||
'FlightDelayMin': h2[1]}
|
||||
|
||||
df = pd.DataFrame(data=d)
|
||||
|
||||
df.hist(weights=[l1, l2])
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user