Feature/filtered hist (#104)

* Adding python 3.5 compatibility.

Main issue is ordering of dictionaries.

* Updating notebooks with 3.7 results.

* Removing tempoorary code.

* Defaulting to OrderedDict for python 3.5 + lint all code

All code reformated by PyCharm and inspection results analysed.

* Adding support for multiple arithmetic operations.

Added new 'arithmetics' file to manage this process.
More tests to be added + cleanup.

* Signficant refactor to arithmetics and mappings.

Work in progress. Tests don't pass.

* Major refactor to Mappings.

Field name mappings were stored in different places
(Mappings, QueryCompiler, Operations) and needed to
be keep in sync.

With the addition of complex arithmetic operations
this became complex and difficult to maintain. Therefore,
all field naming is now in 'FieldMappings' which
replaces 'Mappings'.

Note this commit removes the cache for some of the
mapped values and so the code is SIGNIFICANTLY
slower on large indices.

In addition, the addition of date_format to
Mappings has been removed. This again added more
unncessary complexity.

* Adding OrderedDict for 3.5 compatibility

* Fixes to ordering issues with 3.5

* Adding simple cache for mappings in flatten

Improves performance significantly on large
datasets (>10000 rows).

* Adding updated notebooks (new info_es).

All tests (doc + nbval + pytest) pass.

* Fixing issue with non-zero offset histograms.
This commit is contained in:
stevedodson 2020-01-10 08:17:45 +00:00 committed by GitHub
parent 903fbf0341
commit a3293168a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 102 additions and 26 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -80,8 +80,6 @@ class FieldMappings:
if (client is None) or (index_pattern is None):
raise ValueError("Can not initialise mapping without client or index_pattern {} {}", client, index_pattern)
# here we keep track of the format of any date fields
self._date_fields_format = dict()
get_mapping = client.get_mapping(index=index_pattern)
# Get all fields (including all nested) and then all field_caps

View File

@ -261,6 +261,7 @@ class Operations:
for field in numeric_source_fields:
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
response = query_compiler._client.search(
index=query_compiler._index_pattern,
size=0,

View File

@ -136,15 +136,17 @@ class Query:
max = max_aggs[field]
interval = (max - min) / num_bins
offset = min
agg = {
"histogram": {
"field": field,
"interval": interval
"interval": interval,
"offset": offset
}
}
if not min == max == 0:
if interval != 0:
self._aggs[name] = agg

View File

@ -46,3 +46,29 @@ class TestDataFrameHist(TestData):
# Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)
def test_flights_filtered_hist(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_flights = pd_flights[pd_flights.FlightDelayMin > 0]
ed_flights = ed_flights[ed_flights.FlightDelayMin > 0]
num_bins = 10
# pandas data
pd_distancekilometers = np.histogram(pd_flights['DistanceKilometers'], num_bins)
pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
pd_bins = pd.DataFrame(
{'DistanceKilometers': pd_distancekilometers[1], 'FlightDelayMin': pd_flightdelaymin[1]})
pd_weights = pd.DataFrame(
{'DistanceKilometers': pd_distancekilometers[0], 'FlightDelayMin': pd_flightdelaymin[0]})
t = ed_flights[['DistanceKilometers', 'FlightDelayMin']]
ed_bins, ed_weights = ed_flights[['DistanceKilometers', 'FlightDelayMin']]._hist(num_bins=num_bins)
# Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)

View File

@ -18,7 +18,8 @@ from io import StringIO
import numpy as np
import eland as ed
from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT, FLIGHTS_MAPPING
from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT
from eland.tests.common import TestData

View File

@ -38,3 +38,27 @@ def test_plot_hist(fig_test, fig_ref):
with pytest.warns(UserWarning):
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)
@check_figures_equal(extensions=['png'])
def test_plot_filtered_hist(fig_test, fig_ref):
test_data = TestData()
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
pd_flights = pd_flights[pd_flights.FlightDelayMin > 0]
ed_flights = ed_flights[ed_flights.FlightDelayMin > 0]
print(ed_flights.head())
# This throws a userwarning
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning):
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
# This throws a userwarning
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning):
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)

View File

@ -13,7 +13,6 @@
# limitations under the License.
# File called _pytest for PyCharm compatability
import pytest
from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData

View File

@ -40,6 +40,30 @@ class TestSeriesFrameHist(TestData):
ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins)
# Numbers are slightly different
print(pd_bins, ed_bins)
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)
def test_filtered_hist(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
num_bins = 10
# pandas data
pd_filteredhist = np.histogram(pd_flights[pd_flights.FlightDelay == True].FlightDelayMin, num_bins)
pd_bins = pd.DataFrame(
{'FlightDelayMin': pd_filteredhist[1]})
pd_weights = pd.DataFrame(
{'FlightDelayMin': pd_filteredhist[0]})
d = ed_flights[ed_flights.FlightDelay == True].FlightDelayMin
print(d.info_es())
ed_bins, ed_weights = ed_flights[ed_flights.FlightDelay == True].FlightDelayMin._hist(num_bins=num_bins)
# Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)