mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Feature/filtered hist (#104)
* Adding python 3.5 compatibility. Main issue is ordering of dictionaries. * Updating notebooks with 3.7 results. * Removing tempoorary code. * Defaulting to OrderedDict for python 3.5 + lint all code All code reformated by PyCharm and inspection results analysed. * Adding support for multiple arithmetic operations. Added new 'arithmetics' file to manage this process. More tests to be added + cleanup. * Signficant refactor to arithmetics and mappings. Work in progress. Tests don't pass. * Major refactor to Mappings. Field name mappings were stored in different places (Mappings, QueryCompiler, Operations) and needed to be keep in sync. With the addition of complex arithmetic operations this became complex and difficult to maintain. Therefore, all field naming is now in 'FieldMappings' which replaces 'Mappings'. Note this commit removes the cache for some of the mapped values and so the code is SIGNIFICANTLY slower on large indices. In addition, the addition of date_format to Mappings has been removed. This again added more unncessary complexity. * Adding OrderedDict for 3.5 compatibility * Fixes to ordering issues with 3.5 * Adding simple cache for mappings in flatten Improves performance significantly on large datasets (>10000 rows). * Adding updated notebooks (new info_es). All tests (doc + nbval + pytest) pass. * Fixing issue with non-zero offset histograms.
This commit is contained in:
parent
903fbf0341
commit
a3293168a1
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -80,8 +80,6 @@ class FieldMappings:
|
|||||||
if (client is None) or (index_pattern is None):
|
if (client is None) or (index_pattern is None):
|
||||||
raise ValueError("Can not initialise mapping without client or index_pattern {} {}", client, index_pattern)
|
raise ValueError("Can not initialise mapping without client or index_pattern {} {}", client, index_pattern)
|
||||||
|
|
||||||
# here we keep track of the format of any date fields
|
|
||||||
self._date_fields_format = dict()
|
|
||||||
get_mapping = client.get_mapping(index=index_pattern)
|
get_mapping = client.get_mapping(index=index_pattern)
|
||||||
|
|
||||||
# Get all fields (including all nested) and then all field_caps
|
# Get all fields (including all nested) and then all field_caps
|
||||||
|
@ -261,6 +261,7 @@ class Operations:
|
|||||||
|
|
||||||
for field in numeric_source_fields:
|
for field in numeric_source_fields:
|
||||||
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
||||||
|
|
||||||
response = query_compiler._client.search(
|
response = query_compiler._client.search(
|
||||||
index=query_compiler._index_pattern,
|
index=query_compiler._index_pattern,
|
||||||
size=0,
|
size=0,
|
||||||
|
@ -136,15 +136,17 @@ class Query:
|
|||||||
max = max_aggs[field]
|
max = max_aggs[field]
|
||||||
|
|
||||||
interval = (max - min) / num_bins
|
interval = (max - min) / num_bins
|
||||||
|
offset = min
|
||||||
|
|
||||||
agg = {
|
agg = {
|
||||||
"histogram": {
|
"histogram": {
|
||||||
"field": field,
|
"field": field,
|
||||||
"interval": interval
|
"interval": interval,
|
||||||
|
"offset": offset
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if not min == max == 0:
|
if interval != 0:
|
||||||
self._aggs[name] = agg
|
self._aggs[name] = agg
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,3 +46,29 @@ class TestDataFrameHist(TestData):
|
|||||||
# Numbers are slightly different
|
# Numbers are slightly different
|
||||||
assert_almost_equal(pd_bins, ed_bins)
|
assert_almost_equal(pd_bins, ed_bins)
|
||||||
assert_almost_equal(pd_weights, ed_weights)
|
assert_almost_equal(pd_weights, ed_weights)
|
||||||
|
|
||||||
|
def test_flights_filtered_hist(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_flights = pd_flights[pd_flights.FlightDelayMin > 0]
|
||||||
|
ed_flights = ed_flights[ed_flights.FlightDelayMin > 0]
|
||||||
|
|
||||||
|
num_bins = 10
|
||||||
|
|
||||||
|
# pandas data
|
||||||
|
pd_distancekilometers = np.histogram(pd_flights['DistanceKilometers'], num_bins)
|
||||||
|
pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
|
||||||
|
|
||||||
|
pd_bins = pd.DataFrame(
|
||||||
|
{'DistanceKilometers': pd_distancekilometers[1], 'FlightDelayMin': pd_flightdelaymin[1]})
|
||||||
|
pd_weights = pd.DataFrame(
|
||||||
|
{'DistanceKilometers': pd_distancekilometers[0], 'FlightDelayMin': pd_flightdelaymin[0]})
|
||||||
|
|
||||||
|
t = ed_flights[['DistanceKilometers', 'FlightDelayMin']]
|
||||||
|
|
||||||
|
ed_bins, ed_weights = ed_flights[['DistanceKilometers', 'FlightDelayMin']]._hist(num_bins=num_bins)
|
||||||
|
|
||||||
|
# Numbers are slightly different
|
||||||
|
assert_almost_equal(pd_bins, ed_bins)
|
||||||
|
assert_almost_equal(pd_weights, ed_weights)
|
||||||
|
@ -18,7 +18,8 @@ from io import StringIO
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT, FLIGHTS_MAPPING
|
from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,3 +38,27 @@ def test_plot_hist(fig_test, fig_ref):
|
|||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
ed_ax = fig_test.subplots()
|
ed_ax = fig_test.subplots()
|
||||||
ed_flights.hist(ax=ed_ax)
|
ed_flights.hist(ax=ed_ax)
|
||||||
|
|
||||||
|
@check_figures_equal(extensions=['png'])
|
||||||
|
def test_plot_filtered_hist(fig_test, fig_ref):
|
||||||
|
test_data = TestData()
|
||||||
|
|
||||||
|
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||||
|
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||||
|
|
||||||
|
pd_flights = pd_flights[pd_flights.FlightDelayMin > 0]
|
||||||
|
ed_flights = ed_flights[ed_flights.FlightDelayMin > 0]
|
||||||
|
|
||||||
|
print(ed_flights.head())
|
||||||
|
|
||||||
|
# This throws a userwarning
|
||||||
|
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
pd_ax = fig_ref.subplots()
|
||||||
|
pd_flights.hist(ax=pd_ax)
|
||||||
|
|
||||||
|
# This throws a userwarning
|
||||||
|
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
ed_ax = fig_test.subplots()
|
||||||
|
ed_flights.hist(ax=ed_ax)
|
||||||
|
@ -13,7 +13,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pytest
|
|
||||||
from matplotlib.testing.decorators import check_figures_equal
|
from matplotlib.testing.decorators import check_figures_equal
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
@ -40,6 +40,30 @@ class TestSeriesFrameHist(TestData):
|
|||||||
|
|
||||||
ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins)
|
ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins)
|
||||||
|
|
||||||
|
# Numbers are slightly different
|
||||||
|
print(pd_bins, ed_bins)
|
||||||
|
assert_almost_equal(pd_bins, ed_bins)
|
||||||
|
assert_almost_equal(pd_weights, ed_weights)
|
||||||
|
|
||||||
|
def test_filtered_hist(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
num_bins = 10
|
||||||
|
|
||||||
|
# pandas data
|
||||||
|
pd_filteredhist = np.histogram(pd_flights[pd_flights.FlightDelay == True].FlightDelayMin, num_bins)
|
||||||
|
|
||||||
|
pd_bins = pd.DataFrame(
|
||||||
|
{'FlightDelayMin': pd_filteredhist[1]})
|
||||||
|
pd_weights = pd.DataFrame(
|
||||||
|
{'FlightDelayMin': pd_filteredhist[0]})
|
||||||
|
|
||||||
|
d = ed_flights[ed_flights.FlightDelay == True].FlightDelayMin
|
||||||
|
print(d.info_es())
|
||||||
|
|
||||||
|
ed_bins, ed_weights = ed_flights[ed_flights.FlightDelay == True].FlightDelayMin._hist(num_bins=num_bins)
|
||||||
|
|
||||||
# Numbers are slightly different
|
# Numbers are slightly different
|
||||||
assert_almost_equal(pd_bins, ed_bins)
|
assert_almost_equal(pd_bins, ed_bins)
|
||||||
assert_almost_equal(pd_weights, ed_weights)
|
assert_almost_equal(pd_weights, ed_weights)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user