Feature/filtered hist (#104)

* Adding python 3.5 compatibility. Main issue is ordering of dictionaries. * Updating notebooks with 3.7 results. * Removing tempoorary code. * Defaulting to OrderedDict for python 3.5 + lint all code All code reformated by PyCharm and inspection results analysed. * Adding support for multiple arithmetic operations. Added new 'arithmetics' file to manage this process. More tests to be added + cleanup. * Signficant refactor to arithmetics and mappings. Work in progress. Tests don't pass. * Major refactor to Mappings. Field name mappings were stored in different places (Mappings, QueryCompiler, Operations) and needed to be keep in sync. With the addition of complex arithmetic operations this became complex and difficult to maintain. Therefore, all field naming is now in 'FieldMappings' which replaces 'Mappings'. Note this commit removes the cache for some of the mapped values and so the code is SIGNIFICANTLY slower on large indices. In addition, the addition of date_format to Mappings has been removed. This again added more unncessary complexity. * Adding OrderedDict for 3.5 compatibility * Fixes to ordering issues with 3.5 * Adding simple cache for mappings in flatten Improves performance significantly on large datasets (>10000 rows). * Adding updated notebooks (new info_es). All tests (doc + nbval + pytest) pass. * Fixing issue with non-zero offset histograms.
2025-07-11 00:02:14 +08:00 · 2020-01-10 08:17:45 +00:00 · 2020-01-10 08:17:45 +00:00 · a3293168a1
commit a3293168a1
parent 903fbf0341
10 changed files with 102 additions and 26 deletions
--- a/docs/source/examples/demo_notebook.ipynb
+++ b/docs/source/examples/demo_notebook.ipynb
--- a/docs/source/examples/online_retail_analysis.ipynb
+++ b/docs/source/examples/online_retail_analysis.ipynb
--- a/eland/field_mappings.py
+++ b/eland/field_mappings.py
@ -80,8 +80,6 @@ class FieldMappings:
        if (client is None) or (index_pattern is None):
            raise ValueError("Can not initialise mapping without client or index_pattern {} {}", client, index_pattern)

-        # here we keep track of the format of any date fields
-        self._date_fields_format = dict()
        get_mapping = client.get_mapping(index=index_pattern)

        # Get all fields (including all nested) and then all field_caps
--- a/eland/operations.py
+++ b/eland/operations.py
@ -261,6 +261,7 @@ class Operations:

        for field in numeric_source_fields:
            body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
+
        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
--- a/eland/query.py
+++ b/eland/query.py
@ -136,15 +136,17 @@ class Query:
        max = max_aggs[field]

        interval = (max - min) / num_bins
+        offset = min

        agg = {
            "histogram": {
                "field": field,
-                "interval": interval
+                "interval": interval,
+                "offset": offset
            }
        }

-        if not min == max == 0:
+        if interval != 0:
            self._aggs[name] = agg


--- a/eland/tests/dataframe/test_hist_pytest.py
+++ b/eland/tests/dataframe/test_hist_pytest.py
@ -46,3 +46,29 @@ class TestDataFrameHist(TestData):
        # Numbers are slightly different
        assert_almost_equal(pd_bins, ed_bins)
        assert_almost_equal(pd_weights, ed_weights)
+
+    def test_flights_filtered_hist(self):
+        pd_flights = self.pd_flights()
+        ed_flights = self.ed_flights()
+
+        pd_flights = pd_flights[pd_flights.FlightDelayMin > 0]
+        ed_flights = ed_flights[ed_flights.FlightDelayMin > 0]
+
+        num_bins = 10
+
+        # pandas data
+        pd_distancekilometers = np.histogram(pd_flights['DistanceKilometers'], num_bins)
+        pd_flightdelaymin = np.histogram(pd_flights['FlightDelayMin'], num_bins)
+
+        pd_bins = pd.DataFrame(
+            {'DistanceKilometers': pd_distancekilometers[1], 'FlightDelayMin': pd_flightdelaymin[1]})
+        pd_weights = pd.DataFrame(
+            {'DistanceKilometers': pd_distancekilometers[0], 'FlightDelayMin': pd_flightdelaymin[0]})
+
+        t = ed_flights[['DistanceKilometers', 'FlightDelayMin']]
+
+        ed_bins, ed_weights = ed_flights[['DistanceKilometers', 'FlightDelayMin']]._hist(num_bins=num_bins)
+
+        # Numbers are slightly different
+        assert_almost_equal(pd_bins, ed_bins)
+        assert_almost_equal(pd_weights, ed_weights)
--- a/eland/tests/field_mappings/test_scripted_fields_pytest.py
+++ b/eland/tests/field_mappings/test_scripted_fields_pytest.py
@ -18,7 +18,8 @@ from io import StringIO
 import numpy as np

 import eland as ed
-from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT, FLIGHTS_MAPPING
+from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT
+
 from eland.tests.common import TestData


--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@ -38,3 +38,27 @@ def test_plot_hist(fig_test, fig_ref):
    with pytest.warns(UserWarning):
        ed_ax = fig_test.subplots()
        ed_flights.hist(ax=ed_ax)
+
+@check_figures_equal(extensions=['png'])
+def test_plot_filtered_hist(fig_test, fig_ref):
+    test_data = TestData()
+
+    pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
+    ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
+
+    pd_flights = pd_flights[pd_flights.FlightDelayMin > 0]
+    ed_flights = ed_flights[ed_flights.FlightDelayMin > 0]
+
+    print(ed_flights.head())
+
+    # This throws a userwarning
+    # (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
+    with pytest.warns(UserWarning):
+        pd_ax = fig_ref.subplots()
+        pd_flights.hist(ax=pd_ax)
+
+    # This throws a userwarning
+    # (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
+    with pytest.warns(UserWarning):
+        ed_ax = fig_test.subplots()
+        ed_flights.hist(ax=ed_ax)
--- a/eland/tests/plotting/test_series_hist_pytest.py
+++ b/eland/tests/plotting/test_series_hist_pytest.py
@ -13,7 +13,6 @@
 #      limitations under the License.

 # File called _pytest for PyCharm compatability
-import pytest
 from matplotlib.testing.decorators import check_figures_equal

 from eland.tests.common import TestData
--- a/eland/tests/series/test_hist_pytest.py
+++ b/eland/tests/series/test_hist_pytest.py
@ -40,6 +40,30 @@ class TestSeriesFrameHist(TestData):

        ed_bins, ed_weights = ed_flights['FlightDelayMin']._hist(num_bins=num_bins)

+        # Numbers are slightly different
+        print(pd_bins, ed_bins)
+        assert_almost_equal(pd_bins, ed_bins)
+        assert_almost_equal(pd_weights, ed_weights)
+
+    def test_filtered_hist(self):
+        pd_flights = self.pd_flights()
+        ed_flights = self.ed_flights()
+
+        num_bins = 10
+
+        # pandas data
+        pd_filteredhist = np.histogram(pd_flights[pd_flights.FlightDelay == True].FlightDelayMin, num_bins)
+
+        pd_bins = pd.DataFrame(
+            {'FlightDelayMin': pd_filteredhist[1]})
+        pd_weights = pd.DataFrame(
+            {'FlightDelayMin': pd_filteredhist[0]})
+
+        d = ed_flights[ed_flights.FlightDelay == True].FlightDelayMin
+        print(d.info_es())
+
+        ed_bins, ed_weights = ed_flights[ed_flights.FlightDelay == True].FlightDelayMin._hist(num_bins=num_bins)
+
        # Numbers are slightly different
        assert_almost_equal(pd_bins, ed_bins)
        assert_almost_equal(pd_weights, ed_weights)