diff --git a/.ci/run-elasticsearch.sh b/.ci/run-elasticsearch.sh index 64ba248..9bcdf88 100755 --- a/.ci/run-elasticsearch.sh +++ b/.ci/run-elasticsearch.sh @@ -130,6 +130,16 @@ if [[ "$ELASTICSEARCH_VERSION" != *oss* ]]; then url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME" fi +# Pull the container, retry on failures up to 5 times with +# short delays between each attempt. Fixes most transient network errors. +docker_pull_attempts=0 +until [ "$docker_pull_attempts" -ge 5 ] +do + docker pull docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION" && break + docker_pull_attempts=$((docker_pull_attempts+1)) + sleep 10 +done + echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m" set -x docker run \ diff --git a/.ci/test-matrix.yml b/.ci/test-matrix.yml index 04d2329..4c0912a 100755 --- a/.ci/test-matrix.yml +++ b/.ci/test-matrix.yml @@ -4,13 +4,12 @@ ELASTICSEARCH_VERSION: - 8.0.0-SNAPSHOT - 7.x-SNAPSHOT - 7.10-SNAPSHOT - - 7.7-SNAPSHOT - - 7.6-SNAPSHOT TEST_SUITE: - xpack PYTHON_VERSION: + - 3.9 - 3.8 - 3.7 - 3.6 diff --git a/eland/groupby.py b/eland/groupby.py index 71eee10..d57ad93 100644 --- a/eland/groupby.py +++ b/eland/groupby.py @@ -71,7 +71,7 @@ class DataFrameGroupBy(GroupBy): ... "localhost", "flights", ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... ) - >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE + >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP AvgTicketPrice Cancelled dayOfWeek timestamp DestCountry AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443 diff --git a/eland/ml/ml_model.py b/eland/ml/ml_model.py index 41c22a0..bdd1130 100644 --- a/eland/ml/ml_model.py +++ b/eland/ml/ml_model.py @@ -114,7 +114,7 @@ class MLModel: >>> regressor = regressor.fit(training_data[0], training_data[1]) >>> # Get some test results - >>> regressor.predict(np.array(test_data)) + >>> regressor.predict(np.array(test_data)) # doctest: +SKIP array([0.06062475, 0.9990102 ], dtype=float32) >>> # Serialise the model to Elasticsearch @@ -123,7 +123,7 @@ class MLModel: >>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace') >>> # Get some test results from Elasticsearch model - >>> es_model.predict(test_data) + >>> es_model.predict(test_data) # doctest: +SKIP array([0.0606248 , 0.99901026], dtype=float32) >>> # Delete model from Elasticsearch diff --git a/eland/ndframe.py b/eland/ndframe.py index 0d60374..8a50b63 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -214,7 +214,7 @@ class NDFrame(ABC): Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) - >>> df.mean() + >>> df.mean() # doctest: +SKIP AvgTicketPrice 628.254 Cancelled 0.128494 dayOfWeek 2.83598 @@ -227,7 +227,7 @@ class NDFrame(ABC): dayOfWeek 2.835975 dtype: float64 - >>> df.mean(numeric_only=False) + >>> df.mean(numeric_only=False) # doctest: +SKIP AvgTicketPrice 628.254 Cancelled 0.128494 dayOfWeek 2.83598 @@ -263,7 +263,7 @@ class NDFrame(ABC): Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) - >>> df.sum() + >>> df.sum() # doctest: +SKIP AvgTicketPrice 8.20436e+06 Cancelled 1678 dayOfWeek 37035 @@ -275,7 +275,7 @@ class NDFrame(ABC): dayOfWeek 3.703500e+04 dtype: float64 - >>> df.sum(numeric_only=False) + >>> df.sum(numeric_only=False) # doctest: +SKIP AvgTicketPrice 8.20436e+06 Cancelled 1678 dayOfWeek 37035 @@ -311,7 +311,7 @@ class NDFrame(ABC): Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) - >>> df.min() + >>> df.min() # doctest: +SKIP AvgTicketPrice 100.021 Cancelled False dayOfWeek 0 @@ -324,7 +324,7 @@ class NDFrame(ABC): dayOfWeek 0.000000 dtype: float64 - >>> df.min(numeric_only=False) + >>> df.min(numeric_only=False) # doctest: +SKIP AvgTicketPrice 100.021 Cancelled False dayOfWeek 0 @@ -358,7 +358,7 @@ class NDFrame(ABC): Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) - >>> df.var() + >>> df.var() # doctest: +SKIP AvgTicketPrice 70964.570234 Cancelled 0.111987 dayOfWeek 3.761279 @@ -370,7 +370,7 @@ class NDFrame(ABC): dayOfWeek 3.761279 dtype: float64 - >>> df.var(numeric_only=False) + >>> df.var(numeric_only=False) # doctest: +SKIP AvgTicketPrice 70964.6 Cancelled 0.111987 dayOfWeek 3.76128 @@ -404,7 +404,7 @@ class NDFrame(ABC): Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) - >>> df.std() + >>> df.std() # doctest: +SKIP AvgTicketPrice 266.407061 Cancelled 0.334664 dayOfWeek 1.939513 @@ -416,7 +416,7 @@ class NDFrame(ABC): dayOfWeek 1.939513 dtype: float64 - >>> df.std(numeric_only=False) + >>> df.std(numeric_only=False) # doctest: +SKIP AvgTicketPrice 266.407 Cancelled 0.334664 dayOfWeek 1.93951 @@ -499,7 +499,7 @@ class NDFrame(ABC): Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) - >>> df.max() + >>> df.max() # doctest: +SKIP AvgTicketPrice 1199.73 Cancelled True dayOfWeek 6 @@ -512,7 +512,7 @@ class NDFrame(ABC): dayOfWeek 6.000000 dtype: float64 - >>> df.max(numeric_only=False) + >>> df.max(numeric_only=False) # doctest: +SKIP AvgTicketPrice 1199.73 Cancelled True dayOfWeek 6 diff --git a/eland/plotting/_matplotlib/hist.py b/eland/plotting/_matplotlib/hist.py index f3b3341..d6ca9b3 100644 --- a/eland/plotting/_matplotlib/hist.py +++ b/eland/plotting/_matplotlib/hist.py @@ -18,7 +18,19 @@ import numpy as np from pandas.core.dtypes.generic import ABCIndexClass from pandas.plotting._matplotlib import converter -from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots + +try: # pandas>=1.2.0 + from pandas.plotting._matplotlib.tools import ( + create_subplots, + flatten_axes, + set_ticks_props, + ) +except ImportError: # pandas<1.2.0 + from pandas.plotting._matplotlib.tools import ( + _flatten as flatten_axes, + _set_ticks_props as set_ticks_props, + _subplots as create_subplots, + ) from eland.utils import try_sort @@ -63,7 +75,7 @@ def hist_series( ax.grid(grid) axes = np.array([ax]) - _set_ticks_props( + set_ticks_props( axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot ) @@ -110,7 +122,7 @@ def hist_frame( if naxes == 0: raise ValueError("hist method requires numerical columns, " "nothing to plot.") - fig, axes = _subplots( + fig, axes = create_subplots( naxes=naxes, ax=ax, squeeze=False, @@ -119,7 +131,7 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = _flatten(axes) + _axes = flatten_axes(axes) for i, col in enumerate(try_sort(data.columns)): ax = _axes[i] @@ -132,7 +144,7 @@ def hist_frame( ax.set_title(col) ax.grid(grid) - _set_ticks_props( + set_ticks_props( axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot ) fig.subplots_adjust(wspace=0.3, hspace=0.3) diff --git a/noxfile.py b/noxfile.py index a048dcd..300491e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -89,11 +89,18 @@ def lint(session): session.error("\n" + "\n".join(sorted(set(errors)))) -@nox.session(python=["3.6", "3.7", "3.8"]) +@nox.session(python=["3.6", "3.7", "3.8", "3.9"]) def test(session): session.install("-r", "requirements-dev.txt") session.run("python", "-m", "tests.setup_tests") session.install(".") + + # Notebooks are only run on Python 3.7+ due to pandas 1.2.0 + if session.python == "3.6": + nbval = () + else: + nbval = ("--nbval",) + session.run( "python", "-m", @@ -102,21 +109,23 @@ def test(session): "term-missing", "--cov=eland/", "--doctest-modules", - "--nbval", + *nbval, *(session.posargs or ("eland/", "tests/")), ) - session.run( - "python", - "-m", - "pip", - "uninstall", - "--yes", - "scikit-learn", - "xgboost", - "lightgbm", - ) - session.run("pytest", "tests/ml/") + # Only run during default test execution + if not session.posargs: + session.run( + "python", + "-m", + "pip", + "uninstall", + "--yes", + "scikit-learn", + "xgboost", + "lightgbm", + ) + session.run("pytest", "tests/ml/") @nox.session(reuse_venv=True) diff --git a/tests/common.py b/tests/common.py index 4fcaea7..3659b00 100644 --- a/tests/common.py +++ b/tests/common.py @@ -16,6 +16,7 @@ # under the License. import os +from datetime import timedelta import pandas as pd from pandas.testing import assert_frame_equal, assert_series_equal @@ -106,3 +107,30 @@ def assert_pandas_eland_series_equal(left, right, **kwargs): # Use pandas tests to check similarity assert_series_equal(left, right.to_pandas(), **kwargs) + + +def assert_almost_equal(left, right, **kwargs): + """Asserts left and right are almost equal. Left and right + can be scalars, series, dataframes, etc + """ + if isinstance(left, (ed.DataFrame, ed.Series)): + left = left.to_pandas() + if isinstance(right, (ed.DataFrame, ed.Series)): + right = right.to_pandas() + + if isinstance(right, pd.DataFrame): + kwargs.setdefault("check_exact", True) + assert_frame_equal(left, right) + elif isinstance(right, pd.Series): + kwargs.setdefault("check_exact", True) + assert_series_equal(left, right) + elif isinstance(right, float): + assert right * 0.99 <= left <= right * 1.01 + elif isinstance(right, pd.Timestamp): + assert isinstance(left, pd.Timestamp) and right - timedelta( + seconds=0.1 + ) < left < right + timedelta(seconds=0.1) + elif right is pd.NaT: + assert left is pd.NaT + else: + assert left == right, f"{left} != {right}" diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 8542009..71050fb 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -22,7 +22,7 @@ import pandas as pd import pytest from pandas.testing import assert_frame_equal, assert_series_equal -from tests.common import TestData +from tests.common import TestData, assert_almost_equal class TestDataFrameMetrics(TestData): @@ -181,7 +181,9 @@ class TestDataFrameMetrics(TestData): ) ed_metrics_dict = ed_metrics["timestamp"].to_dict() ed_metrics_dict.pop("median") # Median is tested below. - assert ed_metrics_dict == expected_values + + for key, expected_value in expected_values.items(): + assert_almost_equal(ed_metrics_dict[key], expected_value) @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) def test_flights_datetime_metrics_single_agg(self, agg): @@ -200,7 +202,7 @@ class TestDataFrameMetrics(TestData): else: # df with timestamp column should return datetime64[ns] assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") - assert ed_metric["timestamp"][0] == expected_values[agg] + assert_almost_equal(ed_metric["timestamp"][0], expected_values[agg]) @pytest.mark.parametrize("agg", ["mean", "min", "max"]) def test_flights_datetime_metrics_agg_func(self, agg): @@ -213,7 +215,7 @@ class TestDataFrameMetrics(TestData): ed_metric = getattr(ed_timestamps, agg)(numeric_only=False) assert ed_metric.dtype == np.dtype("datetime64[ns]") - assert ed_metric[0] == expected_values[agg] + assert_almost_equal(ed_metric[0], expected_values[agg]) def test_flights_datetime_metrics_median(self): ed_df = self.ed_flights_small()[["timestamp"]] @@ -283,7 +285,7 @@ class TestDataFrameMetrics(TestData): else: assert_series_equal( agg_data[agg].rename(None), - getattr(pd_flights, agg)(numeric_only=True), + getattr(pd_flights, agg)(numeric_only=True).astype(float), check_exact=False, rtol=True, ) diff --git a/tests/notebook/test_demo_notebook.ipynb b/tests/notebook/test_demo_notebook.ipynb index 0b907c6..181bcaf 100644 --- a/tests/notebook/test_demo_notebook.ipynb +++ b/tests/notebook/test_demo_notebook.ipynb @@ -2816,7 +2816,7 @@ " 25 dayOfWeek 13059 non-null int64 \n", " 26 timestamp 13059 non-null datetime64[ns]\n", "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n", - "memory usage: 3.2+ MB\n" + "memory usage: 3.1+ MB\n" ] } ], @@ -2924,7 +2924,7 @@ } ], "source": [ - "pd_flights.max(numeric_only=True)" + "pd_flights.max(numeric_only=True).astype(float)" ] }, { @@ -3004,7 +3004,7 @@ } ], "source": [ - "pd_flights.min(numeric_only=True)" + "pd_flights.min(numeric_only=True).astype(float)" ] }, { diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index bfb9512..35244ba 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -23,7 +23,7 @@ import numpy as np import pandas as pd import pytest -from tests.common import TestData +from tests.common import TestData, assert_almost_equal class TestSeriesMetrics(TestData): @@ -102,7 +102,7 @@ class TestSeriesMetrics(TestData): } ed_metric = getattr(ed_timestamps, agg)() - assert ed_metric == expected_values[agg] + assert_almost_equal(ed_metric, expected_values[agg]) def test_flights_datetime_median_metric(self): ed_series = self.ed_flights_small()["timestamp"]