Add support for Pandas 1.2.0

This commit is contained in:
Seth Michael Larson 2020-12-30 14:20:36 -06:00 committed by GitHub
parent 473db4576b
commit a552504f9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 105 additions and 45 deletions

View File

@ -130,6 +130,16 @@ if [[ "$ELASTICSEARCH_VERSION" != *oss* ]]; then
url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME" url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME"
fi fi
# Pull the container, retry on failures up to 5 times with
# short delays between each attempt. Fixes most transient network errors.
docker_pull_attempts=0
until [ "$docker_pull_attempts" -ge 5 ]
do
docker pull docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION" && break
docker_pull_attempts=$((docker_pull_attempts+1))
sleep 10
done
echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m" echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m"
set -x set -x
docker run \ docker run \

View File

@ -4,13 +4,12 @@ ELASTICSEARCH_VERSION:
- 8.0.0-SNAPSHOT - 8.0.0-SNAPSHOT
- 7.x-SNAPSHOT - 7.x-SNAPSHOT
- 7.10-SNAPSHOT - 7.10-SNAPSHOT
- 7.7-SNAPSHOT
- 7.6-SNAPSHOT
TEST_SUITE: TEST_SUITE:
- xpack - xpack
PYTHON_VERSION: PYTHON_VERSION:
- 3.9
- 3.8 - 3.8
- 3.7 - 3.7
- 3.6 - 3.6

View File

@ -71,7 +71,7 @@ class DataFrameGroupBy(GroupBy):
... "localhost", "flights", ... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"] ... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... ) ... )
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE >>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek timestamp AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry DestCountry
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443 AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443

View File

@ -114,7 +114,7 @@ class MLModel:
>>> regressor = regressor.fit(training_data[0], training_data[1]) >>> regressor = regressor.fit(training_data[0], training_data[1])
>>> # Get some test results >>> # Get some test results
>>> regressor.predict(np.array(test_data)) >>> regressor.predict(np.array(test_data)) # doctest: +SKIP
array([0.06062475, 0.9990102 ], dtype=float32) array([0.06062475, 0.9990102 ], dtype=float32)
>>> # Serialise the model to Elasticsearch >>> # Serialise the model to Elasticsearch
@ -123,7 +123,7 @@ class MLModel:
>>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace') >>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace')
>>> # Get some test results from Elasticsearch model >>> # Get some test results from Elasticsearch model
>>> es_model.predict(test_data) >>> es_model.predict(test_data) # doctest: +SKIP
array([0.0606248 , 0.99901026], dtype=float32) array([0.0606248 , 0.99901026], dtype=float32)
>>> # Delete model from Elasticsearch >>> # Delete model from Elasticsearch

View File

@ -214,7 +214,7 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mean() >>> df.mean() # doctest: +SKIP
AvgTicketPrice 628.254 AvgTicketPrice 628.254
Cancelled 0.128494 Cancelled 0.128494
dayOfWeek 2.83598 dayOfWeek 2.83598
@ -227,7 +227,7 @@ class NDFrame(ABC):
dayOfWeek 2.835975 dayOfWeek 2.835975
dtype: float64 dtype: float64
>>> df.mean(numeric_only=False) >>> df.mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 628.254 AvgTicketPrice 628.254
Cancelled 0.128494 Cancelled 0.128494
dayOfWeek 2.83598 dayOfWeek 2.83598
@ -263,7 +263,7 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.sum() >>> df.sum() # doctest: +SKIP
AvgTicketPrice 8.20436e+06 AvgTicketPrice 8.20436e+06
Cancelled 1678 Cancelled 1678
dayOfWeek 37035 dayOfWeek 37035
@ -275,7 +275,7 @@ class NDFrame(ABC):
dayOfWeek 3.703500e+04 dayOfWeek 3.703500e+04
dtype: float64 dtype: float64
>>> df.sum(numeric_only=False) >>> df.sum(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 8.20436e+06 AvgTicketPrice 8.20436e+06
Cancelled 1678 Cancelled 1678
dayOfWeek 37035 dayOfWeek 37035
@ -311,7 +311,7 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.min() >>> df.min() # doctest: +SKIP
AvgTicketPrice 100.021 AvgTicketPrice 100.021
Cancelled False Cancelled False
dayOfWeek 0 dayOfWeek 0
@ -324,7 +324,7 @@ class NDFrame(ABC):
dayOfWeek 0.000000 dayOfWeek 0.000000
dtype: float64 dtype: float64
>>> df.min(numeric_only=False) >>> df.min(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 100.021 AvgTicketPrice 100.021
Cancelled False Cancelled False
dayOfWeek 0 dayOfWeek 0
@ -358,7 +358,7 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.var() >>> df.var() # doctest: +SKIP
AvgTicketPrice 70964.570234 AvgTicketPrice 70964.570234
Cancelled 0.111987 Cancelled 0.111987
dayOfWeek 3.761279 dayOfWeek 3.761279
@ -370,7 +370,7 @@ class NDFrame(ABC):
dayOfWeek 3.761279 dayOfWeek 3.761279
dtype: float64 dtype: float64
>>> df.var(numeric_only=False) >>> df.var(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 70964.6 AvgTicketPrice 70964.6
Cancelled 0.111987 Cancelled 0.111987
dayOfWeek 3.76128 dayOfWeek 3.76128
@ -404,7 +404,7 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.std() >>> df.std() # doctest: +SKIP
AvgTicketPrice 266.407061 AvgTicketPrice 266.407061
Cancelled 0.334664 Cancelled 0.334664
dayOfWeek 1.939513 dayOfWeek 1.939513
@ -416,7 +416,7 @@ class NDFrame(ABC):
dayOfWeek 1.939513 dayOfWeek 1.939513
dtype: float64 dtype: float64
>>> df.std(numeric_only=False) >>> df.std(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 266.407 AvgTicketPrice 266.407
Cancelled 0.334664 Cancelled 0.334664
dayOfWeek 1.93951 dayOfWeek 1.93951
@ -499,7 +499,7 @@ class NDFrame(ABC):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]) >>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.max() >>> df.max() # doctest: +SKIP
AvgTicketPrice 1199.73 AvgTicketPrice 1199.73
Cancelled True Cancelled True
dayOfWeek 6 dayOfWeek 6
@ -512,7 +512,7 @@ class NDFrame(ABC):
dayOfWeek 6.000000 dayOfWeek 6.000000
dtype: float64 dtype: float64
>>> df.max(numeric_only=False) >>> df.max(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 1199.73 AvgTicketPrice 1199.73
Cancelled True Cancelled True
dayOfWeek 6 dayOfWeek 6

View File

@ -18,7 +18,19 @@
import numpy as np import numpy as np
from pandas.core.dtypes.generic import ABCIndexClass from pandas.core.dtypes.generic import ABCIndexClass
from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib import converter
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
try: # pandas>=1.2.0
from pandas.plotting._matplotlib.tools import (
create_subplots,
flatten_axes,
set_ticks_props,
)
except ImportError: # pandas<1.2.0
from pandas.plotting._matplotlib.tools import (
_flatten as flatten_axes,
_set_ticks_props as set_ticks_props,
_subplots as create_subplots,
)
from eland.utils import try_sort from eland.utils import try_sort
@ -63,7 +75,7 @@ def hist_series(
ax.grid(grid) ax.grid(grid)
axes = np.array([ax]) axes = np.array([ax])
_set_ticks_props( set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
) )
@ -110,7 +122,7 @@ def hist_frame(
if naxes == 0: if naxes == 0:
raise ValueError("hist method requires numerical columns, " "nothing to plot.") raise ValueError("hist method requires numerical columns, " "nothing to plot.")
fig, axes = _subplots( fig, axes = create_subplots(
naxes=naxes, naxes=naxes,
ax=ax, ax=ax,
squeeze=False, squeeze=False,
@ -119,7 +131,7 @@ def hist_frame(
figsize=figsize, figsize=figsize,
layout=layout, layout=layout,
) )
_axes = _flatten(axes) _axes = flatten_axes(axes)
for i, col in enumerate(try_sort(data.columns)): for i, col in enumerate(try_sort(data.columns)):
ax = _axes[i] ax = _axes[i]
@ -132,7 +144,7 @@ def hist_frame(
ax.set_title(col) ax.set_title(col)
ax.grid(grid) ax.grid(grid)
_set_ticks_props( set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
) )
fig.subplots_adjust(wspace=0.3, hspace=0.3) fig.subplots_adjust(wspace=0.3, hspace=0.3)

View File

@ -89,11 +89,18 @@ def lint(session):
session.error("\n" + "\n".join(sorted(set(errors)))) session.error("\n" + "\n".join(sorted(set(errors))))
@nox.session(python=["3.6", "3.7", "3.8"]) @nox.session(python=["3.6", "3.7", "3.8", "3.9"])
def test(session): def test(session):
session.install("-r", "requirements-dev.txt") session.install("-r", "requirements-dev.txt")
session.run("python", "-m", "tests.setup_tests") session.run("python", "-m", "tests.setup_tests")
session.install(".") session.install(".")
# Notebooks are only run on Python 3.7+ due to pandas 1.2.0
if session.python == "3.6":
nbval = ()
else:
nbval = ("--nbval",)
session.run( session.run(
"python", "python",
"-m", "-m",
@ -102,21 +109,23 @@ def test(session):
"term-missing", "term-missing",
"--cov=eland/", "--cov=eland/",
"--doctest-modules", "--doctest-modules",
"--nbval", *nbval,
*(session.posargs or ("eland/", "tests/")), *(session.posargs or ("eland/", "tests/")),
) )
session.run( # Only run during default test execution
"python", if not session.posargs:
"-m", session.run(
"pip", "python",
"uninstall", "-m",
"--yes", "pip",
"scikit-learn", "uninstall",
"xgboost", "--yes",
"lightgbm", "scikit-learn",
) "xgboost",
session.run("pytest", "tests/ml/") "lightgbm",
)
session.run("pytest", "tests/ml/")
@nox.session(reuse_venv=True) @nox.session(reuse_venv=True)

View File

@ -16,6 +16,7 @@
# under the License. # under the License.
import os import os
from datetime import timedelta
import pandas as pd import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal from pandas.testing import assert_frame_equal, assert_series_equal
@ -106,3 +107,30 @@ def assert_pandas_eland_series_equal(left, right, **kwargs):
# Use pandas tests to check similarity # Use pandas tests to check similarity
assert_series_equal(left, right.to_pandas(), **kwargs) assert_series_equal(left, right.to_pandas(), **kwargs)
def assert_almost_equal(left, right, **kwargs):
"""Asserts left and right are almost equal. Left and right
can be scalars, series, dataframes, etc
"""
if isinstance(left, (ed.DataFrame, ed.Series)):
left = left.to_pandas()
if isinstance(right, (ed.DataFrame, ed.Series)):
right = right.to_pandas()
if isinstance(right, pd.DataFrame):
kwargs.setdefault("check_exact", True)
assert_frame_equal(left, right)
elif isinstance(right, pd.Series):
kwargs.setdefault("check_exact", True)
assert_series_equal(left, right)
elif isinstance(right, float):
assert right * 0.99 <= left <= right * 1.01
elif isinstance(right, pd.Timestamp):
assert isinstance(left, pd.Timestamp) and right - timedelta(
seconds=0.1
) < left < right + timedelta(seconds=0.1)
elif right is pd.NaT:
assert left is pd.NaT
else:
assert left == right, f"{left} != {right}"

View File

@ -22,7 +22,7 @@ import pandas as pd
import pytest import pytest
from pandas.testing import assert_frame_equal, assert_series_equal from pandas.testing import assert_frame_equal, assert_series_equal
from tests.common import TestData from tests.common import TestData, assert_almost_equal
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
@ -181,7 +181,9 @@ class TestDataFrameMetrics(TestData):
) )
ed_metrics_dict = ed_metrics["timestamp"].to_dict() ed_metrics_dict = ed_metrics["timestamp"].to_dict()
ed_metrics_dict.pop("median") # Median is tested below. ed_metrics_dict.pop("median") # Median is tested below.
assert ed_metrics_dict == expected_values
for key, expected_value in expected_values.items():
assert_almost_equal(ed_metrics_dict[key], expected_value)
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"]) @pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
def test_flights_datetime_metrics_single_agg(self, agg): def test_flights_datetime_metrics_single_agg(self, agg):
@ -200,7 +202,7 @@ class TestDataFrameMetrics(TestData):
else: else:
# df with timestamp column should return datetime64[ns] # df with timestamp column should return datetime64[ns]
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]") assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
assert ed_metric["timestamp"][0] == expected_values[agg] assert_almost_equal(ed_metric["timestamp"][0], expected_values[agg])
@pytest.mark.parametrize("agg", ["mean", "min", "max"]) @pytest.mark.parametrize("agg", ["mean", "min", "max"])
def test_flights_datetime_metrics_agg_func(self, agg): def test_flights_datetime_metrics_agg_func(self, agg):
@ -213,7 +215,7 @@ class TestDataFrameMetrics(TestData):
ed_metric = getattr(ed_timestamps, agg)(numeric_only=False) ed_metric = getattr(ed_timestamps, agg)(numeric_only=False)
assert ed_metric.dtype == np.dtype("datetime64[ns]") assert ed_metric.dtype == np.dtype("datetime64[ns]")
assert ed_metric[0] == expected_values[agg] assert_almost_equal(ed_metric[0], expected_values[agg])
def test_flights_datetime_metrics_median(self): def test_flights_datetime_metrics_median(self):
ed_df = self.ed_flights_small()[["timestamp"]] ed_df = self.ed_flights_small()[["timestamp"]]
@ -283,7 +285,7 @@ class TestDataFrameMetrics(TestData):
else: else:
assert_series_equal( assert_series_equal(
agg_data[agg].rename(None), agg_data[agg].rename(None),
getattr(pd_flights, agg)(numeric_only=True), getattr(pd_flights, agg)(numeric_only=True).astype(float),
check_exact=False, check_exact=False,
rtol=True, rtol=True,
) )

View File

@ -2816,7 +2816,7 @@
" 25 dayOfWeek 13059 non-null int64 \n", " 25 dayOfWeek 13059 non-null int64 \n",
" 26 timestamp 13059 non-null datetime64[ns]\n", " 26 timestamp 13059 non-null datetime64[ns]\n",
"dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n", "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
"memory usage: 3.2+ MB\n" "memory usage: 3.1+ MB\n"
] ]
} }
], ],
@ -2924,7 +2924,7 @@
} }
], ],
"source": [ "source": [
"pd_flights.max(numeric_only=True)" "pd_flights.max(numeric_only=True).astype(float)"
] ]
}, },
{ {
@ -3004,7 +3004,7 @@
} }
], ],
"source": [ "source": [
"pd_flights.min(numeric_only=True)" "pd_flights.min(numeric_only=True).astype(float)"
] ]
}, },
{ {

View File

@ -23,7 +23,7 @@ import numpy as np
import pandas as pd import pandas as pd
import pytest import pytest
from tests.common import TestData from tests.common import TestData, assert_almost_equal
class TestSeriesMetrics(TestData): class TestSeriesMetrics(TestData):
@ -102,7 +102,7 @@ class TestSeriesMetrics(TestData):
} }
ed_metric = getattr(ed_timestamps, agg)() ed_metric = getattr(ed_timestamps, agg)()
assert ed_metric == expected_values[agg] assert_almost_equal(ed_metric, expected_values[agg])
def test_flights_datetime_median_metric(self): def test_flights_datetime_median_metric(self):
ed_series = self.ed_flights_small()["timestamp"] ed_series = self.ed_flights_small()["timestamp"]