Add support for Pandas 1.2.0

This commit is contained in:
Seth Michael Larson 2020-12-30 14:20:36 -06:00 committed by GitHub
parent 473db4576b
commit a552504f9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 105 additions and 45 deletions

View File

@ -130,6 +130,16 @@ if [[ "$ELASTICSEARCH_VERSION" != *oss* ]]; then
url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME"
fi
# Pull the container, retry on failures up to 5 times with
# short delays between each attempt. Fixes most transient network errors.
docker_pull_attempts=0
until [ "$docker_pull_attempts" -ge 5 ]
do
docker pull docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION" && break
docker_pull_attempts=$((docker_pull_attempts+1))
sleep 10
done
echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m"
set -x
docker run \

View File

@ -4,13 +4,12 @@ ELASTICSEARCH_VERSION:
- 8.0.0-SNAPSHOT
- 7.x-SNAPSHOT
- 7.10-SNAPSHOT
- 7.7-SNAPSHOT
- 7.6-SNAPSHOT
TEST_SUITE:
- xpack
PYTHON_VERSION:
- 3.9
- 3.8
- 3.7
- 3.6

View File

@ -71,7 +71,7 @@ class DataFrameGroupBy(GroupBy):
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443

View File

@ -114,7 +114,7 @@ class MLModel:
>>> regressor = regressor.fit(training_data[0], training_data[1])
>>> # Get some test results
>>> regressor.predict(np.array(test_data))
>>> regressor.predict(np.array(test_data)) # doctest: +SKIP
array([0.06062475, 0.9990102 ], dtype=float32)
>>> # Serialise the model to Elasticsearch
@ -123,7 +123,7 @@ class MLModel:
>>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace')
>>> # Get some test results from Elasticsearch model
>>> es_model.predict(test_data)
>>> es_model.predict(test_data) # doctest: +SKIP
array([0.0606248 , 0.99901026], dtype=float32)
>>> # Delete model from Elasticsearch

View File

@ -214,7 +214,7 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mean()
>>> df.mean() # doctest: +SKIP
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
@ -227,7 +227,7 @@ class NDFrame(ABC):
dayOfWeek 2.835975
dtype: float64
>>> df.mean(numeric_only=False)
>>> df.mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
@ -263,7 +263,7 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.sum()
>>> df.sum() # doctest: +SKIP
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
@ -275,7 +275,7 @@ class NDFrame(ABC):
dayOfWeek 3.703500e+04
dtype: float64
>>> df.sum(numeric_only=False)
>>> df.sum(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
@ -311,7 +311,7 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.min()
>>> df.min() # doctest: +SKIP
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
@ -324,7 +324,7 @@ class NDFrame(ABC):
dayOfWeek 0.000000
dtype: float64
>>> df.min(numeric_only=False)
>>> df.min(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
@ -358,7 +358,7 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.var()
>>> df.var() # doctest: +SKIP
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
@ -370,7 +370,7 @@ class NDFrame(ABC):
dayOfWeek 3.761279
dtype: float64
>>> df.var(numeric_only=False)
>>> df.var(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 70964.6
Cancelled 0.111987
dayOfWeek 3.76128
@ -404,7 +404,7 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.std()
>>> df.std() # doctest: +SKIP
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
@ -416,7 +416,7 @@ class NDFrame(ABC):
dayOfWeek 1.939513
dtype: float64
>>> df.std(numeric_only=False)
>>> df.std(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 266.407
Cancelled 0.334664
dayOfWeek 1.93951
@ -499,7 +499,7 @@ class NDFrame(ABC):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.max()
>>> df.max() # doctest: +SKIP
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
@ -512,7 +512,7 @@ class NDFrame(ABC):
dayOfWeek 6.000000
dtype: float64
>>> df.max(numeric_only=False)
>>> df.max(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6

View File

@ -18,7 +18,19 @@
import numpy as np
from pandas.core.dtypes.generic import ABCIndexClass
from pandas.plotting._matplotlib import converter
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
try: # pandas>=1.2.0
from pandas.plotting._matplotlib.tools import (
create_subplots,
flatten_axes,
set_ticks_props,
)
except ImportError: # pandas<1.2.0
from pandas.plotting._matplotlib.tools import (
_flatten as flatten_axes,
_set_ticks_props as set_ticks_props,
_subplots as create_subplots,
)
from eland.utils import try_sort
@ -63,7 +75,7 @@ def hist_series(
ax.grid(grid)
axes = np.array([ax])
_set_ticks_props(
set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)
@ -110,7 +122,7 @@ def hist_frame(
if naxes == 0:
raise ValueError("hist method requires numerical columns, " "nothing to plot.")
fig, axes = _subplots(
fig, axes = create_subplots(
naxes=naxes,
ax=ax,
squeeze=False,
@ -119,7 +131,7 @@ def hist_frame(
figsize=figsize,
layout=layout,
)
_axes = _flatten(axes)
_axes = flatten_axes(axes)
for i, col in enumerate(try_sort(data.columns)):
ax = _axes[i]
@ -132,7 +144,7 @@ def hist_frame(
ax.set_title(col)
ax.grid(grid)
_set_ticks_props(
set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)
fig.subplots_adjust(wspace=0.3, hspace=0.3)

View File

@ -89,11 +89,18 @@ def lint(session):
session.error("\n" + "\n".join(sorted(set(errors))))
@nox.session(python=["3.6", "3.7", "3.8"])
@nox.session(python=["3.6", "3.7", "3.8", "3.9"])
def test(session):
session.install("-r", "requirements-dev.txt")
session.run("python", "-m", "tests.setup_tests")
session.install(".")
# Notebooks are only run on Python 3.7+ due to pandas 1.2.0
if session.python == "3.6":
nbval = ()
else:
nbval = ("--nbval",)
session.run(
"python",
"-m",
@ -102,21 +109,23 @@ def test(session):
"term-missing",
"--cov=eland/",
"--doctest-modules",
"--nbval",
*nbval,
*(session.posargs or ("eland/", "tests/")),
)
session.run(
"python",
"-m",
"pip",
"uninstall",
"--yes",
"scikit-learn",
"xgboost",
"lightgbm",
)
session.run("pytest", "tests/ml/")
# Only run during default test execution
if not session.posargs:
session.run(
"python",
"-m",
"pip",
"uninstall",
"--yes",
"scikit-learn",
"xgboost",
"lightgbm",
)
session.run("pytest", "tests/ml/")
@nox.session(reuse_venv=True)

View File

@ -16,6 +16,7 @@
# under the License.
import os
from datetime import timedelta
import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
@ -106,3 +107,30 @@ def assert_pandas_eland_series_equal(left, right, **kwargs):
# Use pandas tests to check similarity
assert_series_equal(left, right.to_pandas(), **kwargs)
def assert_almost_equal(left, right, **kwargs):
"""Asserts left and right are almost equal. Left and right
can be scalars, series, dataframes, etc
"""
if isinstance(left, (ed.DataFrame, ed.Series)):
left = left.to_pandas()
if isinstance(right, (ed.DataFrame, ed.Series)):
right = right.to_pandas()
if isinstance(right, pd.DataFrame):
kwargs.setdefault("check_exact", True)
assert_frame_equal(left, right)
elif isinstance(right, pd.Series):
kwargs.setdefault("check_exact", True)
assert_series_equal(left, right)
elif isinstance(right, float):
assert right * 0.99 <= left <= right * 1.01
elif isinstance(right, pd.Timestamp):
assert isinstance(left, pd.Timestamp) and right - timedelta(
seconds=0.1
) < left < right + timedelta(seconds=0.1)
elif right is pd.NaT:
assert left is pd.NaT
else:
assert left == right, f"{left} != {right}"

View File

@ -22,7 +22,7 @@ import pandas as pd
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal
from tests.common import TestData
from tests.common import TestData, assert_almost_equal
class TestDataFrameMetrics(TestData):
@ -181,7 +181,9 @@ class TestDataFrameMetrics(TestData):
)
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
ed_metrics_dict.pop("median") # Median is tested below.
assert ed_metrics_dict == expected_values
for key, expected_value in expected_values.items():
assert_almost_equal(ed_metrics_dict[key], expected_value)
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
def test_flights_datetime_metrics_single_agg(self, agg):
@ -200,7 +202,7 @@ class TestDataFrameMetrics(TestData):
else:
# df with timestamp column should return datetime64[ns]
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
assert ed_metric["timestamp"][0] == expected_values[agg]
assert_almost_equal(ed_metric["timestamp"][0], expected_values[agg])
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
def test_flights_datetime_metrics_agg_func(self, agg):
@ -213,7 +215,7 @@ class TestDataFrameMetrics(TestData):
ed_metric = getattr(ed_timestamps, agg)(numeric_only=False)
assert ed_metric.dtype == np.dtype("datetime64[ns]")
assert ed_metric[0] == expected_values[agg]
assert_almost_equal(ed_metric[0], expected_values[agg])
def test_flights_datetime_metrics_median(self):
ed_df = self.ed_flights_small()[["timestamp"]]
@ -283,7 +285,7 @@ class TestDataFrameMetrics(TestData):
else:
assert_series_equal(
agg_data[agg].rename(None),
getattr(pd_flights, agg)(numeric_only=True),
getattr(pd_flights, agg)(numeric_only=True).astype(float),
check_exact=False,
rtol=True,
)

View File

@ -2816,7 +2816,7 @@
" 25 dayOfWeek 13059 non-null int64 \n",
" 26 timestamp 13059 non-null datetime64[ns]\n",
"dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
"memory usage: 3.2+ MB\n"
"memory usage: 3.1+ MB\n"
]
}
],
@ -2924,7 +2924,7 @@
}
],
"source": [
"pd_flights.max(numeric_only=True)"
"pd_flights.max(numeric_only=True).astype(float)"
]
},
{
@ -3004,7 +3004,7 @@
}
],
"source": [
"pd_flights.min(numeric_only=True)"
"pd_flights.min(numeric_only=True).astype(float)"
]
},
{

View File

@ -23,7 +23,7 @@ import numpy as np
import pandas as pd
import pytest
from tests.common import TestData
from tests.common import TestData, assert_almost_equal
class TestSeriesMetrics(TestData):
@ -102,7 +102,7 @@ class TestSeriesMetrics(TestData):
}
ed_metric = getattr(ed_timestamps, agg)()
assert ed_metric == expected_values[agg]
assert_almost_equal(ed_metric, expected_values[agg])
def test_flights_datetime_median_metric(self):
ed_series = self.ed_flights_small()["timestamp"]