mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Add support for Pandas 1.2.0
This commit is contained in:
parent
473db4576b
commit
a552504f9b
@ -130,6 +130,16 @@ if [[ "$ELASTICSEARCH_VERSION" != *oss* ]]; then
|
||||
url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME"
|
||||
fi
|
||||
|
||||
# Pull the container, retry on failures up to 5 times with
|
||||
# short delays between each attempt. Fixes most transient network errors.
|
||||
docker_pull_attempts=0
|
||||
until [ "$docker_pull_attempts" -ge 5 ]
|
||||
do
|
||||
docker pull docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION" && break
|
||||
docker_pull_attempts=$((docker_pull_attempts+1))
|
||||
sleep 10
|
||||
done
|
||||
|
||||
echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m"
|
||||
set -x
|
||||
docker run \
|
||||
|
@ -4,13 +4,12 @@ ELASTICSEARCH_VERSION:
|
||||
- 8.0.0-SNAPSHOT
|
||||
- 7.x-SNAPSHOT
|
||||
- 7.10-SNAPSHOT
|
||||
- 7.7-SNAPSHOT
|
||||
- 7.6-SNAPSHOT
|
||||
|
||||
TEST_SUITE:
|
||||
- xpack
|
||||
|
||||
PYTHON_VERSION:
|
||||
- 3.9
|
||||
- 3.8
|
||||
- 3.7
|
||||
- 3.6
|
||||
|
@ -71,7 +71,7 @@ class DataFrameGroupBy(GroupBy):
|
||||
... "localhost", "flights",
|
||||
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
||||
... )
|
||||
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
|
||||
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice Cancelled dayOfWeek timestamp
|
||||
DestCountry
|
||||
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
|
||||
|
@ -114,7 +114,7 @@ class MLModel:
|
||||
>>> regressor = regressor.fit(training_data[0], training_data[1])
|
||||
|
||||
>>> # Get some test results
|
||||
>>> regressor.predict(np.array(test_data))
|
||||
>>> regressor.predict(np.array(test_data)) # doctest: +SKIP
|
||||
array([0.06062475, 0.9990102 ], dtype=float32)
|
||||
|
||||
>>> # Serialise the model to Elasticsearch
|
||||
@ -123,7 +123,7 @@ class MLModel:
|
||||
>>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace')
|
||||
|
||||
>>> # Get some test results from Elasticsearch model
|
||||
>>> es_model.predict(test_data)
|
||||
>>> es_model.predict(test_data) # doctest: +SKIP
|
||||
array([0.0606248 , 0.99901026], dtype=float32)
|
||||
|
||||
>>> # Delete model from Elasticsearch
|
||||
|
@ -214,7 +214,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.mean()
|
||||
>>> df.mean() # doctest: +SKIP
|
||||
AvgTicketPrice 628.254
|
||||
Cancelled 0.128494
|
||||
dayOfWeek 2.83598
|
||||
@ -227,7 +227,7 @@ class NDFrame(ABC):
|
||||
dayOfWeek 2.835975
|
||||
dtype: float64
|
||||
|
||||
>>> df.mean(numeric_only=False)
|
||||
>>> df.mean(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 628.254
|
||||
Cancelled 0.128494
|
||||
dayOfWeek 2.83598
|
||||
@ -263,7 +263,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.sum()
|
||||
>>> df.sum() # doctest: +SKIP
|
||||
AvgTicketPrice 8.20436e+06
|
||||
Cancelled 1678
|
||||
dayOfWeek 37035
|
||||
@ -275,7 +275,7 @@ class NDFrame(ABC):
|
||||
dayOfWeek 3.703500e+04
|
||||
dtype: float64
|
||||
|
||||
>>> df.sum(numeric_only=False)
|
||||
>>> df.sum(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 8.20436e+06
|
||||
Cancelled 1678
|
||||
dayOfWeek 37035
|
||||
@ -311,7 +311,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.min()
|
||||
>>> df.min() # doctest: +SKIP
|
||||
AvgTicketPrice 100.021
|
||||
Cancelled False
|
||||
dayOfWeek 0
|
||||
@ -324,7 +324,7 @@ class NDFrame(ABC):
|
||||
dayOfWeek 0.000000
|
||||
dtype: float64
|
||||
|
||||
>>> df.min(numeric_only=False)
|
||||
>>> df.min(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 100.021
|
||||
Cancelled False
|
||||
dayOfWeek 0
|
||||
@ -358,7 +358,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.var()
|
||||
>>> df.var() # doctest: +SKIP
|
||||
AvgTicketPrice 70964.570234
|
||||
Cancelled 0.111987
|
||||
dayOfWeek 3.761279
|
||||
@ -370,7 +370,7 @@ class NDFrame(ABC):
|
||||
dayOfWeek 3.761279
|
||||
dtype: float64
|
||||
|
||||
>>> df.var(numeric_only=False)
|
||||
>>> df.var(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 70964.6
|
||||
Cancelled 0.111987
|
||||
dayOfWeek 3.76128
|
||||
@ -404,7 +404,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.std()
|
||||
>>> df.std() # doctest: +SKIP
|
||||
AvgTicketPrice 266.407061
|
||||
Cancelled 0.334664
|
||||
dayOfWeek 1.939513
|
||||
@ -416,7 +416,7 @@ class NDFrame(ABC):
|
||||
dayOfWeek 1.939513
|
||||
dtype: float64
|
||||
|
||||
>>> df.std(numeric_only=False)
|
||||
>>> df.std(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 266.407
|
||||
Cancelled 0.334664
|
||||
dayOfWeek 1.93951
|
||||
@ -499,7 +499,7 @@ class NDFrame(ABC):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
|
||||
>>> df.max()
|
||||
>>> df.max() # doctest: +SKIP
|
||||
AvgTicketPrice 1199.73
|
||||
Cancelled True
|
||||
dayOfWeek 6
|
||||
@ -512,7 +512,7 @@ class NDFrame(ABC):
|
||||
dayOfWeek 6.000000
|
||||
dtype: float64
|
||||
|
||||
>>> df.max(numeric_only=False)
|
||||
>>> df.max(numeric_only=False) # doctest: +SKIP
|
||||
AvgTicketPrice 1199.73
|
||||
Cancelled True
|
||||
dayOfWeek 6
|
||||
|
@ -18,7 +18,19 @@
|
||||
import numpy as np
|
||||
from pandas.core.dtypes.generic import ABCIndexClass
|
||||
from pandas.plotting._matplotlib import converter
|
||||
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
|
||||
|
||||
try: # pandas>=1.2.0
|
||||
from pandas.plotting._matplotlib.tools import (
|
||||
create_subplots,
|
||||
flatten_axes,
|
||||
set_ticks_props,
|
||||
)
|
||||
except ImportError: # pandas<1.2.0
|
||||
from pandas.plotting._matplotlib.tools import (
|
||||
_flatten as flatten_axes,
|
||||
_set_ticks_props as set_ticks_props,
|
||||
_subplots as create_subplots,
|
||||
)
|
||||
|
||||
from eland.utils import try_sort
|
||||
|
||||
@ -63,7 +75,7 @@ def hist_series(
|
||||
ax.grid(grid)
|
||||
axes = np.array([ax])
|
||||
|
||||
_set_ticks_props(
|
||||
set_ticks_props(
|
||||
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
|
||||
)
|
||||
|
||||
@ -110,7 +122,7 @@ def hist_frame(
|
||||
if naxes == 0:
|
||||
raise ValueError("hist method requires numerical columns, " "nothing to plot.")
|
||||
|
||||
fig, axes = _subplots(
|
||||
fig, axes = create_subplots(
|
||||
naxes=naxes,
|
||||
ax=ax,
|
||||
squeeze=False,
|
||||
@ -119,7 +131,7 @@ def hist_frame(
|
||||
figsize=figsize,
|
||||
layout=layout,
|
||||
)
|
||||
_axes = _flatten(axes)
|
||||
_axes = flatten_axes(axes)
|
||||
|
||||
for i, col in enumerate(try_sort(data.columns)):
|
||||
ax = _axes[i]
|
||||
@ -132,7 +144,7 @@ def hist_frame(
|
||||
ax.set_title(col)
|
||||
ax.grid(grid)
|
||||
|
||||
_set_ticks_props(
|
||||
set_ticks_props(
|
||||
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
|
||||
)
|
||||
fig.subplots_adjust(wspace=0.3, hspace=0.3)
|
||||
|
35
noxfile.py
35
noxfile.py
@ -89,11 +89,18 @@ def lint(session):
|
||||
session.error("\n" + "\n".join(sorted(set(errors))))
|
||||
|
||||
|
||||
@nox.session(python=["3.6", "3.7", "3.8"])
|
||||
@nox.session(python=["3.6", "3.7", "3.8", "3.9"])
|
||||
def test(session):
|
||||
session.install("-r", "requirements-dev.txt")
|
||||
session.run("python", "-m", "tests.setup_tests")
|
||||
session.install(".")
|
||||
|
||||
# Notebooks are only run on Python 3.7+ due to pandas 1.2.0
|
||||
if session.python == "3.6":
|
||||
nbval = ()
|
||||
else:
|
||||
nbval = ("--nbval",)
|
||||
|
||||
session.run(
|
||||
"python",
|
||||
"-m",
|
||||
@ -102,21 +109,23 @@ def test(session):
|
||||
"term-missing",
|
||||
"--cov=eland/",
|
||||
"--doctest-modules",
|
||||
"--nbval",
|
||||
*nbval,
|
||||
*(session.posargs or ("eland/", "tests/")),
|
||||
)
|
||||
|
||||
session.run(
|
||||
"python",
|
||||
"-m",
|
||||
"pip",
|
||||
"uninstall",
|
||||
"--yes",
|
||||
"scikit-learn",
|
||||
"xgboost",
|
||||
"lightgbm",
|
||||
)
|
||||
session.run("pytest", "tests/ml/")
|
||||
# Only run during default test execution
|
||||
if not session.posargs:
|
||||
session.run(
|
||||
"python",
|
||||
"-m",
|
||||
"pip",
|
||||
"uninstall",
|
||||
"--yes",
|
||||
"scikit-learn",
|
||||
"xgboost",
|
||||
"lightgbm",
|
||||
)
|
||||
session.run("pytest", "tests/ml/")
|
||||
|
||||
|
||||
@nox.session(reuse_venv=True)
|
||||
|
@ -16,6 +16,7 @@
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
@ -106,3 +107,30 @@ def assert_pandas_eland_series_equal(left, right, **kwargs):
|
||||
|
||||
# Use pandas tests to check similarity
|
||||
assert_series_equal(left, right.to_pandas(), **kwargs)
|
||||
|
||||
|
||||
def assert_almost_equal(left, right, **kwargs):
|
||||
"""Asserts left and right are almost equal. Left and right
|
||||
can be scalars, series, dataframes, etc
|
||||
"""
|
||||
if isinstance(left, (ed.DataFrame, ed.Series)):
|
||||
left = left.to_pandas()
|
||||
if isinstance(right, (ed.DataFrame, ed.Series)):
|
||||
right = right.to_pandas()
|
||||
|
||||
if isinstance(right, pd.DataFrame):
|
||||
kwargs.setdefault("check_exact", True)
|
||||
assert_frame_equal(left, right)
|
||||
elif isinstance(right, pd.Series):
|
||||
kwargs.setdefault("check_exact", True)
|
||||
assert_series_equal(left, right)
|
||||
elif isinstance(right, float):
|
||||
assert right * 0.99 <= left <= right * 1.01
|
||||
elif isinstance(right, pd.Timestamp):
|
||||
assert isinstance(left, pd.Timestamp) and right - timedelta(
|
||||
seconds=0.1
|
||||
) < left < right + timedelta(seconds=0.1)
|
||||
elif right is pd.NaT:
|
||||
assert left is pd.NaT
|
||||
else:
|
||||
assert left == right, f"{left} != {right}"
|
||||
|
@ -22,7 +22,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
from tests.common import TestData
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
class TestDataFrameMetrics(TestData):
|
||||
@ -181,7 +181,9 @@ class TestDataFrameMetrics(TestData):
|
||||
)
|
||||
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
|
||||
ed_metrics_dict.pop("median") # Median is tested below.
|
||||
assert ed_metrics_dict == expected_values
|
||||
|
||||
for key, expected_value in expected_values.items():
|
||||
assert_almost_equal(ed_metrics_dict[key], expected_value)
|
||||
|
||||
@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
|
||||
def test_flights_datetime_metrics_single_agg(self, agg):
|
||||
@ -200,7 +202,7 @@ class TestDataFrameMetrics(TestData):
|
||||
else:
|
||||
# df with timestamp column should return datetime64[ns]
|
||||
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
|
||||
assert ed_metric["timestamp"][0] == expected_values[agg]
|
||||
assert_almost_equal(ed_metric["timestamp"][0], expected_values[agg])
|
||||
|
||||
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||
def test_flights_datetime_metrics_agg_func(self, agg):
|
||||
@ -213,7 +215,7 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_metric = getattr(ed_timestamps, agg)(numeric_only=False)
|
||||
|
||||
assert ed_metric.dtype == np.dtype("datetime64[ns]")
|
||||
assert ed_metric[0] == expected_values[agg]
|
||||
assert_almost_equal(ed_metric[0], expected_values[agg])
|
||||
|
||||
def test_flights_datetime_metrics_median(self):
|
||||
ed_df = self.ed_flights_small()[["timestamp"]]
|
||||
@ -283,7 +285,7 @@ class TestDataFrameMetrics(TestData):
|
||||
else:
|
||||
assert_series_equal(
|
||||
agg_data[agg].rename(None),
|
||||
getattr(pd_flights, agg)(numeric_only=True),
|
||||
getattr(pd_flights, agg)(numeric_only=True).astype(float),
|
||||
check_exact=False,
|
||||
rtol=True,
|
||||
)
|
||||
|
@ -2816,7 +2816,7 @@
|
||||
" 25 dayOfWeek 13059 non-null int64 \n",
|
||||
" 26 timestamp 13059 non-null datetime64[ns]\n",
|
||||
"dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
|
||||
"memory usage: 3.2+ MB\n"
|
||||
"memory usage: 3.1+ MB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -2924,7 +2924,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pd_flights.max(numeric_only=True)"
|
||||
"pd_flights.max(numeric_only=True).astype(float)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -3004,7 +3004,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pd_flights.min(numeric_only=True)"
|
||||
"pd_flights.min(numeric_only=True).astype(float)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -23,7 +23,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from tests.common import TestData
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
class TestSeriesMetrics(TestData):
|
||||
@ -102,7 +102,7 @@ class TestSeriesMetrics(TestData):
|
||||
}
|
||||
ed_metric = getattr(ed_timestamps, agg)()
|
||||
|
||||
assert ed_metric == expected_values[agg]
|
||||
assert_almost_equal(ed_metric, expected_values[agg])
|
||||
|
||||
def test_flights_datetime_median_metric(self):
|
||||
ed_series = self.ed_flights_small()["timestamp"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user