mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
[Backport 8.x] Fixes for Pandas 2 support (#758)
* Support Pandas 2 (#742) * Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co> (cherry picked from commit 75c57b077532c459a9490613cbf7b37215c27fae) * Return input_field_names as list as required by Pandas 2 --------- Co-authored-by: Bart Broere <mail@bartbroere.eu> Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
This commit is contained in:
parent
d50436b01c
commit
af20ef9063
@ -29,11 +29,16 @@ steps:
|
||||
machineType: "n2-standard-4"
|
||||
env:
|
||||
PYTHON_VERSION: "{{ matrix.python }}"
|
||||
PANDAS_VERSION: '1.5.0'
|
||||
PANDAS_VERSION: "{{ matrix.pandas }}"
|
||||
TEST_SUITE: "xpack"
|
||||
ELASTICSEARCH_VERSION: "{{ matrix.stack }}"
|
||||
matrix:
|
||||
setup:
|
||||
# Python and pandas versions need to be added to the nox configuration too
|
||||
# (in the decorators of the test method in noxfile.py)
|
||||
pandas:
|
||||
- '1.5.0'
|
||||
- '2.2.3'
|
||||
python:
|
||||
- '3.12'
|
||||
- '3.11'
|
||||
|
@ -24,7 +24,7 @@
|
||||
"\n",
|
||||
"For this example, you will need:\n",
|
||||
"\n",
|
||||
"- Python 3.8 or later\n",
|
||||
"- Python 3.9 or later\n",
|
||||
"- An Elastic deployment\n",
|
||||
" - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration))\n",
|
||||
"\n",
|
||||
|
@ -34,7 +34,7 @@ from pandas.io.formats.printing import pprint_thing # type: ignore
|
||||
from pandas.util._validators import validate_bool_kwarg # type: ignore
|
||||
|
||||
import eland.plotting as gfx
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||
from eland.filter import BooleanFilter
|
||||
from eland.groupby import DataFrameGroupBy
|
||||
from eland.ndframe import NDFrame
|
||||
@ -411,9 +411,7 @@ class DataFrame(NDFrame):
|
||||
axis = pd.DataFrame._get_axis_name(axis)
|
||||
axes = {axis: labels}
|
||||
elif index is not None or columns is not None:
|
||||
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
|
||||
(index, columns), {}
|
||||
)
|
||||
axes = {"columns": columns, "index": index}
|
||||
else:
|
||||
raise ValueError(
|
||||
"Need to specify at least one of 'labels', 'index' or 'columns'"
|
||||
@ -1361,7 +1359,7 @@ class DataFrame(NDFrame):
|
||||
default_handler=None,
|
||||
lines=False,
|
||||
compression="infer",
|
||||
index=True,
|
||||
index=None,
|
||||
indent=None,
|
||||
storage_options=None,
|
||||
):
|
||||
@ -1376,6 +1374,8 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
:pandas_api_docs:`pandas.DataFrame.to_json`
|
||||
"""
|
||||
if index is None and PANDAS_VERSION[0] == 1:
|
||||
index = True # switch to the pandas 1 default
|
||||
kwargs = {
|
||||
"path_or_buf": path_or_buf,
|
||||
"orient": orient,
|
||||
|
39
eland/etl.py
39
eland/etl.py
@ -16,6 +16,7 @@
|
||||
# under the License.
|
||||
|
||||
import csv
|
||||
import warnings
|
||||
from collections import deque
|
||||
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
|
||||
|
||||
@ -110,15 +111,15 @@ def pandas_to_eland(
|
||||
2 3.141 1 ... 3 Long text - to be indexed as es type text
|
||||
<BLANKLINE>
|
||||
[3 rows x 8 columns]
|
||||
>>> pd_df.dtypes
|
||||
A float64
|
||||
B int64
|
||||
C object
|
||||
D datetime64[ns]
|
||||
E float64
|
||||
F bool
|
||||
G int64
|
||||
H object
|
||||
>>> pd_df.dtypes # doctest skip required for pandas < 2 # doctest: +SKIP
|
||||
A float64
|
||||
B int64
|
||||
C object
|
||||
D datetime64[s]
|
||||
E float64
|
||||
F bool
|
||||
G int64
|
||||
H object
|
||||
dtype: object
|
||||
|
||||
Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
|
||||
@ -307,9 +308,9 @@ def csv_to_eland( # type: ignore
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
squeeze=None,
|
||||
prefix=None,
|
||||
mangle_dupe_cols=True,
|
||||
mangle_dupe_cols=None,
|
||||
# General Parsing Configuration
|
||||
dtype=None,
|
||||
engine=None,
|
||||
@ -357,6 +358,7 @@ def csv_to_eland( # type: ignore
|
||||
low_memory: bool = _DEFAULT_LOW_MEMORY,
|
||||
memory_map=False,
|
||||
float_precision=None,
|
||||
**extra_kwargs,
|
||||
) -> "DataFrame":
|
||||
"""
|
||||
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
|
||||
@ -485,7 +487,6 @@ def csv_to_eland( # type: ignore
|
||||
"usecols": usecols,
|
||||
"verbose": verbose,
|
||||
"encoding": encoding,
|
||||
"squeeze": squeeze,
|
||||
"memory_map": memory_map,
|
||||
"float_precision": float_precision,
|
||||
"na_filter": na_filter,
|
||||
@ -494,9 +495,9 @@ def csv_to_eland( # type: ignore
|
||||
"error_bad_lines": error_bad_lines,
|
||||
"on_bad_lines": on_bad_lines,
|
||||
"low_memory": low_memory,
|
||||
"mangle_dupe_cols": mangle_dupe_cols,
|
||||
"infer_datetime_format": infer_datetime_format,
|
||||
"skip_blank_lines": skip_blank_lines,
|
||||
**extra_kwargs,
|
||||
}
|
||||
|
||||
if chunksize is None:
|
||||
@ -525,6 +526,18 @@ def csv_to_eland( # type: ignore
|
||||
|
||||
kwargs.pop("on_bad_lines")
|
||||
|
||||
if "squeeze" in kwargs:
|
||||
kwargs.pop("squeeze")
|
||||
warnings.warn(
|
||||
"This argument no longer works, use .squeeze('columns') on your DataFrame instead"
|
||||
)
|
||||
|
||||
if "mangle_dupe_cols" in kwargs:
|
||||
kwargs.pop("mangle_dupe_cols")
|
||||
warnings.warn(
|
||||
"The mangle_dupe_cols argument no longer works. Furthermore, "
|
||||
"duplicate columns will automatically get a number suffix."
|
||||
)
|
||||
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
|
||||
reader = pd.read_csv(filepath_or_buffer, **kwargs)
|
||||
|
||||
|
@ -712,8 +712,11 @@ class FieldMappings:
|
||||
capabilities, orient="index", columns=FieldMappings.column_labels
|
||||
)
|
||||
|
||||
self._mappings_capabilities = self._mappings_capabilities.append(
|
||||
capability_matrix_row
|
||||
self._mappings_capabilities = pd.concat(
|
||||
[
|
||||
self._mappings_capabilities,
|
||||
capability_matrix_row,
|
||||
]
|
||||
)
|
||||
|
||||
def numeric_source_fields(self) -> List[str]:
|
||||
|
@ -187,7 +187,7 @@ class ESGradientBoostingModel(ABC):
|
||||
if field_name in feature_names and field_name not in input_field_names:
|
||||
input_field_names.add(field_name)
|
||||
|
||||
return feature_names, input_field_names
|
||||
return feature_names, list(input_field_names)
|
||||
|
||||
@property
|
||||
def preprocessors(self) -> List[Any]:
|
||||
|
@ -40,11 +40,12 @@ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd # type: ignore
|
||||
from pandas.core.indexes.frozen import FrozenList
|
||||
from pandas.io.common import _expand_user, stringify_path # type: ignore
|
||||
|
||||
import eland.plotting
|
||||
from eland.arithmetics import ArithmeticNumber, ArithmeticSeries, ArithmeticString
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, PANDAS_VERSION, docstring_parameter
|
||||
from eland.filter import (
|
||||
BooleanFilter,
|
||||
Equal,
|
||||
@ -292,18 +293,26 @@ class Series(NDFrame):
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df['Carrier'].value_counts()
|
||||
>>> df['Carrier'].value_counts() # doctest: +SKIP
|
||||
Carrier
|
||||
Logstash Airways 3331
|
||||
JetBeats 3274
|
||||
Kibana Airlines 3234
|
||||
ES-Air 3220
|
||||
Name: Carrier, dtype: int64
|
||||
Name: count, dtype: int64
|
||||
"""
|
||||
if not isinstance(es_size, int):
|
||||
raise TypeError("es_size must be a positive integer.")
|
||||
elif es_size <= 0:
|
||||
raise ValueError("es_size must be a positive integer.")
|
||||
return self._query_compiler.value_counts(es_size)
|
||||
value_counts = self._query_compiler.value_counts(es_size)
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
|
||||
if PANDAS_VERSION[0] == 2:
|
||||
value_counts.name = "count"
|
||||
value_counts.index.names = FrozenList([self.es_field_name])
|
||||
value_counts.index.name = self.es_field_name
|
||||
|
||||
return value_counts
|
||||
|
||||
# dtype not implemented for Series as causes query to fail
|
||||
# in pandas.core.computation.ops.Term.type
|
||||
|
@ -96,7 +96,7 @@ def lint(session):
|
||||
|
||||
|
||||
@nox.session(python=["3.9", "3.10", "3.11", "3.12"])
|
||||
@nox.parametrize("pandas_version", ["1.5.0"])
|
||||
@nox.parametrize("pandas_version", ["1.5.0", "2.2.3"])
|
||||
def test(session, pandas_version: str):
|
||||
session.install("-r", "requirements-dev.txt")
|
||||
session.install(".")
|
||||
|
2
setup.py
2
setup.py
@ -87,7 +87,7 @@ setup(
|
||||
packages=find_packages(include=["eland", "eland.*"]),
|
||||
install_requires=[
|
||||
"elasticsearch>=8.3,<9",
|
||||
"pandas>=1.5,<2",
|
||||
"pandas>=1.5,<3",
|
||||
"matplotlib>=3.6",
|
||||
"numpy>=1.2.0,<2",
|
||||
"packaging",
|
||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
import eland as ed
|
||||
from eland.common import PANDAS_VERSION
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@ -45,7 +46,10 @@ with gzip.open(FLIGHTS_FILE_NAME) as f:
|
||||
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
||||
_ed_flights.columns, axis=1
|
||||
)
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
if PANDAS_VERSION[0] >= 2:
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
|
||||
else:
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
|
||||
_pd_flights["Cities"] = _pd_flights.apply(
|
||||
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
|
||||
@ -62,7 +66,7 @@ _pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].appl
|
||||
)
|
||||
_pd_ecommerce.insert(2, "customer_birth_date", None)
|
||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce["customer_birth_date"].astype("datetime64")
|
||||
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
|
||||
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
|
||||
|
||||
|
||||
|
@ -77,7 +77,16 @@ class SymmetricAPIChecker:
|
||||
pd_exc = e
|
||||
|
||||
self.check_exception(ed_exc, pd_exc)
|
||||
self.check_values(ed_obj, pd_obj)
|
||||
try:
|
||||
self.check_values(ed_obj, pd_obj)
|
||||
except AssertionError as e:
|
||||
# This is an attribute we allow to differ when comparing zero-length objects
|
||||
if (
|
||||
'Attribute "inferred_type" are different' in repr(e)
|
||||
and len(ed_obj) == 0
|
||||
and len(pd_obj) == 0
|
||||
):
|
||||
self.check_values(ed_obj, pd_obj, check_index_type=False)
|
||||
|
||||
if isinstance(ed_obj, (ed.DataFrame, ed.Series)):
|
||||
return SymmetricAPIChecker(ed_obj, pd_obj)
|
||||
@ -85,16 +94,16 @@ class SymmetricAPIChecker:
|
||||
|
||||
return f
|
||||
|
||||
def check_values(self, ed_obj, pd_obj):
|
||||
def check_values(self, ed_obj, pd_obj, **kwargs):
|
||||
"""Checks that any two values coming from eland and pandas are equal"""
|
||||
if isinstance(ed_obj, ed.DataFrame):
|
||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj)
|
||||
assert_pandas_eland_frame_equal(pd_obj, ed_obj, **kwargs)
|
||||
elif isinstance(ed_obj, ed.Series):
|
||||
assert_pandas_eland_series_equal(pd_obj, ed_obj)
|
||||
assert_pandas_eland_series_equal(pd_obj, ed_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.DataFrame):
|
||||
assert_frame_equal(ed_obj, pd_obj)
|
||||
assert_frame_equal(ed_obj, pd_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.Series):
|
||||
assert_series_equal(ed_obj, pd_obj)
|
||||
assert_series_equal(ed_obj, pd_obj, **kwargs)
|
||||
elif isinstance(ed_obj, pd.Index):
|
||||
assert ed_obj.equals(pd_obj)
|
||||
else:
|
||||
|
@ -87,6 +87,8 @@ class TestDataFrameDateTime(TestData):
|
||||
},
|
||||
index=["0", "1", "2"],
|
||||
)
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||
df["D"] = df["D"].astype("datetime64[ns]")
|
||||
|
||||
expected_mappings = {
|
||||
"mappings": {
|
||||
|
@ -33,9 +33,17 @@ class TestDataFrameDescribe(TestData):
|
||||
["Cancelled", "FlightDelay"], axis="columns"
|
||||
)
|
||||
|
||||
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
|
||||
# This could be implemented in eland, but as of yet this is not the case
|
||||
# We therefore remove it before the comparison
|
||||
if "timestamp" in pd_describe.columns:
|
||||
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
|
||||
|
||||
# Pandas >= 2 orders the aggregations differently than Pandas < 2
|
||||
# A sort_index is applied so tests will succeed in both environments
|
||||
assert_frame_equal(
|
||||
pd_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||
ed_describe.drop(["25%", "50%", "75%"], axis="index"),
|
||||
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
||||
check_exact=False,
|
||||
rtol=True,
|
||||
)
|
||||
|
@ -99,7 +99,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
ed_head_0 = ed_flights.head(0)
|
||||
pd_head_0 = pd_flights.head(0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0, check_index_type=False)
|
||||
|
||||
def test_doc_test_tail(self):
|
||||
df = self.ed_flights()
|
||||
|
@ -22,6 +22,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_frame_equal, assert_series_equal
|
||||
|
||||
from eland.common import PANDAS_VERSION
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
@ -74,6 +75,8 @@ class TestDataFrameMetrics(TestData):
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights, func)(
|
||||
**({"numeric_only": True} if func != "mad" else {})
|
||||
)
|
||||
@ -92,6 +95,8 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights_1, func)()
|
||||
ed_metric = getattr(ed_flights_1, func)(numeric_only=False)
|
||||
|
||||
@ -102,6 +107,8 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||
|
||||
for func in self.extended_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights_0, func)()
|
||||
ed_metric = getattr(ed_flights_0, func)(numeric_only=False)
|
||||
|
||||
@ -491,8 +498,13 @@ class TestDataFrameMetrics(TestData):
|
||||
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
|
||||
)
|
||||
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
if PANDAS_VERSION[0] == 1:
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"], numeric_only=numeric_only)
|
||||
|
||||
else: # numeric_only is no longer available for pandas > 2
|
||||
pd_quantile = pd_flights.agg(["quantile", "min"])
|
||||
ed_quantile = ed_flights.agg(["quantile", "min"])
|
||||
|
||||
assert_frame_equal(
|
||||
pd_quantile, ed_quantile, check_exact=False, rtol=4, check_dtype=False
|
||||
|
@ -69,6 +69,12 @@ class TestDataFrameUtils(TestData):
|
||||
)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
||||
df["D"] = df["D"].astype("datetime64[ns]")
|
||||
df["H"] = (
|
||||
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
|
||||
)
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
||||
ES_TEST_CLIENT.indices.delete(index=index_name)
|
||||
|
@ -1647,6 +1647,14 @@
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/code/eland/.nox/test-3-12-pandas_version-2-2-3/lib/python3.12/site-packages/eland/series.py:464: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" return self._query_compiler.dtypes[0]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -1792,6 +1800,9 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NBVAL_IGNORE_OUTPUT\n",
|
||||
"# The ignore statement above is because of output difference between Pandas 1 and 2\n",
|
||||
"# and can be removed once Pandas 1 support is dropped\n",
|
||||
"ed_flights.query('Carrier == \"Kibana Airlines\" & AvgTicketPrice > 900.0 & Cancelled == True')"
|
||||
]
|
||||
},
|
||||
@ -2377,8 +2388,8 @@
|
||||
" <th>AvgTicketPrice</th>\n",
|
||||
" <th>DistanceKilometers</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>FlightTimeMin</th>\n",
|
||||
" <th>dayOfWeek</th>\n",
|
||||
" <th>timestamp</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
@ -2388,23 +2399,15 @@
|
||||
" <td>13059.000000</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>13059.000000</td>\n",
|
||||
" <td>13059.000000</td>\n",
|
||||
" <td>13059</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>628.253689</td>\n",
|
||||
" <td>7092.142455</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>511.127842</td>\n",
|
||||
" <td>2.835975</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>266.396861</td>\n",
|
||||
" <td>4578.438497</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>334.753952</td>\n",
|
||||
" <td>1.939439</td>\n",
|
||||
" <td>2018-01-21 19:20:45.564438016</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
@ -2412,57 +2415,65 @@
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>2018-01-01 00:00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>409.893816</td>\n",
|
||||
" <td>2459.705673</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>252.333192</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>2018-01-11 05:16:25.500000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>640.556668</td>\n",
|
||||
" <td>7610.330866</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>503.045170</td>\n",
|
||||
" <td>3.000000</td>\n",
|
||||
" <td>2018-01-22 00:32:11</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>842.185470</td>\n",
|
||||
" <td>9736.637600</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>720.416036</td>\n",
|
||||
" <td>4.000000</td>\n",
|
||||
" <td>2018-02-01 04:51:18</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td>1199.729053</td>\n",
|
||||
" <td>19881.482315</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>1902.902032</td>\n",
|
||||
" <td>6.000000</td>\n",
|
||||
" <td>2018-02-11 23:50:12</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>266.396861</td>\n",
|
||||
" <td>4578.438497</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>1.939439</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>8 rows × 7 columns</p>\n",
|
||||
"<p>8 rows × 8 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" AvgTicketPrice DistanceKilometers ... FlightTimeMin dayOfWeek\n",
|
||||
"count 13059.000000 13059.000000 ... 13059.000000 13059.000000\n",
|
||||
"mean 628.253689 7092.142455 ... 511.127842 2.835975\n",
|
||||
"std 266.396861 4578.438497 ... 334.753952 1.939439\n",
|
||||
"min 100.020528 0.000000 ... 0.000000 0.000000\n",
|
||||
"25% 409.893816 2459.705673 ... 252.333192 1.000000\n",
|
||||
"50% 640.556668 7610.330866 ... 503.045170 3.000000\n",
|
||||
"75% 842.185470 9736.637600 ... 720.416036 4.000000\n",
|
||||
"max 1199.729053 19881.482315 ... 1902.902032 6.000000\n",
|
||||
" AvgTicketPrice DistanceKilometers ... dayOfWeek timestamp\n",
|
||||
"count 13059.000000 13059.000000 ... 13059.000000 13059\n",
|
||||
"mean 628.253689 7092.142455 ... 2.835975 2018-01-21 19:20:45.564438016\n",
|
||||
"min 100.020528 0.000000 ... 0.000000 2018-01-01 00:00:00\n",
|
||||
"25% 409.893816 2459.705673 ... 1.000000 2018-01-11 05:16:25.500000\n",
|
||||
"50% 640.556668 7610.330866 ... 3.000000 2018-01-22 00:32:11\n",
|
||||
"75% 842.185470 9736.637600 ... 4.000000 2018-02-01 04:51:18\n",
|
||||
"max 1199.729053 19881.482315 ... 6.000000 2018-02-11 23:50:12\n",
|
||||
"std 266.396861 4578.438497 ... 1.939439 NaN\n",
|
||||
"\n",
|
||||
"[8 rows x 7 columns]"
|
||||
"[8 rows x 8 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
@ -2471,6 +2482,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NBVAL_IGNORE_OUTPUT\n",
|
||||
"# Once support for pandas <2 is dropped, this and the line above can be removed\n",
|
||||
"pd_flights.describe()"
|
||||
]
|
||||
},
|
||||
|
@ -58,7 +58,9 @@ class TestSeriesFilter(TestData):
|
||||
ed_ser = ed_flights_small.filter(items=items, axis=0)
|
||||
pd_ser = pd_flights_small.filter(items=items, axis=0)
|
||||
|
||||
assert_pandas_eland_series_equal(pd_ser, ed_ser)
|
||||
# For an empty Series, eland will say the datatype it knows from the Elastic index
|
||||
# Pandas however will state empty as the datatype
|
||||
assert_pandas_eland_series_equal(pd_ser, ed_ser, check_index_type=False)
|
||||
|
||||
def test_flights_filter_index_like_and_regex(self):
|
||||
ed_flights_small = self.ed_flights_small()["FlightDelayType"]
|
||||
|
@ -24,6 +24,7 @@ import pandas as pd
|
||||
import pytest
|
||||
from pandas.testing import assert_series_equal
|
||||
|
||||
from eland.common import PANDAS_VERSION
|
||||
from tests.common import TestData, assert_almost_equal
|
||||
|
||||
|
||||
@ -42,6 +43,8 @@ class TestSeriesMetrics(TestData):
|
||||
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
||||
|
||||
for func in self.all_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_flights, func)()
|
||||
ed_metric = getattr(ed_flights, func)()
|
||||
|
||||
@ -87,6 +90,8 @@ class TestSeriesMetrics(TestData):
|
||||
ed_ecommerce = self.ed_ecommerce()[column]
|
||||
|
||||
for func in self.all_funcs:
|
||||
if PANDAS_VERSION[0] >= 2 and func == "mad":
|
||||
continue
|
||||
pd_metric = getattr(pd_ecommerce, func)()
|
||||
ed_metric = getattr(ed_ecommerce, func)(
|
||||
**({"numeric_only": True} if (func != "nunique") else {})
|
||||
|
Loading…
x
Reference in New Issue
Block a user