mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
* Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
153 lines
5.2 KiB
Python
153 lines
5.2 KiB
Python
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
# license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright
|
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
# the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import gzip
|
|
import json
|
|
import os
|
|
from datetime import timedelta
|
|
|
|
import pandas as pd
|
|
from pandas.testing import assert_frame_equal, assert_series_equal
|
|
|
|
import eland as ed
|
|
from eland.common import PANDAS_VERSION
|
|
|
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# Create pandas and eland data frames
|
|
from tests import (
|
|
ECOMMERCE_DF_FILE_NAME,
|
|
ECOMMERCE_INDEX_NAME,
|
|
ES_TEST_CLIENT,
|
|
FLIGHTS_FILE_NAME,
|
|
FLIGHTS_INDEX_NAME,
|
|
FLIGHTS_SMALL_INDEX_NAME,
|
|
)
|
|
|
|
_ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)
|
|
flight_records = []
|
|
with gzip.open(FLIGHTS_FILE_NAME) as f:
|
|
for json_obj in f:
|
|
flight_records.append(json.loads(json_obj))
|
|
_pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
|
_ed_flights.columns, axis=1
|
|
)
|
|
if PANDAS_VERSION[0] >= 2:
|
|
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"], format="mixed")
|
|
else:
|
|
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
|
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
|
|
_pd_flights["Cities"] = _pd_flights.apply(
|
|
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
|
|
)
|
|
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
|
|
|
_pd_flights_small = _pd_flights.head(48)
|
|
_ed_flights_small = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME)
|
|
|
|
_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
|
|
_pd_ecommerce["order_date"] = pd.to_datetime(_pd_ecommerce["order_date"])
|
|
_pd_ecommerce["products.created_on"] = _pd_ecommerce["products.created_on"].apply(
|
|
lambda x: pd.to_datetime(x)
|
|
)
|
|
_pd_ecommerce.insert(2, "customer_birth_date", None)
|
|
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
|
_pd_ecommerce["customer_birth_date"].astype("datetime64[ns]")
|
|
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
|
|
|
|
|
|
class TestData:
|
|
client = ES_TEST_CLIENT
|
|
|
|
def pd_flights(self):
|
|
return _pd_flights
|
|
|
|
def ed_flights(self):
|
|
return _ed_flights
|
|
|
|
def pd_flights_small(self):
|
|
return _pd_flights_small
|
|
|
|
def ed_flights_small(self):
|
|
return _ed_flights_small
|
|
|
|
def pd_ecommerce(self):
|
|
return _pd_ecommerce
|
|
|
|
def ed_ecommerce(self):
|
|
return _ed_ecommerce
|
|
|
|
|
|
def assert_pandas_eland_frame_equal(left, right, **kwargs):
|
|
if not isinstance(left, pd.DataFrame):
|
|
raise AssertionError(f"Expected type pd.DataFrame, found {type(left)} instead")
|
|
|
|
if not isinstance(right, ed.DataFrame):
|
|
raise AssertionError(f"Expected type ed.DataFrame, found {type(right)} instead")
|
|
|
|
# Use pandas tests to check similarity
|
|
assert_frame_equal(left, right.to_pandas(), **kwargs)
|
|
|
|
|
|
def assert_eland_frame_equal(left, right, **kwargs):
|
|
if not isinstance(left, ed.DataFrame):
|
|
raise AssertionError(f"Expected type ed.DataFrame, found {type(left)} instead")
|
|
|
|
if not isinstance(right, ed.DataFrame):
|
|
raise AssertionError(f"Expected type ed.DataFrame, found {type(right)} instead")
|
|
|
|
# Use pandas tests to check similarity
|
|
assert_frame_equal(left.to_pandas(), right.to_pandas(), **kwargs)
|
|
|
|
|
|
def assert_pandas_eland_series_equal(left, right, **kwargs):
|
|
if not isinstance(left, pd.Series):
|
|
raise AssertionError(f"Expected type pd.Series, found {type(left)} instead")
|
|
|
|
if not isinstance(right, ed.Series):
|
|
raise AssertionError(f"Expected type ed.Series, found {type(right)} instead")
|
|
|
|
# Use pandas tests to check similarity
|
|
assert_series_equal(left, right.to_pandas(), **kwargs)
|
|
|
|
|
|
def assert_almost_equal(left, right, **kwargs):
|
|
"""Asserts left and right are almost equal. Left and right
|
|
can be scalars, series, dataframes, etc
|
|
"""
|
|
if isinstance(left, (ed.DataFrame, ed.Series)):
|
|
left = left.to_pandas()
|
|
if isinstance(right, (ed.DataFrame, ed.Series)):
|
|
right = right.to_pandas()
|
|
|
|
if isinstance(right, pd.DataFrame):
|
|
kwargs.setdefault("check_exact", True)
|
|
assert_frame_equal(left, right)
|
|
elif isinstance(right, pd.Series):
|
|
kwargs.setdefault("check_exact", True)
|
|
assert_series_equal(left, right)
|
|
elif isinstance(right, float):
|
|
assert right * 0.99 <= left <= right * 1.01
|
|
elif isinstance(right, pd.Timestamp):
|
|
assert isinstance(left, pd.Timestamp) and right - timedelta(
|
|
seconds=0.1
|
|
) < left < right + timedelta(seconds=0.1)
|
|
elif right is pd.NaT:
|
|
assert left is pd.NaT
|
|
else:
|
|
assert left == right, f"{left} != {right}"
|