mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
* Support Pandas 2 (#742) * Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co> (cherry picked from commit 75c57b077532c459a9490613cbf7b37215c27fae) * Return input_field_names as list as required by Pandas 2 --------- Co-authored-by: Bart Broere <mail@bartbroere.eu> Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
65 lines
2.7 KiB
Python
65 lines
2.7 KiB
Python
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
# license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright
|
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
# the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# File called _pytest for PyCharm compatability
|
|
|
|
from pandas.testing import assert_frame_equal
|
|
|
|
from tests.common import TestData
|
|
|
|
|
|
class TestDataFrameDescribe(TestData):
|
|
def test_flights_describe(self):
|
|
pd_flights = self.pd_flights()
|
|
ed_flights = self.ed_flights()
|
|
|
|
pd_describe = pd_flights.describe()
|
|
# We remove bool columns to match pandas output
|
|
ed_describe = ed_flights.describe().drop(
|
|
["Cancelled", "FlightDelay"], axis="columns"
|
|
)
|
|
|
|
# Pandas >= 2 calculates aggregations such as min and max for timestamps too
|
|
# This could be implemented in eland, but as of yet this is not the case
|
|
# We therefore remove it before the comparison
|
|
if "timestamp" in pd_describe.columns:
|
|
pd_describe = pd_describe.drop(["timestamp"], axis="columns")
|
|
|
|
# Pandas >= 2 orders the aggregations differently than Pandas < 2
|
|
# A sort_index is applied so tests will succeed in both environments
|
|
assert_frame_equal(
|
|
pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
|
ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
|
|
check_exact=False,
|
|
rtol=True,
|
|
)
|
|
|
|
# TODO - this fails for percentile fields as ES aggregations are approximate
|
|
# if ES percentile agg uses
|
|
# "hdr": {
|
|
# "number_of_significant_value_digits": 3
|
|
# }
|
|
# this works
|
|
|
|
# pd_ecommerce_describe = self.pd_ecommerce().describe()
|
|
# ed_ecommerce_describe = self.ed_ecommerce().describe()
|
|
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
|
# don't match the mapping types. This is mainly because the products field is
|
|
# nested and so can be treated as a multi-field in ES, but not in pandas
|
|
|
|
# We can not also run 'describe' on a truncate ed dataframe
|