eland/tests/dataframe/test_describe_pytest.py

#  Licensed to Elasticsearch B.V. under one or more contributor
#  license agreements. See the NOTICE file distributed with
#  this work for additional information regarding copyright
#  ownership. Elasticsearch B.V. licenses this file to you under
#  the Apache License, Version 2.0 (the "License"); you may
#  not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
# 	http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

# File called _pytest for PyCharm compatability

from pandas.testing import assert_frame_equal

from tests.common import TestData


class TestDataFrameDescribe(TestData):
    def test_flights_describe(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()

        pd_describe = pd_flights.describe()
        # We remove bool columns to match pandas output
        ed_describe = ed_flights.describe().drop(
            ["Cancelled", "FlightDelay"], axis="columns"
        )

        # Pandas >= 2 calculates aggregations such as min and max for timestamps too
        # This could be implemented in eland, but as of yet this is not the case
        # We therefore remove it before the comparison
        if "timestamp" in pd_describe.columns:
            pd_describe = pd_describe.drop(["timestamp"], axis="columns")

        # Pandas >= 2 orders the aggregations differently than Pandas < 2
        # A sort_index is applied so tests will succeed in both environments
        assert_frame_equal(
            pd_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
            ed_describe.drop(["25%", "50%", "75%"], axis="index").sort_index(),
            check_exact=False,
            rtol=True,
        )

        # TODO - this fails for percentile fields as ES aggregations are approximate
        #        if ES percentile agg uses
        #        "hdr": {
        #           "number_of_significant_value_digits": 3
        #         }
        #        this works

        # pd_ecommerce_describe = self.pd_ecommerce().describe()
        # ed_ecommerce_describe = self.ed_ecommerce().describe()
        # We don't compare ecommerce here as the default dtypes in pandas from read_json
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas

        # We can not also run 'describe' on a truncate ed dataframe