mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
* Fix test setup to match pandas 2.0 demands * Use the now deprecated _append method (Better solution might exist) * Deal with numeric_only being removed in metrics test * Skip mad metric for other pandas versions * Account for differences between pandas versions in describe methods * Run black * Check Pandas version first * Mirror behaviour of installed Pandas version when running value_counts * Allow passing arguments to the individual asserters * Fix for method _construct_axes_from_arguments no longer existing * Skip mad metric if it does not exist * Account for pandas 2.0 timestamp default behaviour * Deal with empty vs other inferred data types * Account for default datetime precision change * Run Black * Solution for differences in inferred_type only * Fix csv and json issues * Skip two doctests * Passing a set as indexer is no longer allowed * Don't validate output where it differs between Pandas versions in the environment * Update test matrix and packaging metadata * Update version of Python in the docs * Update Python version in demo notebook * Match noxfile * Symmetry * Fix trailing comma in JSON * Revert some changes in setup.py to fix building the documentation * Revert "Revert some changes in setup.py to fix building the documentation" This reverts commit ea9879753129d8d8390b3cbbce57155a8b4fb346. * Use PANDAS_VERSION from eland.common * Still skip the doctest, but make the output pandas 2 instead of 1 * Still skip doctest, but switch to pandas 2 output * Prepare for pandas 3 * Reference the right column * Ignore output in tests but switch to pandas 2 output * Add line comment about NBVAL_IGNORE_OUTPUT * Restore missing line and add stderr cell * Use non-private method instead * Fix indentation and parameter issues * If index is not specified, and pandas 1 is present, set it to True From pandas 2 and upwards, index is set to None by default * Run black * Newer version of black might have different opinions? * Add line comment * Remove unused import * Add reason for ignore statement * Add reason for skip --------- Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
179 lines
5.9 KiB
Python
179 lines
5.9 KiB
Python
# Licensed to Elasticsearch B.V. under one or more contributor
|
|
# license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright
|
|
# ownership. Elasticsearch B.V. licenses this file to you under
|
|
# the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# File called _pytest for PyCharm compatability
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
import eland as ed
|
|
from eland.field_mappings import FieldMappings
|
|
from tests.common import ES_TEST_CLIENT, TestData, assert_pandas_eland_frame_equal
|
|
|
|
|
|
class TestDataFrameUtils(TestData):
|
|
def test_generate_es_mappings(self):
|
|
df = pd.DataFrame(
|
|
data={
|
|
"A": np.random.rand(3),
|
|
"B": 1,
|
|
"C": "foo",
|
|
"D": pd.Timestamp("20190102"),
|
|
"E": [1.0, 2.0, 3.0],
|
|
"F": False,
|
|
"G": [1, 2, 3],
|
|
"H": pd.Timestamp("20190102", tz="UTC"),
|
|
},
|
|
index=["0", "1", "2"],
|
|
)
|
|
|
|
expected_mappings = {
|
|
"mappings": {
|
|
"properties": {
|
|
"A": {"type": "double"},
|
|
"B": {"type": "long"},
|
|
"C": {"type": "keyword"},
|
|
"D": {"type": "date"},
|
|
"E": {"type": "double"},
|
|
"F": {"type": "boolean"},
|
|
"G": {"type": "long"},
|
|
"H": {"type": "date"},
|
|
}
|
|
}
|
|
}
|
|
|
|
mappings = FieldMappings._generate_es_mappings(df)
|
|
|
|
assert expected_mappings == mappings
|
|
|
|
# Now create index
|
|
index_name = "eland_test_generate_es_mappings"
|
|
|
|
ed_df = ed.pandas_to_eland(
|
|
df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True
|
|
)
|
|
ed_df_head = ed_df.head()
|
|
|
|
# https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution
|
|
df["D"] = df["D"].astype("datetime64[ns]")
|
|
df["H"] = (
|
|
df["H"].dt.tz_localize(None).astype("datetime64[ns]").dt.tz_localize("UTC")
|
|
)
|
|
|
|
assert_pandas_eland_frame_equal(df, ed_df_head)
|
|
|
|
ES_TEST_CLIENT.indices.delete(index=index_name)
|
|
|
|
def test_pandas_to_eland_ignore_index(self):
|
|
df = pd.DataFrame(
|
|
data={
|
|
"A": np.random.rand(3),
|
|
"B": 1,
|
|
"C": "foo",
|
|
"D": pd.Timestamp("20190102"),
|
|
"E": [1.0, 2.0, 3.0],
|
|
"F": False,
|
|
"G": [1, 2, 3],
|
|
"H": "Long text", # text
|
|
"I": "52.36,4.83", # geo point
|
|
},
|
|
index=["0", "1", "2"],
|
|
)
|
|
|
|
# Now create index
|
|
index_name = "test_pandas_to_eland_ignore_index"
|
|
|
|
ed_df = ed.pandas_to_eland(
|
|
df,
|
|
ES_TEST_CLIENT,
|
|
index_name,
|
|
es_if_exists="replace",
|
|
es_refresh=True,
|
|
use_pandas_index_for_es_ids=False,
|
|
es_type_overrides={"H": "text", "I": "geo_point"},
|
|
)
|
|
|
|
# Check types
|
|
expected_mapping = {
|
|
"test_pandas_to_eland_ignore_index": {
|
|
"mappings": {
|
|
"properties": {
|
|
"A": {"type": "double"},
|
|
"B": {"type": "long"},
|
|
"C": {"type": "keyword"},
|
|
"D": {"type": "date"},
|
|
"E": {"type": "double"},
|
|
"F": {"type": "boolean"},
|
|
"G": {"type": "long"},
|
|
"H": {
|
|
"type": "text",
|
|
"fields": {"keyword": {"type": "keyword"}},
|
|
},
|
|
"I": {"type": "geo_point"},
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mapping = ES_TEST_CLIENT.indices.get_mapping(index=index_name)
|
|
|
|
assert expected_mapping == mapping
|
|
|
|
# Convert back to pandas and compare with original
|
|
pd_df = ed.eland_to_pandas(ed_df)
|
|
|
|
# Compare values excluding index
|
|
assert df.values.all() == pd_df.values.all()
|
|
|
|
# Ensure that index is populated by ES.
|
|
assert not (df.index == pd_df.index).any()
|
|
|
|
ES_TEST_CLIENT.indices.delete(index=index_name)
|
|
|
|
def tests_to_pandas_performance(self):
|
|
# TODO quantify this
|
|
ed.eland_to_pandas(self.ed_flights(), show_progress=True)
|
|
|
|
# This test calls the same method so is redundant
|
|
# assert_pandas_eland_frame_equal(pd_df, self.ed_flights())
|
|
|
|
def test_es_type_override_error(self):
|
|
df = self.pd_flights().filter(
|
|
["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
|
|
)
|
|
|
|
index_name = "test_es_type_override"
|
|
|
|
match = "'DistanceKilometers', 'DistanceMiles' column(s) not in given dataframe"
|
|
with pytest.raises(KeyError) as e:
|
|
ed.pandas_to_eland(
|
|
df,
|
|
ES_TEST_CLIENT,
|
|
index_name,
|
|
es_if_exists="replace",
|
|
es_refresh=True,
|
|
use_pandas_index_for_es_ids=False,
|
|
es_type_overrides={
|
|
"AvgTicketPrice": "long",
|
|
"DistanceKilometers": "text",
|
|
"DistanceMiles": "text",
|
|
},
|
|
)
|
|
assert str(e.value) == match
|
|
ES_TEST_CLIENT.indices.delete(index=index_name)
|