diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 8c73f92..7908427 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -403,13 +403,16 @@ class FieldMappings: return es_dtype @staticmethod - def _generate_es_mappings(dataframe, geo_points=None): + def _generate_es_mappings(dataframe, es_type_overrides=None): """Given a pandas dataframe, generate the associated Elasticsearch mapping Parameters ---------- dataframe : pandas.DataFrame pandas.DataFrame to create schema from + es_type_overrides : dict + Dictionary of Elasticsearch types to override defaults for certain fields + (e.g. { 'location': 'geo_point' }) Returns ------- @@ -437,8 +440,8 @@ class FieldMappings: mappings = {"properties": {}} for field_name_name, dtype in dataframe.dtypes.iteritems(): - if geo_points is not None and field_name_name in geo_points: - es_dtype = "geo_point" + if es_type_overrides is not None and field_name_name in es_type_overrides: + es_dtype = es_type_overrides[field_name_name] else: es_dtype = FieldMappings._pd_dtype_to_es_dtype(dtype) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index bc36300..41cb925 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -15,7 +15,7 @@ # File called _pytest for PyCharm compatability import numpy as np -from pandas.util.testing import assert_almost_equal +from pandas.testing import assert_frame_equal from eland.tests.common import TestData @@ -31,7 +31,7 @@ class TestDataFrameAggs(TestData): # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more pd_sum_min = pd_sum_min.astype("float64") - assert_almost_equal(pd_sum_min, ed_sum_min) + assert_frame_equal(pd_sum_min, ed_sum_min, check_exact=False) pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg( ["sum", "min", "std"] @@ -43,7 +43,9 @@ class TestDataFrameAggs(TestData): print(pd_sum_min_std.dtypes) print(ed_sum_min_std.dtypes) - assert_almost_equal(pd_sum_min_std, ed_sum_min_std, check_less_precise=True) + assert_frame_equal( + pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True + ) def test_terms_aggs(self): pd_flights = self.pd_flights() @@ -55,7 +57,7 @@ class TestDataFrameAggs(TestData): # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more pd_sum_min = pd_sum_min.astype("float64") - assert_almost_equal(pd_sum_min, ed_sum_min) + assert_frame_equal(pd_sum_min, ed_sum_min, check_exact=False) pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg( ["sum", "min", "std"] @@ -67,7 +69,9 @@ class TestDataFrameAggs(TestData): print(pd_sum_min_std.dtypes) print(ed_sum_min_std.dtypes) - assert_almost_equal(pd_sum_min_std, ed_sum_min_std, check_less_precise=True) + assert_frame_equal( + pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True + ) def test_aggs_median_var(self): pd_ecommerce = self.pd_ecommerce() @@ -86,4 +90,4 @@ class TestDataFrameAggs(TestData): # Eland returns all float values for all metric aggs, pandas can return int # TODO - investigate this more pd_aggs = pd_aggs.astype("float64") - assert_almost_equal(pd_aggs, ed_aggs, check_less_precise=2) + assert_frame_equal(pd_aggs, ed_aggs, check_exact=False, check_less_precise=2) diff --git a/eland/tests/dataframe/test_count_pytest.py b/eland/tests/dataframe/test_count_pytest.py index 36fd7c4..0798827 100644 --- a/eland/tests/dataframe/test_count_pytest.py +++ b/eland/tests/dataframe/test_count_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal from eland.tests.common import TestData diff --git a/eland/tests/dataframe/test_datetime_pytest.py b/eland/tests/dataframe/test_datetime_pytest.py index 80637e1..899c21a 100644 --- a/eland/tests/dataframe/test_datetime_pytest.py +++ b/eland/tests/dataframe/test_datetime_pytest.py @@ -17,7 +17,7 @@ from datetime import datetime import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal import eland as ed from eland.field_mappings import FieldMappings diff --git a/eland/tests/dataframe/test_describe_pytest.py b/eland/tests/dataframe/test_describe_pytest.py index 8c7c6d3..a4ba96d 100644 --- a/eland/tests/dataframe/test_describe_pytest.py +++ b/eland/tests/dataframe/test_describe_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability -from pandas.util.testing import assert_almost_equal +from pandas.testing import assert_frame_equal from eland.tests.common import TestData @@ -27,9 +27,10 @@ class TestDataFrameDescribe(TestData): pd_describe = pd_flights.describe() ed_describe = ed_flights.describe() - assert_almost_equal( + assert_frame_equal( pd_describe.drop(["25%", "50%", "75%"], axis="index"), ed_describe.drop(["25%", "50%", "75%"], axis="index"), + check_exact=False, check_less_precise=True, ) diff --git a/eland/tests/dataframe/test_dtypes_pytest.py b/eland/tests/dataframe/test_dtypes_pytest.py index 54ea88c..1cbb14d 100644 --- a/eland/tests/dataframe/test_dtypes_pytest.py +++ b/eland/tests/dataframe/test_dtypes_pytest.py @@ -15,7 +15,7 @@ # File called _pytest for PyCharm compatability import numpy as np -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal from eland.tests.common import TestData from eland.tests.common import assert_pandas_eland_frame_equal diff --git a/eland/tests/dataframe/test_hist_pytest.py b/eland/tests/dataframe/test_hist_pytest.py index 943d64f..0ac1682 100644 --- a/eland/tests/dataframe/test_hist_pytest.py +++ b/eland/tests/dataframe/test_hist_pytest.py @@ -16,7 +16,7 @@ import numpy as np import pandas as pd -from pandas.util.testing import assert_almost_equal +from pandas.testing import assert_frame_equal from eland.tests.common import TestData @@ -52,8 +52,8 @@ class TestDataFrameHist(TestData): ]._hist(num_bins=num_bins) # Numbers are slightly different - assert_almost_equal(pd_bins, ed_bins) - assert_almost_equal(pd_weights, ed_weights) + assert_frame_equal(pd_bins, ed_bins, check_exact=False) + assert_frame_equal(pd_weights, ed_weights, check_exact=False) def test_flights_filtered_hist(self): pd_flights = self.pd_flights() @@ -88,5 +88,5 @@ class TestDataFrameHist(TestData): ]._hist(num_bins=num_bins) # Numbers are slightly different - assert_almost_equal(pd_bins, ed_bins) - assert_almost_equal(pd_weights, ed_weights) + assert_frame_equal(pd_bins, ed_bins, check_exact=False) + assert_frame_equal(pd_weights, ed_weights, check_exact=False) diff --git a/eland/tests/dataframe/test_nunique_pytest.py b/eland/tests/dataframe/test_nunique_pytest.py index d023347..79b16c8 100644 --- a/eland/tests/dataframe/test_nunique_pytest.py +++ b/eland/tests/dataframe/test_nunique_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal from eland.tests.common import TestData diff --git a/eland/tests/dataframe/test_to_csv_pytest.py b/eland/tests/dataframe/test_to_csv_pytest.py index aff16d2..162aa29 100644 --- a/eland/tests/dataframe/test_to_csv_pytest.py +++ b/eland/tests/dataframe/test_to_csv_pytest.py @@ -18,7 +18,7 @@ import ast import time import pandas as pd -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal import eland as ed from eland.tests import ES_TEST_CLIENT @@ -81,7 +81,10 @@ class TestDataFrameToCSV(TestData): test_index, index_col=0, es_refresh=True, - es_geo_points=["OriginLocation", "DestLocation"], + es_type_overrides={ + "OriginLocation": "geo_point", + "DestLocation": "geo_point", + }, converters={ "DestLocation": lambda x: ast.literal_eval(x), "OriginLocation": lambda x: ast.literal_eval(x), diff --git a/eland/tests/dataframe/test_utils_pytest.py b/eland/tests/dataframe/test_utils_pytest.py index 9378552..087822c 100644 --- a/eland/tests/dataframe/test_utils_pytest.py +++ b/eland/tests/dataframe/test_utils_pytest.py @@ -78,6 +78,8 @@ class TestDataFrameUtils(TestData): "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], + "H": "Long text", # text + "I": "52.36,4.83", # geo point }, index=["0", "1", "2"], ) @@ -92,7 +94,33 @@ class TestDataFrameUtils(TestData): es_if_exists="replace", es_refresh=True, use_pandas_index_for_es_ids=False, + es_type_overrides={"H": "text", "I": "geo_point"}, ) + + # Check types + expected_mapping = { + "test_pandas_to_eland_ignore_index": { + "mappings": { + "properties": { + "A": {"type": "double"}, + "B": {"type": "long"}, + "C": {"type": "keyword"}, + "D": {"type": "date"}, + "E": {"type": "double"}, + "F": {"type": "boolean"}, + "G": {"type": "long"}, + "H": {"type": "text"}, + "I": {"type": "geo_point"}, + } + } + } + } + + mapping = ES_TEST_CLIENT.indices.get_mapping(index_name) + + assert expected_mapping == mapping + + # Convert back to pandas and compare with original pd_df = ed.eland_to_pandas(ed_df) # Compare values excluding index diff --git a/eland/tests/field_mappings/test_dtypes_pytest.py b/eland/tests/field_mappings/test_dtypes_pytest.py index ff4c947..1eee9d2 100644 --- a/eland/tests/field_mappings/test_dtypes_pytest.py +++ b/eland/tests/field_mappings/test_dtypes_pytest.py @@ -13,7 +13,7 @@ # limitations under the License. # File called _pytest for PyCharm compatability -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal from eland.field_mappings import FieldMappings from eland.tests import ES_TEST_CLIENT, FLIGHTS_INDEX_NAME diff --git a/eland/tests/field_mappings/test_field_name_pd_dtype_pytest.py b/eland/tests/field_mappings/test_field_name_pd_dtype_pytest.py index 4bca07c..d7290bf 100644 --- a/eland/tests/field_mappings/test_field_name_pd_dtype_pytest.py +++ b/eland/tests/field_mappings/test_field_name_pd_dtype_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability import pytest -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal from eland.field_mappings import FieldMappings from eland.tests import FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING diff --git a/eland/tests/field_mappings/test_get_field_names_pytest.py b/eland/tests/field_mappings/test_get_field_names_pytest.py index 169b678..4fae705 100644 --- a/eland/tests/field_mappings/test_get_field_names_pytest.py +++ b/eland/tests/field_mappings/test_get_field_names_pytest.py @@ -14,7 +14,7 @@ import numpy as np import pandas as pd -from pandas.util.testing import assert_index_equal +from pandas.testing import assert_index_equal # File called _pytest for PyCharm compatability from eland.field_mappings import FieldMappings diff --git a/eland/tests/query_compiler/test_get_field_names_pytest.py b/eland/tests/query_compiler/test_get_field_names_pytest.py index 37aac66..657bf08 100644 --- a/eland/tests/query_compiler/test_get_field_names_pytest.py +++ b/eland/tests/query_compiler/test_get_field_names_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability import pandas as pd -from pandas.util.testing import assert_index_equal +from pandas.testing import assert_index_equal from eland.tests.common import TestData diff --git a/eland/tests/series/test_hist_pytest.py b/eland/tests/series/test_hist_pytest.py index 9abd8cf..d819162 100644 --- a/eland/tests/series/test_hist_pytest.py +++ b/eland/tests/series/test_hist_pytest.py @@ -17,7 +17,7 @@ import numpy as np import pandas as pd import pytest -from pandas.util.testing import assert_almost_equal +from pandas.testing import assert_frame_equal from eland.tests.common import TestData @@ -39,8 +39,8 @@ class TestSeriesFrameHist(TestData): # Numbers are slightly different print(pd_bins, ed_bins) - assert_almost_equal(pd_bins, ed_bins) - assert_almost_equal(pd_weights, ed_weights) + assert_frame_equal(pd_bins, ed_bins, check_exact=False) + assert_frame_equal(pd_weights, ed_weights, check_exact=False) def test_filtered_hist(self): pd_flights = self.pd_flights() @@ -64,8 +64,8 @@ class TestSeriesFrameHist(TestData): ].FlightDelayMin._hist(num_bins=num_bins) # Numbers are slightly different - assert_almost_equal(pd_bins, ed_bins) - assert_almost_equal(pd_weights, ed_weights) + assert_frame_equal(pd_bins, ed_bins, check_exact=False) + assert_frame_equal(pd_weights, ed_weights, check_exact=False) def test_invalid_hist(self): with pytest.raises(ValueError): diff --git a/eland/tests/series/test_metrics_pytest.py b/eland/tests/series/test_metrics_pytest.py index 44d5d34..c9cd7c9 100644 --- a/eland/tests/series/test_metrics_pytest.py +++ b/eland/tests/series/test_metrics_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability -from pandas.util.testing import assert_almost_equal +import numpy as np from eland.tests.common import TestData @@ -30,7 +30,7 @@ class TestSeriesMetrics(TestData): for func in self.funcs: pd_metric = getattr(pd_flights, func)() ed_metric = getattr(ed_flights, func)() - assert_almost_equal(pd_metric, ed_metric, check_less_precise=True) + np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2) def test_flights_timestamp(self): pd_flights = self.pd_flights()["timestamp"] @@ -40,7 +40,7 @@ class TestSeriesMetrics(TestData): pd_metric = getattr(pd_flights, func)() ed_metric = getattr(ed_flights, func)() pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns - assert_almost_equal(pd_metric, ed_metric, check_less_precise=True) + assert pd_metric == ed_metric def test_ecommerce_selected_non_numeric_source_fields(self): # None of these are numeric @@ -61,8 +61,8 @@ class TestSeriesMetrics(TestData): ed_ecommerce = self.ed_ecommerce()[column] for func in self.funcs: - assert_almost_equal( + np.testing.assert_almost_equal( getattr(pd_ecommerce, func)(), getattr(ed_ecommerce, func)(), - check_less_precise=True, + decimal=2, ) diff --git a/eland/tests/series/test_value_counts_pytest.py b/eland/tests/series/test_value_counts_pytest.py index e0d5762..686290b 100644 --- a/eland/tests/series/test_value_counts_pytest.py +++ b/eland/tests/series/test_value_counts_pytest.py @@ -14,7 +14,7 @@ # File called _pytest for PyCharm compatability import pytest -from pandas.util.testing import assert_series_equal +from pandas.testing import assert_series_equal from eland.tests.common import TestData diff --git a/eland/utils.py b/eland/utils.py index 23b5834..e3e9a8f 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -56,7 +56,7 @@ def pandas_to_eland( es_if_exists="fail", es_refresh=False, es_dropna=False, - es_geo_points=None, + es_type_overrides=None, chunksize=None, use_pandas_index_for_es_ids=True, ): @@ -83,8 +83,8 @@ def pandas_to_eland( es_dropna: bool, default 'False' * True: Remove missing values (see pandas.Series.dropna) * False: Include missing values - may cause bulk to fail - es_geo_points: list, default None - List of columns to map to geo_point data type + es_type_overrides: dict, default None + Dict of field_name: es_data_type that overrides default es data types chunksize: int, default None Number of pandas.DataFrame rows to read before bulk index into Elasticsearch use_pandas_index_for_es_ids: bool, default 'True' @@ -105,17 +105,18 @@ def pandas_to_eland( ... 'D': pd.Timestamp('20190102'), ... 'E': [1.0, 2.0, 3.0], ... 'F': False, - ... 'G': [1, 2, 3]}, + ... 'G': [1, 2, 3], + ... 'H': 'Long text - to be indexed as es type text'}, ... index=['0', '1', '2']) >>> type(pd_df) >>> pd_df - A B ... F G - 0 3.141 1 ... False 1 - 1 3.141 1 ... False 2 - 2 3.141 1 ... False 3 + A B ... G H + 0 3.141 1 ... 1 Long text - to be indexed as es type text + 1 3.141 1 ... 2 Long text - to be indexed as es type text + 2 3.141 1 ... 3 Long text - to be indexed as es type text - [3 rows x 7 columns] + [3 rows x 8 columns] >>> pd_df.dtypes A float64 B int64 @@ -124,6 +125,7 @@ def pandas_to_eland( E float64 F bool G int64 + H object dtype: object Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`. @@ -135,16 +137,17 @@ def pandas_to_eland( ... 'localhost', ... 'pandas_to_eland', ... es_if_exists="replace", - ... es_refresh=True) + ... es_refresh=True, + ... es_type_overrides={'H':'text'}) # index field 'H' as text not keyword >>> type(ed_df) >>> ed_df - A B ... F G - 0 3.141 1 ... False 1 - 1 3.141 1 ... False 2 - 2 3.141 1 ... False 3 + A B ... G H + 0 3.141 1 ... 1 Long text - to be indexed as es type text + 1 3.141 1 ... 2 Long text - to be indexed as es type text + 2 3.141 1 ... 3 Long text - to be indexed as es type text - [3 rows x 7 columns] + [3 rows x 8 columns] >>> ed_df.dtypes A float64 B int64 @@ -153,6 +156,7 @@ def pandas_to_eland( E float64 F bool G int64 + H object dtype: object See Also @@ -163,7 +167,7 @@ def pandas_to_eland( if chunksize is None: chunksize = DEFAULT_CHUNK_SIZE - mapping = FieldMappings._generate_es_mappings(pd_df, es_geo_points) + mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides) es_client = ensure_es_client(es_client) # If table exists, check if_exists parameter @@ -283,7 +287,7 @@ def read_csv( es_if_exists="fail", es_refresh=False, es_dropna=False, - es_geo_points=None, + es_type_overrides=None, sep=",", delimiter=None, # Column and Index Locations and Names @@ -364,8 +368,8 @@ def read_csv( es_dropna: bool, default 'False' * True: Remove missing values (see pandas.Series.dropna) * False: Include missing values - may cause bulk to fail - es_geo_points: list, default None - List of columns to map to geo_point data type + es_type_overrides: dict, default None + Dict of columns: es_type to override default es datatype mappings chunksize number of csv rows to read before bulk index into Elasticsearch @@ -498,7 +502,7 @@ def read_csv( chunksize=chunksize, es_refresh=es_refresh, es_dropna=es_dropna, - es_geo_points=es_geo_points, + es_type_overrides=es_type_overrides, ) first_write = False else: @@ -510,7 +514,7 @@ def read_csv( chunksize=chunksize, es_refresh=es_refresh, es_dropna=es_dropna, - es_geo_points=es_geo_points, + es_type_overrides=es_type_overrides, ) # Now create an eland.DataFrame that references the new index diff --git a/requirements-dev.txt b/requirements-dev.txt index 714063a..e3b1ce7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -elasticsearch>=7.0.5 +elasticsearch>=7.6.0 pandas>=1 matplotlib pytest>=5.2.1 diff --git a/requirements.txt b/requirements.txt index 5a35664..f0da4b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -elasticsearch>=7.0.5 +elasticsearch>=7.6.0 pandas>=1 matplotlib