Allow user to specify es data types in read_csv and pandas_to_eland (#181)

* Allow user to specify es data types in read_csv and pandas_to_eland Also, some minor maintenance modifications: - replaced pandas.util.testing with pandas.testing (required in 1.x) - updated elasticsearch-py requirements to 7.6+ (to support ML code) * linting file
2025-07-11 00:02:14 +08:00 · 2020-04-14 17:04:12 +02:00 · 2020-04-14 17:04:12 +02:00 · 50734f8bd9
commit 50734f8bd9
parent e1cacead44
20 changed files with 103 additions and 60 deletions
--- a/eland/field_mappings.py
+++ b/eland/field_mappings.py
@ -403,13 +403,16 @@ class FieldMappings:
        return es_dtype

    @staticmethod
-    def _generate_es_mappings(dataframe, geo_points=None):
+    def _generate_es_mappings(dataframe, es_type_overrides=None):
        """Given a pandas dataframe, generate the associated Elasticsearch mapping

        Parameters
        ----------
            dataframe : pandas.DataFrame
                pandas.DataFrame to create schema from
+            es_type_overrides : dict
+                Dictionary of Elasticsearch types to override defaults  for certain fields
+                (e.g. { 'location': 'geo_point' })

        Returns
        -------
@ -437,8 +440,8 @@ class FieldMappings:

        mappings = {"properties": {}}
        for field_name_name, dtype in dataframe.dtypes.iteritems():
-            if geo_points is not None and field_name_name in geo_points:
-                es_dtype = "geo_point"
+            if es_type_overrides is not None and field_name_name in es_type_overrides:
+                es_dtype = es_type_overrides[field_name_name]
            else:
                es_dtype = FieldMappings._pd_dtype_to_es_dtype(dtype)

--- a/eland/tests/dataframe/test_aggs_pytest.py
+++ b/eland/tests/dataframe/test_aggs_pytest.py
@ -15,7 +15,7 @@
 # File called _pytest for PyCharm compatability

 import numpy as np
-from pandas.util.testing import assert_almost_equal
+from pandas.testing import assert_frame_equal

 from eland.tests.common import TestData

@ -31,7 +31,7 @@ class TestDataFrameAggs(TestData):
        # Eland returns all float values for all metric aggs, pandas can return int
        # TODO - investigate this more
        pd_sum_min = pd_sum_min.astype("float64")
-        assert_almost_equal(pd_sum_min, ed_sum_min)
+        assert_frame_equal(pd_sum_min, ed_sum_min, check_exact=False)

        pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(
            ["sum", "min", "std"]
@ -43,7 +43,9 @@ class TestDataFrameAggs(TestData):
        print(pd_sum_min_std.dtypes)
        print(ed_sum_min_std.dtypes)

-        assert_almost_equal(pd_sum_min_std, ed_sum_min_std, check_less_precise=True)
+        assert_frame_equal(
+            pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
+        )

    def test_terms_aggs(self):
        pd_flights = self.pd_flights()
@ -55,7 +57,7 @@ class TestDataFrameAggs(TestData):
        # Eland returns all float values for all metric aggs, pandas can return int
        # TODO - investigate this more
        pd_sum_min = pd_sum_min.astype("float64")
-        assert_almost_equal(pd_sum_min, ed_sum_min)
+        assert_frame_equal(pd_sum_min, ed_sum_min, check_exact=False)

        pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(
            ["sum", "min", "std"]
@ -67,7 +69,9 @@ class TestDataFrameAggs(TestData):
        print(pd_sum_min_std.dtypes)
        print(ed_sum_min_std.dtypes)

-        assert_almost_equal(pd_sum_min_std, ed_sum_min_std, check_less_precise=True)
+        assert_frame_equal(
+            pd_sum_min_std, ed_sum_min_std, check_exact=False, check_less_precise=True
+        )

    def test_aggs_median_var(self):
        pd_ecommerce = self.pd_ecommerce()
@ -86,4 +90,4 @@ class TestDataFrameAggs(TestData):
        # Eland returns all float values for all metric aggs, pandas can return int
        # TODO - investigate this more
        pd_aggs = pd_aggs.astype("float64")
-        assert_almost_equal(pd_aggs, ed_aggs, check_less_precise=2)
+        assert_frame_equal(pd_aggs, ed_aggs, check_exact=False, check_less_precise=2)
--- a/eland/tests/dataframe/test_count_pytest.py
+++ b/eland/tests/dataframe/test_count_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability

-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 from eland.tests.common import TestData

--- a/eland/tests/dataframe/test_datetime_pytest.py
+++ b/eland/tests/dataframe/test_datetime_pytest.py
@ -17,7 +17,7 @@ from datetime import datetime

 import numpy as np
 import pandas as pd
-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 import eland as ed
 from eland.field_mappings import FieldMappings
--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability

-from pandas.util.testing import assert_almost_equal
+from pandas.testing import assert_frame_equal

 from eland.tests.common import TestData

@ -27,9 +27,10 @@ class TestDataFrameDescribe(TestData):
        pd_describe = pd_flights.describe()
        ed_describe = ed_flights.describe()

-        assert_almost_equal(
+        assert_frame_equal(
            pd_describe.drop(["25%", "50%", "75%"], axis="index"),
            ed_describe.drop(["25%", "50%", "75%"], axis="index"),
+            check_exact=False,
            check_less_precise=True,
        )

--- a/eland/tests/dataframe/test_dtypes_pytest.py
+++ b/eland/tests/dataframe/test_dtypes_pytest.py
@ -15,7 +15,7 @@
 # File called _pytest for PyCharm compatability

 import numpy as np
-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 from eland.tests.common import TestData
 from eland.tests.common import assert_pandas_eland_frame_equal
--- a/eland/tests/dataframe/test_hist_pytest.py
+++ b/eland/tests/dataframe/test_hist_pytest.py
@ -16,7 +16,7 @@

 import numpy as np
 import pandas as pd
-from pandas.util.testing import assert_almost_equal
+from pandas.testing import assert_frame_equal

 from eland.tests.common import TestData

@ -52,8 +52,8 @@ class TestDataFrameHist(TestData):
        ]._hist(num_bins=num_bins)

        # Numbers are slightly different
-        assert_almost_equal(pd_bins, ed_bins)
-        assert_almost_equal(pd_weights, ed_weights)
+        assert_frame_equal(pd_bins, ed_bins, check_exact=False)
+        assert_frame_equal(pd_weights, ed_weights, check_exact=False)

    def test_flights_filtered_hist(self):
        pd_flights = self.pd_flights()
@ -88,5 +88,5 @@ class TestDataFrameHist(TestData):
        ]._hist(num_bins=num_bins)

        # Numbers are slightly different
-        assert_almost_equal(pd_bins, ed_bins)
-        assert_almost_equal(pd_weights, ed_weights)
+        assert_frame_equal(pd_bins, ed_bins, check_exact=False)
+        assert_frame_equal(pd_weights, ed_weights, check_exact=False)
--- a/eland/tests/dataframe/test_nunique_pytest.py
+++ b/eland/tests/dataframe/test_nunique_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability

-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 from eland.tests.common import TestData

--- a/eland/tests/dataframe/test_to_csv_pytest.py
+++ b/eland/tests/dataframe/test_to_csv_pytest.py
@ -18,7 +18,7 @@ import ast
 import time

 import pandas as pd
-from pandas.util.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal

 import eland as ed
 from eland.tests import ES_TEST_CLIENT
@ -81,7 +81,10 @@ class TestDataFrameToCSV(TestData):
            test_index,
            index_col=0,
            es_refresh=True,
-            es_geo_points=["OriginLocation", "DestLocation"],
+            es_type_overrides={
+                "OriginLocation": "geo_point",
+                "DestLocation": "geo_point",
+            },
            converters={
                "DestLocation": lambda x: ast.literal_eval(x),
                "OriginLocation": lambda x: ast.literal_eval(x),
--- a/eland/tests/dataframe/test_utils_pytest.py
+++ b/eland/tests/dataframe/test_utils_pytest.py
@ -78,6 +78,8 @@ class TestDataFrameUtils(TestData):
                "E": [1.0, 2.0, 3.0],
                "F": False,
                "G": [1, 2, 3],
+                "H": "Long text",  # text
+                "I": "52.36,4.83",  # geo point
            },
            index=["0", "1", "2"],
        )
@ -92,7 +94,33 @@ class TestDataFrameUtils(TestData):
            es_if_exists="replace",
            es_refresh=True,
            use_pandas_index_for_es_ids=False,
+            es_type_overrides={"H": "text", "I": "geo_point"},
        )
+
+        # Check types
+        expected_mapping = {
+            "test_pandas_to_eland_ignore_index": {
+                "mappings": {
+                    "properties": {
+                        "A": {"type": "double"},
+                        "B": {"type": "long"},
+                        "C": {"type": "keyword"},
+                        "D": {"type": "date"},
+                        "E": {"type": "double"},
+                        "F": {"type": "boolean"},
+                        "G": {"type": "long"},
+                        "H": {"type": "text"},
+                        "I": {"type": "geo_point"},
+                    }
+                }
+            }
+        }
+
+        mapping = ES_TEST_CLIENT.indices.get_mapping(index_name)
+
+        assert expected_mapping == mapping
+
+        # Convert back to pandas and compare with original
        pd_df = ed.eland_to_pandas(ed_df)

        # Compare values excluding index
--- a/eland/tests/field_mappings/test_dtypes_pytest.py
+++ b/eland/tests/field_mappings/test_dtypes_pytest.py
@ -13,7 +13,7 @@
 # limitations under the License.

 # File called _pytest for PyCharm compatability
-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 from eland.field_mappings import FieldMappings
 from eland.tests import ES_TEST_CLIENT, FLIGHTS_INDEX_NAME
--- a/eland/tests/field_mappings/test_field_name_pd_dtype_pytest.py
+++ b/eland/tests/field_mappings/test_field_name_pd_dtype_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability
 import pytest
-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 from eland.field_mappings import FieldMappings
 from eland.tests import FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING
--- a/eland/tests/field_mappings/test_get_field_names_pytest.py
+++ b/eland/tests/field_mappings/test_get_field_names_pytest.py
@ -14,7 +14,7 @@

 import numpy as np
 import pandas as pd
-from pandas.util.testing import assert_index_equal
+from pandas.testing import assert_index_equal

 # File called _pytest for PyCharm compatability
 from eland.field_mappings import FieldMappings
--- a/eland/tests/query_compiler/test_get_field_names_pytest.py
+++ b/eland/tests/query_compiler/test_get_field_names_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability
 import pandas as pd
-from pandas.util.testing import assert_index_equal
+from pandas.testing import assert_index_equal

 from eland.tests.common import TestData

--- a/eland/tests/series/test_hist_pytest.py
+++ b/eland/tests/series/test_hist_pytest.py
@ -17,7 +17,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pandas.util.testing import assert_almost_equal
+from pandas.testing import assert_frame_equal

 from eland.tests.common import TestData

@ -39,8 +39,8 @@ class TestSeriesFrameHist(TestData):

        # Numbers are slightly different
        print(pd_bins, ed_bins)
-        assert_almost_equal(pd_bins, ed_bins)
-        assert_almost_equal(pd_weights, ed_weights)
+        assert_frame_equal(pd_bins, ed_bins, check_exact=False)
+        assert_frame_equal(pd_weights, ed_weights, check_exact=False)

    def test_filtered_hist(self):
        pd_flights = self.pd_flights()
@ -64,8 +64,8 @@ class TestSeriesFrameHist(TestData):
        ].FlightDelayMin._hist(num_bins=num_bins)

        # Numbers are slightly different
-        assert_almost_equal(pd_bins, ed_bins)
-        assert_almost_equal(pd_weights, ed_weights)
+        assert_frame_equal(pd_bins, ed_bins, check_exact=False)
+        assert_frame_equal(pd_weights, ed_weights, check_exact=False)

    def test_invalid_hist(self):
        with pytest.raises(ValueError):
--- a/eland/tests/series/test_metrics_pytest.py
+++ b/eland/tests/series/test_metrics_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability

-from pandas.util.testing import assert_almost_equal
+import numpy as np

 from eland.tests.common import TestData

@ -30,7 +30,7 @@ class TestSeriesMetrics(TestData):
        for func in self.funcs:
            pd_metric = getattr(pd_flights, func)()
            ed_metric = getattr(ed_flights, func)()
-            assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
+            np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)

    def test_flights_timestamp(self):
        pd_flights = self.pd_flights()["timestamp"]
@ -40,7 +40,7 @@ class TestSeriesMetrics(TestData):
            pd_metric = getattr(pd_flights, func)()
            ed_metric = getattr(ed_flights, func)()
            pd_metric = pd_metric.floor("S")  # floor or pandas mean with have ns
-            assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
+            assert pd_metric == ed_metric

    def test_ecommerce_selected_non_numeric_source_fields(self):
        # None of these are numeric
@ -61,8 +61,8 @@ class TestSeriesMetrics(TestData):
            ed_ecommerce = self.ed_ecommerce()[column]

            for func in self.funcs:
-                assert_almost_equal(
+                np.testing.assert_almost_equal(
                    getattr(pd_ecommerce, func)(),
                    getattr(ed_ecommerce, func)(),
-                    check_less_precise=True,
+                    decimal=2,
                )
--- a/eland/tests/series/test_value_counts_pytest.py
+++ b/eland/tests/series/test_value_counts_pytest.py
@ -14,7 +14,7 @@

 # File called _pytest for PyCharm compatability
 import pytest
-from pandas.util.testing import assert_series_equal
+from pandas.testing import assert_series_equal

 from eland.tests.common import TestData

--- a/eland/utils.py
+++ b/eland/utils.py
@ -56,7 +56,7 @@ def pandas_to_eland(
    es_if_exists="fail",
    es_refresh=False,
    es_dropna=False,
-    es_geo_points=None,
+    es_type_overrides=None,
    chunksize=None,
    use_pandas_index_for_es_ids=True,
 ):
@ -83,8 +83,8 @@ def pandas_to_eland(
    es_dropna: bool, default 'False'
        * True: Remove missing values (see pandas.Series.dropna)
        * False: Include missing values - may cause bulk to fail
-    es_geo_points: list, default None
-        List of columns to map to geo_point data type
+    es_type_overrides: dict, default None
+        Dict of field_name: es_data_type that overrides default es data types
    chunksize: int, default None
        Number of pandas.DataFrame rows to read before bulk index into Elasticsearch
    use_pandas_index_for_es_ids: bool, default 'True'
@ -105,17 +105,18 @@ def pandas_to_eland(
    ...                            'D': pd.Timestamp('20190102'),
    ...                            'E': [1.0, 2.0, 3.0],
    ...                            'F': False,
-    ...                            'G': [1, 2, 3]},
+    ...                            'G': [1, 2, 3],
+    ...                            'H': 'Long text - to be indexed as es type text'},
    ...                      index=['0', '1', '2'])
    >>> type(pd_df)
    <class 'pandas.core.frame.DataFrame'>
    >>> pd_df
-           A  B  ...      F  G
-    0  3.141  1  ...  False  1
-    1  3.141  1  ...  False  2
-    2  3.141  1  ...  False  3
+           A  B  ...  G                                          H
+    0  3.141  1  ...  1  Long text - to be indexed as es type text
+    1  3.141  1  ...  2  Long text - to be indexed as es type text
+    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
-    [3 rows x 7 columns]
+    [3 rows x 8 columns]
    >>> pd_df.dtypes
    A           float64
    B             int64
@ -124,6 +125,7 @@ def pandas_to_eland(
    E           float64
    F              bool
    G             int64
+    H            object
    dtype: object

    Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
@ -135,16 +137,17 @@ def pandas_to_eland(
    ...                            'localhost',
    ...                            'pandas_to_eland',
    ...                            es_if_exists="replace",
-    ...                            es_refresh=True)
+    ...                            es_refresh=True,
+    ...                            es_type_overrides={'H':'text'}) # index field 'H' as text not keyword
    >>> type(ed_df)
    <class 'eland.dataframe.DataFrame'>
    >>> ed_df
-           A  B  ...      F  G
-    0  3.141  1  ...  False  1
-    1  3.141  1  ...  False  2
-    2  3.141  1  ...  False  3
+           A  B  ...  G                                          H
+    0  3.141  1  ...  1  Long text - to be indexed as es type text
+    1  3.141  1  ...  2  Long text - to be indexed as es type text
+    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
-    [3 rows x 7 columns]
+    [3 rows x 8 columns]
    >>> ed_df.dtypes
    A           float64
    B             int64
@ -153,6 +156,7 @@ def pandas_to_eland(
    E           float64
    F              bool
    G             int64
+    H            object
    dtype: object

    See Also
@ -163,7 +167,7 @@ def pandas_to_eland(
    if chunksize is None:
        chunksize = DEFAULT_CHUNK_SIZE

-    mapping = FieldMappings._generate_es_mappings(pd_df, es_geo_points)
+    mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides)
    es_client = ensure_es_client(es_client)

    # If table exists, check if_exists parameter
@ -283,7 +287,7 @@ def read_csv(
    es_if_exists="fail",
    es_refresh=False,
    es_dropna=False,
-    es_geo_points=None,
+    es_type_overrides=None,
    sep=",",
    delimiter=None,
    # Column and Index Locations and Names
@ -364,8 +368,8 @@ def read_csv(
    es_dropna: bool, default 'False'
        * True: Remove missing values (see pandas.Series.dropna)
        * False: Include missing values - may cause bulk to fail
-    es_geo_points: list, default None
-        List of columns to map to geo_point data type
+    es_type_overrides: dict, default None
+        Dict of columns: es_type to override default es datatype mappings
    chunksize
        number of csv rows to read before bulk index into Elasticsearch

@ -498,7 +502,7 @@ def read_csv(
                chunksize=chunksize,
                es_refresh=es_refresh,
                es_dropna=es_dropna,
-                es_geo_points=es_geo_points,
+                es_type_overrides=es_type_overrides,
            )
            first_write = False
        else:
@ -510,7 +514,7 @@ def read_csv(
                chunksize=chunksize,
                es_refresh=es_refresh,
                es_dropna=es_dropna,
-                es_geo_points=es_geo_points,
+                es_type_overrides=es_type_overrides,
            )

    # Now create an eland.DataFrame that references the new index
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,4 +1,4 @@
-elasticsearch>=7.0.5
+elasticsearch>=7.6.0
 pandas>=1
 matplotlib
 pytest>=5.2.1
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,3 @@
-elasticsearch>=7.0.5
+elasticsearch>=7.6.0
 pandas>=1
 matplotlib