From adafeed667a83261521affeea20299844a049fc0 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 13 Oct 2020 12:14:09 -0500 Subject: [PATCH] Add es_dtypes property to DataFrame and Series --- .../api/eland.DataFrame.es_dtypes.rst | 6 +++ .../reference/api/eland.Series.es_dtype.rst | 6 +++ .../reference/api/eland.Series.es_dtypes.rst | 6 +++ .../source/reference/api/eland.ml.MLModel.rst | 15 +++++++ docs/source/reference/dataframe.rst | 13 +++--- docs/source/reference/ml.rst | 4 +- docs/source/reference/series.rst | 18 ++++---- eland/field_mappings.py | 14 +++++++ eland/ndframe.py | 22 ++++++++++ eland/query_compiler.py | 4 ++ eland/series.py | 7 ++++ eland/tests/conftest.py | 6 +++ eland/tests/dataframe/test_dtypes_pytest.py | 41 ++++++++++++++++++- eland/tests/series/test_dtype_pytest.py | 8 ++++ 14 files changed, 152 insertions(+), 18 deletions(-) create mode 100644 docs/source/reference/api/eland.DataFrame.es_dtypes.rst create mode 100644 docs/source/reference/api/eland.Series.es_dtype.rst create mode 100644 docs/source/reference/api/eland.Series.es_dtypes.rst create mode 100644 docs/source/reference/api/eland.ml.MLModel.rst diff --git a/docs/source/reference/api/eland.DataFrame.es_dtypes.rst b/docs/source/reference/api/eland.DataFrame.es_dtypes.rst new file mode 100644 index 0000000..760189d --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.es_dtypes.rst @@ -0,0 +1,6 @@ +eland.DataFrame.es_dtypes +========================= + +.. currentmodule:: eland + +.. autoattribute:: DataFrame.es_dtypes diff --git a/docs/source/reference/api/eland.Series.es_dtype.rst b/docs/source/reference/api/eland.Series.es_dtype.rst new file mode 100644 index 0000000..ca82760 --- /dev/null +++ b/docs/source/reference/api/eland.Series.es_dtype.rst @@ -0,0 +1,6 @@ +eland.Series.es_dtype +===================== + +.. currentmodule:: eland + +.. autoattribute:: Series.es_dtype diff --git a/docs/source/reference/api/eland.Series.es_dtypes.rst b/docs/source/reference/api/eland.Series.es_dtypes.rst new file mode 100644 index 0000000..34358e7 --- /dev/null +++ b/docs/source/reference/api/eland.Series.es_dtypes.rst @@ -0,0 +1,6 @@ +eland.Series.es_dtypes +====================== + +.. currentmodule:: eland + +.. autoattribute:: Series.es_dtypes diff --git a/docs/source/reference/api/eland.ml.MLModel.rst b/docs/source/reference/api/eland.ml.MLModel.rst new file mode 100644 index 0000000..c936c84 --- /dev/null +++ b/docs/source/reference/api/eland.ml.MLModel.rst @@ -0,0 +1,15 @@ +eland.ml.MLModel +================ + +.. currentmodule:: eland.ml + +.. autoclass:: MLModel + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + DataFrame.abs + DataFrame.add diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst index 180b7f7..ffd9f32 100644 --- a/docs/source/reference/dataframe.rst +++ b/docs/source/reference/dataframe.rst @@ -12,7 +12,7 @@ Constructor DataFrame -Attributes and underlying data +Attributes and Underlying Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -27,7 +27,7 @@ Attributes and underlying data DataFrame.ndim DataFrame.size -Indexing, iteration +Indexing, Iteration ~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -39,7 +39,7 @@ Indexing, iteration DataFrame.query DataFrame.sample -Function application, GroupBy & window +Function Application, GroupBy & Window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -49,7 +49,7 @@ Function application, GroupBy & window .. _api.dataframe.stats: -Computations / descriptive stats +Computations / Descriptive Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -67,7 +67,7 @@ Computations / descriptive stats DataFrame.sum DataFrame.nunique -Reindexing / selection / label manipulation +Reindexing / Selection / Label Manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -89,8 +89,9 @@ Elasticsearch Functions DataFrame.es_info DataFrame.es_query + DataFrame.es_dtypes -Serialization / IO / conversion +Serialization / IO / Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/docs/source/reference/ml.rst b/docs/source/reference/ml.rst index e1b0cee..ac16e98 100644 --- a/docs/source/reference/ml.rst +++ b/docs/source/reference/ml.rst @@ -18,8 +18,8 @@ The fastest way to get started with machine learning features is to See `Elasticsearch Machine Learning documentation `_ more details. -ImportedMLModel -~~~~~~~~~~~~~~~ +MLModel +~~~~~~~ .. currentmodule:: eland.ml Constructor diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst index b030ada..ac809dd 100644 --- a/docs/source/reference/series.rst +++ b/docs/source/reference/series.rst @@ -1,8 +1,8 @@ .. _api.series: -========= +====== Series -========= +====== .. currentmodule:: eland Constructor @@ -12,7 +12,7 @@ Constructor Series -Attributes and underlying data +Attributes and Underlying Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -26,7 +26,7 @@ Attributes and underlying data Series.ndim Series.size -Indexing, iteration +Indexing, Iteration ~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -35,7 +35,7 @@ Indexing, iteration Series.tail Series.sample -Binary operator functions +Binary Operator Functions ~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -63,7 +63,7 @@ Binary operator functions Series.rmod Series.rpow -Computations / descriptive stats +Computations / Descriptive Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -80,7 +80,7 @@ Computations / descriptive stats Series.nunique Series.value_counts -Reindexing / selection / label manipulation +Reindexing / Selection / Label Manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -100,7 +100,7 @@ Plotting Series.hist -Serialization / IO / conversion +Serialization / IO / Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ @@ -115,3 +115,5 @@ Elasticsearch Functions :toctree: api/ Series.es_info + Series.es_dtype + Series.es_dtypes diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 32b71c4..c42f49d 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -780,6 +780,20 @@ class FieldMappings: # Convert return from 'str' to 'np.dtype' return pd_dtypes.apply(lambda x: np.dtype(x)) + def es_dtypes(self): + """ + Returns + ------- + dtypes: pd.Series + Index: Display name + Values: es_dtype as a string + """ + es_dtypes = self._mappings_capabilities["es_dtype"] + + # Set name of the returned series as None + es_dtypes.name = None + return es_dtypes + def es_info(self, buf): buf.write("Mappings:\n") buf.write(f" capabilities:\n{self._mappings_capabilities.to_string()}\n") diff --git a/eland/ndframe.py b/eland/ndframe.py index 6659ee2..e55f557 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -132,6 +132,28 @@ class NDFrame(ABC): """ return self._query_compiler.dtypes + @property + def es_dtypes(self): + """ + Return the Elasticsearch dtypes in the index + + Returns + ------- + pandas.Series + The data type of each column. + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek']) + >>> df.es_dtypes + Origin keyword + AvgTicketPrice float + timestamp date + dayOfWeek byte + dtype: object + """ + return self._query_compiler.es_dtypes + def _build_repr(self, num_rows) -> pd.DataFrame: # self could be Series or DataFrame if len(self.index) <= num_rows: diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 0d8a394..6a45085 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -123,6 +123,10 @@ class QueryCompiler: def dtypes(self): return self._mappings.dtypes() + @property + def es_dtypes(self): + return self._mappings.es_dtypes() + # END Index, columns, and dtypes objects def _es_results_to_pandas(self, results, batch_size=None, show_progress=False): diff --git a/eland/series.py b/eland/series.py index 06cfa31..fcf2173 100644 --- a/eland/series.py +++ b/eland/series.py @@ -435,6 +435,13 @@ class Series(NDFrame): """ return self._query_compiler.dtypes[0] + @property + def es_dtype(self) -> str: + """ + Return the Elasticsearch type of the underlying data. + """ + return self._query_compiler.es_dtypes[0] + def __gt__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values diff --git a/eland/tests/conftest.py b/eland/tests/conftest.py index 3142270..525862c 100644 --- a/eland/tests/conftest.py +++ b/eland/tests/conftest.py @@ -29,6 +29,7 @@ from .common import ( _pd_ecommerce, _ed_flights_small, _pd_flights_small, + TestData, ) import eland as ed @@ -146,3 +147,8 @@ def df(): return SymmetricAPIChecker( ed_obj=_ed_flights_small, pd_obj=_pd_flights_small.copy() ) + + +@pytest.fixture(scope="session") +def testdata(): + return TestData() diff --git a/eland/tests/dataframe/test_dtypes_pytest.py b/eland/tests/dataframe/test_dtypes_pytest.py index 6e63495..fd0630c 100644 --- a/eland/tests/dataframe/test_dtypes_pytest.py +++ b/eland/tests/dataframe/test_dtypes_pytest.py @@ -18,12 +18,12 @@ # File called _pytest for PyCharm compatability import numpy as np +import pandas as pd +from eland.tests.common import assert_series_equal class TestDataFrameDtypes: def test_dtypes(self, df): - print(df.dtypes) - for i in range(0, len(df.dtypes) - 1): assert isinstance(df.dtypes[i], type(df.dtypes[i])) @@ -32,3 +32,40 @@ class TestDataFrameDtypes: df.select_dtypes(exclude=np.number) df.select_dtypes(include=np.float64) df.select_dtypes(exclude=np.float64) + + def test_es_dtypes(self, testdata): + df = testdata.ed_flights_small() + assert_series_equal( + df.es_dtypes, + pd.Series( + { + "AvgTicketPrice": "float", + "Cancelled": "boolean", + "Carrier": "keyword", + "Dest": "keyword", + "DestAirportID": "keyword", + "DestCityName": "keyword", + "DestCountry": "keyword", + "DestLocation": "geo_point", + "DestRegion": "keyword", + "DestWeather": "keyword", + "DistanceKilometers": "float", + "DistanceMiles": "float", + "FlightDelay": "boolean", + "FlightDelayMin": "integer", + "FlightDelayType": "keyword", + "FlightNum": "keyword", + "FlightTimeHour": "float", + "FlightTimeMin": "float", + "Origin": "keyword", + "OriginAirportID": "keyword", + "OriginCityName": "keyword", + "OriginCountry": "keyword", + "OriginLocation": "geo_point", + "OriginRegion": "keyword", + "OriginWeather": "keyword", + "dayOfWeek": "byte", + "timestamp": "date", + } + ), + ) diff --git a/eland/tests/series/test_dtype_pytest.py b/eland/tests/series/test_dtype_pytest.py index c61071c..eb00c98 100644 --- a/eland/tests/series/test_dtype_pytest.py +++ b/eland/tests/series/test_dtype_pytest.py @@ -16,8 +16,10 @@ # under the License. import numpy as np +import pandas as pd import warnings from eland.common import build_pd_series, EMPTY_SERIES_DTYPE +from eland.tests.common import assert_series_equal def test_empty_series_dtypes(): @@ -33,3 +35,9 @@ def test_empty_series_dtypes(): assert np.int32 != EMPTY_SERIES_DTYPE assert s.dtype == np.int32 assert w == [] + + +def test_series_es_dtypes(testdata): + series = testdata.ed_flights_small().AvgTicketPrice + assert_series_equal(series.es_dtypes, pd.Series(data={"AvgTicketPrice": "float"})) + assert series.es_dtype == "float"