Add es_dtypes property to DataFrame and Series

This commit is contained in:
Seth Michael Larson 2020-10-13 12:14:09 -05:00 committed by GitHub
parent b7c6c26606
commit adafeed667
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 152 additions and 18 deletions

View File

@ -0,0 +1,6 @@
eland.DataFrame.es_dtypes
=========================
.. currentmodule:: eland
.. autoattribute:: DataFrame.es_dtypes

View File

@ -0,0 +1,6 @@
eland.Series.es_dtype
=====================
.. currentmodule:: eland
.. autoattribute:: Series.es_dtype

View File

@ -0,0 +1,6 @@
eland.Series.es_dtypes
======================
.. currentmodule:: eland
.. autoattribute:: Series.es_dtypes

View File

@ -0,0 +1,15 @@
eland.ml.MLModel
================
.. currentmodule:: eland.ml
.. autoclass:: MLModel
..
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
.. autosummary::
:toctree:
DataFrame.abs
DataFrame.add

View File

@ -12,7 +12,7 @@ Constructor
DataFrame
Attributes and underlying data
Attributes and Underlying Data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -27,7 +27,7 @@ Attributes and underlying data
DataFrame.ndim
DataFrame.size
Indexing, iteration
Indexing, Iteration
~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -39,7 +39,7 @@ Indexing, iteration
DataFrame.query
DataFrame.sample
Function application, GroupBy & window
Function Application, GroupBy & Window
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -49,7 +49,7 @@ Function application, GroupBy & window
.. _api.dataframe.stats:
Computations / descriptive stats
Computations / Descriptive Stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -67,7 +67,7 @@ Computations / descriptive stats
DataFrame.sum
DataFrame.nunique
Reindexing / selection / label manipulation
Reindexing / Selection / Label Manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -89,8 +89,9 @@ Elasticsearch Functions
DataFrame.es_info
DataFrame.es_query
DataFrame.es_dtypes
Serialization / IO / conversion
Serialization / IO / Conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/

View File

@ -18,8 +18,8 @@ The fastest way to get started with machine learning features is to
See `Elasticsearch Machine Learning documentation <https://www.elastic.co/guide/en/machine-learning/current/setup.html>`_ more details.
ImportedMLModel
~~~~~~~~~~~~~~~
MLModel
~~~~~~~
.. currentmodule:: eland.ml
Constructor

View File

@ -1,8 +1,8 @@
.. _api.series:
=========
======
Series
=========
======
.. currentmodule:: eland
Constructor
@ -12,7 +12,7 @@ Constructor
Series
Attributes and underlying data
Attributes and Underlying Data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -26,7 +26,7 @@ Attributes and underlying data
Series.ndim
Series.size
Indexing, iteration
Indexing, Iteration
~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -35,7 +35,7 @@ Indexing, iteration
Series.tail
Series.sample
Binary operator functions
Binary Operator Functions
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -63,7 +63,7 @@ Binary operator functions
Series.rmod
Series.rpow
Computations / descriptive stats
Computations / Descriptive Stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -80,7 +80,7 @@ Computations / descriptive stats
Series.nunique
Series.value_counts
Reindexing / selection / label manipulation
Reindexing / Selection / Label Manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -100,7 +100,7 @@ Plotting
Series.hist
Serialization / IO / conversion
Serialization / IO / Conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
@ -115,3 +115,5 @@ Elasticsearch Functions
:toctree: api/
Series.es_info
Series.es_dtype
Series.es_dtypes

View File

@ -780,6 +780,20 @@ class FieldMappings:
# Convert return from 'str' to 'np.dtype'
return pd_dtypes.apply(lambda x: np.dtype(x))
def es_dtypes(self):
"""
Returns
-------
dtypes: pd.Series
Index: Display name
Values: es_dtype as a string
"""
es_dtypes = self._mappings_capabilities["es_dtype"]
# Set name of the returned series as None
es_dtypes.name = None
return es_dtypes
def es_info(self, buf):
buf.write("Mappings:\n")
buf.write(f" capabilities:\n{self._mappings_capabilities.to_string()}\n")

View File

@ -132,6 +132,28 @@ class NDFrame(ABC):
"""
return self._query_compiler.dtypes
@property
def es_dtypes(self):
"""
Return the Elasticsearch dtypes in the index
Returns
-------
pandas.Series
The data type of each column.
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
>>> df.es_dtypes
Origin keyword
AvgTicketPrice float
timestamp date
dayOfWeek byte
dtype: object
"""
return self._query_compiler.es_dtypes
def _build_repr(self, num_rows) -> pd.DataFrame:
# self could be Series or DataFrame
if len(self.index) <= num_rows:

View File

@ -123,6 +123,10 @@ class QueryCompiler:
def dtypes(self):
return self._mappings.dtypes()
@property
def es_dtypes(self):
return self._mappings.es_dtypes()
# END Index, columns, and dtypes objects
def _es_results_to_pandas(self, results, batch_size=None, show_progress=False):

View File

@ -435,6 +435,13 @@ class Series(NDFrame):
"""
return self._query_compiler.dtypes[0]
@property
def es_dtype(self) -> str:
"""
Return the Elasticsearch type of the underlying data.
"""
return self._query_compiler.es_dtypes[0]
def __gt__(self, other: Union[int, float, "Series"]) -> BooleanFilter:
if isinstance(other, Series):
# Need to use scripted query to compare to values

View File

@ -29,6 +29,7 @@ from .common import (
_pd_ecommerce,
_ed_flights_small,
_pd_flights_small,
TestData,
)
import eland as ed
@ -146,3 +147,8 @@ def df():
return SymmetricAPIChecker(
ed_obj=_ed_flights_small, pd_obj=_pd_flights_small.copy()
)
@pytest.fixture(scope="session")
def testdata():
return TestData()

View File

@ -18,12 +18,12 @@
# File called _pytest for PyCharm compatability
import numpy as np
import pandas as pd
from eland.tests.common import assert_series_equal
class TestDataFrameDtypes:
def test_dtypes(self, df):
print(df.dtypes)
for i in range(0, len(df.dtypes) - 1):
assert isinstance(df.dtypes[i], type(df.dtypes[i]))
@ -32,3 +32,40 @@ class TestDataFrameDtypes:
df.select_dtypes(exclude=np.number)
df.select_dtypes(include=np.float64)
df.select_dtypes(exclude=np.float64)
def test_es_dtypes(self, testdata):
df = testdata.ed_flights_small()
assert_series_equal(
df.es_dtypes,
pd.Series(
{
"AvgTicketPrice": "float",
"Cancelled": "boolean",
"Carrier": "keyword",
"Dest": "keyword",
"DestAirportID": "keyword",
"DestCityName": "keyword",
"DestCountry": "keyword",
"DestLocation": "geo_point",
"DestRegion": "keyword",
"DestWeather": "keyword",
"DistanceKilometers": "float",
"DistanceMiles": "float",
"FlightDelay": "boolean",
"FlightDelayMin": "integer",
"FlightDelayType": "keyword",
"FlightNum": "keyword",
"FlightTimeHour": "float",
"FlightTimeMin": "float",
"Origin": "keyword",
"OriginAirportID": "keyword",
"OriginCityName": "keyword",
"OriginCountry": "keyword",
"OriginLocation": "geo_point",
"OriginRegion": "keyword",
"OriginWeather": "keyword",
"dayOfWeek": "byte",
"timestamp": "date",
}
),
)

View File

@ -16,8 +16,10 @@
# under the License.
import numpy as np
import pandas as pd
import warnings
from eland.common import build_pd_series, EMPTY_SERIES_DTYPE
from eland.tests.common import assert_series_equal
def test_empty_series_dtypes():
@ -33,3 +35,9 @@ def test_empty_series_dtypes():
assert np.int32 != EMPTY_SERIES_DTYPE
assert s.dtype == np.int32
assert w == []
def test_series_es_dtypes(testdata):
series = testdata.ed_flights_small().AvgTicketPrice
assert_series_equal(series.es_dtypes, pd.Series(data={"AvgTicketPrice": "float"}))
assert series.es_dtype == "float"