eland/eland/ndframe.py
2020-12-30 14:20:36 -06:00

690 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import sys
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
import pandas as pd # type: ignore
from eland.query_compiler import QueryCompiler
if TYPE_CHECKING:
from elasticsearch import Elasticsearch
from eland.index import Index
"""
NDFrame
---------
Abstract base class for eland.DataFrame and eland.Series.
The underlying data resides in Elasticsearch and the API aligns as much as
possible with pandas APIs.
This allows the eland.DataFrame to access large datasets stored in Elasticsearch,
without storing the dataset in local memory.
Implementation Details
----------------------
Elasticsearch indexes can be configured in many different ways, and these indexes
utilise different data structures to pandas.
eland.DataFrame operations that return individual rows (e.g. df.head()) return
_source data. If _source is not enabled, this data is not accessible.
Similarly, only Elasticsearch searchable fields can be searched or filtered, and
only Elasticsearch aggregatable fields can be aggregated or grouped.
"""
class NDFrame(ABC):
def __init__(
self,
es_client: Optional[
Union[str, List[str], Tuple[str, ...], "Elasticsearch"]
] = None,
es_index_pattern: Optional[str] = None,
columns: Optional[List[str]] = None,
es_index_field: Optional[str] = None,
_query_compiler: Optional[QueryCompiler] = None,
) -> None:
"""
pandas.DataFrame/Series like API that proxies into Elasticsearch index(es).
Parameters
----------
client : elasticsearch.Elasticsearch
A reference to a Elasticsearch python client
"""
if _query_compiler is None:
_query_compiler = QueryCompiler(
client=es_client,
index_pattern=es_index_pattern,
display_names=columns,
index_field=es_index_field,
)
self._query_compiler = _query_compiler
@property
def index(self) -> "Index":
"""
Return eland index referencing Elasticsearch field to index a DataFrame/Series
Returns
-------
eland.Index:
Note eland.Index has a very limited API compared to pandas.Index
See Also
--------
:pandas_api_docs:`pandas.DataFrame.index`
:pandas_api_docs:`pandas.Series.index`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> assert isinstance(df.index, ed.Index)
>>> df.index.es_index_field
'_id'
>>> s = df['Carrier']
>>> assert isinstance(s.index, ed.Index)
>>> s.index.es_index_field
'_id'
"""
return self._query_compiler.index
@property
def dtypes(self) -> pd.Series:
"""
Return the pandas dtypes in the DataFrame. Elasticsearch types are mapped
to pandas dtypes via Mappings._es_dtype_to_pd_dtype.__doc__
Returns
-------
pandas.Series
The data type of each column.
See Also
--------
:pandas_api_docs:`pandas.DataFrame.dtypes`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
>>> df.dtypes
Origin object
AvgTicketPrice float64
timestamp datetime64[ns]
dayOfWeek int64
dtype: object
"""
return self._query_compiler.dtypes
@property
def es_dtypes(self) -> pd.Series:
"""
Return the Elasticsearch dtypes in the index
Returns
-------
pandas.Series
The data type of each column.
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
>>> df.es_dtypes
Origin keyword
AvgTicketPrice float
timestamp date
dayOfWeek byte
dtype: object
"""
return self._query_compiler.es_dtypes
def _build_repr(self, num_rows: int) -> pd.DataFrame:
# self could be Series or DataFrame
if len(self.index) <= num_rows:
return self.to_pandas()
num_rows = num_rows
head_rows = int(num_rows / 2) + num_rows % 2
tail_rows = num_rows - head_rows
head = self.head(head_rows).to_pandas()
tail = self.tail(tail_rows).to_pandas()
return head.append(tail)
def __sizeof__(self) -> int:
# Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler)
def __len__(self) -> int:
"""Gets the length of the DataFrame.
Returns:
Returns an integer length of the DataFrame object.
"""
return len(self.index)
def _es_info(self, buf):
self._query_compiler.es_info(buf)
def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return mean value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
mean value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.mean`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mean() # doctest: +SKIP
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
timestamp 2018-01-21 19:20:45.564438232
dtype: object
>>> df.mean(numeric_only=True)
AvgTicketPrice 628.253689
Cancelled 0.128494
dayOfWeek 2.835975
dtype: float64
>>> df.mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
timestamp 2018-01-21 19:20:45.564438232
DestCountry NaN
dtype: object
"""
return self._query_compiler.mean(numeric_only=numeric_only)
def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return sum for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
sum for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.sum`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.sum() # doctest: +SKIP
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
dtype: object
>>> df.sum(numeric_only=True)
AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03
dayOfWeek 3.703500e+04
dtype: float64
>>> df.sum(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.sum(numeric_only=numeric_only)
def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return the minimum value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
min value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.min`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.min() # doctest: +SKIP
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
timestamp 2018-01-01 00:00:00
dtype: object
>>> df.min(numeric_only=True)
AvgTicketPrice 100.020531
Cancelled 0.000000
dayOfWeek 0.000000
dtype: float64
>>> df.min(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
timestamp 2018-01-01 00:00:00
DestCountry NaN
dtype: object
"""
return self._query_compiler.min(numeric_only=numeric_only)
def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return variance for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
The value of the variance for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.var`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.var() # doctest: +SKIP
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
dtype: float64
>>> df.var(numeric_only=True)
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
dtype: float64
>>> df.var(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 70964.6
Cancelled 0.111987
dayOfWeek 3.76128
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.var(numeric_only=numeric_only)
def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return standard deviation for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
The value of the standard deviation for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.std`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.std() # doctest: +SKIP
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
dtype: float64
>>> df.std(numeric_only=True)
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
dtype: float64
>>> df.std(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 266.407
Cancelled 0.334664
dayOfWeek 1.93951
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.std(numeric_only=numeric_only)
def median(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return the median value for each numeric column
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
median value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.median`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.median() # doctest: +SKIP
AvgTicketPrice 640.363
Cancelled False
dayOfWeek 3
timestamp 2018-01-21 23:54:06.624776611
dtype: object
>>> df.median(numeric_only=True) # doctest: +SKIP
AvgTicketPrice 640.362667
Cancelled 0.000000
dayOfWeek 3.000000
dtype: float64
>>> df.median(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 640.387
Cancelled False
dayOfWeek 3
timestamp 2018-01-21 23:54:06.624776611
DestCountry NaN
dtype: object
"""
return self._query_compiler.median(numeric_only=numeric_only)
def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
"""
Return the maximum value for each numeric column
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Parameters
----------
numeric_only: {True, False, None} Default is None
Which datatype to be returned
- True: Returns all values as float64, NaN/NaT values are removed
- None: Returns all values as the same dtype where possible, NaN/NaT are removed
- False: Returns all values as the same dtype where possible, NaN/NaT are preserved
Returns
-------
pandas.Series
max value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.max`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.max() # doctest: +SKIP
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
timestamp 2018-02-11 23:50:12
dtype: object
>>> df.max(numeric_only=True)
AvgTicketPrice 1199.729004
Cancelled 1.000000
dayOfWeek 6.000000
dtype: float64
>>> df.max(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
timestamp 2018-02-11 23:50:12
DestCountry NaN
dtype: object
"""
return self._query_compiler.max(numeric_only=numeric_only)
def nunique(self) -> pd.Series:
"""
Return cardinality of each field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword
rather than text fields**
This method will try and field aggregatable fields if possible if mapping has::
"customer_first_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
we will aggregate ``customer_first_name`` columns using ``customer_first_name.keyword``.
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
cardinality of each column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.nunique`
Examples
--------
>>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns)
>>> df.nunique()
category 6
currency 1
customer_birth_date 0
customer_first_name 46
user 46
dtype: int64
"""
return self._query_compiler.nunique()
def mad(self, numeric_only: bool = True) -> pd.Series:
"""
Return standard deviation for each numeric column
Returns
-------
pandas.Series
The value of the standard deviation for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.std`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mad() # doctest: +SKIP
AvgTicketPrice 213.35497
dayOfWeek 2.00000
dtype: float64
>>> df.mad(numeric_only=True) # doctest: +SKIP
AvgTicketPrice 213.473011
dayOfWeek 2.000000
dtype: float64
>>> df.mad(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 213.484
Cancelled NaN
dayOfWeek 2
timestamp NaT
DestCountry NaN
dtype: object
"""
return self._query_compiler.mad(numeric_only=numeric_only)
def _hist(self, num_bins):
return self._query_compiler._hist(num_bins)
def describe(self) -> pd.DataFrame:
"""
Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
datasets distribution, excluding NaN values.
Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
The output will vary depending on what is provided. Refer to the notes below for more detail.
TODO - add additional arguments (current only numeric values supported)
Returns
-------
pandas.Dataframe:
Summary information
See Also
--------
:pandas_api_docs:`pandas.DataFrame.describe`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
>>> df.describe() # ignoring percentiles as they don't generate consistent results
AvgTicketPrice FlightDelayMin
count 13059.000000 13059.000000
mean 628.253689 47.335171
std 266.386661 96.743006
min 100.020531 0.000000
...
...
...
max 1199.729004 360.000000
"""
return self._query_compiler.describe()
@abstractmethod
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
raise NotImplementedError
@abstractmethod
def head(self, n: int = 5) -> "NDFrame":
raise NotImplementedError
@abstractmethod
def tail(self, n: int = 5) -> "NDFrame":
raise NotImplementedError
@abstractmethod
def sample(
self,
n: Optional[int] = None,
frac: Optional[float] = None,
random_state: Optional[int] = None,
) -> "NDFrame":
raise NotImplementedError
@property
def shape(self) -> Tuple[int, ...]:
raise NotImplementedError
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of rows times number of columns if DataFrame.
Returns
-------
int:
Number of elements in the object
See Also
--------
:pandas_api_docs:`pandas.DataFrame.size`
"""
product = 0
for dim in self.shape:
product = (product or 1) * dim
return product