Add agg compatibility logic to Field class

This commit is contained in:
Seth Michael Larson 2020-04-27 15:16:48 -05:00 committed by GitHub
parent 7946eb4daa
commit 15a1977dcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 490 additions and 348 deletions

View File

@ -35,4 +35,4 @@ docker run \
--name eland-test-runner \ --name eland-test-runner \
--rm \ --rm \
elastic/eland \ elastic/eland \
nox -s test nox -s test-${PYTHON_VERSION}

View File

@ -1,4 +1,4 @@
elasticsearch>=7.0.5 elasticsearch==7.7.0a2
pandas>=1 pandas>=1
matplotlib matplotlib
pytest>=5.2.1 pytest>=5.2.1

View File

@ -5,8 +5,9 @@
import re import re
import warnings import warnings
from enum import Enum from enum import Enum
from typing import Union, List, Tuple, cast, Callable, Any from typing import Union, List, Tuple, cast, Callable, Any, Optional, Dict
import numpy as np # type: ignore
import pandas as pd # type: ignore import pandas as pd # type: ignore
from elasticsearch import Elasticsearch # type: ignore from elasticsearch import Elasticsearch # type: ignore
@ -19,6 +20,23 @@ DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
with warnings.catch_warnings():
warnings.simplefilter("ignore")
EMPTY_SERIES_DTYPE = pd.Series().dtype
def build_pd_series(
data: Dict[str, Any], dtype: Optional[np.dtype] = None, **kwargs: Any
) -> pd.Series:
"""Builds a pd.Series while squelching the warning
for unspecified dtype on empty series
"""
dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype)
if dtype is not None:
kwargs["dtype"] = dtype
return pd.Series(data, **kwargs)
def docstring_parameter(*sub: Any) -> Callable[[Any], Any]: def docstring_parameter(*sub: Any) -> Callable[[Any], Any]:
def dec(obj: Any) -> Any: def dec(obj: Any) -> Any:
obj.__doc__ = obj.__doc__.format(*sub) obj.__doc__ = obj.__doc__.format(*sub)

View File

@ -1280,11 +1280,11 @@ class DataFrame(NDFrame):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights')
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
DistanceKilometers AvgTicketPrice DistanceKilometers AvgTicketPrice
sum 9.261629e+07 8.204365e+06 sum 92616288 8204364
min 0.000000e+00 1.000205e+02 min 0 100
std 4.578263e+03 2.663867e+02 std 4578 266
""" """
axis = pd.DataFrame._get_axis_number(axis) axis = pd.DataFrame._get_axis_number(axis)

View File

@ -14,6 +14,48 @@ from pandas.core.dtypes.common import (
is_string_dtype, is_string_dtype,
) )
from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.inference import is_list_like
from typing import NamedTuple, Optional
class Field(NamedTuple):
"""Holds all information on a particular field in the mapping"""
index: str
es_field_name: str
is_source: bool
es_dtype: str
es_date_format: Optional[str]
pd_dtype: type
is_searchable: bool
is_aggregatable: bool
is_scripted: bool
aggregatable_es_field_name: str
@property
def is_numeric(self) -> bool:
return is_integer_dtype(self.pd_dtype) or is_float_dtype(self.pd_dtype)
@property
def is_timestamp(self) -> bool:
return is_datetime_or_timedelta_dtype(self.pd_dtype)
@property
def is_bool(self) -> bool:
return is_bool_dtype(self.pd_dtype)
@property
def np_dtype(self):
return np.dtype(self.pd_dtype)
def is_es_agg_compatible(self, es_agg):
# Cardinality works for all types
# Numerics and bools work for all aggs
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
return True
# Timestamps also work for 'min', 'max' and 'avg'
if es_agg in {"min", "max", "avg"} and self.is_timestamp:
return True
return False
class FieldMappings: class FieldMappings:
@ -40,6 +82,23 @@ class FieldMappings:
or es_field_name.keyword (if exists) or None or es_field_name.keyword (if exists) or None
""" """
ES_DTYPE_TO_PD_DTYPE = {
"text": "object",
"keyword": "object",
"long": "int64",
"integer": "int64",
"short": "int64",
"byte": "int64",
"binary": "int64",
"double": "float64",
"float": "float64",
"half_float": "float64",
"scaled_float": "float64",
"date": "datetime64[ns]",
"date_nanos": "datetime64[ns]",
"boolean": "bool",
}
# the labels for each column (display_name is index) # the labels for each column (display_name is index)
column_labels = [ column_labels = [
"es_field_name", "es_field_name",
@ -316,8 +375,8 @@ class FieldMappings:
# return just source fields (as these are the only ones we display) # return just source fields (as these are the only ones we display)
return capability_matrix_df[capability_matrix_df.is_source].sort_index() return capability_matrix_df[capability_matrix_df.is_source].sort_index()
@staticmethod @classmethod
def _es_dtype_to_pd_dtype(es_dtype): def _es_dtype_to_pd_dtype(cls, es_dtype):
""" """
Mapping Elasticsearch types to pandas dtypes Mapping Elasticsearch types to pandas dtypes
-------------------------------------------- --------------------------------------------
@ -332,28 +391,7 @@ class FieldMappings:
boolean | bool boolean | bool
TODO - add additional mapping types TODO - add additional mapping types
""" """
es_dtype_to_pd_dtype = { return cls.ES_DTYPE_TO_PD_DTYPE.get(es_dtype, "object")
"text": "object",
"keyword": "object",
"long": "int64",
"integer": "int64",
"short": "int64",
"byte": "int64",
"binary": "int64",
"double": "float64",
"float": "float64",
"half_float": "float64",
"scaled_float": "float64",
"date": "datetime64[ns]",
"date_nanos": "datetime64[ns]",
"boolean": "bool",
}
if es_dtype in es_dtype_to_pd_dtype:
return es_dtype_to_pd_dtype[es_dtype]
# Return 'object' for all unsupported TODO - investigate how different types could be supported
return "object"
@staticmethod @staticmethod
def _pd_dtype_to_es_dtype(pd_dtype): def _pd_dtype_to_es_dtype(pd_dtype):
@ -591,6 +629,14 @@ class FieldMappings:
pd_dtypes, es_field_names, es_date_formats = self.metric_source_fields() pd_dtypes, es_field_names, es_date_formats = self.metric_source_fields()
return es_field_names return es_field_names
def all_source_fields(self):
source_fields = []
for index, row in self._mappings_capabilities.iterrows():
row = row.to_dict()
row["index"] = index
source_fields.append(Field(**row))
return source_fields
def metric_source_fields(self, include_bool=False, include_timestamp=False): def metric_source_fields(self, include_bool=False, include_timestamp=False):
""" """
Returns Returns

View File

@ -409,6 +409,36 @@ class NDFrame(ABC):
""" """
return self._query_compiler.nunique() return self._query_compiler.nunique()
def mad(self, numeric_only=True):
"""
Return standard deviation for each numeric column
Returns
-------
pandas.Series
The value of the standard deviation for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.std`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.mad() # doctest: +SKIP
AvgTicketPrice 213.368709
Cancelled 0.000000
DistanceKilometers 2946.168236
DistanceMiles 1830.987236
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 3.819435
FlightTimeMin 229.142297
dayOfWeek 2.000000
dtype: float64
"""
return self._query_compiler.mad(numeric_only=numeric_only)
def _hist(self, num_bins): def _hist(self, num_bins):
return self._query_compiler._hist(num_bins) return self._query_compiler._hist(num_bins)

View File

@ -3,13 +3,12 @@
# See the LICENSE file in the project root for more information # See the LICENSE file in the project root for more information
import copy import copy
import typing
import warnings import warnings
from typing import Optional from typing import Optional
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
from elasticsearch.helpers import scan from elasticsearch.helpers import scan
from eland import Index from eland import Index
@ -18,6 +17,7 @@ from eland.common import (
DEFAULT_CSV_BATCH_OUTPUT_SIZE, DEFAULT_CSV_BATCH_OUTPUT_SIZE,
DEFAULT_ES_MAX_RESULT_WINDOW, DEFAULT_ES_MAX_RESULT_WINDOW,
elasticsearch_date_to_pandas_date, elasticsearch_date_to_pandas_date,
build_pd_series,
) )
from eland.query import Query from eland.query import Query
from eland.actions import SortFieldAction from eland.actions import SortFieldAction
@ -31,15 +31,8 @@ from eland.tasks import (
SizeTask, SizeTask,
) )
with warnings.catch_warnings(): if typing.TYPE_CHECKING:
warnings.simplefilter("ignore") from eland.query_compiler import QueryCompiler
EMPTY_SERIES_DTYPE = pd.Series().dtype
def build_series(data, dtype=None, **kwargs):
out_dtype = EMPTY_SERIES_DTYPE if not data else dtype
s = pd.Series(data=data, index=data.keys(), dtype=out_dtype, **kwargs)
return s
class Operations: class Operations:
@ -122,45 +115,45 @@ class Operations:
)["count"] )["count"]
counts[field] = field_exists_count counts[field] = field_exists_count
return pd.Series(data=counts, index=fields) return build_pd_series(data=counts, index=fields)
def mean(self, query_compiler, numeric_only=True): def mean(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "avg", numeric_only=numeric_only) results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def var(self, query_compiler, numeric_only=True): def var(self, query_compiler, numeric_only=True):
return self._metric_aggs( results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only)
query_compiler, ("extended_stats", "variance"), numeric_only=numeric_only return build_pd_series(results, index=results.keys())
)
def std(self, query_compiler, numeric_only=True): def std(self, query_compiler, numeric_only=True):
return self._metric_aggs( results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only)
query_compiler, return build_pd_series(results, index=results.keys())
("extended_stats", "std_deviation"),
numeric_only=numeric_only,
)
def median(self, query_compiler, numeric_only=True): def median(self, query_compiler, numeric_only=True):
return self._metric_aggs( results = self._metric_aggs(
query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only query_compiler, ["median"], numeric_only=numeric_only
) )
return build_pd_series(results, index=results.keys())
def sum(self, query_compiler, numeric_only=True): def sum(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only) results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def max(self, query_compiler, numeric_only=True): def max(self, query_compiler, numeric_only=True):
return self._metric_aggs( results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True return build_pd_series(results, index=results.keys())
)
def min(self, query_compiler, numeric_only=True): def min(self, query_compiler, numeric_only=True):
return self._metric_aggs( results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True return build_pd_series(results, index=results.keys())
)
def nunique(self, query_compiler): def nunique(self, query_compiler):
return self._metric_aggs( results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
query_compiler, "cardinality", field_types="aggregatable" return build_pd_series(results, index=results.keys())
)
def mad(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def value_counts(self, query_compiler, es_size): def value_counts(self, query_compiler, es_size):
return self._terms_aggs(query_compiler, "terms", es_size) return self._terms_aggs(query_compiler, "terms", es_size)
@ -168,28 +161,7 @@ class Operations:
def hist(self, query_compiler, bins): def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins) return self._hist_aggs(query_compiler, bins)
def _metric_aggs( def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True):
self,
query_compiler,
func,
field_types=None,
numeric_only=None,
keep_original_dtype=False,
):
"""
Parameters
----------
field_types: str, default None
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
If `None`, use only numeric fields.
keep_original_dtype : bool, default False
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
Returns
-------
pandas.Series
Series containing results of `func` applied to the field_name(s)
"""
query_params, post_processing = self._resolve_tasks(query_compiler) query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing) size = self._size(query_params, post_processing)
@ -198,152 +170,113 @@ class Operations:
f"Can not count field matches if size is set {size}" f"Can not count field matches if size is set {size}"
) )
results = {}
fields = query_compiler._mappings.all_source_fields()
if numeric_only:
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
body = Query(query_params["query"]) body = Query(query_params["query"])
results = {} # Convert pandas aggs to ES equivalent
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
# some metrics aggs (including cardinality) work on all aggregatable fields for field in fields:
# therefore we include an optional all parameter on operations for es_agg in es_aggs:
# that call _metric_aggs if not field.is_es_agg_compatible(es_agg):
if field_types == "aggregatable": continue
aggregatable_field_names = (
query_compiler._mappings.aggregatable_field_names() # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
if isinstance(es_agg, tuple):
body.metric_aggs(
f"{es_agg[0]}_{field.es_field_name}",
es_agg[0],
field.aggregatable_es_field_name,
)
else:
body.metric_aggs(
f"{es_agg}_{field.es_field_name}",
es_agg,
field.aggregatable_es_field_name,
) )
for field in aggregatable_field_names.keys():
body.metric_aggs(field, func, field)
response = query_compiler._client.search( response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body() index=query_compiler._index_pattern, size=0, body=body.to_search_body()
) )
# Results are of the form """
# "aggregations" : { Results are like (for 'sum', 'min')
# "customer_full_name.keyword" : {
# "value" : 10
# }
# }
# map aggregatable (e.g. x.keyword) to field_name AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
for key, value in aggregatable_field_names.items(): sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
results[value] = response["aggregations"][key]["value"] min 1.000205e+02 0.000000e+00 0.000000e+00 0
else: """
if numeric_only: for field in fields:
( values = []
pd_dtypes, for es_agg, pd_agg in zip(es_aggs, pd_aggs):
source_fields,
date_formats,
) = query_compiler._mappings.metric_source_fields(include_bool=True)
else:
# The only non-numerics we support are bool and timestamps currently
# strings are not supported by metric aggs in ES
# TODO - sum isn't supported for Timestamp in pandas - although ES does attempt to do it
(
pd_dtypes,
source_fields,
date_formats,
) = query_compiler._mappings.metric_source_fields(
include_bool=True, include_timestamp=True
)
for field in source_fields: # If the field and agg aren't compatible we add a NaN
if isinstance(func, tuple): if not field.is_es_agg_compatible(es_agg):
body.metric_aggs(func[0] + "_" + field, func[0], field) values.append(np.float64(np.NaN))
else: continue
body.metric_aggs(field, func, field)
response = query_compiler._client.search( if isinstance(es_agg, tuple):
index=query_compiler._index_pattern, size=0, body=body.to_search_body() agg_value = response["aggregations"][
) f"{es_agg[0]}_{field.es_field_name}"
# Results are of the form
# "aggregations" : {
# "AvgTicketPrice" : {
# "value" : 628.2536888148849
# },
# "timestamp": {
# "value": 1.5165624455644382E12,
# "value_as_string": "2018-01-21T19:20:45.564Z"
# }
# }
for pd_dtype, field, date_format in zip(
pd_dtypes, source_fields, date_formats
):
if is_datetime_or_timedelta_dtype(pd_dtype):
results[field] = elasticsearch_date_to_pandas_date(
response["aggregations"][field]["value_as_string"], date_format
)
elif keep_original_dtype:
if isinstance(func, tuple):
results = pd_dtype.type(
response["aggregations"][func[0] + "_" + field][func[1]]
)
else:
results[field] = pd_dtype.type(
response["aggregations"][field]["value"]
)
else:
if isinstance(func, tuple):
if func[0] == "percentiles":
results[field] = response["aggregations"][
"percentiles_" + field
]["values"]["50.0"]
# If 0-length dataframe we get None here
if results[field] is None:
results[field] = np.float64(np.NaN)
elif func[1] == "variance":
# pandas computes the sample variance
# Elasticsearch computes the population variance
count = response["aggregations"][func[0] + "_" + field][
"count"
] ]
results[field] = response["aggregations"][ # Pull multiple values from 'percentiles' result.
func[0] + "_" + field if es_agg[0] == "percentiles":
][func[1]] agg_value = agg_value["values"]
# transform population variance into sample variance agg_value = agg_value[es_agg[1]]
# Need to convert 'Population' stddev and variance
# from Elasticsearch into 'Sample' stddev and variance
# which is what pandas uses.
if es_agg[1] in ("std_deviation", "variance"):
# Neither transformation works with count <=1
count = response["aggregations"][
f"{es_agg[0]}_{field.es_field_name}"
]["count"]
# All of the below calculations result in NaN if count<=1
if count <= 1: if count <= 1:
results[field] = np.float64(np.NaN) agg_value = np.float64(np.NaN)
else:
results[field] = count / (count - 1.0) * results[field]
elif func[1] == "std_deviation":
# pandas computes the sample std
# Elasticsearch computes the population std
count = response["aggregations"][func[0] + "_" + field][
"count"
]
results[field] = response["aggregations"][ elif es_agg[1] == "std_deviation":
func[0] + "_" + field agg_value *= count / (count - 1.0)
][func[1]]
# transform population std into sample std else: # es_agg[1] == "variance"
# sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2} # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
# population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2} # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
# sample_std=\sqrt{\frac{N}{N-1}population_std} # sample_std=\sqrt{\frac{N}{N-1}population_std}
if count <= 1: agg_value = np.sqrt(
results[field] = np.float64(np.NaN) (count / (count - 1.0)) * agg_value * agg_value
else:
results[field] = np.sqrt(
(count / (count - 1.0))
* results[field]
* results[field]
) )
else: else:
results[field] = response["aggregations"][ agg_value = response["aggregations"][
func[0] + "_" + field f"{es_agg}_{field.es_field_name}"
][func[1]] ]
if "value_as_string" in agg_value and field.is_timestamp:
agg_value = elasticsearch_date_to_pandas_date(
agg_value["value_as_string"], field.es_date_format
)
else: else:
results[field] = response["aggregations"][field]["value"] agg_value = agg_value["value"]
# Return single value if this is a series # These aggregations maintain the column datatype
# if len(numeric_source_fields) == 1: if pd_agg in ("max", "min"):
# return np.float64(results[numeric_source_fields[0]]) agg_value = field.np_dtype.type(agg_value)
s = build_series(results)
return s # Null usually means there were no results.
if agg_value is None:
agg_value = np.float64(np.NaN)
values.append(agg_value)
results[field.index] = values if len(values) > 1 else values[0]
return results
def _terms_aggs(self, query_compiler, func, es_size=None): def _terms_aggs(self, query_compiler, func, es_size=None):
""" """
@ -391,9 +324,7 @@ class Operations:
except IndexError: except IndexError:
name = None name = None
s = build_series(results, name=name) return build_pd_series(results, name=name)
return s
def _hist_aggs(self, query_compiler, num_bins): def _hist_aggs(self, query_compiler, num_bins):
# Get histogram bins and weights for numeric field_names # Get histogram bins and weights for numeric field_names
@ -409,8 +340,12 @@ class Operations:
body = Query(query_params["query"]) body = Query(query_params["query"])
min_aggs = self._metric_aggs(query_compiler, "min", numeric_only=True) results = self._metric_aggs(query_compiler, ["min", "max"], numeric_only=True)
max_aggs = self._metric_aggs(query_compiler, "max", numeric_only=True) min_aggs = {}
max_aggs = {}
for field, (min_agg, max_agg) in results.items():
min_aggs[field] = min_agg
max_aggs[field] = max_agg
for field in numeric_source_fields: for field in numeric_source_fields:
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins) body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
@ -476,7 +411,6 @@ class Operations:
df_bins = pd.DataFrame(data=bins) df_bins = pd.DataFrame(data=bins)
df_weights = pd.DataFrame(data=weights) df_weights = pd.DataFrame(data=weights)
return df_bins, df_weights return df_bins, df_weights
@staticmethod @staticmethod
@ -511,20 +445,42 @@ class Operations:
var var
nunique nunique
""" """
ed_aggs = [] # pd aggs that will be mapped to es aggs
# that can use 'extended_stats'.
extended_stats_pd_aggs = {"mean", "min", "max", "count", "sum", "var", "std"}
extended_stats_es_aggs = {"avg", "min", "max", "count", "sum"}
extended_stats_calls = 0
es_aggs = []
for pd_agg in pd_aggs: for pd_agg in pd_aggs:
if pd_agg in extended_stats_pd_aggs:
extended_stats_calls += 1
# Aggs that are 'extended_stats' compatible
if pd_agg == "count": if pd_agg == "count":
ed_aggs.append("count") es_aggs.append("count")
elif pd_agg == "mad":
ed_aggs.append("median_absolute_deviation")
elif pd_agg == "max": elif pd_agg == "max":
ed_aggs.append("max") es_aggs.append("max")
elif pd_agg == "mean":
ed_aggs.append("avg")
elif pd_agg == "median":
ed_aggs.append(("percentiles", "50.0"))
elif pd_agg == "min": elif pd_agg == "min":
ed_aggs.append("min") es_aggs.append("min")
elif pd_agg == "mean":
es_aggs.append("avg")
elif pd_agg == "sum":
es_aggs.append("sum")
elif pd_agg == "std":
es_aggs.append(("extended_stats", "std_deviation"))
elif pd_agg == "var":
es_aggs.append(("extended_stats", "variance"))
# Aggs that aren't 'extended_stats' compatible
elif pd_agg == "nunique":
es_aggs.append("cardinality")
elif pd_agg == "mad":
es_aggs.append("median_absolute_deviation")
elif pd_agg == "median":
es_aggs.append(("percentiles", "50.0"))
# Not implemented
elif pd_agg == "mode": elif pd_agg == "mode":
# We could do this via top term # We could do this via top term
raise NotImplementedError(pd_agg, " not currently implemented") raise NotImplementedError(pd_agg, " not currently implemented")
@ -537,77 +493,24 @@ class Operations:
elif pd_agg == "sem": elif pd_agg == "sem":
# TODO # TODO
raise NotImplementedError(pd_agg, " not currently implemented") raise NotImplementedError(pd_agg, " not currently implemented")
elif pd_agg == "sum":
ed_aggs.append("sum")
elif pd_agg == "std":
ed_aggs.append(("extended_stats", "std_deviation"))
elif pd_agg == "var":
ed_aggs.append(("extended_stats", "variance"))
else: else:
raise NotImplementedError(pd_agg, " not currently implemented") raise NotImplementedError(pd_agg, " not currently implemented")
# TODO - we can optimise extended_stats here as if we have 'count' and 'std' extended_stats would # If two aggs compatible with 'extended_stats' is called we can
# return both in one call # piggy-back on that single aggregation.
if extended_stats_calls >= 2:
es_aggs = [
("extended_stats", es_agg)
if es_agg in extended_stats_es_aggs
else es_agg
for es_agg in es_aggs
]
return ed_aggs return es_aggs
def aggs(self, query_compiler, pd_aggs): def aggs(self, query_compiler, pd_aggs):
query_params, post_processing = self._resolve_tasks(query_compiler) results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
return pd.DataFrame(results, index=pd_aggs)
size = self._size(query_params, post_processing)
if size is not None:
raise NotImplementedError(
f"Can not count field matches if size is set {size}"
)
field_names = query_compiler.get_field_names(include_scripted_fields=False)
body = Query(query_params["query"])
# convert pandas aggs to ES equivalent
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
for field in field_names:
for es_agg in es_aggs:
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
if isinstance(es_agg, tuple):
body.metric_aggs(es_agg[0] + "_" + field, es_agg[0], field)
else:
body.metric_aggs(es_agg + "_" + field, es_agg, field)
response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
)
"""
Results are like (for 'sum', 'min')
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
min 1.000205e+02 0.000000e+00 0.000000e+00 0
"""
results = {}
for field in field_names:
values = list()
for es_agg in es_aggs:
if isinstance(es_agg, tuple):
agg_value = response["aggregations"][es_agg[0] + "_" + field]
# Pull multiple values from 'percentiles' result.
if es_agg[0] == "percentiles":
agg_value = agg_value["values"]
values.append(agg_value[es_agg[1]])
else:
values.append(
response["aggregations"][es_agg + "_" + field]["value"]
)
results[field] = values
df = pd.DataFrame(data=results, index=pd_aggs)
return df
def describe(self, query_compiler): def describe(self, query_compiler):
query_params, post_processing = self._resolve_tasks(query_compiler) query_params, post_processing = self._resolve_tasks(query_compiler)

View File

@ -66,13 +66,13 @@ class QueryCompiler:
self._index_pattern = to_copy._index_pattern self._index_pattern = to_copy._index_pattern
self._index = Index(self, to_copy._index.index_field) self._index = Index(self, to_copy._index.index_field)
self._operations = copy.deepcopy(to_copy._operations) self._operations = copy.deepcopy(to_copy._operations)
self._mappings = copy.deepcopy(to_copy._mappings) self._mappings: FieldMappings = copy.deepcopy(to_copy._mappings)
else: else:
self._client = ensure_es_client(client) self._client = ensure_es_client(client)
self._index_pattern = index_pattern self._index_pattern = index_pattern
# Get and persist mappings, this allows us to correctly # Get and persist mappings, this allows us to correctly
# map returned types from Elasticsearch to pandas datatypes # map returned types from Elasticsearch to pandas datatypes
self._mappings = FieldMappings( self._mappings: FieldMappings = FieldMappings(
client=self._client, client=self._client,
index_pattern=self._index_pattern, index_pattern=self._index_pattern,
display_names=display_names, display_names=display_names,
@ -464,6 +464,9 @@ class QueryCompiler:
def std(self, numeric_only=None): def std(self, numeric_only=None):
return self._operations.std(self, numeric_only=numeric_only) return self._operations.std(self, numeric_only=numeric_only)
def mad(self, numeric_only=None):
return self._operations.mad(self, numeric_only=numeric_only)
def median(self, numeric_only=None): def median(self, numeric_only=None):
return self._operations.median(self, numeric_only=numeric_only) return self._operations.median(self, numeric_only=numeric_only)

View File

@ -1105,7 +1105,7 @@ class Series(NDFrame):
Examples Examples
-------- --------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') >>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.max()) >>> int(s.max())
1199 1199
""" """
@ -1129,7 +1129,7 @@ class Series(NDFrame):
Examples Examples
-------- --------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') >>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.mean()) >>> int(s.mean())
628 628
""" """
@ -1153,7 +1153,7 @@ class Series(NDFrame):
Examples Examples
-------- --------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') >>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.min()) >>> int(s.min())
100 100
""" """
@ -1177,7 +1177,7 @@ class Series(NDFrame):
Examples Examples
-------- --------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') >>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.sum()) >>> int(s.sum())
8204364 8204364
""" """
@ -1186,26 +1186,92 @@ class Series(NDFrame):
def nunique(self): def nunique(self):
""" """
Return the sum of the Series values Return the number of unique values in a Series
Returns Returns
------- -------
float int
max value Number of unique values
See Also See Also
-------- --------
:pandas_api_docs:`pandas.Series.sum` :pandas_api_docs:`pandas.Series.nunique`
Examples Examples
-------- --------
>>> s = ed.Series('localhost', 'flights', name='Carrier') >>> s = ed.DataFrame('localhost', 'flights')['Carrier']
>>> s.nunique() >>> s.nunique()
4 4
""" """
results = super().nunique() results = super().nunique()
return results.squeeze() return results.squeeze()
def var(self, numeric_only=None):
"""
Return variance for a Series
Returns
-------
float
var value
See Also
--------
:pandas_api_docs:`pandas.Series.var`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.var())
70964
"""
results = super().var(numeric_only=numeric_only)
return results.squeeze()
def std(self, numeric_only=None):
"""
Return standard deviation for a Series
Returns
-------
float
std value
See Also
--------
:pandas_api_docs:`pandas.Series.var`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.std())
266
"""
results = super().std(numeric_only=numeric_only)
return results.squeeze()
def mad(self, numeric_only=None):
"""
Return median absolute deviation for a Series
Returns
-------
float
mad value
See Also
--------
:pandas_api_docs:`pandas.Series.mad`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.mad())
213
"""
results = super().mad(numeric_only=numeric_only)
return results.squeeze()
# def values TODO - not implemented as causes current implementation of query to fail # def values TODO - not implemented as causes current implementation of query to fail
def to_numpy(self): def to_numpy(self):

View File

@ -4,11 +4,9 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import warnings
import numpy as np import numpy as np
from pandas.testing import assert_series_equal from pandas.testing import assert_series_equal
from eland.operations import build_series, EMPTY_SERIES_DTYPE
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal from eland.tests.common import assert_pandas_eland_frame_equal
@ -34,9 +32,3 @@ class TestDataFrameDtypes(TestData):
pd_flights.select_dtypes(include=np.number), pd_flights.select_dtypes(include=np.number),
ed_flights.select_dtypes(include=np.number), ed_flights.select_dtypes(include=np.number),
) )
def test_emtpy_series_dtypes(self):
with warnings.catch_warnings(record=True) as w:
s = build_series({})
assert s.dtype == EMPTY_SERIES_DTYPE
assert w == []

View File

@ -11,7 +11,7 @@ from eland.tests.common import TestData
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
funcs = ["max", "min", "mean", "sum"] funcs = ["max", "min", "mean", "sum"]
extended_funcs = ["var", "std", "median"] extended_funcs = ["median", "mad", "var", "std"]
def test_flights_metrics(self): def test_flights_metrics(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -29,40 +29,48 @@ class TestDataFrameMetrics(TestData):
# Test on reduced set of data for more consistent # Test on reduced set of data for more consistent
# median behaviour + better var, std test for sample vs population # median behaviour + better var, std test for sample vs population
pd_flights = pd_flights[pd_flights.DestAirportID == "AMS"] pd_flights = pd_flights[["AvgTicketPrice"]]
ed_flights = ed_flights[ed_flights.DestAirportID == "AMS"] ed_flights = ed_flights[["AvgTicketPrice"]]
import logging
logger = logging.getLogger("elasticsearch")
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
for func in self.extended_funcs: for func in self.extended_funcs:
pd_metric = getattr(pd_flights, func)(numeric_only=True) pd_metric = getattr(pd_flights, func)(
**({"numeric_only": True} if func != "mad" else {})
)
ed_metric = getattr(ed_flights, func)(numeric_only=True) ed_metric = getattr(ed_flights, func)(numeric_only=True)
assert_series_equal( pd_value = pd_metric["AvgTicketPrice"]
pd_metric, ed_metric, check_exact=False, check_less_precise=True ed_value = ed_metric["AvgTicketPrice"]
) assert (ed_value * 0.9) <= pd_value <= (ed_value * 1.1) # +/-10%
def test_flights_extended_metrics_nan(self): def test_flights_extended_metrics_nan(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
# Test on single row to test NaN behaviour of sample std/variance # Test on single row to test NaN behaviour of sample std/variance
pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"] pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"] ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
for func in self.extended_funcs: for func in self.extended_funcs:
pd_metric = getattr(pd_flights_1, func)(numeric_only=True) pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)(numeric_only=True) ed_metric = getattr(ed_flights_1, func)()
assert_series_equal( assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True pd_metric, ed_metric, check_exact=False, check_less_precise=True
) )
# Test on zero rows to test NaN behaviour of sample std/variance # Test on zero rows to test NaN behaviour of sample std/variance
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"] pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"] ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
for func in self.extended_funcs: for func in self.extended_funcs:
pd_metric = getattr(pd_flights_0, func)(numeric_only=True) pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)(numeric_only=True) ed_metric = getattr(ed_flights_0, func)()
assert_series_equal( assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True pd_metric, ed_metric, check_exact=False, check_less_precise=True

View File

@ -0,0 +1,39 @@
# Licensed to Elasticsearch B.V under one or more agreements.
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
# See the LICENSE file in the project root for more information
from eland.operations import Operations
def test_all_aggs():
es_aggs = Operations._map_pd_aggs_to_es_aggs(
["min", "max", "mean", "std", "var", "mad", "count", "nunique", "median"]
)
assert es_aggs == [
("extended_stats", "min"),
("extended_stats", "max"),
("extended_stats", "avg"),
("extended_stats", "std_deviation"),
("extended_stats", "variance"),
"median_absolute_deviation",
("extended_stats", "count"),
"cardinality",
("percentiles", "50.0"),
]
def test_extended_stats_optimization():
# Tests that when '<agg>' and an 'extended_stats' agg are used together
# that ('extended_stats', '<agg>') is used instead of '<agg>'.
es_aggs = Operations._map_pd_aggs_to_es_aggs(["count", "nunique"])
assert es_aggs == ["count", "cardinality"]
for pd_agg in ["var", "std"]:
extended_es_agg = Operations._map_pd_aggs_to_es_aggs([pd_agg])[0]
es_aggs = Operations._map_pd_aggs_to_es_aggs([pd_agg, "nunique"])
assert es_aggs == [extended_es_agg, "cardinality"]
es_aggs = Operations._map_pd_aggs_to_es_aggs(["count", pd_agg, "nunique"])
assert es_aggs == [("extended_stats", "count"), extended_es_agg, "cardinality"]

View File

@ -0,0 +1,22 @@
# Licensed to Elasticsearch B.V under one or more agreements.
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
# See the LICENSE file in the project root for more information
import numpy as np
import warnings
from eland.common import build_pd_series, EMPTY_SERIES_DTYPE
def test_empty_series_dtypes():
with warnings.catch_warnings(record=True) as w:
s = build_pd_series({})
assert s.dtype == EMPTY_SERIES_DTYPE
assert w == []
# Ensure that a passed-in dtype isn't ignore
# even if the result is empty.
with warnings.catch_warnings(record=True) as w:
s = build_pd_series({}, dtype=np.int32)
assert np.int32 != EMPTY_SERIES_DTYPE
assert s.dtype == np.int32
assert w == []

View File

@ -10,17 +10,24 @@ from eland.tests.common import TestData
class TestSeriesMetrics(TestData): class TestSeriesMetrics(TestData):
funcs = ["max", "min", "mean", "sum"] all_funcs = ["max", "min", "mean", "sum", "nunique", "var", "std", "mad"]
timestamp_funcs = ["max", "min", "mean"] timestamp_funcs = ["max", "min", "mean", "nunique"]
def assert_almost_equal_for_agg(self, func, pd_metric, ed_metric):
if func in ("nunique", "var", "mad"):
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=-3)
else:
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)
def test_flights_metrics(self): def test_flights_metrics(self):
pd_flights = self.pd_flights()["AvgTicketPrice"] pd_flights = self.pd_flights()["AvgTicketPrice"]
ed_flights = self.ed_flights()["AvgTicketPrice"] ed_flights = self.ed_flights()["AvgTicketPrice"]
for func in self.funcs: for func in self.all_funcs:
pd_metric = getattr(pd_flights, func)() pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)() ed_metric = getattr(ed_flights, func)()
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
def test_flights_timestamp(self): def test_flights_timestamp(self):
pd_flights = self.pd_flights()["timestamp"] pd_flights = self.pd_flights()["timestamp"]
@ -29,18 +36,28 @@ class TestSeriesMetrics(TestData):
for func in self.timestamp_funcs: for func in self.timestamp_funcs:
pd_metric = getattr(pd_flights, func)() pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)() ed_metric = getattr(ed_flights, func)()
if hasattr(pd_metric, "floor"):
pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns
if func == "nunique":
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
else:
assert pd_metric == ed_metric assert pd_metric == ed_metric
def test_ecommerce_selected_non_numeric_source_fields(self): def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric # None of these are numeric, will result in NaNs
column = "category" column = "category"
ed_ecommerce = self.ed_ecommerce()[column] ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs: for func in self.all_funcs:
if func == "nunique": # nunique never returns 'NaN'
continue
ed_metric = getattr(ed_ecommerce, func)() ed_metric = getattr(ed_ecommerce, func)()
assert ed_metric.empty print(func, ed_metric)
assert np.isnan(ed_metric)
def test_ecommerce_selected_all_numeric_source_fields(self): def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric # All of these are numeric
@ -50,9 +67,7 @@ class TestSeriesMetrics(TestData):
pd_ecommerce = self.pd_ecommerce()[column] pd_ecommerce = self.pd_ecommerce()[column]
ed_ecommerce = self.ed_ecommerce()[column] ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs: for func in self.all_funcs:
np.testing.assert_almost_equal( pd_metric = getattr(pd_ecommerce, func)()
getattr(pd_ecommerce, func)(), ed_metric = getattr(ed_ecommerce, func)()
getattr(ed_ecommerce, func)(), self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
decimal=2,
)

View File

@ -1,4 +1,4 @@
elasticsearch>=7.6.0 elasticsearch==7.7.0a2
pandas>=1 pandas>=1
matplotlib matplotlib
pytest>=5.2.1 pytest>=5.2.1

View File

@ -1,3 +1,3 @@
elasticsearch>=7.6.0 elasticsearch==7.7.0a2
pandas>=1 pandas>=1
matplotlib matplotlib

View File

@ -175,6 +175,6 @@ setup(
classifiers=CLASSIFIERS, classifiers=CLASSIFIERS,
keywords="elastic eland pandas python", keywords="elastic eland pandas python",
packages=find_packages(include=["eland", "eland.*"]), packages=find_packages(include=["eland", "eland.*"]),
install_requires=["elasticsearch>=7.6, <8", "pandas>=1", "matplotlib"], install_requires=["elasticsearch==7.7.0a2", "pandas>=1", "matplotlib", "numpy"],
python_requires=">=3.6", python_requires=">=3.6",
) )