Add agg compatibility logic to Field class

This commit is contained in:
Seth Michael Larson 2020-04-27 15:16:48 -05:00 committed by GitHub
parent 7946eb4daa
commit 15a1977dcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 490 additions and 348 deletions

View File

@ -35,4 +35,4 @@ docker run \
--name eland-test-runner \
--rm \
elastic/eland \
nox -s test
nox -s test-${PYTHON_VERSION}

View File

@ -1,4 +1,4 @@
elasticsearch>=7.0.5
elasticsearch==7.7.0a2
pandas>=1
matplotlib
pytest>=5.2.1

View File

@ -5,8 +5,9 @@
import re
import warnings
from enum import Enum
from typing import Union, List, Tuple, cast, Callable, Any
from typing import Union, List, Tuple, cast, Callable, Any, Optional, Dict
import numpy as np # type: ignore
import pandas as pd # type: ignore
from elasticsearch import Elasticsearch # type: ignore
@ -19,6 +20,23 @@ DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
with warnings.catch_warnings():
warnings.simplefilter("ignore")
EMPTY_SERIES_DTYPE = pd.Series().dtype
def build_pd_series(
data: Dict[str, Any], dtype: Optional[np.dtype] = None, **kwargs: Any
) -> pd.Series:
"""Builds a pd.Series while squelching the warning
for unspecified dtype on empty series
"""
dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype)
if dtype is not None:
kwargs["dtype"] = dtype
return pd.Series(data, **kwargs)
def docstring_parameter(*sub: Any) -> Callable[[Any], Any]:
def dec(obj: Any) -> Any:
obj.__doc__ = obj.__doc__.format(*sub)

View File

@ -1280,11 +1280,11 @@ class DataFrame(NDFrame):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
DistanceKilometers AvgTicketPrice
sum 9.261629e+07 8.204365e+06
min 0.000000e+00 1.000205e+02
std 4.578263e+03 2.663867e+02
sum 92616288 8204364
min 0 100
std 4578 266
"""
axis = pd.DataFrame._get_axis_number(axis)

View File

@ -14,6 +14,48 @@ from pandas.core.dtypes.common import (
is_string_dtype,
)
from pandas.core.dtypes.inference import is_list_like
from typing import NamedTuple, Optional
class Field(NamedTuple):
"""Holds all information on a particular field in the mapping"""
index: str
es_field_name: str
is_source: bool
es_dtype: str
es_date_format: Optional[str]
pd_dtype: type
is_searchable: bool
is_aggregatable: bool
is_scripted: bool
aggregatable_es_field_name: str
@property
def is_numeric(self) -> bool:
return is_integer_dtype(self.pd_dtype) or is_float_dtype(self.pd_dtype)
@property
def is_timestamp(self) -> bool:
return is_datetime_or_timedelta_dtype(self.pd_dtype)
@property
def is_bool(self) -> bool:
return is_bool_dtype(self.pd_dtype)
@property
def np_dtype(self):
return np.dtype(self.pd_dtype)
def is_es_agg_compatible(self, es_agg):
# Cardinality works for all types
# Numerics and bools work for all aggs
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
return True
# Timestamps also work for 'min', 'max' and 'avg'
if es_agg in {"min", "max", "avg"} and self.is_timestamp:
return True
return False
class FieldMappings:
@ -40,6 +82,23 @@ class FieldMappings:
or es_field_name.keyword (if exists) or None
"""
ES_DTYPE_TO_PD_DTYPE = {
"text": "object",
"keyword": "object",
"long": "int64",
"integer": "int64",
"short": "int64",
"byte": "int64",
"binary": "int64",
"double": "float64",
"float": "float64",
"half_float": "float64",
"scaled_float": "float64",
"date": "datetime64[ns]",
"date_nanos": "datetime64[ns]",
"boolean": "bool",
}
# the labels for each column (display_name is index)
column_labels = [
"es_field_name",
@ -316,8 +375,8 @@ class FieldMappings:
# return just source fields (as these are the only ones we display)
return capability_matrix_df[capability_matrix_df.is_source].sort_index()
@staticmethod
def _es_dtype_to_pd_dtype(es_dtype):
@classmethod
def _es_dtype_to_pd_dtype(cls, es_dtype):
"""
Mapping Elasticsearch types to pandas dtypes
--------------------------------------------
@ -332,28 +391,7 @@ class FieldMappings:
boolean | bool
TODO - add additional mapping types
"""
es_dtype_to_pd_dtype = {
"text": "object",
"keyword": "object",
"long": "int64",
"integer": "int64",
"short": "int64",
"byte": "int64",
"binary": "int64",
"double": "float64",
"float": "float64",
"half_float": "float64",
"scaled_float": "float64",
"date": "datetime64[ns]",
"date_nanos": "datetime64[ns]",
"boolean": "bool",
}
if es_dtype in es_dtype_to_pd_dtype:
return es_dtype_to_pd_dtype[es_dtype]
# Return 'object' for all unsupported TODO - investigate how different types could be supported
return "object"
return cls.ES_DTYPE_TO_PD_DTYPE.get(es_dtype, "object")
@staticmethod
def _pd_dtype_to_es_dtype(pd_dtype):
@ -591,6 +629,14 @@ class FieldMappings:
pd_dtypes, es_field_names, es_date_formats = self.metric_source_fields()
return es_field_names
def all_source_fields(self):
source_fields = []
for index, row in self._mappings_capabilities.iterrows():
row = row.to_dict()
row["index"] = index
source_fields.append(Field(**row))
return source_fields
def metric_source_fields(self, include_bool=False, include_timestamp=False):
"""
Returns

View File

@ -409,6 +409,36 @@ class NDFrame(ABC):
"""
return self._query_compiler.nunique()
def mad(self, numeric_only=True):
"""
Return standard deviation for each numeric column
Returns
-------
pandas.Series
The value of the standard deviation for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.std`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.mad() # doctest: +SKIP
AvgTicketPrice 213.368709
Cancelled 0.000000
DistanceKilometers 2946.168236
DistanceMiles 1830.987236
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 3.819435
FlightTimeMin 229.142297
dayOfWeek 2.000000
dtype: float64
"""
return self._query_compiler.mad(numeric_only=numeric_only)
def _hist(self, num_bins):
return self._query_compiler._hist(num_bins)

View File

@ -3,13 +3,12 @@
# See the LICENSE file in the project root for more information
import copy
import typing
import warnings
from typing import Optional
import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
from elasticsearch.helpers import scan
from eland import Index
@ -18,6 +17,7 @@ from eland.common import (
DEFAULT_CSV_BATCH_OUTPUT_SIZE,
DEFAULT_ES_MAX_RESULT_WINDOW,
elasticsearch_date_to_pandas_date,
build_pd_series,
)
from eland.query import Query
from eland.actions import SortFieldAction
@ -31,15 +31,8 @@ from eland.tasks import (
SizeTask,
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
EMPTY_SERIES_DTYPE = pd.Series().dtype
def build_series(data, dtype=None, **kwargs):
out_dtype = EMPTY_SERIES_DTYPE if not data else dtype
s = pd.Series(data=data, index=data.keys(), dtype=out_dtype, **kwargs)
return s
if typing.TYPE_CHECKING:
from eland.query_compiler import QueryCompiler
class Operations:
@ -122,45 +115,45 @@ class Operations:
)["count"]
counts[field] = field_exists_count
return pd.Series(data=counts, index=fields)
return build_pd_series(data=counts, index=fields)
def mean(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "avg", numeric_only=numeric_only)
results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def var(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler, ("extended_stats", "variance"), numeric_only=numeric_only
)
results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def std(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler,
("extended_stats", "std_deviation"),
numeric_only=numeric_only,
)
results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def median(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only
results = self._metric_aggs(
query_compiler, ["median"], numeric_only=numeric_only
)
return build_pd_series(results, index=results.keys())
def sum(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def max(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
)
results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def min(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
)
results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def nunique(self, query_compiler):
return self._metric_aggs(
query_compiler, "cardinality", field_types="aggregatable"
)
results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
return build_pd_series(results, index=results.keys())
def mad(self, query_compiler, numeric_only=True):
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
return build_pd_series(results, index=results.keys())
def value_counts(self, query_compiler, es_size):
return self._terms_aggs(query_compiler, "terms", es_size)
@ -168,28 +161,7 @@ class Operations:
def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins)
def _metric_aggs(
self,
query_compiler,
func,
field_types=None,
numeric_only=None,
keep_original_dtype=False,
):
"""
Parameters
----------
field_types: str, default None
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
If `None`, use only numeric fields.
keep_original_dtype : bool, default False
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
Returns
-------
pandas.Series
Series containing results of `func` applied to the field_name(s)
"""
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True):
query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing)
@ -198,152 +170,113 @@ class Operations:
f"Can not count field matches if size is set {size}"
)
results = {}
fields = query_compiler._mappings.all_source_fields()
if numeric_only:
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
body = Query(query_params["query"])
results = {}
# Convert pandas aggs to ES equivalent
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
# some metrics aggs (including cardinality) work on all aggregatable fields
# therefore we include an optional all parameter on operations
# that call _metric_aggs
if field_types == "aggregatable":
aggregatable_field_names = (
query_compiler._mappings.aggregatable_field_names()
)
for field in fields:
for es_agg in es_aggs:
if not field.is_es_agg_compatible(es_agg):
continue
for field in aggregatable_field_names.keys():
body.metric_aggs(field, func, field)
response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
)
# Results are of the form
# "aggregations" : {
# "customer_full_name.keyword" : {
# "value" : 10
# }
# }
# map aggregatable (e.g. x.keyword) to field_name
for key, value in aggregatable_field_names.items():
results[value] = response["aggregations"][key]["value"]
else:
if numeric_only:
(
pd_dtypes,
source_fields,
date_formats,
) = query_compiler._mappings.metric_source_fields(include_bool=True)
else:
# The only non-numerics we support are bool and timestamps currently
# strings are not supported by metric aggs in ES
# TODO - sum isn't supported for Timestamp in pandas - although ES does attempt to do it
(
pd_dtypes,
source_fields,
date_formats,
) = query_compiler._mappings.metric_source_fields(
include_bool=True, include_timestamp=True
)
for field in source_fields:
if isinstance(func, tuple):
body.metric_aggs(func[0] + "_" + field, func[0], field)
else:
body.metric_aggs(field, func, field)
response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
)
# Results are of the form
# "aggregations" : {
# "AvgTicketPrice" : {
# "value" : 628.2536888148849
# },
# "timestamp": {
# "value": 1.5165624455644382E12,
# "value_as_string": "2018-01-21T19:20:45.564Z"
# }
# }
for pd_dtype, field, date_format in zip(
pd_dtypes, source_fields, date_formats
):
if is_datetime_or_timedelta_dtype(pd_dtype):
results[field] = elasticsearch_date_to_pandas_date(
response["aggregations"][field]["value_as_string"], date_format
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
if isinstance(es_agg, tuple):
body.metric_aggs(
f"{es_agg[0]}_{field.es_field_name}",
es_agg[0],
field.aggregatable_es_field_name,
)
elif keep_original_dtype:
if isinstance(func, tuple):
results = pd_dtype.type(
response["aggregations"][func[0] + "_" + field][func[1]]
)
else:
results[field] = pd_dtype.type(
response["aggregations"][field]["value"]
)
else:
if isinstance(func, tuple):
if func[0] == "percentiles":
results[field] = response["aggregations"][
"percentiles_" + field
]["values"]["50.0"]
body.metric_aggs(
f"{es_agg}_{field.es_field_name}",
es_agg,
field.aggregatable_es_field_name,
)
# If 0-length dataframe we get None here
if results[field] is None:
results[field] = np.float64(np.NaN)
elif func[1] == "variance":
# pandas computes the sample variance
# Elasticsearch computes the population variance
count = response["aggregations"][func[0] + "_" + field][
"count"
]
response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
)
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
"""
Results are like (for 'sum', 'min')
# transform population variance into sample variance
if count <= 1:
results[field] = np.float64(np.NaN)
else:
results[field] = count / (count - 1.0) * results[field]
elif func[1] == "std_deviation":
# pandas computes the sample std
# Elasticsearch computes the population std
count = response["aggregations"][func[0] + "_" + field][
"count"
]
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
min 1.000205e+02 0.000000e+00 0.000000e+00 0
"""
for field in fields:
values = []
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
# If the field and agg aren't compatible we add a NaN
if not field.is_es_agg_compatible(es_agg):
values.append(np.float64(np.NaN))
continue
# transform population std into sample std
if isinstance(es_agg, tuple):
agg_value = response["aggregations"][
f"{es_agg[0]}_{field.es_field_name}"
]
# Pull multiple values from 'percentiles' result.
if es_agg[0] == "percentiles":
agg_value = agg_value["values"]
agg_value = agg_value[es_agg[1]]
# Need to convert 'Population' stddev and variance
# from Elasticsearch into 'Sample' stddev and variance
# which is what pandas uses.
if es_agg[1] in ("std_deviation", "variance"):
# Neither transformation works with count <=1
count = response["aggregations"][
f"{es_agg[0]}_{field.es_field_name}"
]["count"]
# All of the below calculations result in NaN if count<=1
if count <= 1:
agg_value = np.float64(np.NaN)
elif es_agg[1] == "std_deviation":
agg_value *= count / (count - 1.0)
else: # es_agg[1] == "variance"
# sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
# population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
# sample_std=\sqrt{\frac{N}{N-1}population_std}
if count <= 1:
results[field] = np.float64(np.NaN)
else:
results[field] = np.sqrt(
(count / (count - 1.0))
* results[field]
* results[field]
)
else:
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
agg_value = np.sqrt(
(count / (count - 1.0)) * agg_value * agg_value
)
else:
agg_value = response["aggregations"][
f"{es_agg}_{field.es_field_name}"
]
if "value_as_string" in agg_value and field.is_timestamp:
agg_value = elasticsearch_date_to_pandas_date(
agg_value["value_as_string"], field.es_date_format
)
else:
results[field] = response["aggregations"][field]["value"]
agg_value = agg_value["value"]
# Return single value if this is a series
# if len(numeric_source_fields) == 1:
# return np.float64(results[numeric_source_fields[0]])
s = build_series(results)
# These aggregations maintain the column datatype
if pd_agg in ("max", "min"):
agg_value = field.np_dtype.type(agg_value)
return s
# Null usually means there were no results.
if agg_value is None:
agg_value = np.float64(np.NaN)
values.append(agg_value)
results[field.index] = values if len(values) > 1 else values[0]
return results
def _terms_aggs(self, query_compiler, func, es_size=None):
"""
@ -391,9 +324,7 @@ class Operations:
except IndexError:
name = None
s = build_series(results, name=name)
return s
return build_pd_series(results, name=name)
def _hist_aggs(self, query_compiler, num_bins):
# Get histogram bins and weights for numeric field_names
@ -409,8 +340,12 @@ class Operations:
body = Query(query_params["query"])
min_aggs = self._metric_aggs(query_compiler, "min", numeric_only=True)
max_aggs = self._metric_aggs(query_compiler, "max", numeric_only=True)
results = self._metric_aggs(query_compiler, ["min", "max"], numeric_only=True)
min_aggs = {}
max_aggs = {}
for field, (min_agg, max_agg) in results.items():
min_aggs[field] = min_agg
max_aggs[field] = max_agg
for field in numeric_source_fields:
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
@ -476,7 +411,6 @@ class Operations:
df_bins = pd.DataFrame(data=bins)
df_weights = pd.DataFrame(data=weights)
return df_bins, df_weights
@staticmethod
@ -511,20 +445,42 @@ class Operations:
var
nunique
"""
ed_aggs = []
# pd aggs that will be mapped to es aggs
# that can use 'extended_stats'.
extended_stats_pd_aggs = {"mean", "min", "max", "count", "sum", "var", "std"}
extended_stats_es_aggs = {"avg", "min", "max", "count", "sum"}
extended_stats_calls = 0
es_aggs = []
for pd_agg in pd_aggs:
if pd_agg in extended_stats_pd_aggs:
extended_stats_calls += 1
# Aggs that are 'extended_stats' compatible
if pd_agg == "count":
ed_aggs.append("count")
elif pd_agg == "mad":
ed_aggs.append("median_absolute_deviation")
es_aggs.append("count")
elif pd_agg == "max":
ed_aggs.append("max")
elif pd_agg == "mean":
ed_aggs.append("avg")
elif pd_agg == "median":
ed_aggs.append(("percentiles", "50.0"))
es_aggs.append("max")
elif pd_agg == "min":
ed_aggs.append("min")
es_aggs.append("min")
elif pd_agg == "mean":
es_aggs.append("avg")
elif pd_agg == "sum":
es_aggs.append("sum")
elif pd_agg == "std":
es_aggs.append(("extended_stats", "std_deviation"))
elif pd_agg == "var":
es_aggs.append(("extended_stats", "variance"))
# Aggs that aren't 'extended_stats' compatible
elif pd_agg == "nunique":
es_aggs.append("cardinality")
elif pd_agg == "mad":
es_aggs.append("median_absolute_deviation")
elif pd_agg == "median":
es_aggs.append(("percentiles", "50.0"))
# Not implemented
elif pd_agg == "mode":
# We could do this via top term
raise NotImplementedError(pd_agg, " not currently implemented")
@ -537,77 +493,24 @@ class Operations:
elif pd_agg == "sem":
# TODO
raise NotImplementedError(pd_agg, " not currently implemented")
elif pd_agg == "sum":
ed_aggs.append("sum")
elif pd_agg == "std":
ed_aggs.append(("extended_stats", "std_deviation"))
elif pd_agg == "var":
ed_aggs.append(("extended_stats", "variance"))
else:
raise NotImplementedError(pd_agg, " not currently implemented")
# TODO - we can optimise extended_stats here as if we have 'count' and 'std' extended_stats would
# return both in one call
# If two aggs compatible with 'extended_stats' is called we can
# piggy-back on that single aggregation.
if extended_stats_calls >= 2:
es_aggs = [
("extended_stats", es_agg)
if es_agg in extended_stats_es_aggs
else es_agg
for es_agg in es_aggs
]
return ed_aggs
return es_aggs
def aggs(self, query_compiler, pd_aggs):
query_params, post_processing = self._resolve_tasks(query_compiler)
size = self._size(query_params, post_processing)
if size is not None:
raise NotImplementedError(
f"Can not count field matches if size is set {size}"
)
field_names = query_compiler.get_field_names(include_scripted_fields=False)
body = Query(query_params["query"])
# convert pandas aggs to ES equivalent
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
for field in field_names:
for es_agg in es_aggs:
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
if isinstance(es_agg, tuple):
body.metric_aggs(es_agg[0] + "_" + field, es_agg[0], field)
else:
body.metric_aggs(es_agg + "_" + field, es_agg, field)
response = query_compiler._client.search(
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
)
"""
Results are like (for 'sum', 'min')
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
min 1.000205e+02 0.000000e+00 0.000000e+00 0
"""
results = {}
for field in field_names:
values = list()
for es_agg in es_aggs:
if isinstance(es_agg, tuple):
agg_value = response["aggregations"][es_agg[0] + "_" + field]
# Pull multiple values from 'percentiles' result.
if es_agg[0] == "percentiles":
agg_value = agg_value["values"]
values.append(agg_value[es_agg[1]])
else:
values.append(
response["aggregations"][es_agg + "_" + field]["value"]
)
results[field] = values
df = pd.DataFrame(data=results, index=pd_aggs)
return df
results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
return pd.DataFrame(results, index=pd_aggs)
def describe(self, query_compiler):
query_params, post_processing = self._resolve_tasks(query_compiler)

View File

@ -66,13 +66,13 @@ class QueryCompiler:
self._index_pattern = to_copy._index_pattern
self._index = Index(self, to_copy._index.index_field)
self._operations = copy.deepcopy(to_copy._operations)
self._mappings = copy.deepcopy(to_copy._mappings)
self._mappings: FieldMappings = copy.deepcopy(to_copy._mappings)
else:
self._client = ensure_es_client(client)
self._index_pattern = index_pattern
# Get and persist mappings, this allows us to correctly
# map returned types from Elasticsearch to pandas datatypes
self._mappings = FieldMappings(
self._mappings: FieldMappings = FieldMappings(
client=self._client,
index_pattern=self._index_pattern,
display_names=display_names,
@ -464,6 +464,9 @@ class QueryCompiler:
def std(self, numeric_only=None):
return self._operations.std(self, numeric_only=numeric_only)
def mad(self, numeric_only=None):
return self._operations.mad(self, numeric_only=numeric_only)
def median(self, numeric_only=None):
return self._operations.median(self, numeric_only=numeric_only)

View File

@ -1105,7 +1105,7 @@ class Series(NDFrame):
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.max())
1199
"""
@ -1129,7 +1129,7 @@ class Series(NDFrame):
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.mean())
628
"""
@ -1153,7 +1153,7 @@ class Series(NDFrame):
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.min())
100
"""
@ -1177,7 +1177,7 @@ class Series(NDFrame):
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.sum())
8204364
"""
@ -1186,26 +1186,92 @@ class Series(NDFrame):
def nunique(self):
"""
Return the sum of the Series values
Return the number of unique values in a Series
Returns
-------
float
max value
int
Number of unique values
See Also
--------
:pandas_api_docs:`pandas.Series.sum`
:pandas_api_docs:`pandas.Series.nunique`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='Carrier')
>>> s = ed.DataFrame('localhost', 'flights')['Carrier']
>>> s.nunique()
4
"""
results = super().nunique()
return results.squeeze()
def var(self, numeric_only=None):
"""
Return variance for a Series
Returns
-------
float
var value
See Also
--------
:pandas_api_docs:`pandas.Series.var`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.var())
70964
"""
results = super().var(numeric_only=numeric_only)
return results.squeeze()
def std(self, numeric_only=None):
"""
Return standard deviation for a Series
Returns
-------
float
std value
See Also
--------
:pandas_api_docs:`pandas.Series.var`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.std())
266
"""
results = super().std(numeric_only=numeric_only)
return results.squeeze()
def mad(self, numeric_only=None):
"""
Return median absolute deviation for a Series
Returns
-------
float
mad value
See Also
--------
:pandas_api_docs:`pandas.Series.mad`
Examples
--------
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
>>> int(s.mad())
213
"""
results = super().mad(numeric_only=numeric_only)
return results.squeeze()
# def values TODO - not implemented as causes current implementation of query to fail
def to_numpy(self):

View File

@ -4,11 +4,9 @@
# File called _pytest for PyCharm compatability
import warnings
import numpy as np
from pandas.testing import assert_series_equal
from eland.operations import build_series, EMPTY_SERIES_DTYPE
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
@ -34,9 +32,3 @@ class TestDataFrameDtypes(TestData):
pd_flights.select_dtypes(include=np.number),
ed_flights.select_dtypes(include=np.number),
)
def test_emtpy_series_dtypes(self):
with warnings.catch_warnings(record=True) as w:
s = build_series({})
assert s.dtype == EMPTY_SERIES_DTYPE
assert w == []

View File

@ -11,7 +11,7 @@ from eland.tests.common import TestData
class TestDataFrameMetrics(TestData):
funcs = ["max", "min", "mean", "sum"]
extended_funcs = ["var", "std", "median"]
extended_funcs = ["median", "mad", "var", "std"]
def test_flights_metrics(self):
pd_flights = self.pd_flights()
@ -29,40 +29,48 @@ class TestDataFrameMetrics(TestData):
# Test on reduced set of data for more consistent
# median behaviour + better var, std test for sample vs population
pd_flights = pd_flights[pd_flights.DestAirportID == "AMS"]
ed_flights = ed_flights[ed_flights.DestAirportID == "AMS"]
pd_flights = pd_flights[["AvgTicketPrice"]]
ed_flights = ed_flights[["AvgTicketPrice"]]
import logging
logger = logging.getLogger("elasticsearch")
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
for func in self.extended_funcs:
pd_metric = getattr(pd_flights, func)(numeric_only=True)
pd_metric = getattr(pd_flights, func)(
**({"numeric_only": True} if func != "mad" else {})
)
ed_metric = getattr(ed_flights, func)(numeric_only=True)
assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)
pd_value = pd_metric["AvgTicketPrice"]
ed_value = ed_metric["AvgTicketPrice"]
assert (ed_value * 0.9) <= pd_value <= (ed_value * 1.1) # +/-10%
def test_flights_extended_metrics_nan(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
# Test on single row to test NaN behaviour of sample std/variance
pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"]
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"]
pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
for func in self.extended_funcs:
pd_metric = getattr(pd_flights_1, func)(numeric_only=True)
ed_metric = getattr(ed_flights_1, func)(numeric_only=True)
pd_metric = getattr(pd_flights_1, func)()
ed_metric = getattr(ed_flights_1, func)()
assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True
)
# Test on zero rows to test NaN behaviour of sample std/variance
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"]
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"]
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
for func in self.extended_funcs:
pd_metric = getattr(pd_flights_0, func)(numeric_only=True)
ed_metric = getattr(ed_flights_0, func)(numeric_only=True)
pd_metric = getattr(pd_flights_0, func)()
ed_metric = getattr(ed_flights_0, func)()
assert_series_equal(
pd_metric, ed_metric, check_exact=False, check_less_precise=True

View File

@ -0,0 +1,39 @@
# Licensed to Elasticsearch B.V under one or more agreements.
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
# See the LICENSE file in the project root for more information
from eland.operations import Operations
def test_all_aggs():
es_aggs = Operations._map_pd_aggs_to_es_aggs(
["min", "max", "mean", "std", "var", "mad", "count", "nunique", "median"]
)
assert es_aggs == [
("extended_stats", "min"),
("extended_stats", "max"),
("extended_stats", "avg"),
("extended_stats", "std_deviation"),
("extended_stats", "variance"),
"median_absolute_deviation",
("extended_stats", "count"),
"cardinality",
("percentiles", "50.0"),
]
def test_extended_stats_optimization():
# Tests that when '<agg>' and an 'extended_stats' agg are used together
# that ('extended_stats', '<agg>') is used instead of '<agg>'.
es_aggs = Operations._map_pd_aggs_to_es_aggs(["count", "nunique"])
assert es_aggs == ["count", "cardinality"]
for pd_agg in ["var", "std"]:
extended_es_agg = Operations._map_pd_aggs_to_es_aggs([pd_agg])[0]
es_aggs = Operations._map_pd_aggs_to_es_aggs([pd_agg, "nunique"])
assert es_aggs == [extended_es_agg, "cardinality"]
es_aggs = Operations._map_pd_aggs_to_es_aggs(["count", pd_agg, "nunique"])
assert es_aggs == [("extended_stats", "count"), extended_es_agg, "cardinality"]

View File

@ -0,0 +1,22 @@
# Licensed to Elasticsearch B.V under one or more agreements.
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
# See the LICENSE file in the project root for more information
import numpy as np
import warnings
from eland.common import build_pd_series, EMPTY_SERIES_DTYPE
def test_empty_series_dtypes():
with warnings.catch_warnings(record=True) as w:
s = build_pd_series({})
assert s.dtype == EMPTY_SERIES_DTYPE
assert w == []
# Ensure that a passed-in dtype isn't ignore
# even if the result is empty.
with warnings.catch_warnings(record=True) as w:
s = build_pd_series({}, dtype=np.int32)
assert np.int32 != EMPTY_SERIES_DTYPE
assert s.dtype == np.int32
assert w == []

View File

@ -10,17 +10,24 @@ from eland.tests.common import TestData
class TestSeriesMetrics(TestData):
funcs = ["max", "min", "mean", "sum"]
timestamp_funcs = ["max", "min", "mean"]
all_funcs = ["max", "min", "mean", "sum", "nunique", "var", "std", "mad"]
timestamp_funcs = ["max", "min", "mean", "nunique"]
def assert_almost_equal_for_agg(self, func, pd_metric, ed_metric):
if func in ("nunique", "var", "mad"):
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=-3)
else:
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)
def test_flights_metrics(self):
pd_flights = self.pd_flights()["AvgTicketPrice"]
ed_flights = self.ed_flights()["AvgTicketPrice"]
for func in self.funcs:
for func in self.all_funcs:
pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)()
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
def test_flights_timestamp(self):
pd_flights = self.pd_flights()["timestamp"]
@ -29,18 +36,28 @@ class TestSeriesMetrics(TestData):
for func in self.timestamp_funcs:
pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)()
pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns
assert pd_metric == ed_metric
if hasattr(pd_metric, "floor"):
pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns
if func == "nunique":
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
else:
assert pd_metric == ed_metric
def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
# None of these are numeric, will result in NaNs
column = "category"
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs:
for func in self.all_funcs:
if func == "nunique": # nunique never returns 'NaN'
continue
ed_metric = getattr(ed_ecommerce, func)()
assert ed_metric.empty
print(func, ed_metric)
assert np.isnan(ed_metric)
def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric
@ -50,9 +67,7 @@ class TestSeriesMetrics(TestData):
pd_ecommerce = self.pd_ecommerce()[column]
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs:
np.testing.assert_almost_equal(
getattr(pd_ecommerce, func)(),
getattr(ed_ecommerce, func)(),
decimal=2,
)
for func in self.all_funcs:
pd_metric = getattr(pd_ecommerce, func)()
ed_metric = getattr(ed_ecommerce, func)()
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)

View File

@ -1,4 +1,4 @@
elasticsearch>=7.6.0
elasticsearch==7.7.0a2
pandas>=1
matplotlib
pytest>=5.2.1

View File

@ -1,3 +1,3 @@
elasticsearch>=7.6.0
elasticsearch==7.7.0a2
pandas>=1
matplotlib

View File

@ -175,6 +175,6 @@ setup(
classifiers=CLASSIFIERS,
keywords="elastic eland pandas python",
packages=find_packages(include=["eland", "eland.*"]),
install_requires=["elasticsearch>=7.6, <8", "pandas>=1", "matplotlib"],
install_requires=["elasticsearch==7.7.0a2", "pandas>=1", "matplotlib", "numpy"],
python_requires=">=3.6",
)