mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Add agg compatibility logic to Field class
This commit is contained in:
parent
7946eb4daa
commit
15a1977dcf
@ -35,4 +35,4 @@ docker run \
|
|||||||
--name eland-test-runner \
|
--name eland-test-runner \
|
||||||
--rm \
|
--rm \
|
||||||
elastic/eland \
|
elastic/eland \
|
||||||
nox -s test
|
nox -s test-${PYTHON_VERSION}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
elasticsearch>=7.0.5
|
elasticsearch==7.7.0a2
|
||||||
pandas>=1
|
pandas>=1
|
||||||
matplotlib
|
matplotlib
|
||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
|
@ -5,8 +5,9 @@
|
|||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Union, List, Tuple, cast, Callable, Any
|
from typing import Union, List, Tuple, cast, Callable, Any, Optional, Dict
|
||||||
|
|
||||||
|
import numpy as np # type: ignore
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
from elasticsearch import Elasticsearch # type: ignore
|
from elasticsearch import Elasticsearch # type: ignore
|
||||||
|
|
||||||
@ -19,6 +20,23 @@ DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
|||||||
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
|
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
|
||||||
|
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
||||||
|
|
||||||
|
|
||||||
|
def build_pd_series(
|
||||||
|
data: Dict[str, Any], dtype: Optional[np.dtype] = None, **kwargs: Any
|
||||||
|
) -> pd.Series:
|
||||||
|
"""Builds a pd.Series while squelching the warning
|
||||||
|
for unspecified dtype on empty series
|
||||||
|
"""
|
||||||
|
dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype)
|
||||||
|
if dtype is not None:
|
||||||
|
kwargs["dtype"] = dtype
|
||||||
|
return pd.Series(data, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def docstring_parameter(*sub: Any) -> Callable[[Any], Any]:
|
def docstring_parameter(*sub: Any) -> Callable[[Any], Any]:
|
||||||
def dec(obj: Any) -> Any:
|
def dec(obj: Any) -> Any:
|
||||||
obj.__doc__ = obj.__doc__.format(*sub)
|
obj.__doc__ = obj.__doc__.format(*sub)
|
||||||
|
@ -1280,11 +1280,11 @@ class DataFrame(NDFrame):
|
|||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights')
|
||||||
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
|
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']).astype(int)
|
||||||
DistanceKilometers AvgTicketPrice
|
DistanceKilometers AvgTicketPrice
|
||||||
sum 9.261629e+07 8.204365e+06
|
sum 92616288 8204364
|
||||||
min 0.000000e+00 1.000205e+02
|
min 0 100
|
||||||
std 4.578263e+03 2.663867e+02
|
std 4578 266
|
||||||
"""
|
"""
|
||||||
axis = pd.DataFrame._get_axis_number(axis)
|
axis = pd.DataFrame._get_axis_number(axis)
|
||||||
|
|
||||||
|
@ -14,6 +14,48 @@ from pandas.core.dtypes.common import (
|
|||||||
is_string_dtype,
|
is_string_dtype,
|
||||||
)
|
)
|
||||||
from pandas.core.dtypes.inference import is_list_like
|
from pandas.core.dtypes.inference import is_list_like
|
||||||
|
from typing import NamedTuple, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class Field(NamedTuple):
|
||||||
|
"""Holds all information on a particular field in the mapping"""
|
||||||
|
|
||||||
|
index: str
|
||||||
|
es_field_name: str
|
||||||
|
is_source: bool
|
||||||
|
es_dtype: str
|
||||||
|
es_date_format: Optional[str]
|
||||||
|
pd_dtype: type
|
||||||
|
is_searchable: bool
|
||||||
|
is_aggregatable: bool
|
||||||
|
is_scripted: bool
|
||||||
|
aggregatable_es_field_name: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_numeric(self) -> bool:
|
||||||
|
return is_integer_dtype(self.pd_dtype) or is_float_dtype(self.pd_dtype)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_timestamp(self) -> bool:
|
||||||
|
return is_datetime_or_timedelta_dtype(self.pd_dtype)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_bool(self) -> bool:
|
||||||
|
return is_bool_dtype(self.pd_dtype)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def np_dtype(self):
|
||||||
|
return np.dtype(self.pd_dtype)
|
||||||
|
|
||||||
|
def is_es_agg_compatible(self, es_agg):
|
||||||
|
# Cardinality works for all types
|
||||||
|
# Numerics and bools work for all aggs
|
||||||
|
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
||||||
|
return True
|
||||||
|
# Timestamps also work for 'min', 'max' and 'avg'
|
||||||
|
if es_agg in {"min", "max", "avg"} and self.is_timestamp:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class FieldMappings:
|
class FieldMappings:
|
||||||
@ -40,6 +82,23 @@ class FieldMappings:
|
|||||||
or es_field_name.keyword (if exists) or None
|
or es_field_name.keyword (if exists) or None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
ES_DTYPE_TO_PD_DTYPE = {
|
||||||
|
"text": "object",
|
||||||
|
"keyword": "object",
|
||||||
|
"long": "int64",
|
||||||
|
"integer": "int64",
|
||||||
|
"short": "int64",
|
||||||
|
"byte": "int64",
|
||||||
|
"binary": "int64",
|
||||||
|
"double": "float64",
|
||||||
|
"float": "float64",
|
||||||
|
"half_float": "float64",
|
||||||
|
"scaled_float": "float64",
|
||||||
|
"date": "datetime64[ns]",
|
||||||
|
"date_nanos": "datetime64[ns]",
|
||||||
|
"boolean": "bool",
|
||||||
|
}
|
||||||
|
|
||||||
# the labels for each column (display_name is index)
|
# the labels for each column (display_name is index)
|
||||||
column_labels = [
|
column_labels = [
|
||||||
"es_field_name",
|
"es_field_name",
|
||||||
@ -316,8 +375,8 @@ class FieldMappings:
|
|||||||
# return just source fields (as these are the only ones we display)
|
# return just source fields (as these are the only ones we display)
|
||||||
return capability_matrix_df[capability_matrix_df.is_source].sort_index()
|
return capability_matrix_df[capability_matrix_df.is_source].sort_index()
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def _es_dtype_to_pd_dtype(es_dtype):
|
def _es_dtype_to_pd_dtype(cls, es_dtype):
|
||||||
"""
|
"""
|
||||||
Mapping Elasticsearch types to pandas dtypes
|
Mapping Elasticsearch types to pandas dtypes
|
||||||
--------------------------------------------
|
--------------------------------------------
|
||||||
@ -332,28 +391,7 @@ class FieldMappings:
|
|||||||
boolean | bool
|
boolean | bool
|
||||||
TODO - add additional mapping types
|
TODO - add additional mapping types
|
||||||
"""
|
"""
|
||||||
es_dtype_to_pd_dtype = {
|
return cls.ES_DTYPE_TO_PD_DTYPE.get(es_dtype, "object")
|
||||||
"text": "object",
|
|
||||||
"keyword": "object",
|
|
||||||
"long": "int64",
|
|
||||||
"integer": "int64",
|
|
||||||
"short": "int64",
|
|
||||||
"byte": "int64",
|
|
||||||
"binary": "int64",
|
|
||||||
"double": "float64",
|
|
||||||
"float": "float64",
|
|
||||||
"half_float": "float64",
|
|
||||||
"scaled_float": "float64",
|
|
||||||
"date": "datetime64[ns]",
|
|
||||||
"date_nanos": "datetime64[ns]",
|
|
||||||
"boolean": "bool",
|
|
||||||
}
|
|
||||||
|
|
||||||
if es_dtype in es_dtype_to_pd_dtype:
|
|
||||||
return es_dtype_to_pd_dtype[es_dtype]
|
|
||||||
|
|
||||||
# Return 'object' for all unsupported TODO - investigate how different types could be supported
|
|
||||||
return "object"
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _pd_dtype_to_es_dtype(pd_dtype):
|
def _pd_dtype_to_es_dtype(pd_dtype):
|
||||||
@ -591,6 +629,14 @@ class FieldMappings:
|
|||||||
pd_dtypes, es_field_names, es_date_formats = self.metric_source_fields()
|
pd_dtypes, es_field_names, es_date_formats = self.metric_source_fields()
|
||||||
return es_field_names
|
return es_field_names
|
||||||
|
|
||||||
|
def all_source_fields(self):
|
||||||
|
source_fields = []
|
||||||
|
for index, row in self._mappings_capabilities.iterrows():
|
||||||
|
row = row.to_dict()
|
||||||
|
row["index"] = index
|
||||||
|
source_fields.append(Field(**row))
|
||||||
|
return source_fields
|
||||||
|
|
||||||
def metric_source_fields(self, include_bool=False, include_timestamp=False):
|
def metric_source_fields(self, include_bool=False, include_timestamp=False):
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
|
@ -409,6 +409,36 @@ class NDFrame(ABC):
|
|||||||
"""
|
"""
|
||||||
return self._query_compiler.nunique()
|
return self._query_compiler.nunique()
|
||||||
|
|
||||||
|
def mad(self, numeric_only=True):
|
||||||
|
"""
|
||||||
|
Return standard deviation for each numeric column
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.Series
|
||||||
|
The value of the standard deviation for each numeric column
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.DataFrame.std`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame('localhost', 'flights')
|
||||||
|
>>> df.mad() # doctest: +SKIP
|
||||||
|
AvgTicketPrice 213.368709
|
||||||
|
Cancelled 0.000000
|
||||||
|
DistanceKilometers 2946.168236
|
||||||
|
DistanceMiles 1830.987236
|
||||||
|
FlightDelay 0.000000
|
||||||
|
FlightDelayMin 0.000000
|
||||||
|
FlightTimeHour 3.819435
|
||||||
|
FlightTimeMin 229.142297
|
||||||
|
dayOfWeek 2.000000
|
||||||
|
dtype: float64
|
||||||
|
"""
|
||||||
|
return self._query_compiler.mad(numeric_only=numeric_only)
|
||||||
|
|
||||||
def _hist(self, num_bins):
|
def _hist(self, num_bins):
|
||||||
return self._query_compiler._hist(num_bins)
|
return self._query_compiler._hist(num_bins)
|
||||||
|
|
||||||
|
@ -3,13 +3,12 @@
|
|||||||
# See the LICENSE file in the project root for more information
|
# See the LICENSE file in the project root for more information
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import typing
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
|
|
||||||
from elasticsearch.helpers import scan
|
from elasticsearch.helpers import scan
|
||||||
|
|
||||||
from eland import Index
|
from eland import Index
|
||||||
@ -18,6 +17,7 @@ from eland.common import (
|
|||||||
DEFAULT_CSV_BATCH_OUTPUT_SIZE,
|
DEFAULT_CSV_BATCH_OUTPUT_SIZE,
|
||||||
DEFAULT_ES_MAX_RESULT_WINDOW,
|
DEFAULT_ES_MAX_RESULT_WINDOW,
|
||||||
elasticsearch_date_to_pandas_date,
|
elasticsearch_date_to_pandas_date,
|
||||||
|
build_pd_series,
|
||||||
)
|
)
|
||||||
from eland.query import Query
|
from eland.query import Query
|
||||||
from eland.actions import SortFieldAction
|
from eland.actions import SortFieldAction
|
||||||
@ -31,15 +31,8 @@ from eland.tasks import (
|
|||||||
SizeTask,
|
SizeTask,
|
||||||
)
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
if typing.TYPE_CHECKING:
|
||||||
warnings.simplefilter("ignore")
|
from eland.query_compiler import QueryCompiler
|
||||||
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
|
||||||
|
|
||||||
|
|
||||||
def build_series(data, dtype=None, **kwargs):
|
|
||||||
out_dtype = EMPTY_SERIES_DTYPE if not data else dtype
|
|
||||||
s = pd.Series(data=data, index=data.keys(), dtype=out_dtype, **kwargs)
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
class Operations:
|
class Operations:
|
||||||
@ -122,45 +115,45 @@ class Operations:
|
|||||||
)["count"]
|
)["count"]
|
||||||
counts[field] = field_exists_count
|
counts[field] = field_exists_count
|
||||||
|
|
||||||
return pd.Series(data=counts, index=fields)
|
return build_pd_series(data=counts, index=fields)
|
||||||
|
|
||||||
def mean(self, query_compiler, numeric_only=True):
|
def mean(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(query_compiler, "avg", numeric_only=numeric_only)
|
results = self._metric_aggs(query_compiler, ["mean"], numeric_only=numeric_only)
|
||||||
|
return build_pd_series(results, index=results.keys())
|
||||||
|
|
||||||
def var(self, query_compiler, numeric_only=True):
|
def var(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(
|
results = self._metric_aggs(query_compiler, ["var"], numeric_only=numeric_only)
|
||||||
query_compiler, ("extended_stats", "variance"), numeric_only=numeric_only
|
return build_pd_series(results, index=results.keys())
|
||||||
)
|
|
||||||
|
|
||||||
def std(self, query_compiler, numeric_only=True):
|
def std(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(
|
results = self._metric_aggs(query_compiler, ["std"], numeric_only=numeric_only)
|
||||||
query_compiler,
|
return build_pd_series(results, index=results.keys())
|
||||||
("extended_stats", "std_deviation"),
|
|
||||||
numeric_only=numeric_only,
|
|
||||||
)
|
|
||||||
|
|
||||||
def median(self, query_compiler, numeric_only=True):
|
def median(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(
|
results = self._metric_aggs(
|
||||||
query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only
|
query_compiler, ["median"], numeric_only=numeric_only
|
||||||
)
|
)
|
||||||
|
return build_pd_series(results, index=results.keys())
|
||||||
|
|
||||||
def sum(self, query_compiler, numeric_only=True):
|
def sum(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
|
results = self._metric_aggs(query_compiler, ["sum"], numeric_only=numeric_only)
|
||||||
|
return build_pd_series(results, index=results.keys())
|
||||||
|
|
||||||
def max(self, query_compiler, numeric_only=True):
|
def max(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(
|
results = self._metric_aggs(query_compiler, ["max"], numeric_only=numeric_only)
|
||||||
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
|
return build_pd_series(results, index=results.keys())
|
||||||
)
|
|
||||||
|
|
||||||
def min(self, query_compiler, numeric_only=True):
|
def min(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(
|
results = self._metric_aggs(query_compiler, ["min"], numeric_only=numeric_only)
|
||||||
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
|
return build_pd_series(results, index=results.keys())
|
||||||
)
|
|
||||||
|
|
||||||
def nunique(self, query_compiler):
|
def nunique(self, query_compiler):
|
||||||
return self._metric_aggs(
|
results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False)
|
||||||
query_compiler, "cardinality", field_types="aggregatable"
|
return build_pd_series(results, index=results.keys())
|
||||||
)
|
|
||||||
|
def mad(self, query_compiler, numeric_only=True):
|
||||||
|
results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only)
|
||||||
|
return build_pd_series(results, index=results.keys())
|
||||||
|
|
||||||
def value_counts(self, query_compiler, es_size):
|
def value_counts(self, query_compiler, es_size):
|
||||||
return self._terms_aggs(query_compiler, "terms", es_size)
|
return self._terms_aggs(query_compiler, "terms", es_size)
|
||||||
@ -168,28 +161,7 @@ class Operations:
|
|||||||
def hist(self, query_compiler, bins):
|
def hist(self, query_compiler, bins):
|
||||||
return self._hist_aggs(query_compiler, bins)
|
return self._hist_aggs(query_compiler, bins)
|
||||||
|
|
||||||
def _metric_aggs(
|
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True):
|
||||||
self,
|
|
||||||
query_compiler,
|
|
||||||
func,
|
|
||||||
field_types=None,
|
|
||||||
numeric_only=None,
|
|
||||||
keep_original_dtype=False,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
field_types: str, default None
|
|
||||||
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
|
|
||||||
If `None`, use only numeric fields.
|
|
||||||
keep_original_dtype : bool, default False
|
|
||||||
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
pandas.Series
|
|
||||||
Series containing results of `func` applied to the field_name(s)
|
|
||||||
"""
|
|
||||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||||
|
|
||||||
size = self._size(query_params, post_processing)
|
size = self._size(query_params, post_processing)
|
||||||
@ -198,152 +170,113 @@ class Operations:
|
|||||||
f"Can not count field matches if size is set {size}"
|
f"Can not count field matches if size is set {size}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
fields = query_compiler._mappings.all_source_fields()
|
||||||
|
if numeric_only:
|
||||||
|
fields = [field for field in fields if (field.is_numeric or field.is_bool)]
|
||||||
|
|
||||||
body = Query(query_params["query"])
|
body = Query(query_params["query"])
|
||||||
|
|
||||||
results = {}
|
# Convert pandas aggs to ES equivalent
|
||||||
|
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
||||||
|
|
||||||
# some metrics aggs (including cardinality) work on all aggregatable fields
|
for field in fields:
|
||||||
# therefore we include an optional all parameter on operations
|
for es_agg in es_aggs:
|
||||||
# that call _metric_aggs
|
if not field.is_es_agg_compatible(es_agg):
|
||||||
if field_types == "aggregatable":
|
continue
|
||||||
aggregatable_field_names = (
|
|
||||||
query_compiler._mappings.aggregatable_field_names()
|
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
||||||
|
if isinstance(es_agg, tuple):
|
||||||
|
body.metric_aggs(
|
||||||
|
f"{es_agg[0]}_{field.es_field_name}",
|
||||||
|
es_agg[0],
|
||||||
|
field.aggregatable_es_field_name,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
body.metric_aggs(
|
||||||
|
f"{es_agg}_{field.es_field_name}",
|
||||||
|
es_agg,
|
||||||
|
field.aggregatable_es_field_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
for field in aggregatable_field_names.keys():
|
|
||||||
body.metric_aggs(field, func, field)
|
|
||||||
|
|
||||||
response = query_compiler._client.search(
|
response = query_compiler._client.search(
|
||||||
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
||||||
)
|
)
|
||||||
|
|
||||||
# Results are of the form
|
"""
|
||||||
# "aggregations" : {
|
Results are like (for 'sum', 'min')
|
||||||
# "customer_full_name.keyword" : {
|
|
||||||
# "value" : 10
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
|
|
||||||
# map aggregatable (e.g. x.keyword) to field_name
|
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
|
||||||
for key, value in aggregatable_field_names.items():
|
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
|
||||||
results[value] = response["aggregations"][key]["value"]
|
min 1.000205e+02 0.000000e+00 0.000000e+00 0
|
||||||
else:
|
"""
|
||||||
if numeric_only:
|
for field in fields:
|
||||||
(
|
values = []
|
||||||
pd_dtypes,
|
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
|
||||||
source_fields,
|
|
||||||
date_formats,
|
|
||||||
) = query_compiler._mappings.metric_source_fields(include_bool=True)
|
|
||||||
else:
|
|
||||||
# The only non-numerics we support are bool and timestamps currently
|
|
||||||
# strings are not supported by metric aggs in ES
|
|
||||||
# TODO - sum isn't supported for Timestamp in pandas - although ES does attempt to do it
|
|
||||||
(
|
|
||||||
pd_dtypes,
|
|
||||||
source_fields,
|
|
||||||
date_formats,
|
|
||||||
) = query_compiler._mappings.metric_source_fields(
|
|
||||||
include_bool=True, include_timestamp=True
|
|
||||||
)
|
|
||||||
|
|
||||||
for field in source_fields:
|
# If the field and agg aren't compatible we add a NaN
|
||||||
if isinstance(func, tuple):
|
if not field.is_es_agg_compatible(es_agg):
|
||||||
body.metric_aggs(func[0] + "_" + field, func[0], field)
|
values.append(np.float64(np.NaN))
|
||||||
else:
|
continue
|
||||||
body.metric_aggs(field, func, field)
|
|
||||||
|
|
||||||
response = query_compiler._client.search(
|
if isinstance(es_agg, tuple):
|
||||||
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
agg_value = response["aggregations"][
|
||||||
)
|
f"{es_agg[0]}_{field.es_field_name}"
|
||||||
|
|
||||||
# Results are of the form
|
|
||||||
# "aggregations" : {
|
|
||||||
# "AvgTicketPrice" : {
|
|
||||||
# "value" : 628.2536888148849
|
|
||||||
# },
|
|
||||||
# "timestamp": {
|
|
||||||
# "value": 1.5165624455644382E12,
|
|
||||||
# "value_as_string": "2018-01-21T19:20:45.564Z"
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
for pd_dtype, field, date_format in zip(
|
|
||||||
pd_dtypes, source_fields, date_formats
|
|
||||||
):
|
|
||||||
if is_datetime_or_timedelta_dtype(pd_dtype):
|
|
||||||
results[field] = elasticsearch_date_to_pandas_date(
|
|
||||||
response["aggregations"][field]["value_as_string"], date_format
|
|
||||||
)
|
|
||||||
elif keep_original_dtype:
|
|
||||||
if isinstance(func, tuple):
|
|
||||||
results = pd_dtype.type(
|
|
||||||
response["aggregations"][func[0] + "_" + field][func[1]]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
results[field] = pd_dtype.type(
|
|
||||||
response["aggregations"][field]["value"]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if isinstance(func, tuple):
|
|
||||||
if func[0] == "percentiles":
|
|
||||||
results[field] = response["aggregations"][
|
|
||||||
"percentiles_" + field
|
|
||||||
]["values"]["50.0"]
|
|
||||||
|
|
||||||
# If 0-length dataframe we get None here
|
|
||||||
if results[field] is None:
|
|
||||||
results[field] = np.float64(np.NaN)
|
|
||||||
elif func[1] == "variance":
|
|
||||||
# pandas computes the sample variance
|
|
||||||
# Elasticsearch computes the population variance
|
|
||||||
count = response["aggregations"][func[0] + "_" + field][
|
|
||||||
"count"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
results[field] = response["aggregations"][
|
# Pull multiple values from 'percentiles' result.
|
||||||
func[0] + "_" + field
|
if es_agg[0] == "percentiles":
|
||||||
][func[1]]
|
agg_value = agg_value["values"]
|
||||||
|
|
||||||
# transform population variance into sample variance
|
agg_value = agg_value[es_agg[1]]
|
||||||
|
|
||||||
|
# Need to convert 'Population' stddev and variance
|
||||||
|
# from Elasticsearch into 'Sample' stddev and variance
|
||||||
|
# which is what pandas uses.
|
||||||
|
if es_agg[1] in ("std_deviation", "variance"):
|
||||||
|
# Neither transformation works with count <=1
|
||||||
|
count = response["aggregations"][
|
||||||
|
f"{es_agg[0]}_{field.es_field_name}"
|
||||||
|
]["count"]
|
||||||
|
|
||||||
|
# All of the below calculations result in NaN if count<=1
|
||||||
if count <= 1:
|
if count <= 1:
|
||||||
results[field] = np.float64(np.NaN)
|
agg_value = np.float64(np.NaN)
|
||||||
else:
|
|
||||||
results[field] = count / (count - 1.0) * results[field]
|
|
||||||
elif func[1] == "std_deviation":
|
|
||||||
# pandas computes the sample std
|
|
||||||
# Elasticsearch computes the population std
|
|
||||||
count = response["aggregations"][func[0] + "_" + field][
|
|
||||||
"count"
|
|
||||||
]
|
|
||||||
|
|
||||||
results[field] = response["aggregations"][
|
elif es_agg[1] == "std_deviation":
|
||||||
func[0] + "_" + field
|
agg_value *= count / (count - 1.0)
|
||||||
][func[1]]
|
|
||||||
|
|
||||||
# transform population std into sample std
|
else: # es_agg[1] == "variance"
|
||||||
# sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
|
# sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
|
||||||
# population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
|
# population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
|
||||||
# sample_std=\sqrt{\frac{N}{N-1}population_std}
|
# sample_std=\sqrt{\frac{N}{N-1}population_std}
|
||||||
if count <= 1:
|
agg_value = np.sqrt(
|
||||||
results[field] = np.float64(np.NaN)
|
(count / (count - 1.0)) * agg_value * agg_value
|
||||||
else:
|
|
||||||
results[field] = np.sqrt(
|
|
||||||
(count / (count - 1.0))
|
|
||||||
* results[field]
|
|
||||||
* results[field]
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
results[field] = response["aggregations"][
|
agg_value = response["aggregations"][
|
||||||
func[0] + "_" + field
|
f"{es_agg}_{field.es_field_name}"
|
||||||
][func[1]]
|
]
|
||||||
|
if "value_as_string" in agg_value and field.is_timestamp:
|
||||||
|
agg_value = elasticsearch_date_to_pandas_date(
|
||||||
|
agg_value["value_as_string"], field.es_date_format
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
results[field] = response["aggregations"][field]["value"]
|
agg_value = agg_value["value"]
|
||||||
|
|
||||||
# Return single value if this is a series
|
# These aggregations maintain the column datatype
|
||||||
# if len(numeric_source_fields) == 1:
|
if pd_agg in ("max", "min"):
|
||||||
# return np.float64(results[numeric_source_fields[0]])
|
agg_value = field.np_dtype.type(agg_value)
|
||||||
s = build_series(results)
|
|
||||||
|
|
||||||
return s
|
# Null usually means there were no results.
|
||||||
|
if agg_value is None:
|
||||||
|
agg_value = np.float64(np.NaN)
|
||||||
|
|
||||||
|
values.append(agg_value)
|
||||||
|
|
||||||
|
results[field.index] = values if len(values) > 1 else values[0]
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
def _terms_aggs(self, query_compiler, func, es_size=None):
|
def _terms_aggs(self, query_compiler, func, es_size=None):
|
||||||
"""
|
"""
|
||||||
@ -391,9 +324,7 @@ class Operations:
|
|||||||
except IndexError:
|
except IndexError:
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
s = build_series(results, name=name)
|
return build_pd_series(results, name=name)
|
||||||
|
|
||||||
return s
|
|
||||||
|
|
||||||
def _hist_aggs(self, query_compiler, num_bins):
|
def _hist_aggs(self, query_compiler, num_bins):
|
||||||
# Get histogram bins and weights for numeric field_names
|
# Get histogram bins and weights for numeric field_names
|
||||||
@ -409,8 +340,12 @@ class Operations:
|
|||||||
|
|
||||||
body = Query(query_params["query"])
|
body = Query(query_params["query"])
|
||||||
|
|
||||||
min_aggs = self._metric_aggs(query_compiler, "min", numeric_only=True)
|
results = self._metric_aggs(query_compiler, ["min", "max"], numeric_only=True)
|
||||||
max_aggs = self._metric_aggs(query_compiler, "max", numeric_only=True)
|
min_aggs = {}
|
||||||
|
max_aggs = {}
|
||||||
|
for field, (min_agg, max_agg) in results.items():
|
||||||
|
min_aggs[field] = min_agg
|
||||||
|
max_aggs[field] = max_agg
|
||||||
|
|
||||||
for field in numeric_source_fields:
|
for field in numeric_source_fields:
|
||||||
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)
|
||||||
@ -476,7 +411,6 @@ class Operations:
|
|||||||
|
|
||||||
df_bins = pd.DataFrame(data=bins)
|
df_bins = pd.DataFrame(data=bins)
|
||||||
df_weights = pd.DataFrame(data=weights)
|
df_weights = pd.DataFrame(data=weights)
|
||||||
|
|
||||||
return df_bins, df_weights
|
return df_bins, df_weights
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -511,20 +445,42 @@ class Operations:
|
|||||||
var
|
var
|
||||||
nunique
|
nunique
|
||||||
"""
|
"""
|
||||||
ed_aggs = []
|
# pd aggs that will be mapped to es aggs
|
||||||
|
# that can use 'extended_stats'.
|
||||||
|
extended_stats_pd_aggs = {"mean", "min", "max", "count", "sum", "var", "std"}
|
||||||
|
extended_stats_es_aggs = {"avg", "min", "max", "count", "sum"}
|
||||||
|
extended_stats_calls = 0
|
||||||
|
|
||||||
|
es_aggs = []
|
||||||
for pd_agg in pd_aggs:
|
for pd_agg in pd_aggs:
|
||||||
|
if pd_agg in extended_stats_pd_aggs:
|
||||||
|
extended_stats_calls += 1
|
||||||
|
|
||||||
|
# Aggs that are 'extended_stats' compatible
|
||||||
if pd_agg == "count":
|
if pd_agg == "count":
|
||||||
ed_aggs.append("count")
|
es_aggs.append("count")
|
||||||
elif pd_agg == "mad":
|
|
||||||
ed_aggs.append("median_absolute_deviation")
|
|
||||||
elif pd_agg == "max":
|
elif pd_agg == "max":
|
||||||
ed_aggs.append("max")
|
es_aggs.append("max")
|
||||||
elif pd_agg == "mean":
|
|
||||||
ed_aggs.append("avg")
|
|
||||||
elif pd_agg == "median":
|
|
||||||
ed_aggs.append(("percentiles", "50.0"))
|
|
||||||
elif pd_agg == "min":
|
elif pd_agg == "min":
|
||||||
ed_aggs.append("min")
|
es_aggs.append("min")
|
||||||
|
elif pd_agg == "mean":
|
||||||
|
es_aggs.append("avg")
|
||||||
|
elif pd_agg == "sum":
|
||||||
|
es_aggs.append("sum")
|
||||||
|
elif pd_agg == "std":
|
||||||
|
es_aggs.append(("extended_stats", "std_deviation"))
|
||||||
|
elif pd_agg == "var":
|
||||||
|
es_aggs.append(("extended_stats", "variance"))
|
||||||
|
|
||||||
|
# Aggs that aren't 'extended_stats' compatible
|
||||||
|
elif pd_agg == "nunique":
|
||||||
|
es_aggs.append("cardinality")
|
||||||
|
elif pd_agg == "mad":
|
||||||
|
es_aggs.append("median_absolute_deviation")
|
||||||
|
elif pd_agg == "median":
|
||||||
|
es_aggs.append(("percentiles", "50.0"))
|
||||||
|
|
||||||
|
# Not implemented
|
||||||
elif pd_agg == "mode":
|
elif pd_agg == "mode":
|
||||||
# We could do this via top term
|
# We could do this via top term
|
||||||
raise NotImplementedError(pd_agg, " not currently implemented")
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
||||||
@ -537,77 +493,24 @@ class Operations:
|
|||||||
elif pd_agg == "sem":
|
elif pd_agg == "sem":
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(pd_agg, " not currently implemented")
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
||||||
elif pd_agg == "sum":
|
|
||||||
ed_aggs.append("sum")
|
|
||||||
elif pd_agg == "std":
|
|
||||||
ed_aggs.append(("extended_stats", "std_deviation"))
|
|
||||||
elif pd_agg == "var":
|
|
||||||
ed_aggs.append(("extended_stats", "variance"))
|
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(pd_agg, " not currently implemented")
|
raise NotImplementedError(pd_agg, " not currently implemented")
|
||||||
|
|
||||||
# TODO - we can optimise extended_stats here as if we have 'count' and 'std' extended_stats would
|
# If two aggs compatible with 'extended_stats' is called we can
|
||||||
# return both in one call
|
# piggy-back on that single aggregation.
|
||||||
|
if extended_stats_calls >= 2:
|
||||||
|
es_aggs = [
|
||||||
|
("extended_stats", es_agg)
|
||||||
|
if es_agg in extended_stats_es_aggs
|
||||||
|
else es_agg
|
||||||
|
for es_agg in es_aggs
|
||||||
|
]
|
||||||
|
|
||||||
return ed_aggs
|
return es_aggs
|
||||||
|
|
||||||
def aggs(self, query_compiler, pd_aggs):
|
def aggs(self, query_compiler, pd_aggs):
|
||||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
results = self._metric_aggs(query_compiler, pd_aggs, numeric_only=False)
|
||||||
|
return pd.DataFrame(results, index=pd_aggs)
|
||||||
size = self._size(query_params, post_processing)
|
|
||||||
if size is not None:
|
|
||||||
raise NotImplementedError(
|
|
||||||
f"Can not count field matches if size is set {size}"
|
|
||||||
)
|
|
||||||
|
|
||||||
field_names = query_compiler.get_field_names(include_scripted_fields=False)
|
|
||||||
|
|
||||||
body = Query(query_params["query"])
|
|
||||||
# convert pandas aggs to ES equivalent
|
|
||||||
es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)
|
|
||||||
|
|
||||||
for field in field_names:
|
|
||||||
for es_agg in es_aggs:
|
|
||||||
# If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
|
|
||||||
if isinstance(es_agg, tuple):
|
|
||||||
body.metric_aggs(es_agg[0] + "_" + field, es_agg[0], field)
|
|
||||||
else:
|
|
||||||
body.metric_aggs(es_agg + "_" + field, es_agg, field)
|
|
||||||
|
|
||||||
response = query_compiler._client.search(
|
|
||||||
index=query_compiler._index_pattern, size=0, body=body.to_search_body()
|
|
||||||
)
|
|
||||||
|
|
||||||
"""
|
|
||||||
Results are like (for 'sum', 'min')
|
|
||||||
|
|
||||||
AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin
|
|
||||||
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
|
|
||||||
min 1.000205e+02 0.000000e+00 0.000000e+00 0
|
|
||||||
"""
|
|
||||||
results = {}
|
|
||||||
|
|
||||||
for field in field_names:
|
|
||||||
values = list()
|
|
||||||
for es_agg in es_aggs:
|
|
||||||
if isinstance(es_agg, tuple):
|
|
||||||
agg_value = response["aggregations"][es_agg[0] + "_" + field]
|
|
||||||
|
|
||||||
# Pull multiple values from 'percentiles' result.
|
|
||||||
if es_agg[0] == "percentiles":
|
|
||||||
agg_value = agg_value["values"]
|
|
||||||
|
|
||||||
values.append(agg_value[es_agg[1]])
|
|
||||||
else:
|
|
||||||
values.append(
|
|
||||||
response["aggregations"][es_agg + "_" + field]["value"]
|
|
||||||
)
|
|
||||||
|
|
||||||
results[field] = values
|
|
||||||
|
|
||||||
df = pd.DataFrame(data=results, index=pd_aggs)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
def describe(self, query_compiler):
|
def describe(self, query_compiler):
|
||||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||||
|
@ -66,13 +66,13 @@ class QueryCompiler:
|
|||||||
self._index_pattern = to_copy._index_pattern
|
self._index_pattern = to_copy._index_pattern
|
||||||
self._index = Index(self, to_copy._index.index_field)
|
self._index = Index(self, to_copy._index.index_field)
|
||||||
self._operations = copy.deepcopy(to_copy._operations)
|
self._operations = copy.deepcopy(to_copy._operations)
|
||||||
self._mappings = copy.deepcopy(to_copy._mappings)
|
self._mappings: FieldMappings = copy.deepcopy(to_copy._mappings)
|
||||||
else:
|
else:
|
||||||
self._client = ensure_es_client(client)
|
self._client = ensure_es_client(client)
|
||||||
self._index_pattern = index_pattern
|
self._index_pattern = index_pattern
|
||||||
# Get and persist mappings, this allows us to correctly
|
# Get and persist mappings, this allows us to correctly
|
||||||
# map returned types from Elasticsearch to pandas datatypes
|
# map returned types from Elasticsearch to pandas datatypes
|
||||||
self._mappings = FieldMappings(
|
self._mappings: FieldMappings = FieldMappings(
|
||||||
client=self._client,
|
client=self._client,
|
||||||
index_pattern=self._index_pattern,
|
index_pattern=self._index_pattern,
|
||||||
display_names=display_names,
|
display_names=display_names,
|
||||||
@ -464,6 +464,9 @@ class QueryCompiler:
|
|||||||
def std(self, numeric_only=None):
|
def std(self, numeric_only=None):
|
||||||
return self._operations.std(self, numeric_only=numeric_only)
|
return self._operations.std(self, numeric_only=numeric_only)
|
||||||
|
|
||||||
|
def mad(self, numeric_only=None):
|
||||||
|
return self._operations.mad(self, numeric_only=numeric_only)
|
||||||
|
|
||||||
def median(self, numeric_only=None):
|
def median(self, numeric_only=None):
|
||||||
return self._operations.median(self, numeric_only=numeric_only)
|
return self._operations.median(self, numeric_only=numeric_only)
|
||||||
|
|
||||||
|
@ -1105,7 +1105,7 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
>>> int(s.max())
|
>>> int(s.max())
|
||||||
1199
|
1199
|
||||||
"""
|
"""
|
||||||
@ -1129,7 +1129,7 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
>>> int(s.mean())
|
>>> int(s.mean())
|
||||||
628
|
628
|
||||||
"""
|
"""
|
||||||
@ -1153,7 +1153,7 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
>>> int(s.min())
|
>>> int(s.min())
|
||||||
100
|
100
|
||||||
"""
|
"""
|
||||||
@ -1177,7 +1177,7 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
>>> int(s.sum())
|
>>> int(s.sum())
|
||||||
8204364
|
8204364
|
||||||
"""
|
"""
|
||||||
@ -1186,26 +1186,92 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
def nunique(self):
|
def nunique(self):
|
||||||
"""
|
"""
|
||||||
Return the sum of the Series values
|
Return the number of unique values in a Series
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
float
|
int
|
||||||
max value
|
Number of unique values
|
||||||
|
|
||||||
See Also
|
See Also
|
||||||
--------
|
--------
|
||||||
:pandas_api_docs:`pandas.Series.sum`
|
:pandas_api_docs:`pandas.Series.nunique`
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> s = ed.Series('localhost', 'flights', name='Carrier')
|
>>> s = ed.DataFrame('localhost', 'flights')['Carrier']
|
||||||
>>> s.nunique()
|
>>> s.nunique()
|
||||||
4
|
4
|
||||||
"""
|
"""
|
||||||
results = super().nunique()
|
results = super().nunique()
|
||||||
return results.squeeze()
|
return results.squeeze()
|
||||||
|
|
||||||
|
def var(self, numeric_only=None):
|
||||||
|
"""
|
||||||
|
Return variance for a Series
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
var value
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.Series.var`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
|
>>> int(s.var())
|
||||||
|
70964
|
||||||
|
"""
|
||||||
|
results = super().var(numeric_only=numeric_only)
|
||||||
|
return results.squeeze()
|
||||||
|
|
||||||
|
def std(self, numeric_only=None):
|
||||||
|
"""
|
||||||
|
Return standard deviation for a Series
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
std value
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.Series.var`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
|
>>> int(s.std())
|
||||||
|
266
|
||||||
|
"""
|
||||||
|
results = super().std(numeric_only=numeric_only)
|
||||||
|
return results.squeeze()
|
||||||
|
|
||||||
|
def mad(self, numeric_only=None):
|
||||||
|
"""
|
||||||
|
Return median absolute deviation for a Series
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
mad value
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`pandas.Series.mad`
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> s = ed.DataFrame('localhost', 'flights')['AvgTicketPrice']
|
||||||
|
>>> int(s.mad())
|
||||||
|
213
|
||||||
|
"""
|
||||||
|
results = super().mad(numeric_only=numeric_only)
|
||||||
|
return results.squeeze()
|
||||||
|
|
||||||
# def values TODO - not implemented as causes current implementation of query to fail
|
# def values TODO - not implemented as causes current implementation of query to fail
|
||||||
|
|
||||||
def to_numpy(self):
|
def to_numpy(self):
|
||||||
|
@ -4,11 +4,9 @@
|
|||||||
|
|
||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
import warnings
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pandas.testing import assert_series_equal
|
from pandas.testing import assert_series_equal
|
||||||
|
|
||||||
from eland.operations import build_series, EMPTY_SERIES_DTYPE
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||||
|
|
||||||
@ -34,9 +32,3 @@ class TestDataFrameDtypes(TestData):
|
|||||||
pd_flights.select_dtypes(include=np.number),
|
pd_flights.select_dtypes(include=np.number),
|
||||||
ed_flights.select_dtypes(include=np.number),
|
ed_flights.select_dtypes(include=np.number),
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_emtpy_series_dtypes(self):
|
|
||||||
with warnings.catch_warnings(record=True) as w:
|
|
||||||
s = build_series({})
|
|
||||||
assert s.dtype == EMPTY_SERIES_DTYPE
|
|
||||||
assert w == []
|
|
||||||
|
@ -11,7 +11,7 @@ from eland.tests.common import TestData
|
|||||||
|
|
||||||
class TestDataFrameMetrics(TestData):
|
class TestDataFrameMetrics(TestData):
|
||||||
funcs = ["max", "min", "mean", "sum"]
|
funcs = ["max", "min", "mean", "sum"]
|
||||||
extended_funcs = ["var", "std", "median"]
|
extended_funcs = ["median", "mad", "var", "std"]
|
||||||
|
|
||||||
def test_flights_metrics(self):
|
def test_flights_metrics(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
@ -29,40 +29,48 @@ class TestDataFrameMetrics(TestData):
|
|||||||
|
|
||||||
# Test on reduced set of data for more consistent
|
# Test on reduced set of data for more consistent
|
||||||
# median behaviour + better var, std test for sample vs population
|
# median behaviour + better var, std test for sample vs population
|
||||||
pd_flights = pd_flights[pd_flights.DestAirportID == "AMS"]
|
pd_flights = pd_flights[["AvgTicketPrice"]]
|
||||||
ed_flights = ed_flights[ed_flights.DestAirportID == "AMS"]
|
ed_flights = ed_flights[["AvgTicketPrice"]]
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger("elasticsearch")
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
pd_metric = getattr(pd_flights, func)(numeric_only=True)
|
pd_metric = getattr(pd_flights, func)(
|
||||||
|
**({"numeric_only": True} if func != "mad" else {})
|
||||||
|
)
|
||||||
ed_metric = getattr(ed_flights, func)(numeric_only=True)
|
ed_metric = getattr(ed_flights, func)(numeric_only=True)
|
||||||
|
|
||||||
assert_series_equal(
|
pd_value = pd_metric["AvgTicketPrice"]
|
||||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
ed_value = ed_metric["AvgTicketPrice"]
|
||||||
)
|
assert (ed_value * 0.9) <= pd_value <= (ed_value * 1.1) # +/-10%
|
||||||
|
|
||||||
def test_flights_extended_metrics_nan(self):
|
def test_flights_extended_metrics_nan(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
# Test on single row to test NaN behaviour of sample std/variance
|
# Test on single row to test NaN behaviour of sample std/variance
|
||||||
pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"]
|
pd_flights_1 = pd_flights[pd_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
||||||
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"]
|
ed_flights_1 = ed_flights[ed_flights.FlightNum == "9HY9SWR"][["AvgTicketPrice"]]
|
||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
pd_metric = getattr(pd_flights_1, func)(numeric_only=True)
|
pd_metric = getattr(pd_flights_1, func)()
|
||||||
ed_metric = getattr(ed_flights_1, func)(numeric_only=True)
|
ed_metric = getattr(ed_flights_1, func)()
|
||||||
|
|
||||||
assert_series_equal(
|
assert_series_equal(
|
||||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test on zero rows to test NaN behaviour of sample std/variance
|
# Test on zero rows to test NaN behaviour of sample std/variance
|
||||||
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"]
|
pd_flights_0 = pd_flights[pd_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||||
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"]
|
ed_flights_0 = ed_flights[ed_flights.FlightNum == "XXX"][["AvgTicketPrice"]]
|
||||||
|
|
||||||
for func in self.extended_funcs:
|
for func in self.extended_funcs:
|
||||||
pd_metric = getattr(pd_flights_0, func)(numeric_only=True)
|
pd_metric = getattr(pd_flights_0, func)()
|
||||||
ed_metric = getattr(ed_flights_0, func)(numeric_only=True)
|
ed_metric = getattr(ed_flights_0, func)()
|
||||||
|
|
||||||
assert_series_equal(
|
assert_series_equal(
|
||||||
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
pd_metric, ed_metric, check_exact=False, check_less_precise=True
|
||||||
|
39
eland/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py
Normal file
39
eland/tests/operations/test_map_pd_aggs_to_es_aggs_pytest.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# Licensed to Elasticsearch B.V under one or more agreements.
|
||||||
|
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
|
||||||
|
# See the LICENSE file in the project root for more information
|
||||||
|
|
||||||
|
from eland.operations import Operations
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_aggs():
|
||||||
|
es_aggs = Operations._map_pd_aggs_to_es_aggs(
|
||||||
|
["min", "max", "mean", "std", "var", "mad", "count", "nunique", "median"]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert es_aggs == [
|
||||||
|
("extended_stats", "min"),
|
||||||
|
("extended_stats", "max"),
|
||||||
|
("extended_stats", "avg"),
|
||||||
|
("extended_stats", "std_deviation"),
|
||||||
|
("extended_stats", "variance"),
|
||||||
|
"median_absolute_deviation",
|
||||||
|
("extended_stats", "count"),
|
||||||
|
"cardinality",
|
||||||
|
("percentiles", "50.0"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extended_stats_optimization():
|
||||||
|
# Tests that when '<agg>' and an 'extended_stats' agg are used together
|
||||||
|
# that ('extended_stats', '<agg>') is used instead of '<agg>'.
|
||||||
|
es_aggs = Operations._map_pd_aggs_to_es_aggs(["count", "nunique"])
|
||||||
|
assert es_aggs == ["count", "cardinality"]
|
||||||
|
|
||||||
|
for pd_agg in ["var", "std"]:
|
||||||
|
extended_es_agg = Operations._map_pd_aggs_to_es_aggs([pd_agg])[0]
|
||||||
|
|
||||||
|
es_aggs = Operations._map_pd_aggs_to_es_aggs([pd_agg, "nunique"])
|
||||||
|
assert es_aggs == [extended_es_agg, "cardinality"]
|
||||||
|
|
||||||
|
es_aggs = Operations._map_pd_aggs_to_es_aggs(["count", pd_agg, "nunique"])
|
||||||
|
assert es_aggs == [("extended_stats", "count"), extended_es_agg, "cardinality"]
|
22
eland/tests/series/test_dtype_pytest.py
Normal file
22
eland/tests/series/test_dtype_pytest.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# Licensed to Elasticsearch B.V under one or more agreements.
|
||||||
|
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
|
||||||
|
# See the LICENSE file in the project root for more information
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import warnings
|
||||||
|
from eland.common import build_pd_series, EMPTY_SERIES_DTYPE
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_series_dtypes():
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
s = build_pd_series({})
|
||||||
|
assert s.dtype == EMPTY_SERIES_DTYPE
|
||||||
|
assert w == []
|
||||||
|
|
||||||
|
# Ensure that a passed-in dtype isn't ignore
|
||||||
|
# even if the result is empty.
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
s = build_pd_series({}, dtype=np.int32)
|
||||||
|
assert np.int32 != EMPTY_SERIES_DTYPE
|
||||||
|
assert s.dtype == np.int32
|
||||||
|
assert w == []
|
@ -10,17 +10,24 @@ from eland.tests.common import TestData
|
|||||||
|
|
||||||
|
|
||||||
class TestSeriesMetrics(TestData):
|
class TestSeriesMetrics(TestData):
|
||||||
funcs = ["max", "min", "mean", "sum"]
|
all_funcs = ["max", "min", "mean", "sum", "nunique", "var", "std", "mad"]
|
||||||
timestamp_funcs = ["max", "min", "mean"]
|
timestamp_funcs = ["max", "min", "mean", "nunique"]
|
||||||
|
|
||||||
|
def assert_almost_equal_for_agg(self, func, pd_metric, ed_metric):
|
||||||
|
if func in ("nunique", "var", "mad"):
|
||||||
|
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=-3)
|
||||||
|
else:
|
||||||
|
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)
|
||||||
|
|
||||||
def test_flights_metrics(self):
|
def test_flights_metrics(self):
|
||||||
pd_flights = self.pd_flights()["AvgTicketPrice"]
|
pd_flights = self.pd_flights()["AvgTicketPrice"]
|
||||||
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
ed_flights = self.ed_flights()["AvgTicketPrice"]
|
||||||
|
|
||||||
for func in self.funcs:
|
for func in self.all_funcs:
|
||||||
pd_metric = getattr(pd_flights, func)()
|
pd_metric = getattr(pd_flights, func)()
|
||||||
ed_metric = getattr(ed_flights, func)()
|
ed_metric = getattr(ed_flights, func)()
|
||||||
np.testing.assert_almost_equal(pd_metric, ed_metric, decimal=2)
|
|
||||||
|
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||||
|
|
||||||
def test_flights_timestamp(self):
|
def test_flights_timestamp(self):
|
||||||
pd_flights = self.pd_flights()["timestamp"]
|
pd_flights = self.pd_flights()["timestamp"]
|
||||||
@ -29,18 +36,28 @@ class TestSeriesMetrics(TestData):
|
|||||||
for func in self.timestamp_funcs:
|
for func in self.timestamp_funcs:
|
||||||
pd_metric = getattr(pd_flights, func)()
|
pd_metric = getattr(pd_flights, func)()
|
||||||
ed_metric = getattr(ed_flights, func)()
|
ed_metric = getattr(ed_flights, func)()
|
||||||
|
|
||||||
|
if hasattr(pd_metric, "floor"):
|
||||||
pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns
|
pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns
|
||||||
|
|
||||||
|
if func == "nunique":
|
||||||
|
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||||
|
else:
|
||||||
assert pd_metric == ed_metric
|
assert pd_metric == ed_metric
|
||||||
|
|
||||||
def test_ecommerce_selected_non_numeric_source_fields(self):
|
def test_ecommerce_selected_non_numeric_source_fields(self):
|
||||||
# None of these are numeric
|
# None of these are numeric, will result in NaNs
|
||||||
column = "category"
|
column = "category"
|
||||||
|
|
||||||
ed_ecommerce = self.ed_ecommerce()[column]
|
ed_ecommerce = self.ed_ecommerce()[column]
|
||||||
|
|
||||||
for func in self.funcs:
|
for func in self.all_funcs:
|
||||||
|
if func == "nunique": # nunique never returns 'NaN'
|
||||||
|
continue
|
||||||
|
|
||||||
ed_metric = getattr(ed_ecommerce, func)()
|
ed_metric = getattr(ed_ecommerce, func)()
|
||||||
assert ed_metric.empty
|
print(func, ed_metric)
|
||||||
|
assert np.isnan(ed_metric)
|
||||||
|
|
||||||
def test_ecommerce_selected_all_numeric_source_fields(self):
|
def test_ecommerce_selected_all_numeric_source_fields(self):
|
||||||
# All of these are numeric
|
# All of these are numeric
|
||||||
@ -50,9 +67,7 @@ class TestSeriesMetrics(TestData):
|
|||||||
pd_ecommerce = self.pd_ecommerce()[column]
|
pd_ecommerce = self.pd_ecommerce()[column]
|
||||||
ed_ecommerce = self.ed_ecommerce()[column]
|
ed_ecommerce = self.ed_ecommerce()[column]
|
||||||
|
|
||||||
for func in self.funcs:
|
for func in self.all_funcs:
|
||||||
np.testing.assert_almost_equal(
|
pd_metric = getattr(pd_ecommerce, func)()
|
||||||
getattr(pd_ecommerce, func)(),
|
ed_metric = getattr(ed_ecommerce, func)()
|
||||||
getattr(ed_ecommerce, func)(),
|
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||||
decimal=2,
|
|
||||||
)
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
elasticsearch>=7.6.0
|
elasticsearch==7.7.0a2
|
||||||
pandas>=1
|
pandas>=1
|
||||||
matplotlib
|
matplotlib
|
||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
elasticsearch>=7.6.0
|
elasticsearch==7.7.0a2
|
||||||
pandas>=1
|
pandas>=1
|
||||||
matplotlib
|
matplotlib
|
||||||
|
2
setup.py
2
setup.py
@ -175,6 +175,6 @@ setup(
|
|||||||
classifiers=CLASSIFIERS,
|
classifiers=CLASSIFIERS,
|
||||||
keywords="elastic eland pandas python",
|
keywords="elastic eland pandas python",
|
||||||
packages=find_packages(include=["eland", "eland.*"]),
|
packages=find_packages(include=["eland", "eland.*"]),
|
||||||
install_requires=["elasticsearch>=7.6, <8", "pandas>=1", "matplotlib"],
|
install_requires=["elasticsearch==7.7.0a2", "pandas>=1", "matplotlib", "numpy"],
|
||||||
python_requires=">=3.6",
|
python_requires=">=3.6",
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user