mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Add 'nunique' and 'mean' aggs for datetime, improve precision of datetime aggs
This commit is contained in:
parent
d238bc5d42
commit
a709ed589d
@ -110,10 +110,11 @@ def elasticsearch_date_to_pandas_date(
|
|||||||
or epoch_millis.
|
or epoch_millis.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if date_format is None:
|
if date_format is None or isinstance(value, (int, float)):
|
||||||
try:
|
try:
|
||||||
value = int(value)
|
return pd.to_datetime(
|
||||||
return pd.to_datetime(value, unit="ms")
|
value, unit="s" if date_format == "epoch_second" else "ms"
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return pd.to_datetime(value)
|
return pd.to_datetime(value)
|
||||||
elif date_format == "epoch_millis":
|
elif date_format == "epoch_millis":
|
||||||
|
@ -27,7 +27,16 @@ from pandas.core.dtypes.common import (
|
|||||||
is_string_dtype,
|
is_string_dtype,
|
||||||
)
|
)
|
||||||
from pandas.core.dtypes.inference import is_list_like
|
from pandas.core.dtypes.inference import is_list_like
|
||||||
from typing import NamedTuple, Optional, Mapping, Dict, Any, TYPE_CHECKING, List, Set
|
from typing import (
|
||||||
|
NamedTuple,
|
||||||
|
Optional,
|
||||||
|
Mapping,
|
||||||
|
Dict,
|
||||||
|
Any,
|
||||||
|
TYPE_CHECKING,
|
||||||
|
List,
|
||||||
|
Set,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
@ -82,6 +91,9 @@ class Field(NamedTuple):
|
|||||||
return np.dtype(self.pd_dtype)
|
return np.dtype(self.pd_dtype)
|
||||||
|
|
||||||
def is_es_agg_compatible(self, es_agg) -> bool:
|
def is_es_agg_compatible(self, es_agg) -> bool:
|
||||||
|
# Unpack the actual aggregation if this is 'extended_stats'
|
||||||
|
if isinstance(es_agg, tuple):
|
||||||
|
es_agg = es_agg[1]
|
||||||
# Cardinality works for all types
|
# Cardinality works for all types
|
||||||
# Numerics and bools work for all aggs
|
# Numerics and bools work for all aggs
|
||||||
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
if es_agg == "cardinality" or self.is_numeric or self.is_bool:
|
||||||
@ -91,6 +103,13 @@ class Field(NamedTuple):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nan_value(self) -> Any:
|
||||||
|
"""Returns NaN for any field except datetimes which use NaT"""
|
||||||
|
if self.is_timestamp:
|
||||||
|
return pd.NaT
|
||||||
|
return np.float64(np.NaN)
|
||||||
|
|
||||||
|
|
||||||
class FieldMappings:
|
class FieldMappings:
|
||||||
"""
|
"""
|
||||||
|
@ -242,9 +242,9 @@ class Operations:
|
|||||||
values = []
|
values = []
|
||||||
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
|
for es_agg, pd_agg in zip(es_aggs, pd_aggs):
|
||||||
|
|
||||||
# If the field and agg aren't compatible we add a NaN
|
# If the field and agg aren't compatible we add a NaN/NaT
|
||||||
if not field.is_es_agg_compatible(es_agg):
|
if not field.is_es_agg_compatible(es_agg):
|
||||||
values.append(np.float64(np.NaN))
|
values.append(field.nan_value)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if isinstance(es_agg, tuple):
|
if isinstance(es_agg, tuple):
|
||||||
@ -284,21 +284,25 @@ class Operations:
|
|||||||
else:
|
else:
|
||||||
agg_value = response["aggregations"][
|
agg_value = response["aggregations"][
|
||||||
f"{es_agg}_{field.es_field_name}"
|
f"{es_agg}_{field.es_field_name}"
|
||||||
]
|
]["value"]
|
||||||
if "value_as_string" in agg_value and field.is_timestamp:
|
|
||||||
agg_value = elasticsearch_date_to_pandas_date(
|
|
||||||
agg_value["value_as_string"], field.es_date_format
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
agg_value = agg_value["value"]
|
|
||||||
|
|
||||||
# These aggregations maintain the column datatype
|
|
||||||
if pd_agg in ("max", "min"):
|
|
||||||
agg_value = field.np_dtype.type(agg_value)
|
|
||||||
|
|
||||||
# Null usually means there were no results.
|
# Null usually means there were no results.
|
||||||
if agg_value is None:
|
if agg_value is None:
|
||||||
agg_value = np.float64(np.NaN)
|
agg_value = field.nan_value
|
||||||
|
|
||||||
|
# Cardinality is always either NaN or integer.
|
||||||
|
elif pd_agg == "nunique":
|
||||||
|
agg_value = int(agg_value)
|
||||||
|
|
||||||
|
# If this is a non-null timestamp field convert to a pd.Timestamp()
|
||||||
|
elif field.is_timestamp:
|
||||||
|
agg_value = elasticsearch_date_to_pandas_date(
|
||||||
|
agg_value, field.es_date_format
|
||||||
|
)
|
||||||
|
|
||||||
|
# These aggregations maintain the column datatype
|
||||||
|
elif pd_agg in ("max", "min"):
|
||||||
|
agg_value = field.np_dtype.type(agg_value)
|
||||||
|
|
||||||
values.append(agg_value)
|
values.append(agg_value)
|
||||||
|
|
||||||
|
@ -17,6 +17,9 @@
|
|||||||
|
|
||||||
# File called _pytest for PyCharm compatibility
|
# File called _pytest for PyCharm compatibility
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from pandas.testing import assert_series_equal
|
from pandas.testing import assert_series_equal
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
@ -26,13 +29,25 @@ class TestDataFrameMetrics(TestData):
|
|||||||
funcs = ["max", "min", "mean", "sum"]
|
funcs = ["max", "min", "mean", "sum"]
|
||||||
extended_funcs = ["median", "mad", "var", "std"]
|
extended_funcs = ["median", "mad", "var", "std"]
|
||||||
|
|
||||||
def test_flights_metrics(self):
|
@pytest.mark.parametrize("numeric_only", [False, None])
|
||||||
|
def test_flights_metrics(self, numeric_only):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
for func in self.funcs:
|
for func in self.funcs:
|
||||||
pd_metric = getattr(pd_flights, func)(numeric_only=True)
|
# Pandas v1.0 doesn't support mean() on datetime
|
||||||
ed_metric = getattr(ed_flights, func)(numeric_only=True)
|
# Pandas and Eland don't support sum() on datetime
|
||||||
|
if not numeric_only:
|
||||||
|
dtype_include = (
|
||||||
|
[np.number, np.datetime64]
|
||||||
|
if func not in ("mean", "sum")
|
||||||
|
else [np.number]
|
||||||
|
)
|
||||||
|
pd_flights = pd_flights.select_dtypes(include=dtype_include)
|
||||||
|
ed_flights = ed_flights.select_dtypes(include=dtype_include)
|
||||||
|
|
||||||
|
pd_metric = getattr(pd_flights, func)(numeric_only=numeric_only)
|
||||||
|
ed_metric = getattr(ed_flights, func)(numeric_only=numeric_only)
|
||||||
|
|
||||||
assert_series_equal(pd_metric, ed_metric)
|
assert_series_equal(pd_metric, ed_metric)
|
||||||
|
|
||||||
@ -144,3 +159,49 @@ class TestDataFrameMetrics(TestData):
|
|||||||
getattr(ed_ecommerce, func)(numeric_only=True),
|
getattr(ed_ecommerce, func)(numeric_only=True),
|
||||||
check_less_precise=True,
|
check_less_precise=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_flights_datetime_metrics_agg(self):
|
||||||
|
ed_timestamps = self.ed_flights()[["timestamp"]]
|
||||||
|
expected_values = {
|
||||||
|
"timestamp": {
|
||||||
|
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||||
|
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||||
|
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||||
|
"mad": pd.NaT,
|
||||||
|
"median": pd.NaT,
|
||||||
|
"std": pd.NaT,
|
||||||
|
"sum": pd.NaT,
|
||||||
|
"var": pd.NaT,
|
||||||
|
"nunique": 12236,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ed_metrics = ed_timestamps.agg(self.funcs + self.extended_funcs + ["nunique"])
|
||||||
|
assert ed_metrics.to_dict() == expected_values
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||||
|
def test_flights_datetime_metrics_single_agg(self, agg):
|
||||||
|
ed_timestamps = self.ed_flights()[["timestamp"]]
|
||||||
|
expected_values = {
|
||||||
|
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||||
|
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||||
|
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||||
|
"nunique": 12236,
|
||||||
|
}
|
||||||
|
ed_metric = ed_timestamps.agg([agg])
|
||||||
|
|
||||||
|
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
|
||||||
|
assert ed_metric["timestamp"][0] == expected_values[agg]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||||
|
def test_flights_datetime_metrics_agg_func(self, agg):
|
||||||
|
ed_timestamps = self.ed_flights()[["timestamp"]]
|
||||||
|
expected_values = {
|
||||||
|
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||||
|
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||||
|
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||||
|
}
|
||||||
|
ed_metric = getattr(ed_timestamps, agg)(numeric_only=False)
|
||||||
|
|
||||||
|
assert ed_metric.dtype == np.dtype("datetime64[ns]")
|
||||||
|
assert ed_metric[0] == expected_values[agg]
|
||||||
|
@ -17,7 +17,10 @@
|
|||||||
|
|
||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
@ -50,11 +53,12 @@ class TestSeriesMetrics(TestData):
|
|||||||
pd_metric = getattr(pd_flights, func)()
|
pd_metric = getattr(pd_flights, func)()
|
||||||
ed_metric = getattr(ed_flights, func)()
|
ed_metric = getattr(ed_flights, func)()
|
||||||
|
|
||||||
if hasattr(pd_metric, "floor"):
|
|
||||||
pd_metric = pd_metric.floor("S") # floor or pandas mean with have ns
|
|
||||||
|
|
||||||
if func == "nunique":
|
if func == "nunique":
|
||||||
|
print(pd_metric, ed_metric)
|
||||||
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||||
|
elif func == "mean":
|
||||||
|
offset = timedelta(seconds=0.001)
|
||||||
|
assert (ed_metric - offset) < pd_metric < (ed_metric + offset)
|
||||||
else:
|
else:
|
||||||
assert pd_metric == ed_metric
|
assert pd_metric == ed_metric
|
||||||
|
|
||||||
@ -84,3 +88,15 @@ class TestSeriesMetrics(TestData):
|
|||||||
pd_metric = getattr(pd_ecommerce, func)()
|
pd_metric = getattr(pd_ecommerce, func)()
|
||||||
ed_metric = getattr(ed_ecommerce, func)()
|
ed_metric = getattr(ed_ecommerce, func)()
|
||||||
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
self.assert_almost_equal_for_agg(func, pd_metric, ed_metric)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", ["mean", "min", "max"])
|
||||||
|
def test_flights_datetime_metrics_agg(self, agg):
|
||||||
|
ed_timestamps = self.ed_flights()["timestamp"]
|
||||||
|
expected_values = {
|
||||||
|
"min": pd.Timestamp("2018-01-01 00:00:00"),
|
||||||
|
"mean": pd.Timestamp("2018-01-21 19:20:45.564438232"),
|
||||||
|
"max": pd.Timestamp("2018-02-11 23:50:12"),
|
||||||
|
}
|
||||||
|
ed_metric = getattr(ed_timestamps, agg)()
|
||||||
|
|
||||||
|
assert ed_metric == expected_values[agg]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user