mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
297 lines
13 KiB
Python
297 lines
13 KiB
Python
# Licensed to Elasticsearch B.V under one or more agreements.
|
|
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
|
|
# See the LICENSE file in the project root for more information
|
|
|
|
import re
|
|
import warnings
|
|
from enum import Enum
|
|
from typing import Union, List, Tuple, cast, Callable, Any, Optional, Dict
|
|
|
|
import numpy as np # type: ignore
|
|
import pandas as pd # type: ignore
|
|
from elasticsearch import Elasticsearch # type: ignore
|
|
|
|
# Default number of rows displayed (different to pandas where ALL could be displayed)
|
|
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
|
|
|
DEFAULT_CHUNK_SIZE = 10000
|
|
DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000
|
|
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
|
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
|
|
|
|
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
EMPTY_SERIES_DTYPE = pd.Series().dtype
|
|
|
|
|
|
def build_pd_series(
|
|
data: Dict[str, Any], dtype: Optional[np.dtype] = None, **kwargs: Any
|
|
) -> pd.Series:
|
|
"""Builds a pd.Series while squelching the warning
|
|
for unspecified dtype on empty series
|
|
"""
|
|
dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype)
|
|
if dtype is not None:
|
|
kwargs["dtype"] = dtype
|
|
return pd.Series(data, **kwargs)
|
|
|
|
|
|
def docstring_parameter(*sub: Any) -> Callable[[Any], Any]:
|
|
def dec(obj: Any) -> Any:
|
|
obj.__doc__ = obj.__doc__.format(*sub)
|
|
return obj
|
|
|
|
return dec
|
|
|
|
|
|
class SortOrder(Enum):
|
|
ASC = 0
|
|
DESC = 1
|
|
|
|
@staticmethod
|
|
def reverse(order: "SortOrder") -> "SortOrder":
|
|
if order == SortOrder.ASC:
|
|
return SortOrder.DESC
|
|
|
|
return SortOrder.ASC
|
|
|
|
@staticmethod
|
|
def to_string(order: "SortOrder") -> str:
|
|
if order == SortOrder.ASC:
|
|
return "asc"
|
|
|
|
return "desc"
|
|
|
|
@staticmethod
|
|
def from_string(order: str) -> "SortOrder":
|
|
if order == "asc":
|
|
return SortOrder.ASC
|
|
|
|
return SortOrder.DESC
|
|
|
|
|
|
def elasticsearch_date_to_pandas_date(
|
|
value: Union[int, str], date_format: str
|
|
) -> pd.Timestamp:
|
|
"""
|
|
Given a specific Elasticsearch format for a date datatype, returns the
|
|
'partial' `to_datetime` function to parse a given value in that format
|
|
|
|
**Date Formats: https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html#built-in-date-formats
|
|
|
|
Parameters
|
|
----------
|
|
value: Union[int, str]
|
|
The date value.
|
|
date_format: str
|
|
The Elasticsearch date format (ex. 'epoch_millis', 'epoch_second', etc.)
|
|
|
|
Returns
|
|
-------
|
|
datetime: pd.Timestamp
|
|
From https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html
|
|
Date formats can be customised, but if no format is specified then it uses the default:
|
|
"strict_date_optional_time||epoch_millis"
|
|
Therefore if no format is specified we assume either strict_date_optional_time
|
|
or epoch_millis.
|
|
"""
|
|
|
|
if date_format is None:
|
|
try:
|
|
value = int(value)
|
|
return pd.to_datetime(value, unit="ms")
|
|
except ValueError:
|
|
return pd.to_datetime(value)
|
|
elif date_format == "epoch_millis":
|
|
return pd.to_datetime(value, unit="ms")
|
|
elif date_format == "epoch_second":
|
|
return pd.to_datetime(value, unit="s")
|
|
elif date_format == "strict_date_optional_time":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "basic_date":
|
|
return pd.to_datetime(value, format="%Y%m%d")
|
|
elif date_format == "basic_date_time":
|
|
return pd.to_datetime(value, format="%Y%m%dT%H%M%S.%f", exact=False)
|
|
elif date_format == "basic_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%Y%m%dT%H%M%S%z")
|
|
elif date_format == "basic_ordinal_date":
|
|
return pd.to_datetime(value, format="%Y%j")
|
|
elif date_format == "basic_ordinal_date_time":
|
|
return pd.to_datetime(value, format="%Y%jT%H%M%S.%f%z", exact=False)
|
|
elif date_format == "basic_ordinal_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%Y%jT%H%M%S%z")
|
|
elif date_format == "basic_time":
|
|
return pd.to_datetime(value, format="%H%M%S.%f%z", exact=False)
|
|
elif date_format == "basic_time_no_millis":
|
|
return pd.to_datetime(value, format="%H%M%S%z")
|
|
elif date_format == "basic_t_time":
|
|
return pd.to_datetime(value, format="T%H%M%S.%f%z", exact=False)
|
|
elif date_format == "basic_t_time_no_millis":
|
|
return pd.to_datetime(value, format="T%H%M%S%z")
|
|
elif date_format == "basic_week_date":
|
|
return pd.to_datetime(value, format="%GW%V%u")
|
|
elif date_format == "basic_week_date_time":
|
|
return pd.to_datetime(value, format="%GW%V%uT%H%M%S.%f%z", exact=False)
|
|
elif date_format == "basic_week_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%GW%V%uT%H%M%S%z")
|
|
elif date_format == "strict_date":
|
|
return pd.to_datetime(value, format="%Y-%m-%d")
|
|
elif date_format == "date":
|
|
return pd.to_datetime(value, format="%Y-%m-%d")
|
|
elif date_format == "strict_date_hour":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H")
|
|
elif date_format == "date_hour":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H")
|
|
elif date_format == "strict_date_hour_minute":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M")
|
|
elif date_format == "date_hour_minute":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M")
|
|
elif date_format == "strict_date_hour_minute_second":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S")
|
|
elif date_format == "date_hour_minute_second":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S")
|
|
elif date_format == "strict_date_hour_minute_second_fraction":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f", exact=False)
|
|
elif date_format == "date_hour_minute_second_fraction":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f", exact=False)
|
|
elif date_format == "strict_date_hour_minute_second_millis":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f", exact=False)
|
|
elif date_format == "date_hour_minute_second_millis":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f", exact=False)
|
|
elif date_format == "strict_date_time":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "date_time":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "strict_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S%z")
|
|
elif date_format == "date_time_no_millis":
|
|
return pd.to_datetime(value, format="%Y-%m-%dT%H:%M:%S%z")
|
|
elif date_format == "strict_hour":
|
|
return pd.to_datetime(value, format="%H")
|
|
elif date_format == "hour":
|
|
return pd.to_datetime(value, format="%H")
|
|
elif date_format == "strict_hour_minute":
|
|
return pd.to_datetime(value, format="%H:%M")
|
|
elif date_format == "hour_minute":
|
|
return pd.to_datetime(value, format="%H:%M")
|
|
elif date_format == "strict_hour_minute_second":
|
|
return pd.to_datetime(value, format="%H:%M:%S")
|
|
elif date_format == "hour_minute_second":
|
|
return pd.to_datetime(value, format="%H:%M:%S")
|
|
elif date_format == "strict_hour_minute_second_fraction":
|
|
return pd.to_datetime(value, format="%H:%M:%S.%f", exact=False)
|
|
elif date_format == "hour_minute_second_fraction":
|
|
return pd.to_datetime(value, format="%H:%M:%S.%f", exact=False)
|
|
elif date_format == "strict_hour_minute_second_millis":
|
|
return pd.to_datetime(value, format="%H:%M:%S.%f", exact=False)
|
|
elif date_format == "hour_minute_second_millis":
|
|
return pd.to_datetime(value, format="%H:%M:%S.%f", exact=False)
|
|
elif date_format == "strict_ordinal_date":
|
|
return pd.to_datetime(value, format="%Y-%j")
|
|
elif date_format == "ordinal_date":
|
|
return pd.to_datetime(value, format="%Y-%j")
|
|
elif date_format == "strict_ordinal_date_time":
|
|
return pd.to_datetime(value, format="%Y-%jT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "ordinal_date_time":
|
|
return pd.to_datetime(value, format="%Y-%jT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "strict_ordinal_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%Y-%jT%H:%M:%S%z")
|
|
elif date_format == "ordinal_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%Y-%jT%H:%M:%S%z")
|
|
elif date_format == "strict_time":
|
|
return pd.to_datetime(value, format="%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "time":
|
|
return pd.to_datetime(value, format="%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "strict_time_no_millis":
|
|
return pd.to_datetime(value, format="%H:%M:%S%z")
|
|
elif date_format == "time_no_millis":
|
|
return pd.to_datetime(value, format="%H:%M:%S%z")
|
|
elif date_format == "strict_t_time":
|
|
return pd.to_datetime(value, format="T%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "t_time":
|
|
return pd.to_datetime(value, format="T%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "strict_t_time_no_millis":
|
|
return pd.to_datetime(value, format="T%H:%M:%S%z")
|
|
elif date_format == "t_time_no_millis":
|
|
return pd.to_datetime(value, format="T%H:%M:%S%z")
|
|
elif date_format == "strict_week_date":
|
|
return pd.to_datetime(value, format="%G-W%V-%u")
|
|
elif date_format == "week_date":
|
|
return pd.to_datetime(value, format="%G-W%V-%u")
|
|
elif date_format == "strict_week_date_time":
|
|
return pd.to_datetime(value, format="%G-W%V-%uT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "week_date_time":
|
|
return pd.to_datetime(value, format="%G-W%V-%uT%H:%M:%S.%f%z", exact=False)
|
|
elif date_format == "strict_week_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%G-W%V-%uT%H:%M:%S%z")
|
|
elif date_format == "week_date_time_no_millis":
|
|
return pd.to_datetime(value, format="%G-W%V-%uT%H:%M:%S%z")
|
|
elif date_format == "strict_weekyear" or date_format == "weekyear":
|
|
# TODO investigate if there is a way of converting this
|
|
raise NotImplementedError(
|
|
"strict_weekyear is not implemented due to support in pandas"
|
|
)
|
|
return pd.to_datetime(value, format="%G")
|
|
# Not supported in pandas
|
|
# ValueError: ISO year directive '%G' must be used with the ISO week directive '%V'
|
|
# and a weekday directive '%A', '%a', '%w', or '%u'.
|
|
elif date_format == "strict_weekyear_week" or date_format == "weekyear_week":
|
|
# TODO investigate if there is a way of converting this
|
|
raise NotImplementedError(
|
|
"strict_weekyear_week is not implemented due to support in pandas"
|
|
)
|
|
return pd.to_datetime(value, format="%G-W%V")
|
|
# Not supported in pandas
|
|
# ValueError: ISO year directive '%G' must be used with the ISO week directive '%V'
|
|
# and a weekday directive '%A', '%a', '%w', or '%u'.
|
|
elif date_format == "strict_weekyear_week_day":
|
|
return pd.to_datetime(value, format="%G-W%V-%u")
|
|
elif date_format == "weekyear_week_day":
|
|
return pd.to_datetime(value, format="%G-W%V-%u")
|
|
elif date_format == "strict_year":
|
|
return pd.to_datetime(value, format="%Y")
|
|
elif date_format == "year":
|
|
return pd.to_datetime(value, format="%Y")
|
|
elif date_format == "strict_year_month":
|
|
return pd.to_datetime(value, format="%Y-%m")
|
|
elif date_format == "year_month":
|
|
return pd.to_datetime(value, format="%Y-%m")
|
|
elif date_format == "strict_year_month_day":
|
|
return pd.to_datetime(value, format="%Y-%m-%d")
|
|
elif date_format == "year_month_day":
|
|
return pd.to_datetime(value, format="%Y-%m-%d")
|
|
else:
|
|
warnings.warn(
|
|
f"The '{date_format}' format is not explicitly supported."
|
|
f"Using pandas.to_datetime(value) to parse value",
|
|
Warning,
|
|
)
|
|
# TODO investigate how we could generate this just once for a bulk read.
|
|
return pd.to_datetime(value)
|
|
|
|
|
|
def ensure_es_client(
|
|
es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch]
|
|
) -> Elasticsearch:
|
|
if not isinstance(es_client, Elasticsearch):
|
|
es_client = Elasticsearch(es_client)
|
|
return es_client
|
|
|
|
|
|
def es_version(es_client: Elasticsearch) -> Tuple[int, int, int]:
|
|
"""Tags the current ES client with a cached '_eland_es_version'
|
|
property if one doesn't exist yet for the current Elasticsearch version.
|
|
"""
|
|
if not hasattr(es_client, "_eland_es_version"):
|
|
version_info = es_client.info()["version"]["number"]
|
|
match = re.match(r"^(\d+)\.(\d+)\.(\d+)", version_info)
|
|
if match is None:
|
|
raise ValueError(
|
|
f"Unable to determine Elasticsearch version. "
|
|
f"Received: {version_info}"
|
|
)
|
|
major, minor, patch = [int(x) for x in match.groups()]
|
|
es_client._eland_es_version = (major, minor, patch)
|
|
return cast(Tuple[int, int, int], es_client._eland_es_version)
|