eland/eland/tests/field_mappings/test_datetime_pytest.py
stevedodson 903fbf0341
Feature/mapping cache (#103)
* Adding python 3.5 compatibility.

Main issue is ordering of dictionaries.

* Updating notebooks with 3.7 results.

* Removing tempoorary code.

* Defaulting to OrderedDict for python 3.5 + lint all code

All code reformated by PyCharm and inspection results analysed.

* Adding support for multiple arithmetic operations.

Added new 'arithmetics' file to manage this process.
More tests to be added + cleanup.

* Signficant refactor to arithmetics and mappings.

Work in progress. Tests don't pass.

* Major refactor to Mappings.

Field name mappings were stored in different places
(Mappings, QueryCompiler, Operations) and needed to
be keep in sync.

With the addition of complex arithmetic operations
this became complex and difficult to maintain. Therefore,
all field naming is now in 'FieldMappings' which
replaces 'Mappings'.

Note this commit removes the cache for some of the
mapped values and so the code is SIGNIFICANTLY
slower on large indices.

In addition, the addition of date_format to
Mappings has been removed. This again added more
unncessary complexity.

* Adding OrderedDict for 3.5 compatibility

* Fixes to ordering issues with 3.5

* Adding simple cache for mappings in flatten

Improves performance significantly on large
datasets (>10000 rows).

* Adding updated notebooks (new info_es).

All tests (doc + nbval + pytest) pass.
2020-01-10 08:12:03 +00:00

242 lines
12 KiB
Python

# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# File called _pytest for PyCharm compatability
from datetime import datetime
import eland as ed
from eland.tests.common import ES_TEST_CLIENT
from eland.tests.common import TestData
class TestDateTime(TestData):
times = ["2019-11-26T19:58:15.246+0000",
"1970-01-01T00:00:03.000+0000"]
time_index_name = 'test_time_formats'
@classmethod
def setup_class(cls):
""" setup any state specific to the execution of the given class (which
usually contains tests).
"""
es = ES_TEST_CLIENT
if es.indices.exists(cls.time_index_name):
es.indices.delete(index=cls.time_index_name)
dts = [datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%f%z")
for time in cls.times]
time_formats_docs = [TestDateTime.get_time_values_from_datetime(dt)
for dt in dts]
mappings = {'properties': {}}
for field_name, field_value in time_formats_docs[0].items():
mappings['properties'][field_name] = {}
mappings['properties'][field_name]['type'] = 'date'
mappings['properties'][field_name]['format'] = field_name
body = {"mappings": mappings}
index = 'test_time_formats'
es.indices.delete(index=index, ignore=[400, 404])
es.indices.create(index=index, body=body)
for i, time_formats in enumerate(time_formats_docs):
es.index(index=index, body=time_formats, id=i)
es.indices.refresh(index=index)
@classmethod
def teardown_class(cls):
""" teardown any state that was previously setup with a call to
setup_class.
"""
es = ES_TEST_CLIENT
es.indices.delete(index=cls.time_index_name)
def test_all_formats(self):
ed_field_mappings = ed.FieldMappings(
client=ed.Client(ES_TEST_CLIENT),
index_pattern=self.time_index_name
)
# do a rename so display_name for a field is different to es_field_name
ed_field_mappings.rename({'strict_year_month': 'renamed_strict_year_month'})
# buf = StringIO()
# ed_field_mappings.info_es(buf)
# print(buf.getvalue())
for format_name in self.time_formats.keys():
es_date_format = ed_field_mappings.date_field_format(format_name)
assert format_name == es_date_format
@staticmethod
def get_time_values_from_datetime(dt: datetime) -> dict:
time_formats = {
"epoch_millis": int(dt.timestamp() * 1000),
"epoch_second": int(dt.timestamp()),
"strict_date_optional_time": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"basic_date": dt.strftime("%Y%m%d"),
"basic_date_time": dt.strftime("%Y%m%dT%H%M%S.%f")[:-3] + dt.strftime("%z"),
"basic_date_time_no_millis": dt.strftime("%Y%m%dT%H%M%S%z"),
"basic_ordinal_date": dt.strftime("%Y%j"),
"basic_ordinal_date_time": dt.strftime("%Y%jT%H%M%S.%f")[:-3] + dt.strftime("%z"),
"basic_ordinal_date_time_no_millis": dt.strftime("%Y%jT%H%M%S%z"),
"basic_time": dt.strftime("%H%M%S.%f")[:-3] + dt.strftime("%z"),
"basic_time_no_millis": dt.strftime("%H%M%S%z"),
"basic_t_time": dt.strftime("T%H%M%S.%f")[:-3] + dt.strftime("%z"),
"basic_t_time_no_millis": dt.strftime("T%H%M%S%z"),
"basic_week_date": dt.strftime("%GW%V%u"),
"basic_week_date_time": dt.strftime("%GW%V%uT%H%M%S.%f")[:-3] + dt.strftime("%z"),
"basic_week_date_time_no_millis": dt.strftime("%GW%V%uT%H%M%S%z"),
"strict_date": dt.strftime("%Y-%m-%d"),
"date": dt.strftime("%Y-%m-%d"),
"strict_date_hour": dt.strftime("%Y-%m-%dT%H"),
"date_hour": dt.strftime("%Y-%m-%dT%H"),
"strict_date_hour_minute": dt.strftime("%Y-%m-%dT%H:%M"),
"date_hour_minute": dt.strftime("%Y-%m-%dT%H:%M"),
"strict_date_hour_minute_second": dt.strftime("%Y-%m-%dT%H:%M:%S"),
"date_hour_minute_second": dt.strftime("%Y-%m-%dT%H:%M:%S"),
"strict_date_hour_minute_second_fraction": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
"date_hour_minute_second_fraction": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
"strict_date_hour_minute_second_millis": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
"date_hour_minute_second_millis": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
"strict_date_time": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"date_time": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"strict_date_time_no_millis": dt.strftime("%Y-%m-%dT%H:%M:%S%z"),
"date_time_no_millis": dt.strftime("%Y-%m-%dT%H:%M:%S%z"),
"strict_hour": dt.strftime("%H"),
"hour": dt.strftime("%H"),
"strict_hour_minute": dt.strftime("%H:%M"),
"hour_minute": dt.strftime("%H:%M"),
"strict_hour_minute_second": dt.strftime("%H:%M:%S"),
"hour_minute_second": dt.strftime("%H:%M:%S"),
"strict_hour_minute_second_fraction": dt.strftime("%H:%M:%S.%f")[:-3],
"hour_minute_second_fraction": dt.strftime("%H:%M:%S.%f")[:-3],
"strict_hour_minute_second_millis": dt.strftime("%H:%M:%S.%f")[:-3],
"hour_minute_second_millis": dt.strftime("%H:%M:%S.%f")[:-3],
"strict_ordinal_date": dt.strftime("%Y-%j"),
"ordinal_date": dt.strftime("%Y-%j"),
"strict_ordinal_date_time": dt.strftime("%Y-%jT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"ordinal_date_time": dt.strftime("%Y-%jT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"strict_ordinal_date_time_no_millis": dt.strftime("%Y-%jT%H:%M:%S%z"),
"ordinal_date_time_no_millis": dt.strftime("%Y-%jT%H:%M:%S%z"),
"strict_time": dt.strftime("%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"time": dt.strftime("%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"strict_time_no_millis": dt.strftime("%H:%M:%S%z"),
"time_no_millis": dt.strftime("%H:%M:%S%z"),
"strict_t_time": dt.strftime("T%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"t_time": dt.strftime("T%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"strict_t_time_no_millis": dt.strftime("T%H:%M:%S%z"),
"t_time_no_millis": dt.strftime("T%H:%M:%S%z"),
"strict_week_date": dt.strftime("%G-W%V-%u"),
"week_date": dt.strftime("%G-W%V-%u"),
"strict_week_date_time": dt.strftime("%G-W%V-%uT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"week_date_time": dt.strftime("%G-W%V-%uT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
"strict_week_date_time_no_millis": dt.strftime("%G-W%V-%uT%H:%M:%S%z"),
"week_date_time_no_millis": dt.strftime("%G-W%V-%uT%H:%M:%S%z"),
"strict_weekyear": dt.strftime("%G"),
"weekyear": dt.strftime("%G"),
"strict_weekyear_week": dt.strftime("%G-W%V"),
"weekyear_week": dt.strftime("%G-W%V"),
"strict_weekyear_week_day": dt.strftime("%G-W%V-%u"),
"weekyear_week_day": dt.strftime("%G-W%V-%u"),
"strict_year": dt.strftime("%Y"),
"year": dt.strftime("%Y"),
"strict_year_month": dt.strftime("%Y-%m"),
"year_month": dt.strftime("%Y-%m"),
"strict_year_month_day": dt.strftime("%Y-%m-%d"),
"year_month_day": dt.strftime("%Y-%m-%d"),
}
return time_formats
time_formats = {
"epoch_millis": "%Y-%m-%dT%H:%M:%S.%f",
"epoch_second": "%Y-%m-%dT%H:%M:%S",
"strict_date_optional_time": "%Y-%m-%dT%H:%M:%S.%f%z",
"basic_date": "%Y%m%d",
"basic_date_time": "%Y%m%dT%H%M%S.%f",
"basic_date_time_no_millis": "%Y%m%dT%H%M%S%z",
"basic_ordinal_date": "%Y%j",
"basic_ordinal_date_time": "%Y%jT%H%M%S.%f%z",
"basic_ordinal_date_time_no_millis": "%Y%jT%H%M%S%z",
"basic_time": "%H%M%S.%f%z",
"basic_time_no_millis": "%H%M%S%z",
"basic_t_time": "T%H%M%S.%f%z",
"basic_t_time_no_millis": "T%H%M%S%z",
"basic_week_date": "%GW%V%u",
"basic_week_date_time": "%GW%V%uT%H%M%S.%f%z",
"basic_week_date_time_no_millis": "%GW%V%uT%H%M%S%z",
"date": "%Y-%m-%d",
"strict_date": "%Y-%m-%d",
"strict_date_hour": "%Y-%m-%dT%H",
"date_hour": "%Y-%m-%dT%H",
"strict_date_hour_minute": "%Y-%m-%dT%H:%M",
"date_hour_minute": "%Y-%m-%dT%H:%M",
"strict_date_hour_minute_second": "%Y-%m-%dT%H:%M:%S",
"date_hour_minute_second": "%Y-%m-%dT%H:%M:%S",
"strict_date_hour_minute_second_fraction": "%Y-%m-%dT%H:%M:%S.%f",
"date_hour_minute_second_fraction": "%Y-%m-%dT%H:%M:%S.%f",
"strict_date_hour_minute_second_millis": "%Y-%m-%dT%H:%M:%S.%f",
"date_hour_minute_second_millis": "%Y-%m-%dT%H:%M:%S.%f",
"strict_date_time": "%Y-%m-%dT%H:%M:%S.%f%z",
"date_time": "%Y-%m-%dT%H:%M:%S.%f%z",
"strict_date_time_no_millis": "%Y-%m-%dT%H:%M:%S%z",
"date_time_no_millis": "%Y-%m-%dT%H:%M:%S%z",
"strict_hour": "%H",
"hour": "%H",
"strict_hour_minute": "%H:%M",
"hour_minute": "%H:%M",
"strict_hour_minute_second": "%H:%M:%S",
"hour_minute_second": "%H:%M:%S",
"strict_hour_minute_second_fraction": "%H:%M:%S.%f",
"hour_minute_second_fraction": "%H:%M:%S.%f",
"strict_hour_minute_second_millis": "%H:%M:%S.%f",
"hour_minute_second_millis": "%H:%M:%S.%f",
"strict_ordinal_date": "%Y-%j",
"ordinal_date": "%Y-%j",
"strict_ordinal_date_time": "%Y-%jT%H:%M:%S.%f%z",
"ordinal_date_time": "%Y-%jT%H:%M:%S.%f%z",
"strict_ordinal_date_time_no_millis": "%Y-%jT%H:%M:%S%z",
"ordinal_date_time_no_millis": "%Y-%jT%H:%M:%S%z",
"strict_time": "%H:%M:%S.%f%z",
"time": "%H:%M:%S.%f%z",
"strict_time_no_millis": "%H:%M:%S%z",
"time_no_millis": "%H:%M:%S%z",
"strict_t_time": "T%H:%M:%S.%f%z",
"t_time": "T%H:%M:%S.%f%z",
"strict_t_time_no_millis": "T%H:%M:%S%z",
"t_time_no_millis": "T%H:%M:%S%z",
"strict_week_date": "%G-W%V-%u",
"week_date": "%G-W%V-%u",
"strict_week_date_time": "%G-W%V-%uT%H:%M:%S.%f%z",
"week_date_time": "%G-W%V-%uT%H:%M:%S.%f%z",
"strict_week_date_time_no_millis": "%G-W%V-%uT%H:%M:%S%z",
"week_date_time_no_millis": "%G-W%V-%uT%H:%M:%S%z",
"strict_weekyear_week_day": "%G-W%V-%u",
"weekyear_week_day": "%G-W%V-%u",
"strict_year": "%Y",
"year": "%Y",
"strict_year_month": "%Y-%m",
"year_month": "%Y-%m",
"strict_year_month_day": "%Y-%m-%d",
"year_month_day": "%Y-%m-%d"
}
# excluding these formats as pandas throws a ValueError
# "strict_weekyear": ("%G", None) - not supported in pandas
# "strict_weekyear_week": ("%G-W%V", None),
# E ValueError: ISO year directive '%G' must be used with the ISO week directive '%V' and a weekday directive '%A', '%a', '%w', or '%u'.