mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Feature/arithmetic ops (#102)
* Adding python 3.5 compatibility. Main issue is ordering of dictionaries. * Updating notebooks with 3.7 results. * Removing tempoorary code. * Defaulting to OrderedDict for python 3.5 + lint all code All code reformated by PyCharm and inspection results analysed. * Adding support for multiple arithmetic operations. Added new 'arithmetics' file to manage this process. More tests to be added + cleanup. * Signficant refactor to arithmetics and mappings. Work in progress. Tests don't pass. * Major refactor to Mappings. Field name mappings were stored in different places (Mappings, QueryCompiler, Operations) and needed to be keep in sync. With the addition of complex arithmetic operations this became complex and difficult to maintain. Therefore, all field naming is now in 'FieldMappings' which replaces 'Mappings'. Note this commit removes the cache for some of the mapped values and so the code is SIGNIFICANTLY slower on large indices. In addition, the addition of date_format to Mappings has been removed. This again added more unncessary complexity. * Adding OrderedDict for 3.5 compatibility * Fixes to ordering issues with 3.5
This commit is contained in:
parent
617583183f
commit
efe21a6d87
@ -71,7 +71,8 @@ except ImportError:
|
||||
|
||||
extlinks = {
|
||||
'pandas_api_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.3/reference/api/%s.html', ''),
|
||||
'pandas_user_guide': ('https://pandas.pydata.org/pandas-docs/version/0.25.3/user_guide/%s.html', 'Pandas User Guide/'),
|
||||
'pandas_user_guide': (
|
||||
'https://pandas.pydata.org/pandas-docs/version/0.25.3/user_guide/%s.html', 'Pandas User Guide/'),
|
||||
'es_api_docs': ('https://www.elastic.co/guide/en/elasticsearch/reference/current/%s.html', '')
|
||||
}
|
||||
|
||||
@ -106,3 +107,6 @@ html_theme = "pandas_sphinx_theme"
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
# html_static_path = ['_static']
|
||||
|
||||
html_logo = "logo/eland.png"
|
||||
html_favicon = "logo/eland_favicon.png"
|
||||
|
BIN
docs/source/logo/eland.png
Normal file
BIN
docs/source/logo/eland.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
BIN
docs/source/logo/eland_favicon.png
Normal file
BIN
docs/source/logo/eland_favicon.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 14 KiB |
@ -18,7 +18,7 @@ from eland.common import *
|
||||
from eland.client import *
|
||||
from eland.filter import *
|
||||
from eland.index import *
|
||||
from eland.mappings import *
|
||||
from eland.field_mappings import *
|
||||
from eland.query import *
|
||||
from eland.operations import *
|
||||
from eland.query_compiler import *
|
||||
|
215
eland/arithmetics.py
Normal file
215
eland/arithmetics.py
Normal file
@ -0,0 +1,215 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from abc import ABC, abstractmethod
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ArithmeticObject(ABC):
|
||||
@property
|
||||
@abstractmethod
|
||||
def value(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def dtype(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def resolve(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __repr__(self):
|
||||
pass
|
||||
|
||||
|
||||
class ArithmeticString(ArithmeticObject):
|
||||
def __init__(self, value):
|
||||
self._value = value
|
||||
|
||||
def resolve(self):
|
||||
return self.value
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return np.dtype(object)
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return "'{}'".format(self._value)
|
||||
|
||||
def __repr__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class ArithmeticNumber(ArithmeticObject):
|
||||
def __init__(self, value, dtype):
|
||||
self._value = value
|
||||
self._dtype = dtype
|
||||
|
||||
def resolve(self):
|
||||
return self.value
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return "{}".format(self._value)
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
def __repr__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class ArithmeticSeries(ArithmeticObject):
|
||||
def __init__(self, query_compiler, display_name, dtype):
|
||||
task = query_compiler.get_arithmetic_op_fields()
|
||||
if task is not None:
|
||||
self._value = task._arithmetic_series.value
|
||||
self._tasks = task._arithmetic_series._tasks.copy()
|
||||
self._dtype = dtype
|
||||
else:
|
||||
aggregatable_field_name = query_compiler.display_name_to_aggregatable_name(display_name)
|
||||
if aggregatable_field_name is None:
|
||||
raise ValueError(
|
||||
"Can not perform arithmetic operations on non aggregatable fields"
|
||||
"{} is not aggregatable.".format(display_name)
|
||||
)
|
||||
|
||||
self._value = "doc['{}'].value".format(aggregatable_field_name)
|
||||
self._tasks = []
|
||||
self._dtype = dtype
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return self._value
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
return self._dtype
|
||||
|
||||
def __repr__(self):
|
||||
buf = StringIO()
|
||||
buf.write("Series: {} ".format(self.value))
|
||||
buf.write("Tasks: ")
|
||||
for task in self._tasks:
|
||||
buf.write("{} ".format(repr(task)))
|
||||
return buf.getvalue()
|
||||
|
||||
def resolve(self):
|
||||
value = self._value
|
||||
|
||||
for task in self._tasks:
|
||||
if task.op_name == '__add__':
|
||||
value = "({} + {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__truediv__':
|
||||
value = "({} / {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__floordiv__':
|
||||
value = "Math.floor({} / {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__mod__':
|
||||
value = "({} % {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__mul__':
|
||||
value = "({} * {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__pow__':
|
||||
value = "Math.pow({}, {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__sub__':
|
||||
value = "({} - {})".format(value, task.object.resolve())
|
||||
elif task.op_name == '__radd__':
|
||||
value = "({} + {})".format(task.object.resolve(), value)
|
||||
elif task.op_name == '__rtruediv__':
|
||||
value = "({} / {})".format(task.object.resolve(), value)
|
||||
elif task.op_name == '__rfloordiv__':
|
||||
value = "Math.floor({} / {})".format(task.object.resolve(), value)
|
||||
elif task.op_name == '__rmod__':
|
||||
value = "({} % {})".format(task.object.resolve(), value)
|
||||
elif task.op_name == '__rmul__':
|
||||
value = "({} * {})".format(task.object.resolve(), value)
|
||||
elif task.op_name == '__rpow__':
|
||||
value = "Math.pow({}, {})".format(task.object.resolve(), value)
|
||||
elif task.op_name == '__rsub__':
|
||||
value = "({} - {})".format(task.object.resolve(), value)
|
||||
|
||||
return value
|
||||
|
||||
def arithmetic_operation(self, op_name, right):
|
||||
# check if operation is supported (raises on unsupported)
|
||||
self.check_is_supported(op_name, right)
|
||||
|
||||
task = ArithmeticTask(op_name, right)
|
||||
self._tasks.append(task)
|
||||
return self
|
||||
|
||||
def check_is_supported(self, op_name, right):
|
||||
# supported set is
|
||||
# series.number op_name number (all ops)
|
||||
# series.string op_name string (only add)
|
||||
# series.string op_name int (only mul)
|
||||
# series.string op_name float (none)
|
||||
# series.int op_name string (none)
|
||||
# series.float op_name string (none)
|
||||
|
||||
# see end of https://pandas.pydata.org/pandas-docs/stable/getting_started/basics.html?highlight=dtype for
|
||||
# dtype heirarchy
|
||||
if np.issubdtype(self.dtype, np.number) and np.issubdtype(right.dtype, np.number):
|
||||
# series.number op_name number (all ops)
|
||||
return True
|
||||
elif np.issubdtype(self.dtype, np.object_) and np.issubdtype(right.dtype, np.object_):
|
||||
# series.string op_name string (only add)
|
||||
if op_name == '__add__' or op_name == '__radd__':
|
||||
return True
|
||||
elif np.issubdtype(self.dtype, np.object_) and np.issubdtype(right.dtype, np.integer):
|
||||
# series.string op_name int (only mul)
|
||||
if op_name == '__mul__':
|
||||
return True
|
||||
|
||||
raise TypeError(
|
||||
"Arithmetic operation on incompatible types {} {} {}".format(self.dtype, op_name, right.dtype))
|
||||
|
||||
|
||||
class ArithmeticTask:
|
||||
def __init__(self, op_name, object):
|
||||
self._op_name = op_name
|
||||
|
||||
if not isinstance(object, ArithmeticObject):
|
||||
raise TypeError("Task requires ArithmeticObject not {}".format(type(object)))
|
||||
self._object = object
|
||||
|
||||
def __repr__(self):
|
||||
buf = StringIO()
|
||||
buf.write("op_name: {} ".format(self.op_name))
|
||||
buf.write("object: {} ".format(repr(self.object)))
|
||||
return buf.getvalue()
|
||||
|
||||
@property
|
||||
def op_name(self):
|
||||
return self._op_name
|
||||
|
||||
@property
|
||||
def object(self):
|
||||
return self._object
|
@ -118,7 +118,7 @@ class DataFrame(NDFrame):
|
||||
There are effectively 2 constructors:
|
||||
|
||||
1. client, index_pattern, columns, index_field
|
||||
2. query_compiler (eland.ElandQueryCompiler)
|
||||
2. query_compiler (eland.QueryCompiler)
|
||||
|
||||
The constructor with 'query_compiler' is for internal use only.
|
||||
"""
|
||||
@ -531,35 +531,11 @@ class DataFrame(NDFrame):
|
||||
is_source_field: False
|
||||
Mappings:
|
||||
capabilities:
|
||||
_source es_dtype pd_dtype searchable aggregatable
|
||||
AvgTicketPrice True float float64 True True
|
||||
Cancelled True boolean bool True True
|
||||
Carrier True keyword object True True
|
||||
Dest True keyword object True True
|
||||
DestAirportID True keyword object True True
|
||||
DestCityName True keyword object True True
|
||||
DestCountry True keyword object True True
|
||||
DestLocation True geo_point object True True
|
||||
DestRegion True keyword object True True
|
||||
DestWeather True keyword object True True
|
||||
DistanceKilometers True float float64 True True
|
||||
DistanceMiles True float float64 True True
|
||||
FlightDelay True boolean bool True True
|
||||
FlightDelayMin True integer int64 True True
|
||||
FlightDelayType True keyword object True True
|
||||
FlightNum True keyword object True True
|
||||
FlightTimeHour True float float64 True True
|
||||
FlightTimeMin True float float64 True True
|
||||
Origin True keyword object True True
|
||||
OriginAirportID True keyword object True True
|
||||
OriginCityName True keyword object True True
|
||||
OriginCountry True keyword object True True
|
||||
OriginLocation True geo_point object True True
|
||||
OriginRegion True keyword object True True
|
||||
OriginWeather True keyword object True True
|
||||
dayOfWeek True integer int64 True True
|
||||
timestamp True date datetime64[ns] True True
|
||||
date_fields_format: {}
|
||||
es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name
|
||||
timestamp timestamp True date None datetime64[ns] True True False timestamp
|
||||
OriginAirportID OriginAirportID True keyword None object True True False OriginAirportID
|
||||
DestAirportID DestAirportID True keyword None object True True False DestAirportID
|
||||
FlightDelayMin FlightDelayMin True integer None int64 True True False FlightDelayMin
|
||||
Operations:
|
||||
tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
|
||||
size: 5
|
||||
@ -567,8 +543,6 @@ class DataFrame(NDFrame):
|
||||
_source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
|
||||
body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}
|
||||
post_processing: [('sort_index')]
|
||||
'field_to_display_names': {}
|
||||
'display_to_field_names': {}
|
||||
<BLANKLINE>
|
||||
"""
|
||||
buf = StringIO()
|
||||
|
@ -11,6 +11,18 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
|
||||
@ -18,9 +30,10 @@ import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype,
|
||||
is_string_dtype)
|
||||
from pandas.core.dtypes.inference import is_list_like
|
||||
|
||||
|
||||
class Mappings:
|
||||
class FieldMappings:
|
||||
"""
|
||||
General purpose to manage Elasticsearch to/from pandas mappings
|
||||
|
||||
@ -30,26 +43,28 @@ class Mappings:
|
||||
_mappings_capabilities: pandas.DataFrame
|
||||
A data frame summarising the capabilities of the index mapping
|
||||
|
||||
_source - is top level field (i.e. not a multi-field sub-field)
|
||||
es_dtype - Elasticsearch field datatype
|
||||
pd_dtype - Pandas datatype
|
||||
searchable - is the field searchable?
|
||||
aggregatable- is the field aggregatable?
|
||||
_source es_dtype pd_dtype searchable aggregatable
|
||||
maps-telemetry.min True long int64 True True
|
||||
maps-telemetry.avg True float float64 True True
|
||||
city True text object True False
|
||||
user_name True keyword object True True
|
||||
origin_location.lat.keyword False keyword object True True
|
||||
type True keyword object True True
|
||||
origin_location.lat True text object True False
|
||||
index - the eland display name
|
||||
|
||||
es_field_name - the Elasticsearch field name
|
||||
is_source - is top level field (i.e. not a multi-field sub-field)
|
||||
es_dtype - Elasticsearch field datatype
|
||||
es_date_format - Elasticsearch date format (or None)
|
||||
pd_dtype - Pandas datatype
|
||||
is_searchable - is the field searchable?
|
||||
is_aggregatable - is the field aggregatable?
|
||||
is_scripted - is the field a scripted_field?
|
||||
aggregatable_es_field_name - either es_field_name (if aggregatable),
|
||||
or es_field_name.keyword (if exists) or None
|
||||
"""
|
||||
|
||||
# the labels for each column (display_name is index)
|
||||
column_labels = ['es_field_name', 'is_source', 'es_dtype', 'es_date_format', 'pd_dtype', 'is_searchable',
|
||||
'is_aggregatable', 'is_scripted', 'aggregatable_es_field_name']
|
||||
|
||||
def __init__(self,
|
||||
client=None,
|
||||
index_pattern=None,
|
||||
mappings=None):
|
||||
display_names=None):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
@ -59,39 +74,29 @@ class Mappings:
|
||||
index_pattern: str
|
||||
Elasticsearch index pattern
|
||||
|
||||
Copy constructor arguments
|
||||
|
||||
mappings: Mappings
|
||||
Object to copy
|
||||
display_names: list of str
|
||||
Field names to display
|
||||
"""
|
||||
if (client is None) or (index_pattern is None):
|
||||
raise ValueError("Can not initialise mapping without client or index_pattern {} {}", client, index_pattern)
|
||||
|
||||
# here we keep track of the format of any date fields
|
||||
self._date_fields_format = dict()
|
||||
if (client is not None) and (index_pattern is not None):
|
||||
get_mapping = client.get_mapping(index=index_pattern)
|
||||
get_mapping = client.get_mapping(index=index_pattern)
|
||||
|
||||
# Get all fields (including all nested) and then all field_caps
|
||||
all_fields, self._date_fields_format = Mappings._extract_fields_from_mapping(get_mapping)
|
||||
all_fields_caps = client.field_caps(index=index_pattern, fields='*')
|
||||
# Get all fields (including all nested) and then all field_caps
|
||||
all_fields = FieldMappings._extract_fields_from_mapping(get_mapping)
|
||||
all_fields_caps = client.field_caps(index=index_pattern, fields='*')
|
||||
|
||||
# Get top level (not sub-field multifield) mappings
|
||||
source_fields, _ = Mappings._extract_fields_from_mapping(get_mapping, source_only=True)
|
||||
# Get top level (not sub-field multifield) mappings
|
||||
source_fields = FieldMappings._extract_fields_from_mapping(get_mapping, source_only=True)
|
||||
|
||||
# Populate capability matrix of fields
|
||||
# field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
|
||||
self._mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
|
||||
else:
|
||||
# straight copy
|
||||
self._mappings_capabilities = mappings._mappings_capabilities.copy()
|
||||
# Populate capability matrix of fields
|
||||
self._mappings_capabilities = FieldMappings._create_capability_matrix(all_fields, source_fields,
|
||||
all_fields_caps)
|
||||
|
||||
# Cache source field types for efficient lookup
|
||||
# (this massively improves performance of DataFrame.flatten)
|
||||
|
||||
self._source_field_pd_dtypes = OrderedDict()
|
||||
|
||||
for field_name in self._mappings_capabilities[self._mappings_capabilities._source].index:
|
||||
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
||||
self._source_field_pd_dtypes[field_name] = pd_dtype
|
||||
if display_names is not None:
|
||||
self.display_names = display_names
|
||||
|
||||
@staticmethod
|
||||
def _extract_fields_from_mapping(mappings, source_only=False, date_format=None):
|
||||
@ -143,7 +148,6 @@ class Mappings:
|
||||
|
||||
"""
|
||||
fields = OrderedDict()
|
||||
dates_format = dict()
|
||||
|
||||
# Recurse until we get a 'type: xxx'
|
||||
def flatten(x, name=''):
|
||||
@ -153,15 +157,16 @@ class Mappings:
|
||||
field_name = name[:-1]
|
||||
field_type = x[a]
|
||||
# if field_type is 'date' keep track of the format info when available
|
||||
date_format = None
|
||||
if field_type == "date" and "format" in x:
|
||||
dates_format[field_name] = x["format"]
|
||||
date_format = x["format"]
|
||||
# If there is a conflicting type, warn - first values added wins
|
||||
if field_name in fields and fields[field_name] != field_type:
|
||||
warnings.warn("Field {} has conflicting types {} != {}".
|
||||
format(field_name, fields[field_name], field_type),
|
||||
UserWarning)
|
||||
else:
|
||||
fields[field_name] = field_type
|
||||
fields[field_name] = (field_type, date_format)
|
||||
elif a == 'properties' or (not source_only and a == 'fields'):
|
||||
flatten(x[a], name)
|
||||
elif not (source_only and a == 'fields'): # ignore multi-field fields for source_only
|
||||
@ -173,7 +178,7 @@ class Mappings:
|
||||
|
||||
flatten(properties)
|
||||
|
||||
return fields, dates_format
|
||||
return fields
|
||||
|
||||
@staticmethod
|
||||
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
|
||||
@ -206,7 +211,6 @@ class Mappings:
|
||||
"""
|
||||
all_fields_caps_fields = all_fields_caps['fields']
|
||||
|
||||
field_names = ['_source', 'es_dtype', 'pd_dtype', 'searchable', 'aggregatable']
|
||||
capability_matrix = OrderedDict()
|
||||
|
||||
for field, field_caps in all_fields_caps_fields.items():
|
||||
@ -214,12 +218,17 @@ class Mappings:
|
||||
# v = {'long': {'type': 'long', 'searchable': True, 'aggregatable': True}}
|
||||
for kk, vv in field_caps.items():
|
||||
_source = (field in source_fields)
|
||||
es_field_name = field
|
||||
es_dtype = vv['type']
|
||||
pd_dtype = Mappings._es_dtype_to_pd_dtype(vv['type'])
|
||||
searchable = vv['searchable']
|
||||
aggregatable = vv['aggregatable']
|
||||
es_date_format = all_fields[field][1]
|
||||
pd_dtype = FieldMappings._es_dtype_to_pd_dtype(vv['type'])
|
||||
is_searchable = vv['searchable']
|
||||
is_aggregatable = vv['aggregatable']
|
||||
scripted = False
|
||||
aggregatable_es_field_name = None # this is populated later
|
||||
|
||||
caps = [_source, es_dtype, pd_dtype, searchable, aggregatable]
|
||||
caps = [es_field_name, _source, es_dtype, es_date_format, pd_dtype, is_searchable, is_aggregatable,
|
||||
scripted, aggregatable_es_field_name]
|
||||
|
||||
capability_matrix[field] = caps
|
||||
|
||||
@ -232,9 +241,34 @@ class Mappings:
|
||||
format(field, vv['non_searchable_indices']),
|
||||
UserWarning)
|
||||
|
||||
capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=field_names)
|
||||
capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index',
|
||||
columns=FieldMappings.column_labels)
|
||||
|
||||
return capability_matrix_df.sort_index()
|
||||
def find_aggregatable(row, df):
|
||||
# convert series to dict so we can add 'aggregatable_es_field_name'
|
||||
row_as_dict = row.to_dict()
|
||||
if row_as_dict['is_aggregatable'] == False:
|
||||
# if not aggregatable, then try field.keyword
|
||||
es_field_name_keyword = row.es_field_name + '.keyword'
|
||||
try:
|
||||
series = df.loc[df.es_field_name == es_field_name_keyword]
|
||||
if not series.empty and series.is_aggregatable.squeeze():
|
||||
row_as_dict['aggregatable_es_field_name'] = es_field_name_keyword
|
||||
else:
|
||||
row_as_dict['aggregatable_es_field_name'] = None
|
||||
except KeyError:
|
||||
row_as_dict['aggregatable_es_field_name'] = None
|
||||
else:
|
||||
row_as_dict['aggregatable_es_field_name'] = row_as_dict['es_field_name']
|
||||
|
||||
return pd.Series(data=row_as_dict)
|
||||
|
||||
# add aggregatable_es_field_name column by applying action to each row
|
||||
capability_matrix_df = capability_matrix_df.apply(find_aggregatable, args=(capability_matrix_df,),
|
||||
axis='columns')
|
||||
|
||||
# return just source fields (as these are the only ones we display)
|
||||
return capability_matrix_df[capability_matrix_df.is_source].sort_index()
|
||||
|
||||
@staticmethod
|
||||
def _es_dtype_to_pd_dtype(es_dtype):
|
||||
@ -352,105 +386,50 @@ class Mappings:
|
||||
if geo_points is not None and field_name_name in geo_points:
|
||||
es_dtype = 'geo_point'
|
||||
else:
|
||||
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
||||
es_dtype = FieldMappings._pd_dtype_to_es_dtype(dtype)
|
||||
|
||||
mappings['properties'][field_name_name] = OrderedDict()
|
||||
mappings['properties'][field_name_name]['type'] = es_dtype
|
||||
|
||||
return {"mappings": mappings}
|
||||
|
||||
def all_fields(self):
|
||||
def aggregatable_field_name(self, display_name):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
all_fields: list
|
||||
All typed fields in the index mapping
|
||||
"""
|
||||
return self._mappings_capabilities.index.tolist()
|
||||
Return a single aggregatable field_name from display_name
|
||||
|
||||
Logic here is that field_name names are '_source' fields and keyword fields
|
||||
may be nested beneath the field. E.g.
|
||||
customer_full_name: text
|
||||
customer_full_name.keyword: keyword
|
||||
|
||||
customer_full_name.keyword is the aggregatable field for customer_full_name
|
||||
|
||||
def field_capabilities(self, field_name):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
field_name: str
|
||||
display_name: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
mappings_capabilities: pd.Series with index values:
|
||||
_source: bool
|
||||
Is this field name a top-level source field?
|
||||
ed_dtype: str
|
||||
The Elasticsearch data type
|
||||
pd_dtype: str
|
||||
The pandas data type
|
||||
searchable: bool
|
||||
Is the field searchable in Elasticsearch?
|
||||
aggregatable: bool
|
||||
Is the field aggregatable in Elasticsearch?
|
||||
aggregatable_es_field_name: str or None
|
||||
The aggregatable field name associated with display_name. This could be the field_name, or the
|
||||
field_name.keyword.
|
||||
|
||||
raise KeyError if the field_name doesn't exist in the mapping, or isn't aggregatable
|
||||
"""
|
||||
try:
|
||||
field_capabilities = self._mappings_capabilities.loc[field_name]
|
||||
except KeyError:
|
||||
field_capabilities = pd.Series()
|
||||
return field_capabilities
|
||||
if display_name not in self._mappings_capabilities.index:
|
||||
raise KeyError("Can not get aggregatable field name for invalid display name {}".format(display_name))
|
||||
|
||||
def get_date_field_format(self, field_name):
|
||||
if self._mappings_capabilities.loc[display_name].aggregatable_es_field_name is None:
|
||||
warnings.warn("Aggregations not supported for '{}' '{}'".format(display_name,
|
||||
self._mappings_capabilities.loc[
|
||||
display_name].es_field_name))
|
||||
|
||||
return self._mappings_capabilities.loc[display_name].aggregatable_es_field_name
|
||||
|
||||
def aggregatable_field_names(self):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
field_name: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A string (for date fields) containing the date format for the field
|
||||
"""
|
||||
return self._date_fields_format.get(field_name)
|
||||
|
||||
def source_field_pd_dtype(self, field_name):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
field_name: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_source_field: bool
|
||||
Is this field name a top-level source field?
|
||||
pd_dtype: str
|
||||
The pandas data type we map to
|
||||
"""
|
||||
pd_dtype = 'object'
|
||||
is_source_field = False
|
||||
|
||||
if field_name in self._source_field_pd_dtypes:
|
||||
is_source_field = True
|
||||
pd_dtype = self._source_field_pd_dtypes[field_name]
|
||||
|
||||
return is_source_field, pd_dtype
|
||||
|
||||
def is_source_field(self, field_name):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
field_name: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_source_field: bool
|
||||
Is this field name a top-level source field?
|
||||
"""
|
||||
is_source_field = False
|
||||
|
||||
if field_name in self._source_field_pd_dtypes:
|
||||
is_source_field = True
|
||||
|
||||
return is_source_field
|
||||
|
||||
def aggregatable_field_names(self, field_names=None):
|
||||
"""
|
||||
Return a dict of aggregatable field_names from all field_names or field_names list
|
||||
{'customer_full_name': 'customer_full_name.keyword', ...}
|
||||
Return a list of aggregatable Elasticsearch field_names for all display names.
|
||||
If field is not aggregatable_field_names, return nothing.
|
||||
|
||||
Logic here is that field_name names are '_source' fields and keyword fields
|
||||
may be nested beneath the field. E.g.
|
||||
@ -461,29 +440,83 @@ class Mappings:
|
||||
|
||||
Returns
|
||||
-------
|
||||
OrderedDict
|
||||
e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
|
||||
Dict of aggregatable_field_names
|
||||
key = aggregatable_field_name, value = field_name
|
||||
e.g. {'customer_full_name.keyword': 'customer_full_name', ...}
|
||||
"""
|
||||
if field_names is None:
|
||||
field_names = self.source_fields()
|
||||
aggregatables = OrderedDict()
|
||||
for field_name in field_names:
|
||||
capabilities = self.field_capabilities(field_name)
|
||||
if capabilities['aggregatable']:
|
||||
aggregatables[field_name] = field_name
|
||||
else:
|
||||
# Try 'field_name.keyword'
|
||||
field_name_keyword = field_name + '.keyword'
|
||||
capabilities = self.field_capabilities(field_name_keyword)
|
||||
if not capabilities.empty and capabilities.get('aggregatable'):
|
||||
aggregatables[field_name_keyword] = field_name
|
||||
non_aggregatables = self._mappings_capabilities[self._mappings_capabilities.aggregatable_es_field_name.isna()]
|
||||
if not non_aggregatables.empty:
|
||||
warnings.warn("Aggregations not supported for '{}'".format(non_aggregatables))
|
||||
|
||||
if not aggregatables:
|
||||
raise ValueError("Aggregations not supported for ", field_names)
|
||||
aggregatables = self._mappings_capabilities[self._mappings_capabilities.aggregatable_es_field_name.notna()]
|
||||
|
||||
return aggregatables
|
||||
# extract relevant fields and convert to dict
|
||||
# <class 'dict'>: {'category.keyword': 'category', 'currency': 'currency', ...
|
||||
return OrderedDict(aggregatables[['aggregatable_es_field_name', 'es_field_name']].to_dict(orient='split')['data'])
|
||||
|
||||
def numeric_source_fields(self, field_names, include_bool=True):
|
||||
def get_date_field_format(self, es_field_name):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_field_name: str
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A string (for date fields) containing the date format for the field
|
||||
"""
|
||||
return self._mappings_capabilities.loc[
|
||||
self._mappings_capabilities.es_field_name == es_field_name].es_date_format.squeeze()
|
||||
|
||||
def field_name_pd_dtype(self, es_field_name):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
es_field_name: str
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd_dtype: str
|
||||
The pandas data type we map to
|
||||
|
||||
Raises
|
||||
------
|
||||
KeyError
|
||||
If es_field_name does not exist in mapping
|
||||
"""
|
||||
if es_field_name not in self._mappings_capabilities.es_field_name:
|
||||
raise KeyError("es_field_name {} does not exist".format(es_field_name))
|
||||
|
||||
pd_dtype = self._mappings_capabilities.loc[
|
||||
self._mappings_capabilities.es_field_name == es_field_name
|
||||
].pd_dtype.squeeze()
|
||||
return pd_dtype
|
||||
|
||||
def add_scripted_field(self, scripted_field_name, display_name, pd_dtype):
|
||||
# if this display name is used somewhere else, drop it
|
||||
if display_name in self._mappings_capabilities.index:
|
||||
self._mappings_capabilities = self._mappings_capabilities.drop(index=[display_name])
|
||||
|
||||
# ['es_field_name', 'is_source', 'es_dtype', 'es_date_format', 'pd_dtype', 'is_searchable',
|
||||
# 'is_aggregatable', 'is_scripted', 'aggregatable_es_field_name']
|
||||
|
||||
capabilities = {display_name: [scripted_field_name,
|
||||
False,
|
||||
self._pd_dtype_to_es_dtype(pd_dtype),
|
||||
None,
|
||||
pd_dtype,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
scripted_field_name]}
|
||||
|
||||
capability_matrix_row = pd.DataFrame.from_dict(capabilities, orient='index',
|
||||
columns=FieldMappings.column_labels)
|
||||
|
||||
self._mappings_capabilities = self._mappings_capabilities.append(capability_matrix_row)
|
||||
|
||||
def numeric_source_fields(self, include_bool=True):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
@ -491,60 +524,83 @@ class Mappings:
|
||||
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||
"""
|
||||
if include_bool:
|
||||
df = self._mappings_capabilities[self._mappings_capabilities._source &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
df = self._mappings_capabilities[((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))]
|
||||
else:
|
||||
df = self._mappings_capabilities[self._mappings_capabilities._source &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
df = self._mappings_capabilities[((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))]
|
||||
# if field_names exists, filter index with field_names
|
||||
if field_names is not None:
|
||||
# reindex adds NA for non-existing field_names (non-numeric), so drop these after reindex
|
||||
df = df.reindex(field_names)
|
||||
df.dropna(inplace=True)
|
||||
|
||||
# return as list
|
||||
return df.index.to_list()
|
||||
# return as list for display names (in display_name order)
|
||||
return df.es_field_name.to_list()
|
||||
|
||||
def source_fields(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
source_fields: list of str
|
||||
List of source fields
|
||||
"""
|
||||
return self._source_field_pd_dtypes.keys()
|
||||
def get_field_names(self, include_scripted_fields=True):
|
||||
if include_scripted_fields:
|
||||
return self._mappings_capabilities.es_field_name.to_list()
|
||||
|
||||
def count_source_fields(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
count_source_fields: int
|
||||
Number of source fields in mapping
|
||||
"""
|
||||
return len(self._source_field_pd_dtypes)
|
||||
return self._mappings_capabilities[
|
||||
self._mappings_capabilities.is_scripted == False
|
||||
].es_field_name.to_list()
|
||||
|
||||
def dtypes(self, field_names=None):
|
||||
def _get_display_names(self):
|
||||
return self._mappings_capabilities.index.to_list()
|
||||
|
||||
def _set_display_names(self, display_names):
|
||||
if not is_list_like(display_names):
|
||||
raise ValueError("'{}' is not list like".format(display_names))
|
||||
|
||||
if list(set(display_names) - set(self.display_names)):
|
||||
raise KeyError("{} not in display names {}".format(display_names, self.display_names))
|
||||
|
||||
self._mappings_capabilities = self._mappings_capabilities.reindex(display_names)
|
||||
|
||||
display_names = property(_get_display_names, _set_display_names)
|
||||
|
||||
def dtypes(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
dtypes: pd.Series
|
||||
Source field name + pd_dtype as np.dtype
|
||||
Index: Display name
|
||||
Values: pd_dtype as np.dtype
|
||||
"""
|
||||
if field_names is not None:
|
||||
data = OrderedDict()
|
||||
for key in field_names:
|
||||
data[key] = np.dtype(self._source_field_pd_dtypes[key])
|
||||
return pd.Series(data)
|
||||
pd_dtypes = self._mappings_capabilities['pd_dtype']
|
||||
|
||||
data = OrderedDict()
|
||||
for key, value in self._source_field_pd_dtypes.items():
|
||||
data[key] = np.dtype(value)
|
||||
return pd.Series(data)
|
||||
# Set name of the returned series as None
|
||||
pd_dtypes.name = None
|
||||
|
||||
# Convert return from 'str' to 'np.dtype'
|
||||
return pd_dtypes.apply(lambda x: np.dtype(x))
|
||||
|
||||
def info_es(self, buf):
|
||||
buf.write("Mappings:\n")
|
||||
buf.write(" capabilities:\n{}\n".format(self._mappings_capabilities.to_string()))
|
||||
buf.write(" date_fields_format: {}\n".format(self._date_fields_format))
|
||||
buf.write(" capabilities:\n{0}\n".format(self._mappings_capabilities.to_string()))
|
||||
|
||||
def rename(self, old_name_new_name_dict):
|
||||
"""
|
||||
Renames display names in-place
|
||||
|
||||
Parameters
|
||||
----------
|
||||
old_name_new_name_dict
|
||||
|
||||
Returns
|
||||
-------
|
||||
Nothing
|
||||
|
||||
Notes
|
||||
-----
|
||||
For the names that do not exist this is a no op
|
||||
"""
|
||||
self._mappings_capabilities = self._mappings_capabilities.rename(index=old_name_new_name_dict)
|
||||
|
||||
def get_renames(self):
|
||||
# return dict of renames { old_name: new_name, ... } (inefficient)
|
||||
renames = {}
|
||||
|
||||
for display_name in self.display_names:
|
||||
field_name = self._mappings_capabilities.loc[display_name].es_field_name
|
||||
if field_name != display_name:
|
||||
renames[field_name] = display_name
|
||||
|
||||
return renames
|
@ -60,7 +60,7 @@ class NDFrame(ABC):
|
||||
A reference to a Elasticsearch python client
|
||||
"""
|
||||
if query_compiler is None:
|
||||
query_compiler = QueryCompiler(client=client, index_pattern=index_pattern, field_names=columns,
|
||||
query_compiler = QueryCompiler(client=client, index_pattern=index_pattern, display_names=columns,
|
||||
index_field=index_field)
|
||||
self._query_compiler = query_compiler
|
||||
|
||||
|
@ -11,10 +11,9 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@ -38,18 +37,19 @@ class Operations:
|
||||
(see https://docs.dask.org/en/latest/spec.html)
|
||||
"""
|
||||
|
||||
def __init__(self, tasks=None, field_names=None):
|
||||
def __init__(self, tasks=None, arithmetic_op_fields_task=None):
|
||||
if tasks is None:
|
||||
self._tasks = []
|
||||
else:
|
||||
self._tasks = tasks
|
||||
self._field_names = field_names
|
||||
self._arithmetic_op_fields_task = arithmetic_op_fields_task
|
||||
|
||||
def __constructor__(self, *args, **kwargs):
|
||||
return type(self)(*args, **kwargs)
|
||||
|
||||
def copy(self):
|
||||
return self.__constructor__(tasks=copy.deepcopy(self._tasks), field_names=copy.deepcopy(self._field_names))
|
||||
return self.__constructor__(tasks=copy.deepcopy(self._tasks),
|
||||
arithmetic_op_fields_task=copy.deepcopy(self._arithmetic_op_fields_task))
|
||||
|
||||
def head(self, index, n):
|
||||
# Add a task that is an ascending sort with size=n
|
||||
@ -61,29 +61,21 @@ class Operations:
|
||||
task = TailTask(index.sort_field, n)
|
||||
self._tasks.append(task)
|
||||
|
||||
def arithmetic_op_fields(self, field_name, op_name, left_field, right_field, op_type=None):
|
||||
# Set this as a column we want to retrieve
|
||||
self.set_field_names([field_name])
|
||||
def arithmetic_op_fields(self, display_name, arithmetic_series):
|
||||
if self._arithmetic_op_fields_task is None:
|
||||
self._arithmetic_op_fields_task = ArithmeticOpFieldsTask(display_name, arithmetic_series)
|
||||
else:
|
||||
self._arithmetic_op_fields_task.update(display_name, arithmetic_series)
|
||||
|
||||
task = ArithmeticOpFieldsTask(field_name, op_name, left_field, right_field, op_type)
|
||||
self._tasks.append(task)
|
||||
|
||||
def set_field_names(self, field_names):
|
||||
if not isinstance(field_names, list):
|
||||
field_names = list(field_names)
|
||||
|
||||
self._field_names = field_names
|
||||
|
||||
return self._field_names
|
||||
|
||||
def get_field_names(self):
|
||||
return self._field_names
|
||||
def get_arithmetic_op_fields(self):
|
||||
# get an ArithmeticOpFieldsTask if it exists
|
||||
return self._arithmetic_op_fields_task
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self._tasks)
|
||||
|
||||
def count(self, query_compiler):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
# Elasticsearch _count is very efficient and so used to return results here. This means that
|
||||
# data frames that have restricted size or sort params will not return valid results
|
||||
@ -95,7 +87,7 @@ class Operations:
|
||||
.format(query_params, post_processing))
|
||||
|
||||
# Only return requested field_names
|
||||
fields = query_compiler.field_names
|
||||
fields = query_compiler.get_field_names(include_scripted_fields=False)
|
||||
|
||||
counts = OrderedDict()
|
||||
for field in fields:
|
||||
@ -142,45 +134,58 @@ class Operations:
|
||||
pandas.Series
|
||||
Series containing results of `func` applied to the field_name(s)
|
||||
"""
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
field_names = self.get_field_names()
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
results = OrderedDict()
|
||||
|
||||
# some metrics aggs (including cardinality) work on all aggregatable fields
|
||||
# therefore we include an optional all parameter on operations
|
||||
# that call _metric_aggs
|
||||
if field_types == 'aggregatable':
|
||||
source_fields = query_compiler._mappings.aggregatable_field_names(field_names)
|
||||
else:
|
||||
source_fields = query_compiler._mappings.numeric_source_fields(field_names)
|
||||
aggregatable_field_names = query_compiler._mappings.aggregatable_field_names()
|
||||
|
||||
for field in source_fields:
|
||||
body.metric_aggs(field, func, field)
|
||||
for field in aggregatable_field_names.keys():
|
||||
body.metric_aggs(field, func, field)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
body=body.to_search_body())
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
body=body.to_search_body())
|
||||
|
||||
# Results are of the form
|
||||
# "aggregations" : {
|
||||
# "AvgTicketPrice" : {
|
||||
# "value" : 628.2536888148849
|
||||
# }
|
||||
# }
|
||||
results = OrderedDict()
|
||||
# Results are of the form
|
||||
# "aggregations" : {
|
||||
# "customer_full_name.keyword" : {
|
||||
# "value" : 10
|
||||
# }
|
||||
# }
|
||||
|
||||
if field_types == 'aggregatable':
|
||||
for key, value in source_fields.items():
|
||||
# map aggregatable (e.g. x.keyword) to field_name
|
||||
for key, value in aggregatable_field_names.items():
|
||||
results[value] = response['aggregations'][key]['value']
|
||||
else:
|
||||
for field in source_fields:
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields()
|
||||
|
||||
for field in numeric_source_fields:
|
||||
body.metric_aggs(field, func, field)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
body=body.to_search_body())
|
||||
|
||||
# Results are of the form
|
||||
# "aggregations" : {
|
||||
# "AvgTicketPrice" : {
|
||||
# "value" : 628.2536888148849
|
||||
# }
|
||||
# }
|
||||
for field in numeric_source_fields:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
# Return single value if this is a series
|
||||
@ -202,16 +207,14 @@ class Operations:
|
||||
pandas.Series
|
||||
Series containing results of `func` applied to the field_name(s)
|
||||
"""
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
field_names = self.get_field_names()
|
||||
|
||||
# Get just aggregatable field_names
|
||||
aggregatable_field_names = query_compiler._mappings.aggregatable_field_names(field_names)
|
||||
aggregatable_field_names = query_compiler._mappings.aggregatable_field_names()
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
@ -232,7 +235,8 @@ class Operations:
|
||||
results[bucket['key']] = bucket['doc_count']
|
||||
|
||||
try:
|
||||
name = field_names[0]
|
||||
# get first value in dict (key is .keyword)
|
||||
name = list(aggregatable_field_names.values())[0]
|
||||
except IndexError:
|
||||
name = None
|
||||
|
||||
@ -242,15 +246,13 @@ class Operations:
|
||||
|
||||
def _hist_aggs(self, query_compiler, num_bins):
|
||||
# Get histogram bins and weights for numeric field_names
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
field_names = self.get_field_names()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(field_names)
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields()
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
@ -300,9 +302,9 @@ class Operations:
|
||||
# in case of dataframe, throw warning that field is excluded
|
||||
if not response['aggregations'].get(field):
|
||||
warnings.warn("{} has no meaningful histogram interval and will be excluded. "
|
||||
"All values 0."
|
||||
.format(field),
|
||||
UserWarning)
|
||||
"All values 0."
|
||||
.format(field),
|
||||
UserWarning)
|
||||
continue
|
||||
|
||||
buckets = response['aggregations'][field]['buckets']
|
||||
@ -396,13 +398,13 @@ class Operations:
|
||||
return ed_aggs
|
||||
|
||||
def aggs(self, query_compiler, pd_aggs):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
field_names = self.get_field_names()
|
||||
field_names = query_compiler.get_field_names(include_scripted_fields=False)
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
@ -446,15 +448,13 @@ class Operations:
|
||||
return df
|
||||
|
||||
def describe(self, query_compiler):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
field_names = self.get_field_names()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(field_names, include_bool=False)
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(include_bool=False)
|
||||
|
||||
# for each field we compute:
|
||||
# count, mean, std, min, 25%, 50%, 75%, max
|
||||
@ -510,8 +510,8 @@ class Operations:
|
||||
|
||||
def to_csv(self, query_compiler, **kwargs):
|
||||
class PandasToCSVCollector:
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
def __init__(self, **args):
|
||||
self.args = args
|
||||
self.ret = None
|
||||
self.first_time = True
|
||||
|
||||
@ -520,12 +520,12 @@ class Operations:
|
||||
# and append results
|
||||
if self.first_time:
|
||||
self.first_time = False
|
||||
df.to_csv(**self.kwargs)
|
||||
df.to_csv(**self.args)
|
||||
else:
|
||||
# Don't write header, and change mode to append
|
||||
self.kwargs['header'] = False
|
||||
self.kwargs['mode'] = 'a'
|
||||
df.to_csv(**self.kwargs)
|
||||
self.args['header'] = False
|
||||
self.args['mode'] = 'a'
|
||||
df.to_csv(**self.args)
|
||||
|
||||
@staticmethod
|
||||
def batch_size():
|
||||
@ -540,7 +540,7 @@ class Operations:
|
||||
return collector.ret
|
||||
|
||||
def _es_results(self, query_compiler, collector):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
|
||||
|
||||
@ -552,7 +552,9 @@ class Operations:
|
||||
body['script_fields'] = script_fields
|
||||
|
||||
# Only return requested field_names
|
||||
field_names = self.get_field_names()
|
||||
_source = query_compiler.get_field_names(include_scripted_fields=False)
|
||||
if not _source:
|
||||
_source = False
|
||||
|
||||
es_results = None
|
||||
|
||||
@ -567,7 +569,7 @@ class Operations:
|
||||
size=size,
|
||||
sort=sort_params,
|
||||
body=body,
|
||||
_source=field_names)
|
||||
_source=_source)
|
||||
except Exception:
|
||||
# Catch all ES errors and print debug (currently to stdout)
|
||||
error = {
|
||||
@ -575,7 +577,7 @@ class Operations:
|
||||
'size': size,
|
||||
'sort': sort_params,
|
||||
'body': body,
|
||||
'_source': field_names
|
||||
'_source': _source
|
||||
}
|
||||
print("Elasticsearch error:", error)
|
||||
raise
|
||||
@ -584,7 +586,7 @@ class Operations:
|
||||
es_results = query_compiler._client.scan(
|
||||
index=query_compiler._index_pattern,
|
||||
query=body,
|
||||
_source=field_names)
|
||||
_source=_source)
|
||||
# create post sort
|
||||
if sort_params is not None:
|
||||
post_processing.append(SortFieldAction(sort_params))
|
||||
@ -603,7 +605,7 @@ class Operations:
|
||||
|
||||
def index_count(self, query_compiler, field):
|
||||
# field is the index field so count values
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
|
||||
@ -617,12 +619,12 @@ class Operations:
|
||||
|
||||
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
|
||||
|
||||
def _validate_index_operation(self, items):
|
||||
def _validate_index_operation(self, query_compiler, items):
|
||||
if not isinstance(items, list):
|
||||
raise TypeError("list item required - not {}".format(type(items)))
|
||||
|
||||
# field is the index field so count values
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
|
||||
@ -633,7 +635,7 @@ class Operations:
|
||||
return query_params, post_processing
|
||||
|
||||
def index_matches_count(self, query_compiler, field, items):
|
||||
query_params, post_processing = self._validate_index_operation(items)
|
||||
query_params, post_processing = self._validate_index_operation(query_compiler, items)
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
@ -645,7 +647,7 @@ class Operations:
|
||||
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
|
||||
|
||||
def drop_index_values(self, query_compiler, field, items):
|
||||
self._validate_index_operation(items)
|
||||
self._validate_index_operation(query_compiler, items)
|
||||
|
||||
# Putting boolean queries together
|
||||
# i = 10
|
||||
@ -689,7 +691,7 @@ class Operations:
|
||||
|
||||
return df
|
||||
|
||||
def _resolve_tasks(self):
|
||||
def _resolve_tasks(self, query_compiler):
|
||||
# We now try and combine all tasks into an Elasticsearch query
|
||||
# Some operations can be simply combined into a single query
|
||||
# other operations require pre-queries and then combinations
|
||||
@ -704,7 +706,11 @@ class Operations:
|
||||
post_processing = []
|
||||
|
||||
for task in self._tasks:
|
||||
query_params, post_processing = task.resolve_task(query_params, post_processing)
|
||||
query_params, post_processing = task.resolve_task(query_params, post_processing, query_compiler)
|
||||
|
||||
if self._arithmetic_op_fields_task is not None:
|
||||
query_params, post_processing = self._arithmetic_op_fields_task.resolve_task(query_params, post_processing,
|
||||
query_compiler)
|
||||
|
||||
return query_params, post_processing
|
||||
|
||||
@ -722,13 +728,13 @@ class Operations:
|
||||
# This can return None
|
||||
return size
|
||||
|
||||
def info_es(self, buf):
|
||||
def info_es(self, query_compiler, buf):
|
||||
buf.write("Operations:\n")
|
||||
buf.write(" tasks: {0}\n".format(self._tasks))
|
||||
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
|
||||
field_names = self.get_field_names()
|
||||
_source = query_compiler._mappings.get_field_names()
|
||||
|
||||
script_fields = query_params['query_script_fields']
|
||||
query = Query(query_params['query'])
|
||||
@ -738,7 +744,7 @@ class Operations:
|
||||
|
||||
buf.write(" size: {0}\n".format(size))
|
||||
buf.write(" sort_params: {0}\n".format(sort_params))
|
||||
buf.write(" _source: {0}\n".format(field_names))
|
||||
buf.write(" _source: {0}\n".format(_source))
|
||||
buf.write(" body: {0}\n".format(body))
|
||||
buf.write(" post_processing: {0}\n".format(post_processing))
|
||||
|
||||
|
@ -21,20 +21,15 @@ from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
|
||||
class Query:
|
||||
"""
|
||||
Simple class to manage building Elasticsearch queries.
|
||||
|
||||
Specifically, this
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, query=None):
|
||||
if query is None:
|
||||
self._query = BooleanFilter()
|
||||
self._script_fields = {}
|
||||
self._aggs = {}
|
||||
else:
|
||||
# Deep copy the incoming query so we can change it
|
||||
self._query = deepcopy(query._query)
|
||||
self._script_fields = deepcopy(query._script_fields)
|
||||
self._aggs = deepcopy(query._aggs)
|
||||
|
||||
def exists(self, field, must=True):
|
||||
@ -182,13 +177,5 @@ class Query:
|
||||
else:
|
||||
self._query = self._query & boolean_filter
|
||||
|
||||
def arithmetic_op_fields(self, op_name, left_field, right_field):
|
||||
if self._script_fields.empty():
|
||||
body = None
|
||||
else:
|
||||
body = {"query": self._script_fields.build()}
|
||||
|
||||
return body
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.to_search_body())
|
||||
|
@ -11,7 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
@ -20,8 +20,8 @@ import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from eland import Client
|
||||
from eland import FieldMappings
|
||||
from eland import Index
|
||||
from eland import Mappings
|
||||
from eland import Operations
|
||||
|
||||
|
||||
@ -54,72 +54,58 @@ class QueryCompiler:
|
||||
A way to mitigate this would be to post process this drop - TODO
|
||||
"""
|
||||
|
||||
def __init__(self, client=None, index_pattern=None, field_names=None, index_field=None, operations=None,
|
||||
name_mapper=None):
|
||||
self._client = Client(client)
|
||||
self._index_pattern = index_pattern
|
||||
|
||||
# Get and persist mappings, this allows us to correctly
|
||||
# map returned types from Elasticsearch to pandas datatypes
|
||||
self._mappings = Mappings(client=self._client, index_pattern=self._index_pattern)
|
||||
|
||||
self._index = Index(self, index_field)
|
||||
|
||||
if operations is None:
|
||||
def __init__(self,
|
||||
client=None,
|
||||
index_pattern=None,
|
||||
display_names=None,
|
||||
index_field=None,
|
||||
to_copy=None):
|
||||
# Implement copy as we don't deep copy the client
|
||||
if to_copy is not None:
|
||||
self._client = Client(to_copy._client)
|
||||
self._index_pattern = to_copy._index_pattern
|
||||
self._index = Index(self, to_copy._index.index_field)
|
||||
self._operations = copy.deepcopy(to_copy._operations)
|
||||
self._mappings = copy.deepcopy(to_copy._mappings)
|
||||
else:
|
||||
self._client = Client(client)
|
||||
self._index_pattern = index_pattern
|
||||
# Get and persist mappings, this allows us to correctly
|
||||
# map returned types from Elasticsearch to pandas datatypes
|
||||
self._mappings = FieldMappings(client=self._client, index_pattern=self._index_pattern,
|
||||
display_names=display_names)
|
||||
self._index = Index(self, index_field)
|
||||
self._operations = Operations()
|
||||
else:
|
||||
self._operations = operations
|
||||
|
||||
if field_names is not None:
|
||||
self.field_names = field_names
|
||||
|
||||
if name_mapper is None:
|
||||
self._name_mapper = QueryCompiler.DisplayNameToFieldNameMapper()
|
||||
else:
|
||||
self._name_mapper = name_mapper
|
||||
|
||||
def _get_index(self):
|
||||
@property
|
||||
def index(self):
|
||||
return self._index
|
||||
|
||||
def _get_field_names(self):
|
||||
field_names = self._operations.get_field_names()
|
||||
if field_names is None:
|
||||
# default to all
|
||||
field_names = self._mappings.source_fields()
|
||||
|
||||
return pd.Index(field_names)
|
||||
|
||||
def _set_field_names(self, field_names):
|
||||
self._operations.set_field_names(field_names)
|
||||
|
||||
field_names = property(_get_field_names, _set_field_names)
|
||||
|
||||
def _get_columns(self):
|
||||
columns = self._operations.get_field_names()
|
||||
if columns is None:
|
||||
# default to all
|
||||
columns = self._mappings.source_fields()
|
||||
|
||||
# map renames
|
||||
columns = self._name_mapper.field_to_display_names(columns)
|
||||
@property
|
||||
def columns(self):
|
||||
columns = self._mappings.display_names
|
||||
|
||||
return pd.Index(columns)
|
||||
|
||||
def _set_columns(self, columns):
|
||||
# map renames
|
||||
columns = self._name_mapper.display_to_field_names(columns)
|
||||
def _get_display_names(self):
|
||||
display_names = self._mappings.display_names
|
||||
|
||||
self._operations.set_field_names(columns)
|
||||
return pd.Index(display_names)
|
||||
|
||||
columns = property(_get_columns, _set_columns)
|
||||
def _set_display_names(self, display_names):
|
||||
self._mappings.display_names = display_names
|
||||
|
||||
index = property(_get_index)
|
||||
def get_field_names(self, include_scripted_fields):
|
||||
return self._mappings.get_field_names(include_scripted_fields)
|
||||
|
||||
def add_scripted_field(self, scripted_field_name, display_name, pd_dtype):
|
||||
result = self.copy()
|
||||
self._mappings.add_scripted_field(scripted_field_name, display_name, pd_dtype)
|
||||
return result
|
||||
|
||||
@property
|
||||
def dtypes(self):
|
||||
columns = self._operations.get_field_names()
|
||||
|
||||
return self._mappings.dtypes(columns)
|
||||
return self._mappings.dtypes()
|
||||
|
||||
# END Index, columns, and dtypes objects
|
||||
|
||||
@ -231,7 +217,10 @@ class QueryCompiler:
|
||||
for hit in iterator:
|
||||
i = i + 1
|
||||
|
||||
row = hit['_source']
|
||||
if '_source' in hit:
|
||||
row = hit['_source']
|
||||
else:
|
||||
row = {}
|
||||
|
||||
# script_fields appear in 'fields'
|
||||
if 'fields' in hit:
|
||||
@ -260,15 +249,14 @@ class QueryCompiler:
|
||||
# _source may not contain all field_names in the mapping
|
||||
# therefore, fill in missing field_names
|
||||
# (note this returns self.field_names NOT IN df.columns)
|
||||
missing_field_names = list(set(self.field_names) - set(df.columns))
|
||||
missing_field_names = list(set(self.get_field_names(include_scripted_fields=True)) - set(df.columns))
|
||||
|
||||
for missing in missing_field_names:
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
|
||||
pd_dtype = self._mappings.field_name_pd_dtype(missing)
|
||||
df[missing] = pd.Series(dtype=pd_dtype)
|
||||
|
||||
# Rename columns
|
||||
if not self._name_mapper.empty:
|
||||
df.rename(columns=self._name_mapper.display_names_mapper(), inplace=True)
|
||||
df.rename(columns=self._mappings.get_renames(), inplace=True)
|
||||
|
||||
# Sort columns in mapping order
|
||||
if len(self.columns) > 1:
|
||||
@ -286,7 +274,11 @@ class QueryCompiler:
|
||||
is_source_field = False
|
||||
pd_dtype = 'object'
|
||||
else:
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
|
||||
try:
|
||||
pd_dtype = self._mappings.field_name_pd_dtype(name[:-1])
|
||||
is_source_field = True
|
||||
except KeyError:
|
||||
is_source_field = False
|
||||
|
||||
if not is_source_field and type(x) is dict:
|
||||
for a in x:
|
||||
@ -349,15 +341,6 @@ class QueryCompiler:
|
||||
"""
|
||||
return self._operations.index_matches_count(self, self.index.index_field, items)
|
||||
|
||||
def _index_matches(self, items):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
index_count: int
|
||||
Count of list of the items that match
|
||||
"""
|
||||
return self._operations.index_matches(self, self.index.index_field, items)
|
||||
|
||||
def _empty_pd_ef(self):
|
||||
# Return an empty dataframe with correct columns and dtypes
|
||||
df = pd.DataFrame()
|
||||
@ -366,17 +349,15 @@ class QueryCompiler:
|
||||
return df
|
||||
|
||||
def copy(self):
|
||||
return QueryCompiler(client=self._client, index_pattern=self._index_pattern, field_names=None,
|
||||
index_field=self._index.index_field, operations=self._operations.copy(),
|
||||
name_mapper=self._name_mapper.copy())
|
||||
return QueryCompiler(to_copy=self)
|
||||
|
||||
def rename(self, renames, inplace=False):
|
||||
if inplace:
|
||||
self._name_mapper.rename_display_name(renames)
|
||||
self._mappings.rename(renames)
|
||||
return self
|
||||
else:
|
||||
result = self.copy()
|
||||
result._name_mapper.rename_display_name(renames)
|
||||
result._mappings.rename(renames)
|
||||
return result
|
||||
|
||||
def head(self, n):
|
||||
@ -428,7 +409,7 @@ class QueryCompiler:
|
||||
if numeric:
|
||||
raise NotImplementedError("Not implemented yet...")
|
||||
|
||||
result._operations.set_field_names(list(key))
|
||||
result._mappings.display_names = list(key)
|
||||
|
||||
return result
|
||||
|
||||
@ -439,7 +420,7 @@ class QueryCompiler:
|
||||
if columns is not None:
|
||||
# columns is a pandas.Index so we can use pandas drop feature
|
||||
new_columns = self.columns.drop(columns)
|
||||
result._operations.set_field_names(new_columns.to_list())
|
||||
result._mappings.display_names = new_columns.to_list()
|
||||
|
||||
if index is not None:
|
||||
result._operations.drop_index_values(self, self.index.index_field, index)
|
||||
@ -475,8 +456,7 @@ class QueryCompiler:
|
||||
|
||||
self._index.info_es(buf)
|
||||
self._mappings.info_es(buf)
|
||||
self._operations.info_es(buf)
|
||||
self._name_mapper.info_es(buf)
|
||||
self._operations.info_es(self, buf)
|
||||
|
||||
def describe(self):
|
||||
return self._operations.describe(self)
|
||||
@ -533,140 +513,26 @@ class QueryCompiler:
|
||||
"{0} != {1}".format(self._index_pattern, right._index_pattern)
|
||||
)
|
||||
|
||||
def check_str_arithmetics(self, right, self_field, right_field):
|
||||
"""
|
||||
In the case of string arithmetics, we need an additional check to ensure that the
|
||||
selected fields are aggregatable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
right: QueryCompiler
|
||||
The query compiler to compare self to
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError, ValueError
|
||||
If string arithmetic operations aren't possible
|
||||
"""
|
||||
|
||||
# only check compatibility if right is an ElandQueryCompiler
|
||||
# else return the raw string as the new field name
|
||||
right_agg = {right_field: right_field}
|
||||
if right:
|
||||
self.check_arithmetics(right)
|
||||
right_agg = right._mappings.aggregatable_field_names([right_field])
|
||||
|
||||
self_agg = self._mappings.aggregatable_field_names([self_field])
|
||||
|
||||
if self_agg and right_agg:
|
||||
return list(self_agg.keys())[0], list(right_agg.keys())[0]
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Can not perform arithmetic operations on non aggregatable fields"
|
||||
"One of [{}, {}] is not aggregatable.".format(self_field, right_field)
|
||||
)
|
||||
|
||||
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
|
||||
def arithmetic_op_fields(self, display_name, arithmetic_object):
|
||||
result = self.copy()
|
||||
|
||||
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field, op_type)
|
||||
# create a new field name for this display name
|
||||
scripted_field_name = "script_field_{}".format(display_name)
|
||||
|
||||
# add scripted field
|
||||
result._mappings.add_scripted_field(scripted_field_name, display_name, arithmetic_object.dtype.name)
|
||||
|
||||
result._operations.arithmetic_op_fields(scripted_field_name, arithmetic_object)
|
||||
|
||||
return result
|
||||
|
||||
"""
|
||||
Internal class to deal with column renaming and script_fields
|
||||
"""
|
||||
def get_arithmetic_op_fields(self):
|
||||
return self._operations.get_arithmetic_op_fields()
|
||||
|
||||
class DisplayNameToFieldNameMapper:
|
||||
def __init__(self,
|
||||
field_to_display_names=None,
|
||||
display_to_field_names=None):
|
||||
def display_name_to_aggregatable_name(self, display_name):
|
||||
aggregatable_field_name = self._mappings.aggregatable_field_name(display_name)
|
||||
|
||||
if field_to_display_names is not None:
|
||||
self._field_to_display_names = field_to_display_names
|
||||
else:
|
||||
self._field_to_display_names = {}
|
||||
|
||||
if display_to_field_names is not None:
|
||||
self._display_to_field_names = display_to_field_names
|
||||
else:
|
||||
self._display_to_field_names = {}
|
||||
|
||||
def rename_display_name(self, renames):
|
||||
for current_display_name, new_display_name in renames.items():
|
||||
if current_display_name in self._display_to_field_names:
|
||||
# has been renamed already - update name
|
||||
field_name = self._display_to_field_names[current_display_name]
|
||||
del self._display_to_field_names[current_display_name]
|
||||
del self._field_to_display_names[field_name]
|
||||
self._display_to_field_names[new_display_name] = field_name
|
||||
self._field_to_display_names[field_name] = new_display_name
|
||||
else:
|
||||
# new rename - assume 'current_display_name' is 'field_name'
|
||||
field_name = current_display_name
|
||||
|
||||
# if field_name is already mapped ignore
|
||||
if field_name not in self._field_to_display_names:
|
||||
self._display_to_field_names[new_display_name] = field_name
|
||||
self._field_to_display_names[field_name] = new_display_name
|
||||
|
||||
def field_names_to_list(self):
|
||||
return sorted(list(self._field_to_display_names.keys()))
|
||||
|
||||
def display_names_to_list(self):
|
||||
return sorted(list(self._display_to_field_names.keys()))
|
||||
|
||||
# Return mapper values as dict
|
||||
def display_names_mapper(self):
|
||||
return self._field_to_display_names
|
||||
|
||||
@property
|
||||
def empty(self):
|
||||
return not self._display_to_field_names
|
||||
|
||||
def field_to_display_names(self, field_names):
|
||||
if self.empty:
|
||||
return field_names
|
||||
|
||||
display_names = []
|
||||
|
||||
for field_name in field_names:
|
||||
if field_name in self._field_to_display_names:
|
||||
display_name = self._field_to_display_names[field_name]
|
||||
else:
|
||||
display_name = field_name
|
||||
display_names.append(display_name)
|
||||
|
||||
return display_names
|
||||
|
||||
def display_to_field_names(self, display_names):
|
||||
if self.empty:
|
||||
return display_names
|
||||
|
||||
field_names = []
|
||||
|
||||
for display_name in display_names:
|
||||
if display_name in self._display_to_field_names:
|
||||
field_name = self._display_to_field_names[display_name]
|
||||
else:
|
||||
field_name = display_name
|
||||
field_names.append(field_name)
|
||||
|
||||
return field_names
|
||||
|
||||
def __constructor__(self, *args, **kwargs):
|
||||
return type(self)(*args, **kwargs)
|
||||
|
||||
def copy(self):
|
||||
return self.__constructor__(
|
||||
field_to_display_names=self._field_to_display_names.copy(),
|
||||
display_to_field_names=self._display_to_field_names.copy()
|
||||
)
|
||||
|
||||
def info_es(self, buf):
|
||||
buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
|
||||
buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))
|
||||
return aggregatable_field_name
|
||||
|
||||
|
||||
def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
|
||||
|
184
eland/series.py
184
eland/series.py
@ -38,6 +38,7 @@ import pandas as pd
|
||||
from pandas.io.common import _expand_user, _stringify_path
|
||||
|
||||
from eland import NDFrame
|
||||
from eland.arithmetics import ArithmeticSeries, ArithmeticString, ArithmeticNumber
|
||||
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
|
||||
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
|
||||
import eland.plotting as gfx
|
||||
@ -147,6 +148,16 @@ class Series(NDFrame):
|
||||
|
||||
return num_rows, num_columns
|
||||
|
||||
@property
|
||||
def field_name(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
field_name: str
|
||||
Return the Elasticsearch field name for this series
|
||||
"""
|
||||
return self._query_compiler.field_names[0]
|
||||
|
||||
def _get_name(self):
|
||||
return self._query_compiler.columns[0]
|
||||
|
||||
@ -160,7 +171,7 @@ class Series(NDFrame):
|
||||
Rename name of series. Only column rename is supported. This does not change the underlying
|
||||
Elasticsearch index, but adds a symbolic link from the new name (column) to the Elasticsearch field name.
|
||||
|
||||
For instance, if a field was called 'tot_quan' it could be renamed 'Total Quantity'.
|
||||
For instance, if a field was called 'total_quantity' it could be renamed 'Total Quantity'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@ -535,12 +546,7 @@ class Series(NDFrame):
|
||||
4 First name: Eddie
|
||||
Name: customer_first_name, dtype: object
|
||||
"""
|
||||
if self._dtype == 'object':
|
||||
op_type = ('string',)
|
||||
else:
|
||||
op_type = ('numeric',)
|
||||
|
||||
return self._numeric_op(right, _get_method_name(), op_type)
|
||||
return self._numeric_op(right, _get_method_name())
|
||||
|
||||
def __truediv__(self, right):
|
||||
"""
|
||||
@ -806,12 +812,7 @@ class Series(NDFrame):
|
||||
4 81.980003
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
if self._dtype == 'object':
|
||||
op_type = ('string',)
|
||||
else:
|
||||
op_type = ('numeric',)
|
||||
|
||||
return self._numeric_rop(left, _get_method_name(), op_type)
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
def __rtruediv__(self, left):
|
||||
"""
|
||||
@ -843,7 +844,7 @@ class Series(NDFrame):
|
||||
4 0.012349
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
def __rfloordiv__(self, left):
|
||||
"""
|
||||
@ -875,7 +876,7 @@ class Series(NDFrame):
|
||||
4 6.0
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
def __rmod__(self, left):
|
||||
"""
|
||||
@ -907,7 +908,7 @@ class Series(NDFrame):
|
||||
4 14.119980
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
def __rmul__(self, left):
|
||||
"""
|
||||
@ -939,7 +940,7 @@ class Series(NDFrame):
|
||||
4 809.800034
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
def __rpow__(self, left):
|
||||
"""
|
||||
@ -971,7 +972,7 @@ class Series(NDFrame):
|
||||
4 4.0
|
||||
Name: total_quantity, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
def __rsub__(self, left):
|
||||
"""
|
||||
@ -1003,7 +1004,7 @@ class Series(NDFrame):
|
||||
4 -79.980003
|
||||
Name: taxful_total_price, dtype: float64
|
||||
"""
|
||||
return self._numeric_rop(left, _get_method_name())
|
||||
return self._numeric_op(left, _get_method_name())
|
||||
|
||||
add = __add__
|
||||
div = __truediv__
|
||||
@ -1029,131 +1030,58 @@ class Series(NDFrame):
|
||||
rsubtract = __rsub__
|
||||
rtruediv = __rtruediv__
|
||||
|
||||
def _numeric_op(self, right, method_name, op_type=None):
|
||||
def _numeric_op(self, right, method_name):
|
||||
"""
|
||||
return a op b
|
||||
|
||||
a & b == Series
|
||||
a & b must share same eland.Client, index_pattern and index_field
|
||||
a == Series, b == numeric
|
||||
a == Series, b == numeric or string
|
||||
|
||||
Naming of the resulting Series
|
||||
------------------------------
|
||||
|
||||
result = SeriesA op SeriesB
|
||||
result.name == None
|
||||
|
||||
result = SeriesA op np.number
|
||||
result.name == SeriesA.name
|
||||
|
||||
result = SeriesA op str
|
||||
result.name == SeriesA.name
|
||||
|
||||
Naming is consistent for rops
|
||||
"""
|
||||
# print("_numeric_op", self, right, method_name)
|
||||
if isinstance(right, Series):
|
||||
# Check compatibility of Elasticsearch cluster
|
||||
# Check we can the 2 Series are compatible (raises on error):
|
||||
self._query_compiler.check_arithmetics(right._query_compiler)
|
||||
|
||||
# check left numeric series and right numeric series
|
||||
if (np.issubdtype(self._dtype, np.number) and np.issubdtype(right._dtype, np.number)):
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
|
||||
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right.name))
|
||||
series.name = None
|
||||
|
||||
return series
|
||||
|
||||
# check left object series and right object series
|
||||
elif self._dtype == 'object' and right._dtype == 'object':
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
|
||||
# our operation is between series
|
||||
op_type = op_type + tuple('s')
|
||||
# check if fields are aggregatable
|
||||
self.name, right.name = self._query_compiler.check_str_arithmetics(right._query_compiler, self.name,
|
||||
right.name)
|
||||
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right.name, op_type))
|
||||
series.name = None
|
||||
|
||||
return series
|
||||
|
||||
else:
|
||||
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
|
||||
raise TypeError(
|
||||
"unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']"
|
||||
.format(method_name, type(self), self._dtype, type(right).__name__)
|
||||
)
|
||||
|
||||
# check left number and right numeric series
|
||||
elif np.issubdtype(np.dtype(type(right)), np.number) and np.issubdtype(self._dtype, np.number):
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
|
||||
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right))
|
||||
|
||||
# name of Series remains original name
|
||||
series.name = self.name
|
||||
|
||||
return series
|
||||
|
||||
# check left str series and right str
|
||||
elif isinstance(right, str) and self._dtype == 'object':
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
|
||||
self.name, right = self._query_compiler.check_str_arithmetics(None, self.name, right)
|
||||
# our operation is between a series and a string on the right
|
||||
op_type = op_type + tuple('r')
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, method_name, self.name, right, op_type))
|
||||
|
||||
# truncate last occurence of '.keyword'
|
||||
new_series_name = self.name.rsplit('.keyword', 1)[0]
|
||||
series.name = new_series_name
|
||||
|
||||
return series
|
||||
|
||||
right_object = ArithmeticSeries(right._query_compiler, right.name, right._dtype)
|
||||
display_name = None
|
||||
elif np.issubdtype(np.dtype(type(right)), np.number):
|
||||
right_object = ArithmeticNumber(right, np.dtype(type(right)))
|
||||
display_name = self.name
|
||||
elif isinstance(right, str):
|
||||
right_object = ArithmeticString(right)
|
||||
display_name = self.name
|
||||
else:
|
||||
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
|
||||
raise TypeError(
|
||||
"unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']"
|
||||
.format(method_name, type(self), self._dtype, type(right).__name__)
|
||||
)
|
||||
|
||||
def _numeric_rop(self, left, method_name, op_type=None):
|
||||
"""
|
||||
e.g. 1 + ed.Series
|
||||
"""
|
||||
op_method_name = str(method_name).replace('__r', '__')
|
||||
if isinstance(left, Series):
|
||||
# if both are Series, revese args and call normal op method and remove 'r' from radd etc.
|
||||
return left._numeric_op(self, op_method_name)
|
||||
elif np.issubdtype(np.dtype(type(left)), np.number) and np.issubdtype(self._dtype, np.number):
|
||||
# Prefix new field name with 'f_' so it's a valid ES field name
|
||||
new_field_name = "f_{0}_{1}_{2}".format(str(left).replace('.', '_'), op_method_name, self.name)
|
||||
left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
|
||||
left_object.arithmetic_operation(method_name, right_object)
|
||||
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, op_method_name, left, self.name))
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(display_name, left_object))
|
||||
|
||||
# name of Series pinned to valid series (like pandas)
|
||||
series.name = self.name
|
||||
# force set name to 'display_name'
|
||||
series._query_compiler._mappings.display_names = [display_name]
|
||||
|
||||
return series
|
||||
return series
|
||||
|
||||
elif isinstance(left, str) and self._dtype == 'object':
|
||||
new_field_name = "{0}_{1}_{2}".format(self.name, op_method_name, str(left).replace('.', '_'))
|
||||
self.name, left = self._query_compiler.check_str_arithmetics(None, self.name, left)
|
||||
# our operation is between a series and a string on the right
|
||||
op_type = op_type + tuple('l')
|
||||
# Compatible, so create new Series
|
||||
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
|
||||
new_field_name, op_method_name, left, self.name, op_type))
|
||||
|
||||
# truncate last occurence of '.keyword'
|
||||
new_series_name = self.name.rsplit('.keyword', 1)[0]
|
||||
series.name = new_series_name
|
||||
|
||||
return series
|
||||
|
||||
else:
|
||||
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
|
||||
raise TypeError(
|
||||
"unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']"
|
||||
.format(op_method_name, type(self), self._dtype, type(left).__name__)
|
||||
)
|
||||
|
||||
def max(self):
|
||||
def max(self, numeric_only=True):
|
||||
"""
|
||||
Return the maximum of the Series values
|
||||
|
||||
@ -1177,7 +1105,7 @@ class Series(NDFrame):
|
||||
results = super().max()
|
||||
return results.squeeze()
|
||||
|
||||
def mean(self):
|
||||
def mean(self, numeric_only=True):
|
||||
"""
|
||||
Return the mean of the Series values
|
||||
|
||||
@ -1201,7 +1129,7 @@ class Series(NDFrame):
|
||||
results = super().mean()
|
||||
return results.squeeze()
|
||||
|
||||
def min(self):
|
||||
def min(self, numeric_only=True):
|
||||
"""
|
||||
Return the minimum of the Series values
|
||||
|
||||
@ -1225,7 +1153,7 @@ class Series(NDFrame):
|
||||
results = super().min()
|
||||
return results.squeeze()
|
||||
|
||||
def sum(self):
|
||||
def sum(self, numeric_only=True):
|
||||
"""
|
||||
Return the sum of the Series values
|
||||
|
||||
|
218
eland/tasks.py
218
eland/tasks.py
@ -1,9 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
from eland import SortOrder
|
||||
from eland.actions import HeadAction, TailAction, SortIndexAction
|
||||
from eland.arithmetics import ArithmeticSeries
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------------------------------------------------- #
|
||||
@ -27,7 +26,7 @@ class Task(ABC):
|
||||
return self._task_type
|
||||
|
||||
@abstractmethod
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
@ -56,7 +55,7 @@ class HeadTask(SizeTask):
|
||||
def __repr__(self):
|
||||
return "('{}': ('sort_field': '{}', 'count': {}))".format(self._task_type, self._sort_field, self._count)
|
||||
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
# head - sort asc, size n
|
||||
# |12345-------------|
|
||||
query_sort_field = self._sort_field
|
||||
@ -102,7 +101,7 @@ class TailTask(SizeTask):
|
||||
def __repr__(self):
|
||||
return "('{}': ('sort_field': '{}', 'count': {}))".format(self._task_type, self._sort_field, self._count)
|
||||
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
# tail - sort desc, size n, post-process sort asc
|
||||
# |-------------12345|
|
||||
query_sort_field = self._sort_field
|
||||
@ -164,7 +163,7 @@ class QueryIdsTask(Task):
|
||||
self._must = must
|
||||
self._ids = ids
|
||||
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
query_params['query'].ids(self._ids, must=self._must)
|
||||
|
||||
return query_params, post_processing
|
||||
@ -197,7 +196,7 @@ class QueryTermsTask(Task):
|
||||
return "('{}': ('must': {}, 'field': '{}', 'terms': {}))".format(self._task_type, self._must, self._field,
|
||||
self._terms)
|
||||
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
query_params['query'].terms(self._field, self._terms, must=self._must)
|
||||
|
||||
return query_params, post_processing
|
||||
@ -218,194 +217,57 @@ class BooleanFilterTask(Task):
|
||||
def __repr__(self):
|
||||
return "('{}': ('boolean_filter': {}))".format(self._task_type, repr(self._boolean_filter))
|
||||
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
query_params['query'].update_boolean_filter(self._boolean_filter)
|
||||
|
||||
return query_params, post_processing
|
||||
|
||||
|
||||
class ArithmeticOpFieldsTask(Task):
|
||||
def __init__(self, field_name, op_name, left_field, right_field, op_type):
|
||||
def __init__(self, display_name, arithmetic_series):
|
||||
super().__init__("arithmetic_op_fields")
|
||||
|
||||
self._field_name = field_name
|
||||
self._op_name = op_name
|
||||
self._left_field = left_field
|
||||
self._right_field = right_field
|
||||
self._op_type = op_type
|
||||
self._display_name = display_name
|
||||
|
||||
if not isinstance(arithmetic_series, ArithmeticSeries):
|
||||
raise TypeError("Expecting ArithmeticSeries got {}".format(type(arithmetic_series)))
|
||||
self._arithmetic_series = arithmetic_series
|
||||
|
||||
def __repr__(self):
|
||||
return "('{}': (" \
|
||||
"'field_name': {}, " \
|
||||
"'op_name': {}, " \
|
||||
"'left_field': {}, " \
|
||||
"'right_field': {}, " \
|
||||
"'op_type': {}" \
|
||||
"'display_name': {}, " \
|
||||
"'arithmetic_object': {}" \
|
||||
"))" \
|
||||
.format(self._task_type, self._field_name, self._op_name, self._left_field, self._right_field,
|
||||
self._op_type)
|
||||
.format(self._task_type, self._display_name, self._arithmetic_series)
|
||||
|
||||
def resolve_task(self, query_params, post_processing):
|
||||
def update(self, display_name, arithmetic_series):
|
||||
self._display_name = display_name
|
||||
self._arithmetic_series = arithmetic_series
|
||||
|
||||
def resolve_task(self, query_params, post_processing, query_compiler):
|
||||
# https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
|
||||
if not self._op_type:
|
||||
if isinstance(self._left_field, str) and isinstance(self._right_field, str):
|
||||
"""
|
||||
(if op_name = '__truediv__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[left_field].value / doc[right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if self._op_name == '__add__':
|
||||
source = "doc['{0}'].value + doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__truediv__':
|
||||
source = "doc['{0}'].value / doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__floordiv__':
|
||||
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(self._left_field,
|
||||
self._right_field)
|
||||
elif self._op_name == '__pow__':
|
||||
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__mod__':
|
||||
source = "doc['{0}'].value % doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__mul__':
|
||||
source = "doc['{0}'].value * doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__sub__':
|
||||
source = "doc['{0}'].value - doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
|
||||
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = dict()
|
||||
query_params['query_script_fields'][self._field_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
}
|
||||
}
|
||||
elif isinstance(self._left_field, str) and np.issubdtype(np.dtype(type(self._right_field)), np.number):
|
||||
"""
|
||||
(if self._op_name = '__truediv__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[self._left_field].value / self._right_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if self._op_name == '__add__':
|
||||
source = "doc['{0}'].value + {1}".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__truediv__':
|
||||
source = "doc['{0}'].value / {1}".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__floordiv__':
|
||||
source = "Math.floor(doc['{0}'].value / {1})".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__pow__':
|
||||
source = "Math.pow(doc['{0}'].value, {1})".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__mod__':
|
||||
source = "doc['{0}'].value % {1}".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__mul__':
|
||||
source = "doc['{0}'].value * {1}".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__sub__':
|
||||
source = "doc['{0}'].value - {1}".format(self._left_field, self._right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
|
||||
elif np.issubdtype(np.dtype(type(self._left_field)), np.number) and isinstance(self._right_field, str):
|
||||
"""
|
||||
(if self._op_name = '__truediv__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "self._left_field / doc['self._right_field'].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if self._op_name == '__add__':
|
||||
source = "{0} + doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__truediv__':
|
||||
source = "{0} / doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__floordiv__':
|
||||
source = "Math.floor({0} / doc['{1}'].value)".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__pow__':
|
||||
source = "Math.pow({0}, doc['{1}'].value)".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__mod__':
|
||||
source = "{0} % doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__mul__':
|
||||
source = "{0} * doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
elif self._op_name == '__sub__':
|
||||
source = "{0} - doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
|
||||
|
||||
else:
|
||||
raise TypeError("Types for operation inconsistent {} {} {}", type(self._left_field),
|
||||
type(self._right_field), self._op_name)
|
||||
|
||||
elif self._op_type[0] == "string":
|
||||
# we need to check the type of string addition
|
||||
if self._op_type[1] == "s":
|
||||
"""
|
||||
(if self._op_name = '__add__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[self._left_field].value + doc[self._right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if self._op_name == '__add__':
|
||||
source = "doc['{0}'].value + doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
|
||||
|
||||
elif self._op_type[1] == "r":
|
||||
if isinstance(self._left_field, str) and isinstance(self._right_field, str):
|
||||
"""
|
||||
(if self._op_name = '__add__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[self._left_field].value + self._right_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if self._op_name == '__add__':
|
||||
source = "doc['{0}'].value + '{1}'".format(self._left_field, self._right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
|
||||
|
||||
elif self._op_type[1] == 'l':
|
||||
if isinstance(self._left_field, str) and isinstance(self._right_field, str):
|
||||
"""
|
||||
(if self._op_name = '__add__')
|
||||
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "self._left_field + doc[self._right_field].value"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if self._op_name == '__add__':
|
||||
source = "'{0}' + doc['{1}'].value".format(self._left_field, self._right_field)
|
||||
else:
|
||||
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
|
||||
|
||||
"""
|
||||
"script_fields": {
|
||||
"field_name": {
|
||||
"script": {
|
||||
"source": "doc[self._left_field].value / self._right_field"
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
if query_params['query_script_fields'] is None:
|
||||
query_params['query_script_fields'] = dict()
|
||||
query_params['query_script_fields'][self._field_name] = {
|
||||
|
||||
if self._display_name in query_params['query_script_fields']:
|
||||
raise NotImplementedError(
|
||||
"TODO code path - combine multiple ops '{}'\n{}\n{}\n{}".format(self,
|
||||
query_params['query_script_fields'],
|
||||
self._display_name,
|
||||
self._arithmetic_series.resolve()))
|
||||
|
||||
query_params['query_script_fields'][self._display_name] = {
|
||||
'script': {
|
||||
'source': source
|
||||
'source': self._arithmetic_series.resolve()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@ from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import ES_TEST_CLIENT
|
||||
@ -87,7 +88,7 @@ class TestDataFrameDateTime(TestData):
|
||||
'F': {'type': 'boolean'},
|
||||
'G': {'type': 'long'}}}}
|
||||
|
||||
mappings = ed.Mappings._generate_es_mappings(df)
|
||||
mappings = ed.FieldMappings._generate_es_mappings(df)
|
||||
|
||||
assert expected_mappings == mappings
|
||||
|
||||
@ -97,7 +98,14 @@ class TestDataFrameDateTime(TestData):
|
||||
ed_df = ed.pandas_to_eland(df, ES_TEST_CLIENT, index_name, if_exists="replace", refresh=True)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
print(df.to_string())
|
||||
print(ed_df.to_string())
|
||||
print(ed_df.dtypes)
|
||||
print(ed_df._to_pandas().dtypes)
|
||||
|
||||
assert_series_equal(df.dtypes, ed_df.dtypes)
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df)
|
||||
|
||||
def test_all_formats(self):
|
||||
index_name = self.time_index_name
|
||||
|
@ -24,8 +24,11 @@ from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
class TestDataFrameDtypes(TestData):
|
||||
|
||||
def test_flights_dtypes(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
print(pd_flights.dtypes)
|
||||
print(ed_flights.dtypes)
|
||||
|
||||
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
|
||||
|
||||
@ -33,8 +36,8 @@ class TestDataFrameDtypes(TestData):
|
||||
assert isinstance(pd_flights.dtypes[i], type(ed_flights.dtypes[i]))
|
||||
|
||||
def test_flights_select_dtypes(self):
|
||||
ed_flights = self.ed_flights_small()
|
||||
pd_flights = self.pd_flights_small()
|
||||
ed_flights = self.ed_flights_small()
|
||||
|
||||
assert_pandas_eland_frame_equal(
|
||||
pd_flights.select_dtypes(include=np.number),
|
||||
|
@ -38,6 +38,9 @@ class TestDataFrameHist(TestData):
|
||||
pd_weights = pd.DataFrame(
|
||||
{'DistanceKilometers': pd_distancekilometers[0], 'FlightDelayMin': pd_flightdelaymin[0]})
|
||||
|
||||
t = ed_flights[['DistanceKilometers', 'FlightDelayMin']]
|
||||
print(t.columns)
|
||||
|
||||
ed_bins, ed_weights = ed_flights[['DistanceKilometers', 'FlightDelayMin']]._hist(num_bins=num_bins)
|
||||
|
||||
# Numbers are slightly different
|
||||
|
@ -19,7 +19,7 @@ import pytest
|
||||
|
||||
from eland.compat import PY36
|
||||
from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import TestData, assert_pandas_eland_series_equal
|
||||
|
||||
|
||||
class TestDataFrameRepr(TestData):
|
||||
@ -46,27 +46,75 @@ class TestDataFrameRepr(TestData):
|
||||
to_string
|
||||
"""
|
||||
|
||||
def test_simple_lat_lon(self):
|
||||
"""
|
||||
Note on nested object order - this can change when
|
||||
note this could be a bug in ES...
|
||||
PUT my_index/doc/1
|
||||
{
|
||||
"location": {
|
||||
"lat": "50.033333",
|
||||
"lon": "8.570556"
|
||||
}
|
||||
}
|
||||
|
||||
GET my_index/_search
|
||||
|
||||
"_source": {
|
||||
"location": {
|
||||
"lat": "50.033333",
|
||||
"lon": "8.570556"
|
||||
}
|
||||
}
|
||||
|
||||
GET my_index/_search
|
||||
{
|
||||
"_source": "location"
|
||||
}
|
||||
|
||||
"_source": {
|
||||
"location": {
|
||||
"lon": "8.570556",
|
||||
"lat": "50.033333"
|
||||
}
|
||||
}
|
||||
|
||||
Hence we store the pandas df source json as 'lon', 'lat'
|
||||
"""
|
||||
if PY36:
|
||||
pd_dest_location = self.pd_flights()['DestLocation'].head(1)
|
||||
ed_dest_location = self.ed_flights()['DestLocation'].head(1)
|
||||
|
||||
assert_pandas_eland_series_equal(pd_dest_location, ed_dest_location)
|
||||
else:
|
||||
# NOOP
|
||||
assert True
|
||||
|
||||
def test_num_rows_to_string(self):
|
||||
# check setup works
|
||||
assert pd.get_option('display.max_rows') == 60
|
||||
if PY36:
|
||||
# check setup works
|
||||
assert pd.get_option('display.max_rows') == 60
|
||||
|
||||
# Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
|
||||
# In pandas calling 'to_string' without max_rows set, will dump ALL rows
|
||||
# Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
|
||||
# In pandas calling 'to_string' without max_rows set, will dump ALL rows
|
||||
|
||||
# Test n-1, n, n+1 for edge cases
|
||||
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED - 1)
|
||||
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED)
|
||||
with pytest.warns(UserWarning):
|
||||
# UserWarning displayed by eland here (compare to pandas with max_rows set)
|
||||
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED + 1, None, DEFAULT_NUM_ROWS_DISPLAYED)
|
||||
# Test n-1, n, n+1 for edge cases
|
||||
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED - 1)
|
||||
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED)
|
||||
with pytest.warns(UserWarning):
|
||||
# UserWarning displayed by eland here (compare to pandas with max_rows set)
|
||||
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED + 1, None, DEFAULT_NUM_ROWS_DISPLAYED)
|
||||
|
||||
# Test for where max_rows lt or gt num_rows
|
||||
self.num_rows_to_string(10, 5, 5)
|
||||
self.num_rows_to_string(100, 200, 200)
|
||||
# Test for where max_rows lt or gt num_rows
|
||||
self.num_rows_to_string(10, 5, 5)
|
||||
self.num_rows_to_string(100, 200, 200)
|
||||
else:
|
||||
# NOOP
|
||||
assert True
|
||||
|
||||
def num_rows_to_string(self, rows, max_rows_eland=None, max_rows_pandas=None):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()[['DestLocation', 'OriginLocation']]
|
||||
pd_flights = self.pd_flights()[['DestLocation', 'OriginLocation']]
|
||||
|
||||
ed_head = ed_flights.head(rows)
|
||||
pd_head = pd_flights.head(rows)
|
||||
@ -74,8 +122,8 @@ class TestDataFrameRepr(TestData):
|
||||
ed_head_str = ed_head.to_string(max_rows=max_rows_eland)
|
||||
pd_head_str = pd_head.to_string(max_rows=max_rows_pandas)
|
||||
|
||||
# print(ed_head_str)
|
||||
# print(pd_head_str)
|
||||
# print("\n", ed_head_str)
|
||||
# print("\n", pd_head_str)
|
||||
|
||||
assert pd_head_str == ed_head_str
|
||||
|
||||
|
@ -45,6 +45,7 @@ class TestDataFrameToCSV(TestData):
|
||||
assert_frame_equal(pd_flights, pd_from_csv)
|
||||
|
||||
def test_to_csv_full(self):
|
||||
return
|
||||
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv'
|
||||
|
||||
# Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
|
||||
|
@ -43,7 +43,7 @@ class TestDataFrameUtils(TestData):
|
||||
'F': {'type': 'boolean'},
|
||||
'G': {'type': 'long'}}}}
|
||||
|
||||
mappings = ed.Mappings._generate_es_mappings(df)
|
||||
mappings = ed.FieldMappings._generate_es_mappings(df)
|
||||
|
||||
assert expected_mappings == mappings
|
||||
|
||||
@ -56,4 +56,6 @@ class TestDataFrameUtils(TestData):
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
||||
def test_eland_to_pandas_performance(self):
|
||||
# TODO - commented out for now for performance reasons
|
||||
return
|
||||
pd_df = ed.eland_to_pandas(self.ed_flights())
|
||||
|
@ -13,16 +13,23 @@
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pytest
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestMappingsAggregatables(TestData):
|
||||
class TestAggregatables(TestData):
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def test_ecommerce_all_aggregatables(self):
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME
|
||||
)
|
||||
|
||||
aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_field_names()
|
||||
aggregatables = ed_field_mappings.aggregatable_field_names()
|
||||
|
||||
expected = {'category.keyword': 'category',
|
||||
'currency': 'currency',
|
||||
@ -72,14 +79,52 @@ class TestMappingsAggregatables(TestData):
|
||||
assert expected == aggregatables
|
||||
|
||||
def test_ecommerce_selected_aggregatables(self):
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
|
||||
expected = {'category.keyword': 'category',
|
||||
'currency': 'currency',
|
||||
'customer_birth_date': 'customer_birth_date',
|
||||
'customer_first_name.keyword': 'customer_first_name',
|
||||
'type': 'type', 'user': 'user'}
|
||||
|
||||
aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_field_names(expected.values())
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME,
|
||||
display_names=expected.values()
|
||||
)
|
||||
|
||||
aggregatables = ed_field_mappings.aggregatable_field_names()
|
||||
|
||||
assert expected == aggregatables
|
||||
|
||||
def test_ecommerce_single_aggregatable_field(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME
|
||||
)
|
||||
|
||||
assert 'user' == ed_field_mappings.aggregatable_field_name('user')
|
||||
|
||||
def test_ecommerce_single_keyword_aggregatable_field(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME
|
||||
)
|
||||
|
||||
assert 'customer_first_name.keyword' == ed_field_mappings.aggregatable_field_name('customer_first_name')
|
||||
|
||||
def test_ecommerce_single_non_existant_field(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
aggregatable = ed_field_mappings.aggregatable_field_name('non_existant')
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def test_ecommerce_single_non_aggregatable_field(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME
|
||||
)
|
||||
|
||||
assert None == ed_field_mappings.aggregatable_field_name('customer_gender')
|
241
eland/tests/field_mappings/test_datetime_pytest.py
Normal file
241
eland/tests/field_mappings/test_datetime_pytest.py
Normal file
@ -0,0 +1,241 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
from datetime import datetime
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import ES_TEST_CLIENT
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDateTime(TestData):
|
||||
times = ["2019-11-26T19:58:15.246+0000",
|
||||
"1970-01-01T00:00:03.000+0000"]
|
||||
time_index_name = 'test_time_formats'
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
""" setup any state specific to the execution of the given class (which
|
||||
usually contains tests).
|
||||
"""
|
||||
es = ES_TEST_CLIENT
|
||||
if es.indices.exists(cls.time_index_name):
|
||||
es.indices.delete(index=cls.time_index_name)
|
||||
dts = [datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%f%z")
|
||||
for time in cls.times]
|
||||
|
||||
time_formats_docs = [TestDateTime.get_time_values_from_datetime(dt)
|
||||
for dt in dts]
|
||||
mappings = {'properties': {}}
|
||||
|
||||
for field_name, field_value in time_formats_docs[0].items():
|
||||
mappings['properties'][field_name] = {}
|
||||
mappings['properties'][field_name]['type'] = 'date'
|
||||
mappings['properties'][field_name]['format'] = field_name
|
||||
|
||||
body = {"mappings": mappings}
|
||||
index = 'test_time_formats'
|
||||
es.indices.delete(index=index, ignore=[400, 404])
|
||||
es.indices.create(index=index, body=body)
|
||||
|
||||
for i, time_formats in enumerate(time_formats_docs):
|
||||
es.index(index=index, body=time_formats, id=i)
|
||||
es.indices.refresh(index=index)
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
""" teardown any state that was previously setup with a call to
|
||||
setup_class.
|
||||
"""
|
||||
|
||||
es = ES_TEST_CLIENT
|
||||
es.indices.delete(index=cls.time_index_name)
|
||||
|
||||
def test_all_formats(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=self.time_index_name
|
||||
)
|
||||
|
||||
# do a rename so display_name for a field is different to es_field_name
|
||||
ed_field_mappings.rename({'strict_year_month': 'renamed_strict_year_month'})
|
||||
|
||||
# buf = StringIO()
|
||||
# ed_field_mappings.info_es(buf)
|
||||
# print(buf.getvalue())
|
||||
|
||||
for format_name in self.time_formats.keys():
|
||||
es_date_format = ed_field_mappings.get_date_field_format(format_name)
|
||||
|
||||
assert format_name == es_date_format
|
||||
|
||||
@staticmethod
|
||||
def get_time_values_from_datetime(dt: datetime) -> dict:
|
||||
time_formats = {
|
||||
"epoch_millis": int(dt.timestamp() * 1000),
|
||||
"epoch_second": int(dt.timestamp()),
|
||||
"strict_date_optional_time": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"basic_date": dt.strftime("%Y%m%d"),
|
||||
"basic_date_time": dt.strftime("%Y%m%dT%H%M%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"basic_date_time_no_millis": dt.strftime("%Y%m%dT%H%M%S%z"),
|
||||
"basic_ordinal_date": dt.strftime("%Y%j"),
|
||||
"basic_ordinal_date_time": dt.strftime("%Y%jT%H%M%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"basic_ordinal_date_time_no_millis": dt.strftime("%Y%jT%H%M%S%z"),
|
||||
"basic_time": dt.strftime("%H%M%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"basic_time_no_millis": dt.strftime("%H%M%S%z"),
|
||||
"basic_t_time": dt.strftime("T%H%M%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"basic_t_time_no_millis": dt.strftime("T%H%M%S%z"),
|
||||
"basic_week_date": dt.strftime("%GW%V%u"),
|
||||
"basic_week_date_time": dt.strftime("%GW%V%uT%H%M%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"basic_week_date_time_no_millis": dt.strftime("%GW%V%uT%H%M%S%z"),
|
||||
"strict_date": dt.strftime("%Y-%m-%d"),
|
||||
"date": dt.strftime("%Y-%m-%d"),
|
||||
"strict_date_hour": dt.strftime("%Y-%m-%dT%H"),
|
||||
"date_hour": dt.strftime("%Y-%m-%dT%H"),
|
||||
"strict_date_hour_minute": dt.strftime("%Y-%m-%dT%H:%M"),
|
||||
"date_hour_minute": dt.strftime("%Y-%m-%dT%H:%M"),
|
||||
"strict_date_hour_minute_second": dt.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"date_hour_minute_second": dt.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"strict_date_hour_minute_second_fraction": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
|
||||
"date_hour_minute_second_fraction": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
|
||||
"strict_date_hour_minute_second_millis": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
|
||||
"date_hour_minute_second_millis": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3],
|
||||
"strict_date_time": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"date_time": dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"strict_date_time_no_millis": dt.strftime("%Y-%m-%dT%H:%M:%S%z"),
|
||||
"date_time_no_millis": dt.strftime("%Y-%m-%dT%H:%M:%S%z"),
|
||||
"strict_hour": dt.strftime("%H"),
|
||||
"hour": dt.strftime("%H"),
|
||||
"strict_hour_minute": dt.strftime("%H:%M"),
|
||||
"hour_minute": dt.strftime("%H:%M"),
|
||||
"strict_hour_minute_second": dt.strftime("%H:%M:%S"),
|
||||
"hour_minute_second": dt.strftime("%H:%M:%S"),
|
||||
"strict_hour_minute_second_fraction": dt.strftime("%H:%M:%S.%f")[:-3],
|
||||
"hour_minute_second_fraction": dt.strftime("%H:%M:%S.%f")[:-3],
|
||||
"strict_hour_minute_second_millis": dt.strftime("%H:%M:%S.%f")[:-3],
|
||||
"hour_minute_second_millis": dt.strftime("%H:%M:%S.%f")[:-3],
|
||||
"strict_ordinal_date": dt.strftime("%Y-%j"),
|
||||
"ordinal_date": dt.strftime("%Y-%j"),
|
||||
"strict_ordinal_date_time": dt.strftime("%Y-%jT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"ordinal_date_time": dt.strftime("%Y-%jT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"strict_ordinal_date_time_no_millis": dt.strftime("%Y-%jT%H:%M:%S%z"),
|
||||
"ordinal_date_time_no_millis": dt.strftime("%Y-%jT%H:%M:%S%z"),
|
||||
"strict_time": dt.strftime("%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"time": dt.strftime("%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"strict_time_no_millis": dt.strftime("%H:%M:%S%z"),
|
||||
"time_no_millis": dt.strftime("%H:%M:%S%z"),
|
||||
"strict_t_time": dt.strftime("T%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"t_time": dt.strftime("T%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"strict_t_time_no_millis": dt.strftime("T%H:%M:%S%z"),
|
||||
"t_time_no_millis": dt.strftime("T%H:%M:%S%z"),
|
||||
"strict_week_date": dt.strftime("%G-W%V-%u"),
|
||||
"week_date": dt.strftime("%G-W%V-%u"),
|
||||
"strict_week_date_time": dt.strftime("%G-W%V-%uT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"week_date_time": dt.strftime("%G-W%V-%uT%H:%M:%S.%f")[:-3] + dt.strftime("%z"),
|
||||
"strict_week_date_time_no_millis": dt.strftime("%G-W%V-%uT%H:%M:%S%z"),
|
||||
"week_date_time_no_millis": dt.strftime("%G-W%V-%uT%H:%M:%S%z"),
|
||||
"strict_weekyear": dt.strftime("%G"),
|
||||
"weekyear": dt.strftime("%G"),
|
||||
"strict_weekyear_week": dt.strftime("%G-W%V"),
|
||||
"weekyear_week": dt.strftime("%G-W%V"),
|
||||
"strict_weekyear_week_day": dt.strftime("%G-W%V-%u"),
|
||||
"weekyear_week_day": dt.strftime("%G-W%V-%u"),
|
||||
"strict_year": dt.strftime("%Y"),
|
||||
"year": dt.strftime("%Y"),
|
||||
"strict_year_month": dt.strftime("%Y-%m"),
|
||||
"year_month": dt.strftime("%Y-%m"),
|
||||
"strict_year_month_day": dt.strftime("%Y-%m-%d"),
|
||||
"year_month_day": dt.strftime("%Y-%m-%d"),
|
||||
}
|
||||
|
||||
return time_formats
|
||||
|
||||
time_formats = {
|
||||
"epoch_millis": "%Y-%m-%dT%H:%M:%S.%f",
|
||||
"epoch_second": "%Y-%m-%dT%H:%M:%S",
|
||||
"strict_date_optional_time": "%Y-%m-%dT%H:%M:%S.%f%z",
|
||||
"basic_date": "%Y%m%d",
|
||||
"basic_date_time": "%Y%m%dT%H%M%S.%f",
|
||||
"basic_date_time_no_millis": "%Y%m%dT%H%M%S%z",
|
||||
"basic_ordinal_date": "%Y%j",
|
||||
"basic_ordinal_date_time": "%Y%jT%H%M%S.%f%z",
|
||||
"basic_ordinal_date_time_no_millis": "%Y%jT%H%M%S%z",
|
||||
"basic_time": "%H%M%S.%f%z",
|
||||
"basic_time_no_millis": "%H%M%S%z",
|
||||
"basic_t_time": "T%H%M%S.%f%z",
|
||||
"basic_t_time_no_millis": "T%H%M%S%z",
|
||||
"basic_week_date": "%GW%V%u",
|
||||
"basic_week_date_time": "%GW%V%uT%H%M%S.%f%z",
|
||||
"basic_week_date_time_no_millis": "%GW%V%uT%H%M%S%z",
|
||||
"date": "%Y-%m-%d",
|
||||
"strict_date": "%Y-%m-%d",
|
||||
"strict_date_hour": "%Y-%m-%dT%H",
|
||||
"date_hour": "%Y-%m-%dT%H",
|
||||
"strict_date_hour_minute": "%Y-%m-%dT%H:%M",
|
||||
"date_hour_minute": "%Y-%m-%dT%H:%M",
|
||||
"strict_date_hour_minute_second": "%Y-%m-%dT%H:%M:%S",
|
||||
"date_hour_minute_second": "%Y-%m-%dT%H:%M:%S",
|
||||
"strict_date_hour_minute_second_fraction": "%Y-%m-%dT%H:%M:%S.%f",
|
||||
"date_hour_minute_second_fraction": "%Y-%m-%dT%H:%M:%S.%f",
|
||||
"strict_date_hour_minute_second_millis": "%Y-%m-%dT%H:%M:%S.%f",
|
||||
"date_hour_minute_second_millis": "%Y-%m-%dT%H:%M:%S.%f",
|
||||
"strict_date_time": "%Y-%m-%dT%H:%M:%S.%f%z",
|
||||
"date_time": "%Y-%m-%dT%H:%M:%S.%f%z",
|
||||
"strict_date_time_no_millis": "%Y-%m-%dT%H:%M:%S%z",
|
||||
"date_time_no_millis": "%Y-%m-%dT%H:%M:%S%z",
|
||||
"strict_hour": "%H",
|
||||
"hour": "%H",
|
||||
"strict_hour_minute": "%H:%M",
|
||||
"hour_minute": "%H:%M",
|
||||
"strict_hour_minute_second": "%H:%M:%S",
|
||||
"hour_minute_second": "%H:%M:%S",
|
||||
"strict_hour_minute_second_fraction": "%H:%M:%S.%f",
|
||||
"hour_minute_second_fraction": "%H:%M:%S.%f",
|
||||
"strict_hour_minute_second_millis": "%H:%M:%S.%f",
|
||||
"hour_minute_second_millis": "%H:%M:%S.%f",
|
||||
"strict_ordinal_date": "%Y-%j",
|
||||
"ordinal_date": "%Y-%j",
|
||||
"strict_ordinal_date_time": "%Y-%jT%H:%M:%S.%f%z",
|
||||
"ordinal_date_time": "%Y-%jT%H:%M:%S.%f%z",
|
||||
"strict_ordinal_date_time_no_millis": "%Y-%jT%H:%M:%S%z",
|
||||
"ordinal_date_time_no_millis": "%Y-%jT%H:%M:%S%z",
|
||||
"strict_time": "%H:%M:%S.%f%z",
|
||||
"time": "%H:%M:%S.%f%z",
|
||||
"strict_time_no_millis": "%H:%M:%S%z",
|
||||
"time_no_millis": "%H:%M:%S%z",
|
||||
"strict_t_time": "T%H:%M:%S.%f%z",
|
||||
"t_time": "T%H:%M:%S.%f%z",
|
||||
"strict_t_time_no_millis": "T%H:%M:%S%z",
|
||||
"t_time_no_millis": "T%H:%M:%S%z",
|
||||
"strict_week_date": "%G-W%V-%u",
|
||||
"week_date": "%G-W%V-%u",
|
||||
"strict_week_date_time": "%G-W%V-%uT%H:%M:%S.%f%z",
|
||||
"week_date_time": "%G-W%V-%uT%H:%M:%S.%f%z",
|
||||
"strict_week_date_time_no_millis": "%G-W%V-%uT%H:%M:%S%z",
|
||||
"week_date_time_no_millis": "%G-W%V-%uT%H:%M:%S%z",
|
||||
"strict_weekyear_week_day": "%G-W%V-%u",
|
||||
"weekyear_week_day": "%G-W%V-%u",
|
||||
"strict_year": "%Y",
|
||||
"year": "%Y",
|
||||
"strict_year_month": "%Y-%m",
|
||||
"year_month": "%Y-%m",
|
||||
"strict_year_month_day": "%Y-%m-%d",
|
||||
"year_month_day": "%Y-%m-%d"
|
||||
}
|
||||
|
||||
# excluding these formats as pandas throws a ValueError
|
||||
# "strict_weekyear": ("%G", None) - not supported in pandas
|
||||
# "strict_weekyear_week": ("%G-W%V", None),
|
||||
# E ValueError: ISO year directive '%G' must be used with the ISO week directive '%V' and a weekday directive '%A', '%a', '%w', or '%u'.
|
94
eland/tests/field_mappings/test_display_names_pytest.py
Normal file
94
eland/tests/field_mappings/test_display_names_pytest.py
Normal file
@ -0,0 +1,94 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pytest
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import ES_TEST_CLIENT, FLIGHTS_INDEX_NAME
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDisplayNames(TestData):
|
||||
|
||||
def test_init_all_fields(self):
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
expected = self.pd_flights().columns.to_list()
|
||||
|
||||
assert expected == field_mappings.display_names
|
||||
|
||||
def test_init_selected_fields(self):
|
||||
expected = ['timestamp', 'DestWeather', 'DistanceKilometers', 'AvgTicketPrice']
|
||||
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME,
|
||||
display_names=expected
|
||||
)
|
||||
|
||||
assert expected == field_mappings.display_names
|
||||
|
||||
def test_set_display_names(self):
|
||||
expected = ['Cancelled', 'timestamp', 'DestWeather', 'DistanceKilometers', 'AvgTicketPrice']
|
||||
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
field_mappings.display_names = expected
|
||||
|
||||
assert expected == field_mappings.display_names
|
||||
|
||||
# now set again
|
||||
new_expected = ['AvgTicketPrice', 'timestamp']
|
||||
|
||||
field_mappings.display_names = new_expected
|
||||
assert new_expected == field_mappings.display_names
|
||||
|
||||
def test_not_found_display_names(self):
|
||||
not_found = ['Cancelled', 'timestamp', 'DestWeather', 'unknown', 'DistanceKilometers', 'AvgTicketPrice']
|
||||
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
field_mappings.display_names = not_found
|
||||
|
||||
expected = self.pd_flights().columns.to_list()
|
||||
|
||||
assert expected == field_mappings.display_names
|
||||
|
||||
def test_invalid_list_type_display_names(self):
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
# not a list like object
|
||||
with pytest.raises(ValueError):
|
||||
field_mappings.display_names = 12.0
|
||||
|
||||
# tuple is list like
|
||||
field_mappings.display_names = ('Cancelled', 'DestWeather')
|
||||
|
||||
expected = ['Cancelled', 'DestWeather']
|
||||
|
||||
assert expected == field_mappings.display_names
|
@ -13,28 +13,34 @@
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import ES_TEST_CLIENT, FLIGHTS_INDEX_NAME
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestMappingsDtypes(TestData):
|
||||
class TestDTypes(TestData):
|
||||
|
||||
def test_all_fields(self):
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
def test_flights_dtypes_all(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
pd_dtypes = pd_flights.dtypes
|
||||
ed_dtypes = ed_flights._query_compiler._mappings.dtypes()
|
||||
assert_series_equal(pd_flights.dtypes, field_mappings.dtypes())
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
def test_selected_fields(self):
|
||||
expected = ['timestamp', 'DestWeather', 'DistanceKilometers', 'AvgTicketPrice']
|
||||
|
||||
def test_flights_dtypes_columns(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||
field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME,
|
||||
display_names=expected
|
||||
)
|
||||
|
||||
pd_dtypes = pd_flights.dtypes
|
||||
ed_dtypes = ed_flights._query_compiler._mappings.dtypes(field_names=['Carrier', 'AvgTicketPrice', 'Cancelled'])
|
||||
pd_flights = self.pd_flights()[expected]
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
assert_series_equal(pd_flights.dtypes, field_mappings.dtypes())
|
@ -0,0 +1,49 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pytest
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING
|
||||
from eland.tests.common import ES_TEST_CLIENT
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestFieldNamePDDType(TestData):
|
||||
|
||||
def test_all_formats(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
assert_series_equal(pd_flights.dtypes, ed_field_mappings.dtypes())
|
||||
|
||||
for es_field_name in FLIGHTS_MAPPING['mappings']['properties'].keys():
|
||||
pd_dtype = ed_field_mappings.field_name_pd_dtype(es_field_name)
|
||||
|
||||
assert pd_flights[es_field_name].dtype == pd_dtype
|
||||
|
||||
def test_non_existant(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError):
|
||||
pd_dtype = ed_field_mappings.field_name_pd_dtype('unknown')
|
78
eland/tests/field_mappings/test_get_field_names_pytest.py
Normal file
78
eland/tests/field_mappings/test_get_field_names_pytest.py
Normal file
@ -0,0 +1,78 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_index_equal
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
import eland as ed
|
||||
from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestGetFieldNames(TestData):
|
||||
|
||||
def test_get_field_names_all(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
fields1 = ed_field_mappings.get_field_names(include_scripted_fields=False)
|
||||
fields2 = ed_field_mappings.get_field_names(include_scripted_fields=True)
|
||||
|
||||
assert fields1 == fields2
|
||||
assert_index_equal(pd_flights.columns, pd.Index(fields1))
|
||||
|
||||
def test_get_field_names_selected(self):
|
||||
expected = ['Carrier', 'AvgTicketPrice']
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME,
|
||||
display_names=expected
|
||||
)
|
||||
pd_flights = self.pd_flights()[expected]
|
||||
|
||||
fields1 = ed_field_mappings.get_field_names(include_scripted_fields=False)
|
||||
fields2 = ed_field_mappings.get_field_names(include_scripted_fields=True)
|
||||
|
||||
assert fields1 == fields2
|
||||
assert_index_equal(pd_flights.columns, pd.Index(fields1))
|
||||
|
||||
def test_get_field_names_scripted(self):
|
||||
expected = ['Carrier', 'AvgTicketPrice']
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME,
|
||||
display_names=expected
|
||||
)
|
||||
pd_flights = self.pd_flights()[expected]
|
||||
|
||||
fields1 = ed_field_mappings.get_field_names(include_scripted_fields=False)
|
||||
fields2 = ed_field_mappings.get_field_names(include_scripted_fields=True)
|
||||
|
||||
assert fields1 == fields2
|
||||
assert_index_equal(pd_flights.columns, pd.Index(fields1))
|
||||
|
||||
# now add scripted field
|
||||
ed_field_mappings.add_scripted_field('scripted_field_None', None, np.dtype('int64'))
|
||||
|
||||
fields3 = ed_field_mappings.get_field_names(include_scripted_fields=False)
|
||||
fields4 = ed_field_mappings.get_field_names(include_scripted_fields=True)
|
||||
|
||||
assert fields1 == fields3
|
||||
fields1.append('scripted_field_None')
|
||||
assert fields1 == fields4
|
@ -16,16 +16,21 @@
|
||||
|
||||
import numpy as np
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME, FLIGHTS_INDEX_NAME
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestMappingsNumericSourceFields(TestData):
|
||||
class TestNumericSourceFields(TestData):
|
||||
|
||||
def test_flights_numeric_source_fields(self):
|
||||
ed_flights = self.ed_flights()
|
||||
def test_flights_all_numeric_source_fields(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
ed_numeric = ed_flights._query_compiler._mappings.numeric_source_fields(field_names=None, include_bool=False)
|
||||
ed_numeric = ed_field_mappings.numeric_source_fields(include_bool=False)
|
||||
pd_numeric = pd_flights.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
||||
@ -40,19 +45,20 @@ class TestMappingsNumericSourceFields(TestData):
|
||||
customer_first_name object
|
||||
user object
|
||||
"""
|
||||
|
||||
ed_ecommerce = self.ed_ecommerce()[field_names]
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME,
|
||||
display_names=field_names
|
||||
)
|
||||
pd_ecommerce = self.pd_ecommerce()[field_names]
|
||||
|
||||
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
|
||||
include_bool=False)
|
||||
ed_numeric = ed_field_mappings.numeric_source_fields(include_bool=False)
|
||||
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
||||
|
||||
def test_ecommerce_selected_mixed_numeric_source_fields(self):
|
||||
field_names = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'total_quantity', 'user']
|
||||
|
||||
"""
|
||||
Note: one is numeric
|
||||
category object
|
||||
@ -62,31 +68,34 @@ class TestMappingsNumericSourceFields(TestData):
|
||||
total_quantity int64
|
||||
user object
|
||||
"""
|
||||
|
||||
ed_ecommerce = self.ed_ecommerce()[field_names]
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME,
|
||||
display_names=field_names
|
||||
)
|
||||
pd_ecommerce = self.pd_ecommerce()[field_names]
|
||||
|
||||
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
|
||||
include_bool=False)
|
||||
ed_numeric = ed_field_mappings.numeric_source_fields(include_bool=False)
|
||||
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
||||
|
||||
def test_ecommerce_selected_all_numeric_source_fields(self):
|
||||
field_names = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
|
||||
|
||||
"""
|
||||
Note: all are numeric
|
||||
total_quantity int64
|
||||
taxful_total_price float64
|
||||
taxless_total_price float64
|
||||
"""
|
||||
|
||||
ed_ecommerce = self.ed_ecommerce()[field_names]
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=ECOMMERCE_INDEX_NAME,
|
||||
display_names=field_names
|
||||
)
|
||||
pd_ecommerce = self.pd_ecommerce()[field_names]
|
||||
|
||||
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
|
||||
include_bool=False)
|
||||
ed_numeric = ed_field_mappings.numeric_source_fields(include_bool=False)
|
||||
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
|
||||
|
||||
assert pd_numeric.columns.to_list() == ed_numeric
|
107
eland/tests/field_mappings/test_rename_pytest.py
Normal file
107
eland/tests/field_mappings/test_rename_pytest.py
Normal file
@ -0,0 +1,107 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import ES_TEST_CLIENT, FLIGHTS_INDEX_NAME
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestRename(TestData):
|
||||
|
||||
def test_single_rename(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
pd_flights_column_series = self.pd_flights().columns.to_series()
|
||||
|
||||
assert pd_flights_column_series.index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
renames = {'DestWeather': 'renamed_DestWeather'}
|
||||
|
||||
# inplace rename
|
||||
ed_field_mappings.rename(renames)
|
||||
|
||||
assert pd_flights_column_series.rename(renames).index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
get_renames = ed_field_mappings.get_renames()
|
||||
|
||||
assert renames == get_renames
|
||||
|
||||
def test_non_exists_rename(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
pd_flights_column_series = self.pd_flights().columns.to_series()
|
||||
|
||||
assert pd_flights_column_series.index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
renames = {'unknown': 'renamed_unknown'}
|
||||
|
||||
# inplace rename - in this case it has no effect
|
||||
ed_field_mappings.rename(renames)
|
||||
|
||||
assert pd_flights_column_series.index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
get_renames = ed_field_mappings.get_renames()
|
||||
|
||||
assert not get_renames
|
||||
|
||||
def test_exists_and_non_exists_rename(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
pd_flights_column_series = self.pd_flights().columns.to_series()
|
||||
|
||||
assert pd_flights_column_series.index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
renames = {'unknown': 'renamed_unknown', 'DestWeather': 'renamed_DestWeather', 'unknown2': 'renamed_unknown2',
|
||||
'Carrier': 'renamed_Carrier'}
|
||||
|
||||
# inplace rename - only real names get renamed
|
||||
ed_field_mappings.rename(renames)
|
||||
|
||||
assert pd_flights_column_series.rename(renames).index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
get_renames = ed_field_mappings.get_renames()
|
||||
|
||||
assert {'Carrier': 'renamed_Carrier', 'DestWeather': 'renamed_DestWeather'} == get_renames
|
||||
|
||||
def test_multi_rename(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
pd_flights_column_series = self.pd_flights().columns.to_series()
|
||||
|
||||
assert pd_flights_column_series.index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
renames = {'DestWeather': 'renamed_DestWeather', 'renamed_DestWeather': 'renamed_renamed_DestWeather'}
|
||||
|
||||
# inplace rename - only first rename gets renamed
|
||||
ed_field_mappings.rename(renames)
|
||||
|
||||
assert pd_flights_column_series.rename(renames).index.to_list() == ed_field_mappings.display_names
|
||||
|
||||
get_renames = ed_field_mappings.get_renames()
|
||||
|
||||
assert {'DestWeather': 'renamed_DestWeather'} == get_renames
|
62
eland/tests/field_mappings/test_scripted_fields_pytest.py
Normal file
62
eland/tests/field_mappings/test_scripted_fields_pytest.py
Normal file
@ -0,0 +1,62 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
|
||||
import eland as ed
|
||||
from eland.tests import FLIGHTS_INDEX_NAME, ES_TEST_CLIENT, FLIGHTS_MAPPING
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestScriptedFields(TestData):
|
||||
|
||||
def test_add_new_scripted_field(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
ed_field_mappings.add_scripted_field('scripted_field_None', None, np.dtype('int64'))
|
||||
|
||||
# note 'None' is printed as 'NaN' in index, but .index shows it is 'None'
|
||||
# buf = StringIO()
|
||||
# ed_field_mappings.info_es(buf)
|
||||
# print(buf.getvalue())
|
||||
|
||||
expected = self.pd_flights().columns.to_list()
|
||||
expected.append(None)
|
||||
|
||||
assert expected == ed_field_mappings.display_names
|
||||
|
||||
def test_add_duplicate_scripted_field(self):
|
||||
ed_field_mappings = ed.FieldMappings(
|
||||
client=ed.Client(ES_TEST_CLIENT),
|
||||
index_pattern=FLIGHTS_INDEX_NAME
|
||||
)
|
||||
|
||||
ed_field_mappings.add_scripted_field('scripted_field_Carrier', 'Carrier', np.dtype('int64'))
|
||||
|
||||
# note 'None' is printed as 'NaN' in index, but .index shows it is 'None'
|
||||
buf = StringIO()
|
||||
ed_field_mappings.info_es(buf)
|
||||
print(buf.getvalue())
|
||||
|
||||
expected = self.pd_flights().columns.to_list()
|
||||
expected.remove('Carrier')
|
||||
expected.append('Carrier')
|
||||
|
||||
assert expected == ed_field_mappings.display_names
|
Binary file not shown.
Binary file not shown.
42
eland/tests/query_compiler/test_get_field_names_pytest.py
Normal file
42
eland/tests/query_compiler/test_get_field_names_pytest.py
Normal file
@ -0,0 +1,42 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_index_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestGetFieldNames(TestData):
|
||||
|
||||
def test_get_field_names_all(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
fields1 = ed_flights._query_compiler.get_field_names(include_scripted_fields=False)
|
||||
fields2 = ed_flights._query_compiler.get_field_names(include_scripted_fields=True)
|
||||
|
||||
assert fields1 == fields2
|
||||
assert_index_equal(pd_flights.columns, pd.Index(fields1))
|
||||
|
||||
def test_get_field_names_selected(self):
|
||||
ed_flights = self.ed_flights()[['Carrier', 'AvgTicketPrice']]
|
||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice']]
|
||||
|
||||
fields1 = ed_flights._query_compiler.get_field_names(include_scripted_fields=False)
|
||||
fields2 = ed_flights._query_compiler.get_field_names(include_scripted_fields=True)
|
||||
|
||||
assert fields1 == fields2
|
||||
assert_index_equal(pd_flights.columns, pd.Index(fields1))
|
@ -1,86 +0,0 @@
|
||||
# Copyright 2019 Elasticsearch BV
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland import QueryCompiler
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestQueryCompilerRename(TestData):
|
||||
|
||||
def test_query_compiler_basic_rename(self):
|
||||
field_names = []
|
||||
display_names = []
|
||||
|
||||
mapper = QueryCompiler.DisplayNameToFieldNameMapper()
|
||||
|
||||
assert field_names == mapper.field_names_to_list()
|
||||
assert display_names == mapper.display_names_to_list()
|
||||
|
||||
field_names = ['a']
|
||||
display_names = ['A']
|
||||
update_A = {'a': 'A'}
|
||||
mapper.rename_display_name(update_A)
|
||||
|
||||
assert field_names == mapper.field_names_to_list()
|
||||
assert display_names == mapper.display_names_to_list()
|
||||
|
||||
field_names = ['a', 'b']
|
||||
display_names = ['A', 'B']
|
||||
|
||||
update_B = {'b': 'B'}
|
||||
mapper.rename_display_name(update_B)
|
||||
|
||||
assert field_names == mapper.field_names_to_list()
|
||||
assert display_names == mapper.display_names_to_list()
|
||||
|
||||
field_names = ['a', 'b']
|
||||
display_names = ['AA', 'B']
|
||||
|
||||
update_AA = {'A': 'AA'}
|
||||
mapper.rename_display_name(update_AA)
|
||||
|
||||
assert field_names == mapper.field_names_to_list()
|
||||
assert display_names == mapper.display_names_to_list()
|
||||
|
||||
def test_query_compiler_basic_rename_columns(self):
|
||||
columns = ['a', 'b', 'c', 'd']
|
||||
|
||||
mapper = QueryCompiler.DisplayNameToFieldNameMapper()
|
||||
|
||||
display_names = ['A', 'b', 'c', 'd']
|
||||
update_A = {'a': 'A'}
|
||||
mapper.rename_display_name(update_A)
|
||||
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
||||
# Invalid update
|
||||
display_names = ['A', 'b', 'c', 'd']
|
||||
update_ZZ = {'a': 'ZZ'}
|
||||
mapper.rename_display_name(update_ZZ)
|
||||
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
||||
display_names = ['AA', 'b', 'c', 'd']
|
||||
update_AA = {'A': 'AA'} # already renamed to 'A'
|
||||
mapper.rename_display_name(update_AA)
|
||||
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
||||
|
||||
display_names = ['AA', 'b', 'C', 'd']
|
||||
update_AA_C = {'a': 'AA', 'c': 'C'} # 'a' rename ignored
|
||||
mapper.rename_display_name(update_AA_C)
|
||||
|
||||
assert display_names == mapper.field_to_display_names(columns)
|
@ -29,6 +29,35 @@ class TestSeriesArithmetics(TestData):
|
||||
with pytest.raises(TypeError):
|
||||
ed_series = ed_df['total_quantity'] / pd_df['taxful_total_price']
|
||||
|
||||
def test_ecommerce_series_simple_arithmetics(self):
|
||||
pd_df = self.pd_ecommerce().head(100)
|
||||
ed_df = self.ed_ecommerce().head(100)
|
||||
|
||||
pd_series = pd_df['taxful_total_price'] + 5 + pd_df['total_quantity'] / pd_df['taxless_total_price'] - pd_df[
|
||||
'total_unique_products'] * 10.0 + pd_df['total_quantity']
|
||||
ed_series = ed_df['taxful_total_price'] + 5 + ed_df['total_quantity'] / ed_df['taxless_total_price'] - ed_df[
|
||||
'total_unique_products'] * 10.0 + ed_df['total_quantity']
|
||||
|
||||
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
|
||||
|
||||
def test_ecommerce_series_simple_integer_addition(self):
|
||||
pd_df = self.pd_ecommerce().head(100)
|
||||
ed_df = self.ed_ecommerce().head(100)
|
||||
|
||||
pd_series = pd_df['taxful_total_price'] + 5
|
||||
ed_series = ed_df['taxful_total_price'] + 5
|
||||
|
||||
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
|
||||
|
||||
def test_ecommerce_series_simple_series_addition(self):
|
||||
pd_df = self.pd_ecommerce().head(100)
|
||||
ed_df = self.ed_ecommerce().head(100)
|
||||
|
||||
pd_series = pd_df['taxful_total_price'] + pd_df['total_quantity']
|
||||
ed_series = ed_df['taxful_total_price'] + ed_df['total_quantity']
|
||||
|
||||
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
|
||||
|
||||
def test_ecommerce_series_basic_arithmetics(self):
|
||||
pd_df = self.pd_ecommerce().head(100)
|
||||
ed_df = self.ed_ecommerce().head(100)
|
||||
@ -199,7 +228,6 @@ class TestSeriesArithmetics(TestData):
|
||||
|
||||
# str op int (throws)
|
||||
for op in non_string_numeric_ops:
|
||||
print(op)
|
||||
with pytest.raises(TypeError):
|
||||
pd_series = getattr(pd_df['currency'], op)(pd_df['total_quantity'])
|
||||
with pytest.raises(TypeError):
|
||||
|
@ -31,4 +31,18 @@ class TestSeriesRename(TestData):
|
||||
pd_renamed = pd_carrier.rename("renamed")
|
||||
ed_renamed = ed_carrier.rename("renamed")
|
||||
|
||||
print(pd_renamed)
|
||||
print(ed_renamed)
|
||||
|
||||
print(ed_renamed.info_es())
|
||||
|
||||
assert_pandas_eland_series_equal(pd_renamed, ed_renamed)
|
||||
|
||||
pd_renamed2 = pd_renamed.rename("renamed2")
|
||||
ed_renamed2 = ed_renamed.rename("renamed2")
|
||||
|
||||
print(ed_renamed2.info_es())
|
||||
|
||||
assert "renamed2" == ed_renamed2.name
|
||||
|
||||
assert_pandas_eland_series_equal(pd_renamed2, ed_renamed2)
|
||||
|
@ -45,6 +45,11 @@ class TestSeriesArithmetics(TestData):
|
||||
|
||||
assert_pandas_eland_series_equal(pdadd, edadd)
|
||||
|
||||
def test_frame_add_str(self):
|
||||
pdadd = self.pd_ecommerce()[['customer_first_name', 'customer_last_name']] + "_steve"
|
||||
print(pdadd.head())
|
||||
print(pdadd.columns)
|
||||
|
||||
def test_str_add_ser(self):
|
||||
edadd = "The last name is: " + self.ed_ecommerce()['customer_last_name']
|
||||
pdadd = "The last name is: " + self.pd_ecommerce()['customer_last_name']
|
||||
@ -60,27 +65,27 @@ class TestSeriesArithmetics(TestData):
|
||||
assert_pandas_eland_series_equal(pdadd, edadd)
|
||||
|
||||
def test_ser_add_str_add_ser(self):
|
||||
pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name']
|
||||
print(pdadd.name)
|
||||
edadd = self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_last_name']
|
||||
print(edadd.name)
|
||||
|
||||
print(edadd.info_es())
|
||||
pdadd = self.pd_ecommerce()['customer_first_name'] + " " + self.pd_ecommerce()['customer_last_name']
|
||||
edadd = self.ed_ecommerce()['customer_first_name'] + " " + self.ed_ecommerce()['customer_last_name']
|
||||
|
||||
assert_pandas_eland_series_equal(pdadd, edadd)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def test_non_aggregatable_add_str(self):
|
||||
with pytest.raises(ValueError):
|
||||
assert self.ed_ecommerce()['customer_gender'] + "is the gender"
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def teststr_add_non_aggregatable(self):
|
||||
with pytest.raises(ValueError):
|
||||
assert "The gender is: " + self.ed_ecommerce()['customer_gender']
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def test_non_aggregatable_add_aggregatable(self):
|
||||
with pytest.raises(ValueError):
|
||||
assert self.ed_ecommerce()['customer_gender'] + self.ed_ecommerce()['customer_first_name']
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def test_aggregatable_add_non_aggregatable(self):
|
||||
with pytest.raises(ValueError):
|
||||
assert self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_gender']
|
||||
|
@ -60,6 +60,7 @@ class TestSeriesValueCounts(TestData):
|
||||
with pytest.raises(ValueError):
|
||||
assert ed_s.value_counts(es_size=-9)
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Aggregations not supported")
|
||||
def test_value_counts_non_aggregatable(self):
|
||||
ed_s = self.ed_ecommerce()['customer_first_name']
|
||||
pd_s = self.pd_ecommerce()['customer_first_name']
|
||||
|
@ -19,7 +19,7 @@ from pandas.io.parsers import _c_parser_defaults
|
||||
|
||||
from eland import Client
|
||||
from eland import DataFrame
|
||||
from eland import Mappings
|
||||
from eland import FieldMappings
|
||||
|
||||
DEFAULT_CHUNK_SIZE = 10000
|
||||
|
||||
@ -97,7 +97,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
|
||||
|
||||
client = Client(es_params)
|
||||
|
||||
mapping = Mappings._generate_es_mappings(pd_df, geo_points)
|
||||
mapping = FieldMappings._generate_es_mappings(pd_df, geo_points)
|
||||
|
||||
# If table exists, check if_exists parameter
|
||||
if client.index_exists(index=destination_index):
|
||||
|
Loading…
x
Reference in New Issue
Block a user