mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
507 lines
17 KiB
Python
507 lines
17 KiB
Python
import warnings
|
|
|
|
import pandas as pd
|
|
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype,
|
|
is_string_dtype)
|
|
|
|
|
|
class Mappings:
|
|
"""
|
|
General purpose to manage Elasticsearch to/from pandas mappings
|
|
|
|
Attributes
|
|
----------
|
|
|
|
mappings_capabilities: pandas.DataFrame
|
|
A data frame summarising the capabilities of the index mapping
|
|
|
|
_source - is top level field (i.e. not a multi-field sub-field)
|
|
es_dtype - Elasticsearch field datatype
|
|
pd_dtype - Pandas datatype
|
|
searchable - is the field searchable?
|
|
aggregatable- is the field aggregatable?
|
|
_source es_dtype pd_dtype searchable aggregatable
|
|
maps-telemetry.min True long int64 True True
|
|
maps-telemetry.avg True float float64 True True
|
|
city True text object True False
|
|
user_name True keyword object True True
|
|
origin_location.lat.keyword False keyword object True True
|
|
type True keyword object True True
|
|
origin_location.lat True text object True False
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
client=None,
|
|
index_pattern=None,
|
|
mappings=None):
|
|
"""
|
|
Parameters
|
|
----------
|
|
client: eland.Client
|
|
Elasticsearch client
|
|
|
|
index_pattern: str
|
|
Elasticsearch index pattern
|
|
|
|
Copy constructor arguments
|
|
|
|
mappings: Mappings
|
|
Object to copy
|
|
"""
|
|
if (client is not None) and (index_pattern is not None):
|
|
get_mapping = client.get_mapping(index=index_pattern)
|
|
|
|
# Get all fields (including all nested) and then all field_caps
|
|
all_fields = Mappings._extract_fields_from_mapping(get_mapping)
|
|
all_fields_caps = client.field_caps(index=index_pattern, fields='*')
|
|
|
|
# Get top level (not sub-field multifield) mappings
|
|
source_fields = Mappings._extract_fields_from_mapping(get_mapping, source_only=True)
|
|
|
|
# Populate capability matrix of fields
|
|
# field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
|
|
self._mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
|
|
else:
|
|
# straight copy
|
|
self._mappings_capabilities = mappings._mappings_capabilities.copy()
|
|
|
|
# Cache source field types for efficient lookup
|
|
# (this massively improves performance of DataFrame.flatten)
|
|
self._source_field_pd_dtypes = {}
|
|
|
|
for field_name in self._mappings_capabilities[self._mappings_capabilities._source == True].index:
|
|
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
|
self._source_field_pd_dtypes[field_name] = pd_dtype
|
|
|
|
@staticmethod
|
|
def _extract_fields_from_mapping(mappings, source_only=False):
|
|
"""
|
|
Extract all field names and types from a mapping.
|
|
```
|
|
{
|
|
"my_index": {
|
|
"mappings": {
|
|
"properties": {
|
|
"city": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
```
|
|
if source_only == False:
|
|
return {'city': 'text', 'city.keyword': 'keyword'}
|
|
else:
|
|
return {'city': 'text'}
|
|
|
|
Note: first field name type wins. E.g.
|
|
|
|
```
|
|
PUT my_index1 {"mappings":{"properties":{"city":{"type":"text"}}}}
|
|
PUT my_index2 {"mappings":{"properties":{"city":{"type":"long"}}}}
|
|
|
|
Returns {'city': 'text'}
|
|
```
|
|
|
|
Parameters
|
|
----------
|
|
mappings: dict
|
|
Return from get_mapping
|
|
|
|
Returns
|
|
-------
|
|
fields: dict
|
|
Dict of field names and types
|
|
|
|
"""
|
|
fields = {}
|
|
|
|
# Recurse until we get a 'type: xxx'
|
|
def flatten(x, name=''):
|
|
if type(x) is dict:
|
|
for a in x:
|
|
if a == 'type' and type(x[a]) is str: # 'type' can be a name of a field
|
|
field_name = name[:-1]
|
|
field_type = x[a]
|
|
|
|
# If there is a conflicting type, warn - first values added wins
|
|
if field_name in fields and fields[field_name] != field_type:
|
|
warnings.warn("Field {} has conflicting types {} != {}".
|
|
format(field_name, fields[field_name], field_type),
|
|
UserWarning)
|
|
else:
|
|
fields[field_name] = field_type
|
|
elif a == 'properties' or (not source_only and a == 'fields'):
|
|
flatten(x[a], name)
|
|
elif not (source_only and a == 'fields'): # ignore multi-field fields for source_only
|
|
flatten(x[a], name + a + '.')
|
|
|
|
for index in mappings:
|
|
if 'properties' in mappings[index]['mappings']:
|
|
properties = mappings[index]['mappings']['properties']
|
|
|
|
flatten(properties)
|
|
|
|
return fields
|
|
|
|
@staticmethod
|
|
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
|
|
"""
|
|
{
|
|
"fields": {
|
|
"rating": {
|
|
"long": {
|
|
"searchable": true,
|
|
"aggregatable": false,
|
|
"indices": ["index1", "index2"],
|
|
"non_aggregatable_indices": ["index1"]
|
|
},
|
|
"keyword": {
|
|
"searchable": false,
|
|
"aggregatable": true,
|
|
"indices": ["index3", "index4"],
|
|
"non_searchable_indices": ["index4"]
|
|
}
|
|
},
|
|
"title": {
|
|
"text": {
|
|
"searchable": true,
|
|
"aggregatable": false
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
all_fields_caps_fields = all_fields_caps['fields']
|
|
|
|
columns = ['_source', 'es_dtype', 'pd_dtype', 'searchable', 'aggregatable']
|
|
capability_matrix = {}
|
|
|
|
for field, field_caps in all_fields_caps_fields.items():
|
|
if field in all_fields:
|
|
# v = {'long': {'type': 'long', 'searchable': True, 'aggregatable': True}}
|
|
for kk, vv in field_caps.items():
|
|
_source = (field in source_fields)
|
|
es_dtype = vv['type']
|
|
pd_dtype = Mappings._es_dtype_to_pd_dtype(vv['type'])
|
|
searchable = vv['searchable']
|
|
aggregatable = vv['aggregatable']
|
|
|
|
caps = [_source, es_dtype, pd_dtype, searchable, aggregatable]
|
|
|
|
capability_matrix[field] = caps
|
|
|
|
if 'non_aggregatable_indices' in vv:
|
|
warnings.warn("Field {} has conflicting aggregatable fields across indexes {}",
|
|
format(field, vv['non_aggregatable_indices']),
|
|
UserWarning)
|
|
if 'non_searchable_indices' in vv:
|
|
warnings.warn("Field {} has conflicting searchable fields across indexes {}",
|
|
format(field, vv['non_searchable_indices']),
|
|
UserWarning)
|
|
|
|
capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns)
|
|
|
|
return capability_matrix_df.sort_index()
|
|
|
|
@staticmethod
|
|
def _es_dtype_to_pd_dtype(es_dtype):
|
|
"""
|
|
Mapping Elasticsearch types to pandas dtypes
|
|
--------------------------------------------
|
|
|
|
Elasticsearch field datatype | Pandas dtype
|
|
--
|
|
text | object
|
|
keyword | object
|
|
long, integer, short, byte, binary | int64
|
|
double, float, half_float, scaled_float | float64
|
|
date, date_nanos | datetime64
|
|
boolean | bool
|
|
TODO - add additional mapping types
|
|
"""
|
|
es_dtype_to_pd_dtype = {
|
|
'text': 'object',
|
|
'keyword': 'object',
|
|
|
|
'long': 'int64',
|
|
'integer': 'int64',
|
|
'short': 'int64',
|
|
'byte': 'int64',
|
|
'binary': 'int64',
|
|
|
|
'double': 'float64',
|
|
'float': 'float64',
|
|
'half_float': 'float64',
|
|
'scaled_float': 'float64',
|
|
|
|
'date': 'datetime64[ns]',
|
|
'date_nanos': 'datetime64[ns]',
|
|
|
|
'boolean': 'bool'
|
|
}
|
|
|
|
if es_dtype in es_dtype_to_pd_dtype:
|
|
return es_dtype_to_pd_dtype[es_dtype]
|
|
|
|
# Return 'object' for all unsupported TODO - investigate how different types could be supported
|
|
return 'object'
|
|
|
|
@staticmethod
|
|
def _pd_dtype_to_es_dtype(pd_dtype):
|
|
"""
|
|
Mapping pandas dtypes to Elasticsearch dtype
|
|
--------------------------------------------
|
|
|
|
```
|
|
Pandas dtype Python type NumPy type Usage
|
|
object str string_, unicode_ Text
|
|
int64 int int_, int8, int16, int32, int64, uint8, uint16, uint32, uint64 Integer numbers
|
|
float64 float float_, float16, float32, float64 Floating point numbers
|
|
bool bool bool_ True/False values
|
|
datetime64 NA datetime64[ns] Date and time values
|
|
timedelta[ns] NA NA Differences between two datetimes
|
|
category NA NA Finite list of text values
|
|
```
|
|
"""
|
|
es_dtype = None
|
|
|
|
# Map all to 64-bit - TODO map to specifics: int32 -> int etc.
|
|
if is_float_dtype(pd_dtype):
|
|
es_dtype = 'double'
|
|
elif is_integer_dtype(pd_dtype):
|
|
es_dtype = 'long'
|
|
elif is_bool_dtype(pd_dtype):
|
|
es_dtype = 'boolean'
|
|
elif is_string_dtype(pd_dtype):
|
|
es_dtype = 'keyword'
|
|
elif is_datetime_or_timedelta_dtype(pd_dtype):
|
|
es_dtype = 'date'
|
|
else:
|
|
warnings.warn('No mapping for pd_dtype: [{0}], using default mapping'.format(pd_dtype))
|
|
|
|
return es_dtype
|
|
|
|
@staticmethod
|
|
def _generate_es_mappings(dataframe, geo_points=None):
|
|
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
|
|
|
|
Parameters
|
|
----------
|
|
dataframe : pandas.DataFrame
|
|
pandas.DataFrame to create schema from
|
|
|
|
Returns
|
|
-------
|
|
mapping : str
|
|
"""
|
|
|
|
"""
|
|
"mappings" : {
|
|
"properties" : {
|
|
"AvgTicketPrice" : {
|
|
"type" : "float"
|
|
},
|
|
"Cancelled" : {
|
|
"type" : "boolean"
|
|
},
|
|
"Carrier" : {
|
|
"type" : "keyword"
|
|
},
|
|
"Dest" : {
|
|
"type" : "keyword"
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
|
|
mappings = {}
|
|
mappings['properties'] = {}
|
|
for column_name, dtype in dataframe.dtypes.iteritems():
|
|
if geo_points is not None and column_name in geo_points:
|
|
es_dtype = 'geo_point'
|
|
else:
|
|
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
|
|
|
mappings['properties'][column_name] = {}
|
|
mappings['properties'][column_name]['type'] = es_dtype
|
|
|
|
return {"mappings": mappings}
|
|
|
|
def all_fields(self):
|
|
"""
|
|
Returns
|
|
-------
|
|
all_fields: list
|
|
All typed fields in the index mapping
|
|
"""
|
|
return self._mappings_capabilities.index.tolist()
|
|
|
|
def field_capabilities(self, field_name):
|
|
"""
|
|
Parameters
|
|
----------
|
|
field_name: str
|
|
|
|
Returns
|
|
-------
|
|
mappings_capabilities: pd.Series with index values:
|
|
_source: bool
|
|
Is this field name a top-level source field?
|
|
ed_dtype: str
|
|
The Elasticsearch data type
|
|
pd_dtype: str
|
|
The pandas data type
|
|
searchable: bool
|
|
Is the field searchable in Elasticsearch?
|
|
aggregatable: bool
|
|
Is the field aggregatable in Elasticsearch?
|
|
"""
|
|
return self._mappings_capabilities.loc[field_name]
|
|
|
|
def source_field_pd_dtype(self, field_name):
|
|
"""
|
|
Parameters
|
|
----------
|
|
field_name: str
|
|
|
|
Returns
|
|
-------
|
|
is_source_field: bool
|
|
Is this field name a top-level source field?
|
|
pd_dtype: str
|
|
The pandas data type we map to
|
|
"""
|
|
pd_dtype = 'object'
|
|
is_source_field = False
|
|
|
|
if field_name in self._source_field_pd_dtypes:
|
|
is_source_field = True
|
|
pd_dtype = self._source_field_pd_dtypes[field_name]
|
|
|
|
return is_source_field, pd_dtype
|
|
|
|
def is_source_field(self, field_name):
|
|
"""
|
|
Parameters
|
|
----------
|
|
field_name: str
|
|
|
|
Returns
|
|
-------
|
|
is_source_field: bool
|
|
Is this field name a top-level source field?
|
|
"""
|
|
is_source_field = False
|
|
|
|
if field_name in self._source_field_pd_dtypes:
|
|
is_source_field = True
|
|
|
|
return is_source_field
|
|
|
|
def aggregatable_columns(self, columns=None):
|
|
"""
|
|
Return a dict of aggregatable columns from all columns or columns list
|
|
{'customer_full_name': 'customer_full_name.keyword', ...}
|
|
|
|
Logic here is that column names are '_source' fields and keyword fields
|
|
may be nested beneath the field. E.g.
|
|
customer_full_name: text
|
|
customer_full_name.keyword: keyword
|
|
|
|
customer_full_name.keyword is the aggregatable field for customer_full_name
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
|
|
"""
|
|
if columns is None:
|
|
columns = self.source_fields()
|
|
|
|
aggregatables = {}
|
|
|
|
for column in columns:
|
|
capabilities = self.field_capabilities(column)
|
|
if capabilities['aggregatable']:
|
|
aggregatables[column] = column
|
|
else:
|
|
# Try 'column.keyword'
|
|
column_keyword = column + '.keyword'
|
|
capabilities = self.field_capabilities(column_keyword)
|
|
if capabilities['aggregatable']:
|
|
aggregatables[column_keyword] = column
|
|
else:
|
|
# Aggregations not supported for this field
|
|
raise ValueError("Aggregations not supported for ", column)
|
|
|
|
return aggregatables
|
|
|
|
def numeric_source_fields(self, columns, include_bool=True):
|
|
"""
|
|
Returns
|
|
-------
|
|
numeric_source_fields: list of str
|
|
List of source fields where pd_dtype == (int64 or float64 or bool)
|
|
"""
|
|
if include_bool == True:
|
|
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
|
(self._mappings_capabilities.pd_dtype == 'float64') |
|
|
(self._mappings_capabilities.pd_dtype == 'bool'))]
|
|
else:
|
|
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
|
(self._mappings_capabilities.pd_dtype == 'float64'))]
|
|
# if columns exists, filter index with columns
|
|
if columns is not None:
|
|
# reindex adds NA for non-existing columns (non-numeric), so drop these after reindex
|
|
df = df.reindex(columns)
|
|
df.dropna(inplace=True)
|
|
|
|
# return as list
|
|
return df.index.to_list()
|
|
|
|
def source_fields(self):
|
|
"""
|
|
Returns
|
|
-------
|
|
source_fields: list of str
|
|
List of source fields
|
|
"""
|
|
return self._source_field_pd_dtypes.keys()
|
|
|
|
def count_source_fields(self):
|
|
"""
|
|
Returns
|
|
-------
|
|
count_source_fields: int
|
|
Number of source fields in mapping
|
|
"""
|
|
return len(self.source_fields())
|
|
|
|
def dtypes(self, columns=None):
|
|
"""
|
|
Returns
|
|
-------
|
|
dtypes: pd.Series
|
|
Source field name + pd_dtype
|
|
"""
|
|
if columns is not None:
|
|
return pd.Series(
|
|
{key: self._source_field_pd_dtypes[key] for key in columns})
|
|
|
|
return pd.Series(self._source_field_pd_dtypes)
|
|
|
|
def info_es(self, buf):
|
|
buf.write("Mappings:\n")
|
|
buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
|