Major refactor

Addition of new mapping module to manage mapping between pandas and ES. More tests.
2025-07-24 00:00:39 +08:00 · 2019-06-21 10:23:58 +00:00 · 2019-06-21 10:23:58 +00:00 · 0fa3f81bbb
commit 0fa3f81bbb
parent 674ac129e6
11 changed files with 838 additions and 91 deletions
--- a/eland/init.py
+++ b/eland/init.py
@ -1,3 +1,4 @@
 from .utils import *
 from .frame import *
 from .client import *
 from .mappings import *
--- a/eland/client.py
+++ b/eland/client.py
@ -1,8 +1,9 @@
 from elasticsearch import Elasticsearch
-# eland client - implement as facade to control access to Elasticsearch methods
+class Client():
-class Client(object):
+    """
-
+    eland client - implemented as facade to control access to Elasticsearch methods
    """
    def __init__(self, es=None):
        if isinstance(es, Elasticsearch):
            self.es = es
@ -20,4 +21,3 @@ class Client(object):
    def field_caps(self, **kwargs):
        return self.es.field_caps(**kwargs)
--- a/eland/frame.py
+++ b/eland/frame.py
@ -23,7 +23,7 @@ Similarly, only Elasticsearch searchable fields can be searched or filtered, and
 only Elasticsearch aggregatable fields can be aggregated or grouped.
 """
-import eland
+import eland as ed
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search
@ -66,94 +66,179 @@ class DataFrame():
    If the Elasticsearch index is deleted or index mappings are changed after this
    object is created, the object is not rebuilt and so inconsistencies can occur.
    Mapping Elasticsearch types to pandas dtypes
    --------------------------------------------
    Elasticsearch field datatype              | Pandas dtype
    --
    text                                      | object
    keyword                                   | object
    long, integer, short, byte, binary        | int64
    double, float, half_float, scaled_float   | float64
    date, date_nanos                          | datetime64[ns]
    boolean                                   | bool
    TODO - add additional mapping types
    """
    def __init__(self, client, index_pattern):
-        self.client = eland.Client(client)
+        self.client = ed.Client(client)
        self.index_pattern = index_pattern
-        # Get and persist mappings, this allows use to correctly
+        # Get and persist mappings, this allows us to correctly
        # map returned types from Elasticsearch to pandas datatypes
-        mapping = self.client.indices().get_mapping(index=self.index_pattern)
+        self.mappings = ed.Mappings(self.client, self.index_pattern)
        #field_caps = self.client.field_caps(index=self.index_pattern, fields='*')
        #self.fields, self.aggregatable_fields, self.searchable_fields = \
        #    DataFrame._es_mappings_to_pandas(mapping, field_caps)
        self.fields = DataFrame._es_mappings_to_pandas(mapping)
    @staticmethod
    def _flatten_results(prefix, results, result):
        # TODO
        return prefix
    def _es_results_to_pandas(self, results):
-        # TODO - resolve nested fields
+        """
        Parameters
        ----------
        results: dict
            Elasticsearch results from self.client.search
        Returns
        -------
        df: pandas.DataFrame
            _source values extracted from results and mapped to pandas DataFrame
            dtypes are mapped via Mapping object
        Notes
        -----
        Fields containing lists in Elasticsearch don't map easily to pandas.DataFrame
        For example, an index with mapping:
        ```
        "mappings" : {
          "properties" : {
            "group" : {
              "type" : "keyword"
            },
            "user" : {
              "type" : "nested",
              "properties" : {
                "first" : {
                  "type" : "keyword"
                },
                "last" : {
                  "type" : "keyword"
                }
              }
            }
          }
        }
        ```
        Adding a document:
        ```
        "_source" : {
          "group" : "amsterdam",
          "user" : [
            {
              "first" : "John",
              "last" : "Smith"
            },
            {
              "first" : "Alice",
              "last" : "White"
            }
          ]
        }
        ```
        (https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html)
        this would be transformed internally (in Elasticsearch) into a document that looks more like this:
        ```
        {
          "group" :        "amsterdam",
          "user.first" : [ "alice", "john" ],
          "user.last" :  [ "smith", "white" ]
        }
        ```
        When mapping this a pandas data frame we mimic this transformation.
        Similarly, if a list is added to Elasticsearch:
        ```
        PUT my_index/_doc/1
        {
          "list" : [
            0, 1, 2
          ]
        }
        ```
        The mapping is:
        ```
        "mappings" : {
          "properties" : {
            "user" : {
              "type" : "long"
            }
          }
        }
        ```
        TODO - explain how lists are handled (https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html)
        TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great)
        NOTE - using this lists is generally not a good way to use this API
        """
        def flatten_dict(y):
            out = {}
            def flatten(x, name=''):
                # We flatten into source fields e.g. if type=geo_point
                # location: {lat=52.38, lon=4.90}
                if name == '':
                    is_source_field = False
                    pd_dtype = 'object'
                else:
                    is_source_field, pd_dtype = self.mappings.is_source_field(name[:-1])
                if not is_source_field and type(x) is dict:
                    for a in x:
                        flatten(x[a], name + a + '.')
                elif not is_source_field and type(x) is list:
                    for a in x:
                        flatten(a, name)
                else:
                    field_name = name[:-1]
                    # Coerce type
                    if pd_dtype == 'datetime64':
                        x = pd.to_datetime(x)
                        print(field_name, pd_dtype, x, type(x))
                    # Elasticsearch can have multiple values for a field. These are represented as lists, so
                    # create lists for this pivot (see notes above)
                    if field_name in out:
                        if type(out[field_name]) is not list:
                            l = [out[field_name]]
                            out[field_name] = l
                        out[field_name].append(x)
                    else:
                        out[field_name] = x
            flatten(y)
            return out
        rows = []
        i = 0
        for hit in results['hits']['hits']:
            row = hit['_source']
            rows.append(row)
            # flatten row to map correctly to 2D DataFrame
            rows.append(flatten_dict(row))
            i = i + 1
            if i % 100 == 0:
                print(i)
        # Create pandas DataFrame
        df = pd.DataFrame(data=rows)
        """
        # Coerce types
        pd_dtypes = self.mappings.source_fields_pd_dtypes()
        # This returns types such as:
        # {
        # 'bool': Index(['Cancelled', 'FlightDelay'], dtype='object'),
        # 'datetime64[ns]': Index(['timestamp'], dtype='object'),
        # 'float64': Index(['AvgTicketPrice', 'DistanceKilometers', 'DistanceMiles',...
        # }
        for pd_dtype, value in pd_dtypes.items():
            # Types generally convert well e.g. 1,2,3 -> int64, 1.1,2.2,3.3 -> float64
            # so to minimise work we only convert special types.
            # TODO - add option to force all conversion
            if pd_dtype == 'datetime64':
                print(df.loc[:,value.tolist()])
                df.loc[:,value.tolist()] = df.loc[:,value.tolist()].astype('datetime64')
        """
        return df
    @staticmethod
    def _extract_types_from_mapping(y):
        """
        Extract data types from mapping for DataFrame columns.
        Elasticsearch _source data is transformed into pandas DataFrames. This strategy is not compatible
        with all Elasticsearch configurations. Notes:
        - This strategy is not compatible with all Elasticsearch configurations. If _source is disabled
        (https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-source-field.html#disable-source-field)
        no data values will be populated
        - Sub-fields (e.g. english.text in {"mappings":{"properties":{"text":{"type":"text","fields":{"english":{"type":"text","analyzer":"english"}}}}}})
        are not be used
        """
        out = {}
        # Recurse until we get a 'type: xxx' - ignore sub-fields
        def flatten(x, name=''):
            if type(x) is dict:
                for a in x:
                    if a == 'type' and type(x[a]) is str: # 'type' can be a name of a field
                        out[name[:-1]] = x[a]
                    if a == 'properties' or a == 'fields':
                        flatten(x[a], name)
                    else:
                        flatten(x[a], name + a + '.')
        flatten(y)
        return out
    @staticmethod
    def _es_mappings_to_pandas(mappings):
        fields = {}
        for index in mappings:
            if 'properties' in mappings[index]['mappings']:
                properties = mappings[index]['mappings']['properties']
                datatypes = DataFrame._extract_types_from_mapping(properties)
                # Note there could be conflicts here - e.g. the same field name with different semantics in
                # different indexes - currently the last one wins TODO: review this
                fields.update(datatypes)
        return pd.DataFrame.from_dict(data=fields, orient='index', columns=['datatype'])
    def head(self, n=5):
        results = self.client.search(index=self.index_pattern, size=n)
@ -161,8 +246,6 @@ class DataFrame():
    def describe(self):
        # First get all types
        #mapping = self.client.indices().get_mapping(index=self.index_pattern)
        mapping = self.client.indices().get_mapping(index=self.index_pattern)
        fields = DataFrame._es_mappings_to_pandas(mapping)
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -0,0 +1,286 @@
 import warnings
 import pandas as pd
 class Mappings():
    """
    General purpose to manage Elasticsearch to/from pandas mappings
    Attributes
    ----------
    mappings_capabilities: pandas.DataFrame
        A data frame summarising the capabilities of the index mapping
        _source     - is top level field (i.e. not a multi-field sub-field)
        es_dtype    - Elasticsearch field datatype
        pd_dtype    - Pandas datatype
        searchable  - is the field searchable?
        aggregatable- is the field aggregatable?
                                        _source es_dtype    pd_dtype    searchable  aggregatable
        maps-telemetry.min              True    long        int64       True        True
        maps-telemetry.avg              True    float       float64     True        True
        city                            True    text        object      True        False
        user_name                       True    keyword     object      True        True
        origin_location.lat.keyword     False   keyword     object      True        True
        type                            True    keyword     object      True        True
        origin_location.lat             True    text        object      True        False
    """
    def __init__(self, client, index_pattern):
        """
        Parameters
        ----------
        client: eland.Client
            Elasticsearch client
        index_pattern: str
            Elasticsearch index pattern
        """
        # persist index_pattern for debugging
        self.index_pattern = index_pattern
        mappings = client.indices().get_mapping(index=index_pattern)
        # Get all fields (including all nested) and then field_caps
        # for these names (fields=* doesn't appear to work effectively...)
        all_fields = Mappings._extract_fields_from_mapping(mappings)
        all_fields_caps = client.field_caps(index=index_pattern, fields=list(all_fields.keys()))
        # Get top level (not sub-field multifield) mappings
        source_fields = Mappings._extract_fields_from_mapping(mappings, source_only=True)
        # Populate capability matrix of fields
        # field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
        self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
        # Cache source field types for efficient lookup
        # (this massively improves performance of DataFrame.flatten)
        self.source_field_pd_dtypes = {}
        for field_name in source_fields:
            pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype']
            self.source_field_pd_dtypes[field_name] = pd_dtype
    def _extract_fields_from_mapping(mappings, source_only=False):
        """
        Extract all field names and types from a mapping.
        ```
        {
          "my_index": {
            "mappings": {
              "properties": {
                "city": {
                  "type": "text",
                  "fields": {
                    "keyword": {
                      "type": "keyword"
                    }
                  }
                }
              }
            }
          }
        }
        ```
        if source_only == False:
            return {'city': 'text', 'city.keyword': 'keyword'}
        else:
            return {'city': 'text'}
        Note: first field name type wins. E.g.
        ```
        PUT my_index1 {"mappings":{"properties":{"city":{"type":"text"}}}}
        PUT my_index2 {"mappings":{"properties":{"city":{"type":"long"}}}}
        Returns {'city': 'text'}
        ```
        Parameters
        ----------
        mappings: dict
            Return from get_mapping
        Returns
        -------
        fields: dict
            Dict of field names and types
        """
        fields = {}
        # Recurse until we get a 'type: xxx'
        def flatten(x, name=''):
            if type(x) is dict:
                for a in x:
                    if a == 'type' and type(x[a]) is str:  # 'type' can be a name of a field
                        field_name = name[:-1]
                        field_type = x[a]
                        # If there is a conflicting type, warn - first values added wins
                        if field_name in fields and fields[field_name] != field_type:
                            warnings.warn("Field {} has conflicting types {} != {}".
                                          format(field_name, fields[field_name], field_type),
                                          UserWarning)
                        else:
                            fields[field_name] = field_type
                    elif a == 'properties' or (not source_only and a == 'fields'):
                        flatten(x[a], name)
                    elif not (source_only and a == 'fields'):  # ignore multi-field fields for source_only
                        flatten(x[a], name + a + '.')
        for index in mappings:
            if 'properties' in mappings[index]['mappings']:
                properties = mappings[index]['mappings']['properties']
                flatten(properties)
        return fields
    def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
        """
        {
          "fields": {
            "rating": {
              "long": {
                "searchable": true,
                "aggregatable": false,
                "indices": ["index1", "index2"],
                "non_aggregatable_indices": ["index1"]
              },
              "keyword": {
                "searchable": false,
                "aggregatable": true,
                "indices": ["index3", "index4"],
                "non_searchable_indices": ["index4"]
              }
            },
            "title": {
              "text": {
                "searchable": true,
                "aggregatable": false
              }
            }
          }
        }
        """
        all_fields_caps_fields = all_fields_caps['fields']
        columns = ['_source', 'es_dtype', 'pd_dtype', 'searchable', 'aggregatable']
        capability_matrix = {}
        for field, field_caps in all_fields_caps_fields.items():
            if field in all_fields:
                # v = {'long': {'type': 'long', 'searchable': True, 'aggregatable': True}}
                for kk, vv in field_caps.items():
                    _source = (field in source_fields)
                    es_dtype = vv['type']
                    pd_dtype = Mappings._es_dtype_to_pd_dtype(vv['type'])
                    searchable = vv['searchable']
                    aggregatable = vv['aggregatable']
                    caps = [_source, es_dtype, pd_dtype, searchable, aggregatable]
                    capability_matrix[field] = caps
                    if 'non_aggregatable_indices' in vv:
                        warnings.warn("Field {} has conflicting aggregatable fields across indexes {}",
                                      format(field_name, vv['non_aggregatable_indices']),
                                      UserWarning)
                    if 'non_searchable_indices' in vv:
                        warnings.warn("Field {} has conflicting searchable fields across indexes {}",
                                      format(field_name, vv['non_searchable_indices']),
                                      UserWarning)
        capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns)
        return capability_matrix_df.sort_index()
    def _es_dtype_to_pd_dtype(es_dtype):
        """
        Mapping Elasticsearch types to pandas dtypes
        --------------------------------------------
        Elasticsearch field datatype              | Pandas dtype
        --
        text                                      | object
        keyword                                   | object
        long, integer, short, byte, binary        | int64
        double, float, half_float, scaled_float   | float64
        date, date_nanos                          | datetime64
        boolean                                   | bool
        TODO - add additional mapping types
        """
        es_dtype_to_pd_dtype = {
            'text': 'object',
            'keyword': 'object',
            'long': 'int64',
            'integer': 'int64',
            'short': 'int64',
            'byte': 'int64',
            'binary': 'int64',
            'double': 'float64',
            'float': 'float64',
            'half_float': 'float64',
            'scaled_float': 'float64',
            'date': 'datetime64',
            'date_nanos': 'datetime64',
            'boolean': 'bool'
        }
        if es_dtype in es_dtype_to_pd_dtype:
            return es_dtype_to_pd_dtype[es_dtype]
        # Return 'object' for all unsupported TODO - investigate how different types could be supported
        return 'object'
    def all_fields(self):
        """
        Returns
        -------
        all_fields: list
            All typed fields in the index mapping
        """
        return self.mappings_capabilities.index.tolist()
    def source_fields_pd_dtypes(self):
        """
        Returns
        -------
        groups: dict
            Calls pandas.core.groupby.GroupBy.groups for _source fields
            E.g.
            {
                'bool': Index(['Cancelled', 'FlightDelay'], dtype='object'),
                'datetime64[ns]': Index(['timestamp'], dtype='object'),
                'float64': Index(['AvgTicketPrice', 'DistanceKilometers', 'DistanceMiles',...
            }
        """
        return self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype').groups
    def is_source_field(self, field_name):
        """
        Parameters
        ----------
        field_name: str
        Returns
        -------
        is_source_field: bool
            Is this field name a top-level source field?
        pd_dtype: str
            The pandas data type we map to
        """
        pd_dtype = 'object'
        is_source_field = False
        if field_name in self.source_field_pd_dtypes:
            is_source_field = True
            pd_dtype = self.source_field_pd_dtypes[field_name]
        return is_source_field, pd_dtype
--- a/eland/tests/init.py
+++ b/eland/tests/init.py
@ -7,10 +7,294 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
 FLIGHTS_INDEX_NAME = 'flights'
 FLIGHTS_MAPPING = { "mappings" : {
      "properties" : {
        "AvgTicketPrice" : {
          "type" : "float"
        },
        "Cancelled" : {
          "type" : "boolean"
        },
        "Carrier" : {
          "type" : "keyword"
        },
        "Dest" : {
          "type" : "keyword"
        },
        "DestAirportID" : {
          "type" : "keyword"
        },
        "DestCityName" : {
          "type" : "keyword"
        },
        "DestCountry" : {
          "type" : "keyword"
        },
        "DestLocation" : {
          "type" : "geo_point"
        },
        "DestRegion" : {
          "type" : "keyword"
        },
        "DestWeather" : {
          "type" : "keyword"
        },
        "DistanceKilometers" : {
          "type" : "float"
        },
        "DistanceMiles" : {
          "type" : "float"
        },
        "FlightDelay" : {
          "type" : "boolean"
        },
        "FlightDelayMin" : {
          "type" : "integer"
        },
        "FlightDelayType" : {
          "type" : "keyword"
        },
        "FlightNum" : {
          "type" : "keyword"
        },
        "FlightTimeHour" : {
          "type" : "keyword"
        },
        "FlightTimeMin" : {
          "type" : "float"
        },
        "Origin" : {
          "type" : "keyword"
        },
        "OriginAirportID" : {
          "type" : "keyword"
        },
        "OriginCityName" : {
          "type" : "keyword"
        },
        "OriginCountry" : {
          "type" : "keyword"
        },
        "OriginLocation" : {
          "type" : "geo_point"
        },
        "OriginRegion" : {
          "type" : "keyword"
        },
        "OriginWeather" : {
          "type" : "keyword"
        },
        "dayOfWeek" : {
          "type" : "integer"
        },
        "timestamp" : {
          "type" : "date"
        }
      }
    } }
 FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
 ECOMMERCE_INDEX_NAME = 'ecommerce'
 ECOMMERCE_MAPPING = { "mappings" : {
      "properties" : {
        "category" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          }
        },
        "currency" : {
          "type" : "keyword"
        },
        "customer_birth_date" : {
          "type" : "date"
        },
        "customer_first_name" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "customer_full_name" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "customer_gender" : {
          "type" : "keyword"
        },
        "customer_id" : {
          "type" : "keyword"
        },
        "customer_last_name" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "customer_phone" : {
          "type" : "keyword"
        },
        "day_of_week" : {
          "type" : "keyword"
        },
        "day_of_week_i" : {
          "type" : "integer"
        },
        "email" : {
          "type" : "keyword"
        },
        "geoip" : {
          "properties" : {
            "city_name" : {
              "type" : "keyword"
            },
            "continent_name" : {
              "type" : "keyword"
            },
            "country_iso_code" : {
              "type" : "keyword"
            },
            "location" : {
              "type" : "geo_point"
            },
            "region_name" : {
              "type" : "keyword"
            }
          }
        },
        "manufacturer" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          }
        },
        "order_date" : {
          "type" : "date"
        },
        "order_id" : {
          "type" : "keyword"
        },
        "products" : {
          "properties" : {
            "_id" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword",
                  "ignore_above" : 256
                }
              }
            },
            "base_price" : {
              "type" : "half_float"
            },
            "base_unit_price" : {
              "type" : "half_float"
            },
            "category" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword"
                }
              }
            },
            "created_on" : {
              "type" : "date"
            },
            "discount_amount" : {
              "type" : "half_float"
            },
            "discount_percentage" : {
              "type" : "half_float"
            },
            "manufacturer" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword"
                }
              }
            },
            "min_price" : {
              "type" : "half_float"
            },
            "price" : {
              "type" : "half_float"
            },
            "product_id" : {
              "type" : "long"
            },
            "product_name" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword"
                }
              },
              "analyzer" : "english"
            },
            "quantity" : {
              "type" : "integer"
            },
            "sku" : {
              "type" : "keyword"
            },
            "tax_amount" : {
              "type" : "half_float"
            },
            "taxful_price" : {
              "type" : "half_float"
            },
            "taxless_price" : {
              "type" : "half_float"
            },
            "unit_discount_amount" : {
              "type" : "half_float"
            }
          }
        },
        "sku" : {
          "type" : "keyword"
        },
        "taxful_total_price" : {
          "type" : "half_float"
        },
        "taxless_total_price" : {
          "type" : "half_float"
        },
        "total_quantity" : {
          "type" : "integer"
        },
        "total_unique_products" : {
          "type" : "integer"
        },
        "type" : {
          "type" : "keyword"
        },
        "user" : {
          "type" : "keyword"
        }
      }
    } }
 ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
 ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
 ECOMMERCE_DATETIME_FIELD = 'order_date'
 TEST_MAPPING1 = {
      'mappings': {
@ -115,9 +399,6 @@ TEST_MAPPING1 = {
          'dest_location': {
            'type': 'geo_point'
          },
          'user': {
            'type': 'nested'
          },
          'my_join_field': {
            'type': 'join',
            'relations': {
@ -153,8 +434,48 @@ TEST_MAPPING1_EXPECTED = {
    'text.english': 'text',
    'tweeted_at': 'date',
    'type': 'keyword',
    'user': 'nested',
    'user_name': 'keyword'
 }
-TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['datatype'])
+TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
 TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
 TEST_NESTED_USER_GROUP_MAPPING = {
  'mappings': {
    'properties': {
      'group': {
        'type': 'keyword'
      },
      'user': {
        'properties': {
            'first': {
              'type': 'keyword'
            },
            'last': {
              'type': 'keyword'
            },
            'address' : {
              'type' : 'keyword'
            }
        }
      }
    }
 }
 }
 TEST_NESTED_USER_GROUP_DOCS = [
 {'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
 '_source':
    {'group':'amsterdam','user':[
        {'first':'Manke','last':'Nelis','address':['Elandsgracht', 'Amsterdam']},
        {'first':'Johnny','last':'Jordaan','address':['Elandsstraat', 'Amsterdam']}]}},
 {'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
 '_source':
    {'group':'london','user':[
        {'first':'Alice','last':'Monkton'},
        {'first':'Jimmy','last':'White','address':['London']}]}},
 {'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
 '_source':{'group':'new york','user':[
    {'first':'Bill','last':'Jones'}]}}
 ]
--- a/eland/tests/client/init.py
+++ b/eland/tests/client/init.py
--- a/eland/tests/client/test_mappings_pytest.py
+++ b/eland/tests/client/test_mappings_pytest.py
@ -0,0 +1,20 @@
 # File called _pytest for PyCharm compatability
 import pytest
 from eland.tests import *
 from pandas.util.testing import (
    assert_almost_equal, assert_frame_equal, assert_series_equal)
 import eland as ed
 class TestMapping():
    # Requires 'setup_tests.py' to be run prior to this
    def test_mapping(self):
        mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
        assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype']))
--- a/eland/tests/ecommerce_df.json.gz
+++ b/eland/tests/ecommerce_df.json.gz
--- a/eland/tests/frame/common.py
+++ b/eland/tests/frame/common.py
@ -10,12 +10,15 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 # Create pandas and eland data frames
 from eland.tests import ELASTICSEARCH_HOST
-from eland.tests import FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME
+from eland.tests import FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME, \
    ECOMMERCE_DATETIME_FIELD
 _pd_flights = pd.read_json(FLIGHTS_FILE_NAME, lines=True)
 _ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
-_pd_ecommerce = pd.read_json(ECOMMERCE_FILE_NAME, lines=True)
+_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
 _pd_ecommerce[ECOMMERCE_DATETIME_FIELD] = \
    pd.to_datetime(_pd_ecommerce[ECOMMERCE_DATETIME_FIELD])
 _ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
 class TestData:
--- a/eland/tests/frame/test_indexing_pytest.py
+++ b/eland/tests/frame/test_indexing_pytest.py
@ -1,6 +1,5 @@
 # File called _pytest for PyCharm compatability
 from eland.tests.frame.common import TestData
 from eland.tests import *
 import eland as ed
@ -11,18 +10,42 @@ from pandas.util.testing import (
 class TestDataFrameIndexing(TestData):
    def test_results(self):
        test = ed.read_es(ELASTICSEARCH_HOST, TEST_NESTED_USER_GROUP_INDEX_NAME)
        print(test.mappings.mappings_capabilities)
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 1000)
        print(test.head())
    def test_head(self):
        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        pd.set_option('display.width', 1000)
        """
        pd_flights_head = self.pd_flights().head()
        ed_flights_head = self.ed_flights().head()
        assert_frame_equal(pd_flights_head, ed_flights_head)
        """
        pd_ecommerce_head = self.pd_ecommerce().head()
        ed_ecommerce_head = self.ed_ecommerce().head()
        print(self.ed_ecommerce().mappings.source_fields_pd_dtypes())
        print(ed_ecommerce_head.dtypes)
        print(pd_ecommerce_head.dtypes)
        #print(ed_ecommerce_head)
        assert_frame_equal(pd_ecommerce_head, ed_ecommerce_head)
    def test_mappings(self):
        test_mapping1 = ed.read_es(ELASTICSEARCH_HOST, TEST_MAPPING1_INDEX_NAME)
        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, test_mapping1.fields)
--- a/eland/tests/setup_tests.py
+++ b/eland/tests/setup_tests.py
@ -5,8 +5,8 @@ from elasticsearch import helpers
 from eland.tests import *
 DATA_LIST = [
-    (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME),
+    (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
-    (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME)
+    (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
 ]
 def _setup_data(es):
@ -14,10 +14,13 @@ def _setup_data(es):
    for data in DATA_LIST:
        json_file_name = data[0]
        index_name = data[1]
        mapping = data[2]
        # Delete index
        print("Deleting index:", index_name)
        es.indices.delete(index=index_name, ignore=[400, 404])
        print("Creating index:", index_name)
        es.indices.create(index=index_name, body=mapping)
        df = pd.read_json(json_file_name, lines=True)
@ -50,9 +53,16 @@ def _setup_test_mappings(es):
    es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
    es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
 def _setup_test_nested(es):
    es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
    es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
    helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
 if __name__ == '__main__':
    # Create connection to Elasticsearch - use defaults
    es = Elasticsearch(ELASTICSEARCH_HOST)
    _setup_data(es)
    _setup_test_mappings(es)
    _setup_test_nested(es)