Merge pull request #13 from stevedodson/master

Adding eland.Index feature
2025-07-24 00:00:39 +08:00 · 2019-06-28 16:47:59 +02:00 · 2019-06-28 16:47:59 +02:00 · 99279724f6
commit 99279724f6
parent c5cee6afc8 582d744848
12 changed files with 5186 additions and 461 deletions
--- a/NOTES.md
+++ b/NOTES.md
@ -0,0 +1,58 @@
 # Implementation Notes
 The goal of an `eland.DataFrame` is to enable users who are familiar with `pandas.DataFrame` 
 to access, explore and manipulate data that resides in Elasticsearch. 
 Ideally, all data should reside in Elasticsearch and not to reside in memory.
 This restricts the API, but allows access to huge data sets that do not fit into memory, and allows
 use of powerful Elasticsearch features such as aggrergations.
 ## Implementation Details
 ### 3rd Party System Access
 Generally, integrations with [3rd party storage systems](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html) 
 (SQL, Google Big Query etc.) involve accessing these systems and reading all external data into an 
 in-core pandas data structure. This also applies to [Apache Arrow](https://arrow.apache.org/docs/python/pandas.html) 
 structures.
 Whilst this provides access to data in these systems, for large datasets this can require significant
 in-core memory, and for systems such as Elasticsearch, bulk export of data can be an inefficient way
 of exploring the data.
 An alternative option is to create an API that proxies `pandas.DataFrame`-like calls to Elasticsearch
 queries and operations. This could allow the Elasticsearch cluster to perform operations such as
 aggregations rather than exporting all the data and performing this operation in-core.
 ### Implementation Options
 An option would be to replace the `pandas.DataFrame` backend in-core memory structures with Elasticsearch
 accessors. This would allow full access to the `pandas.DataFrame` APIs. However, this has issues:
 * If a `pandas.DataFrame` instance maps to an index, typical manipulation of a `pandas.DataFrame` 
 may involve creating many derived `pandas.DataFrame` instances. Constructing an index per 
 `pandas.DataFrame` may result in many Elasticsearch indexes and a significant load on Elasticsearch. 
 For example, `df_a = df['a']` should not require Elasticsearch indices `df` and `df_a`
 * Not all `pandas.DataFrame` APIs map to things we may want to do in Elasticsearch. In particular, 
 API calls that involve exporting all data from Elasticsearch into memory e.g. `df.to_dict()`. 
 * The backend `pandas.DataFrame` structures are not easily abstractable and are deeply embedded in 
 the implementation.
 Another option is to create a `eland.DataFrame` API that mimics appropriate aspects of 
 the `pandas.DataFrame` API. This resolves some of the issues above as:
 * `df_a = df['a']` could be implemented as a change to the Elasticsearch query used, rather 
 than a new index
 * Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for
 Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame.to_pandas()` 
 method which would explicitly export all data to a `pandas.DataFrame` 
 * Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However, 
 it does create a large amount of work which may duplicate a lot of the `pandas` code - for example,
 printing objects etc. - this creates maintenance issues etc.
--- a/eland/init.py
+++ b/eland/init.py
@ -1,4 +1,5 @@
 from .utils import *
-from .frame import *
+from .dataframe import *
 from .client import *
 from .mappings import *
 from .index import *
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -23,21 +23,19 @@ Similarly, only Elasticsearch searchable fields can be searched or filtered, and
 only Elasticsearch aggregatable fields can be aggregated or grouped.
 """
-import eland as ed
+import sys
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search
 import pandas as pd
-
+from elasticsearch_dsl import Search
-from pandas.core.arrays.sparse import BlockIndex
+from pandas.compat import StringIO
-
+from pandas.core import common as com
 from pandas.io.common import _expand_user, _stringify_path
 from pandas.io.formats import format as fmt
 from pandas.io.formats.printing import pprint_thing
 from pandas.io.formats import console
-from io import StringIO
+import eland as ed
 import sys
 class DataFrame():
    """
@ -79,26 +77,24 @@ class DataFrame():
    object is created, the object is not rebuilt and so inconsistencies can occur.
    """
    def __init__(self,
                 client,
                 index_pattern,
                 mappings=None,
-                 operations=None):
+                 index_field=None):
-        self.client = ed.Client(client)
+
-        self.index_pattern = index_pattern
+        self._client = ed.Client(client)
        self._index_pattern = index_pattern
        # Get and persist mappings, this allows us to correctly
        # map returned types from Elasticsearch to pandas datatypes
        if mappings is None:
-            self.mappings = ed.Mappings(self.client, self.index_pattern)
+            self._mappings = ed.Mappings(self._client, self._index_pattern)
        else:
-            self.mappings = mappings
+            self._mappings = mappings
-        # Initialise a list of 'operations'
+        self._index = ed.Index(index_field)
        # these are filters
        self.operations = []
        if operations is not None:
            self.operations.extend(operations)
    def _es_results_to_pandas(self, results):
        """
@ -187,6 +183,7 @@ class DataFrame():
        TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great)
        NOTE - using this lists is generally not a good way to use this API
        """
        def flatten_dict(y):
            out = {}
@ -197,7 +194,7 @@ class DataFrame():
                    is_source_field = False
                    pd_dtype = 'object'
                else:
-                    is_source_field, pd_dtype = self.mappings.source_field_pd_dtype(name[:-1])
+                    is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
                if not is_source_field and type(x) is dict:
                    for a in x:
@ -205,7 +202,7 @@ class DataFrame():
                elif not is_source_field and type(x) is list:
                    for a in x:
                        flatten(a, name)
-                elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
+                elif is_source_field == True:  # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
                    field_name = name[:-1]
                    # Coerce types - for now just datetime
@ -227,14 +224,22 @@ class DataFrame():
            return out
        rows = []
        index = []
        for hit in results['hits']['hits']:
            row = hit['_source']
            # get index value - can be _id or can be field value in source
            if self._index.is_source_field:
                index_field = row[self._index.index_field]
            else:
                index_field = hit[self._index.index_field]
            index.append(index_field)
            # flatten row to map correctly to 2D DataFrame
            rows.append(flatten_dict(row))
        # Create pandas DataFrame
-        df = pd.DataFrame(data=rows)
+        df = pd.DataFrame(data=rows, index=index)
        # _source may not contain all columns in the mapping
        # therefore, fill in missing columns
@ -242,7 +247,7 @@ class DataFrame():
        missing_columns = list(set(self.columns) - set(df.columns))
        for missing in missing_columns:
-            is_source_field, pd_dtype = self.mappings.source_field_pd_dtype(missing)
+            is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
            df[missing] = None
            df[missing].astype(pd_dtype)
@ -252,20 +257,32 @@ class DataFrame():
        return df
    def head(self, n=5):
-        results = self.client.search(index=self.index_pattern, size=n)
+        sort_params = self._index.sort_field + ":asc"
        results = self._client.search(index=self._index_pattern, size=n, sort=sort_params)
        return self._es_results_to_pandas(results)
-    
+
    def tail(self, n=5):
        sort_params = self._index.sort_field + ":desc"
        results = self._client.search(index=self._index_pattern, size=n, sort=sort_params)
        df = self._es_results_to_pandas(results)
        # reverse order (index ascending)
        return df.sort_index()
    def describe(self):
-        numeric_source_fields = self.mappings.numeric_source_fields()
+        numeric_source_fields = self._mappings.numeric_source_fields()
        # for each field we compute:
        # count, mean, std, min, 25%, 50%, 75%, max
-        search = Search(using=self.client, index=self.index_pattern).extra(size=0)
+        search = Search(using=self._client, index=self._index_pattern).extra(size=0)
        for field in numeric_source_fields:
-            search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
+            search.aggs.metric('extended_stats_' + field, 'extended_stats', field=field)
-            search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
+            search.aggs.metric('percentiles_' + field, 'percentiles', field=field)
        response = search.execute()
@ -273,21 +290,21 @@ class DataFrame():
        for field in numeric_source_fields:
            values = []
-            values.append(response.aggregations['extended_stats_'+field]['count'])
+            values.append(response.aggregations['extended_stats_' + field]['count'])
-            values.append(response.aggregations['extended_stats_'+field]['avg'])
+            values.append(response.aggregations['extended_stats_' + field]['avg'])
-            values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
+            values.append(response.aggregations['extended_stats_' + field]['std_deviation'])
-            values.append(response.aggregations['extended_stats_'+field]['min'])
+            values.append(response.aggregations['extended_stats_' + field]['min'])
-            values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
+            values.append(response.aggregations['percentiles_' + field]['values']['25.0'])
-            values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
+            values.append(response.aggregations['percentiles_' + field]['values']['50.0'])
-            values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
+            values.append(response.aggregations['percentiles_' + field]['values']['75.0'])
-            values.append(response.aggregations['extended_stats_'+field]['max'])
+            values.append(response.aggregations['extended_stats_' + field]['max'])
-            
+
            # if not None
            if (values.count(None) < len(values)):
                results[field] = values
        df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
-            
+
        return df
    def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
@ -305,12 +322,10 @@ class DataFrame():
        if buf is None:  # pragma: no cover
            buf = sys.stdout
        fake_df = self.__fake_dataframe__()
        lines = []
        lines.append(str(type(self)))
-        lines.append(fake_df.index._summary())
+        lines.append(self.index_summary())
        if len(self.columns) == 0:
            lines.append('Empty {name}'.format(name=type(self).__name__))
@ -322,7 +337,7 @@ class DataFrame():
        # hack
        if max_cols is None:
            max_cols = pd.get_option('display.max_info_columns',
-                                  len(self.columns) + 1)
+                                     len(self.columns) + 1)
        max_rows = pd.get_option('display.max_info_rows', len(self) + 1)
@ -404,7 +419,6 @@ class DataFrame():
        fmt.buffer_put_lines(buf, lines)
    @property
    def shape(self):
        """
@ -423,14 +437,32 @@ class DataFrame():
    @property
    def columns(self):
-        return pd.Index(self.mappings.source_fields())
+        return pd.Index(self._mappings.source_fields())
    @property
    def index(self):
        return self._index
    def set_index(self, index_field):
        copy = self.copy()
        copy._index = ed.Index(index_field)
        return copy
    def index_summary(self):
        head = self.head(1).index[0]
        tail = self.tail(1).index[0]
        index_summary = ', %s to %s' % (pprint_thing(head),
                                        pprint_thing(tail))
        name = "Index"
        return '%s: %s entries%s' % (name, len(self), index_summary)
    @property
    def dtypes(self):
-        return self.mappings.dtypes()
+        return self._mappings.dtypes()
    def get_dtype_counts(self):
-        return self.mappings.get_dtype_counts()
+        return self._mappings.get_dtype_counts()
    def count(self):
        """
@ -446,63 +478,155 @@ class DataFrame():
        for a single document.
        """
        counts = {}
-        for field in self.mappings.source_fields():
+        for field in self._mappings.source_fields():
-            exists_query = {"query":{"exists":{"field":field}}}
+            exists_query = {"query": {"exists": {"field": field}}}
-            field_exists_count = self.client.count(index=self.index_pattern, body=exists_query)
+            field_exists_count = self._client.count(index=self._index_pattern, body=exists_query)
            counts[field] = field_exists_count
-        count = pd.Series(data=counts, index=self.mappings.source_fields())
+        count = pd.Series(data=counts, index=self._mappings.source_fields())
        return count
    def index_count(self):
        """
        Returns
        -------
        index_count: int
            Count of docs where index_field exists
        """
        exists_query = {"query": {"exists": {"field": self._index.index_field}}}
-    def __getitem__(self, item):
+        index_count = self._client.count(index=self._index_pattern, body=exists_query)
-        # df['a'] -> item == str
+
-        # df['a', 'b'] -> item == (str, str) tuple
+        return index_count
    def _filter_by_columns(self, columns):
        # Return new eland.DataFrame with modified mappings
        mappings = ed.Mappings(mappings=self._mappings, columns=columns)
        return DataFrame(self._client, self._index_pattern, mappings=mappings)
    def __getitem__(self, key):
        # NOTE: there is a difference between pandas here.
        # e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame
        # we always return DataFrame - TODO maybe create eland.Series at some point...
        # Implementation mainly copied from pandas v0.24.2
        # (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)
        key = com.apply_if_callable(key, self)
        # TODO - add slice capabilities - need to add index features first
        #   e.g. set index etc.
        # Do we have a slicer (on rows)?
        """
        indexer = convert_to_index_sliceable(self, key)
        if indexer is not None:
            return self._slice(indexer, axis=0)
        # Do we have a (boolean) DataFrame?
        if isinstance(key, DataFrame):
            return self._getitem_frame(key)
        """
        # Do we have a (boolean) 1d indexer?
        """
        if com.is_bool_indexer(key):
            return self._getitem_bool_array(key)
        """
        # We are left with two options: a single key, and a collection of keys,
        columns = []
-        if isinstance(item, str):
+        if isinstance(key, str):
-            if not self.mappings.is_source_field(item):
+            if not self._mappings.is_source_field(key):
-                raise TypeError('Column does not exist: [{0}]'.format(item))
+                raise TypeError('Column does not exist: [{0}]'.format(key))
-            columns.append(item)
+            columns.append(key)
-        elif isinstance(item, tuple):
+        elif isinstance(key, list):
-            columns.extend(list(item))
+            columns.extend(key)
        elif isinstance(item, list):
            columns.extend(item)
        if len(columns) > 0:
            # Return new eland.DataFrame with modified mappings
            mappings = ed.Mappings(mappings=self.mappings, columns=columns)
            return DataFrame(self.client, self.index_pattern, mappings=mappings)
        """
        elif isinstance(item, BooleanFilter):
            self._filter = item.build()
            return self
        else:
-            raise TypeError('Unsupported expr: [{0}]'.format(item))
+            raise TypeError('__getitem__ arguments invalid: [{0}]'.format(key))
-        """
+
        return self._filter_by_columns(columns)
    def __len__(self):
        """
        Returns length of info axis, but here we use the index.
        """
-        return self.client.count(index=self.index_pattern)
+        return self._client.count(index=self._index_pattern)
    def copy(self):
        # TODO - test and validate...may need deep copying
        return ed.DataFrame(self._client,
                 self._index_pattern,
                 self._mappings,
                 self._index)
    # ----------------------------------------------------------------------
    # Rendering Methods
    def __repr__(self):
        """
-        Return a string representation for a particular DataFrame.
+        From pandas
        """
-        return self.to_string()
+        buf = StringIO()
        max_rows = pd.get_option("display.max_rows")
        max_cols = pd.get_option("display.max_columns")
        show_dimensions = pd.get_option("display.show_dimensions")
        if pd.get_option("display.expand_frame_repr"):
            width, _ = console.get_console_size()
        else:
            width = None
        self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
                       line_width=width, show_dimensions=show_dimensions)
        return buf.getvalue()
    def to_string(self, buf=None, columns=None, col_space=None, header=True,
                  index=True, na_rep='NaN', formatters=None, float_format=None,
                  sparsify=None, index_names=True, justify=None,
                  max_rows=None, max_cols=None, show_dimensions=True,
                  decimal='.', line_width=None):
        """
        From pandas
        """
        if max_rows == None:
            max_rows = pd.get_option('display.max_rows')
        sdf = self.__fake_dataframe__(max_rows=max_rows+1)
        _show_dimensions = show_dimensions
        if buf is not None:
            _buf = _expand_user(_stringify_path(buf))
        else:
            _buf = StringIO()
        sdf.to_string(buf=_buf, columns=columns,
                      col_space=col_space, na_rep=na_rep,
                      formatters=formatters,
                      float_format=float_format,
                      sparsify=sparsify, justify=justify,
                      index_names=index_names,
                      header=header, index=index,
                      max_rows=max_rows,
                      max_cols=max_cols,
                      show_dimensions=False, # print this outside of this call
                      decimal=decimal,
                      line_width=line_width)
        if _show_dimensions:
            _buf.write("\n\n[{nrows} rows x {ncols} columns]"
                      .format(nrows=self.index_count(), ncols=len(self.columns)))
        if buf is None:
            result = _buf.getvalue()
            return result
    def __fake_dataframe__(self, max_rows=1):
-        head_rows = max_rows / 2 + 1
+        head_rows = int(max_rows / 2) + max_rows % 2
        tail_rows = max_rows - head_rows
-        head = self.head(max_rows)
+        head = self.head(head_rows)
        tail = self.tail(tail_rows)
        num_rows = len(self)
@ -514,8 +638,9 @@ class DataFrame():
            # to use the pandas IO methods.
            # TODO - if data is indexed by time series, return top/bottom of
            #   time series, rather than first max_rows items
            """
            if tail_rows > 0:
-                locations = [0, num_rows-tail_rows]
+                locations = [0, num_rows - tail_rows]
                lengths = [head_rows, tail_rows]
            else:
                locations = [0]
@ -526,21 +651,13 @@ class DataFrame():
                                                     BlockIndex(
                                                         num_rows, locations, lengths))
                                for item in self.columns})
-
+            """
-            return sdf
+            return pd.concat([head, tail])
        return head
-    def to_string(self):
+        return pd.concat([head, tail])
        # TODO - this doesn't return 'notebook' friendly results yet..
        # TODO - don't hard code max_rows - use pandas default/ES default
        max_rows = 60
        df = self.__fake_dataframe__(max_rows=max_rows)
        return df.to_string(max_rows=max_rows, show_dimensions=True)
 # From pandas.DataFrame
 def _put_str(s, space):
-    return '{s}'.format(s=s)[:space].ljust(space)
+    return '{s}'.format(s=s)[:space].ljust(space)
--- a/eland/index.py
+++ b/eland/index.py
@ -0,0 +1,46 @@
 """
 class Index
 The index for an eland.DataFrame.
 Currently, the index is a field that exists in every document in an Elasticsearch index.
 For slicing and sorting operations it must be a docvalues field. By default _id is used,
 which can't be used for range queries and is inefficient for sorting:
 https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html
 (The value of the _id field is also accessible in aggregations or for sorting,
 but doing so is discouraged as it requires to load a lot of data in memory.
 In case sorting or aggregating on the _id field is required, it is advised to duplicate
 the content of the _id field in another field that has doc_values enabled.)
 """
 class Index:
    ID_INDEX_FIELD = '_id'
    ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
    def __init__(self, index_field=None):
        # Calls setter
        self.index_field = index_field
    @property
    def sort_field(self):
        if self._index_field == self.ID_INDEX_FIELD:
            return self.ID_SORT_FIELD
        return self._index_field
    @property
    def is_source_field(self):
        return self._is_source_field
    @property
    def index_field(self):
        return self._index_field
    @index_field.setter
    def index_field(self, index_field):
        if index_field == None:
            self._index_field = Index.ID_INDEX_FIELD
            self._is_source_field = False
        else:
            self._index_field = index_field
            self._is_source_field = True
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -64,18 +64,22 @@ class Mappings():
            # Populate capability matrix of fields
            # field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
-            self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
+            self._mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
        else:
-            # Reference object and restrict mapping columns
+            if columns is not None:
-            self.mappings_capabilities = mappings.mappings_capabilities.loc[columns]
+                # Reference object and restrict mapping columns
                self._mappings_capabilities = mappings._mappings_capabilities.loc[columns]
            else:
                # straight copy
                self._mappings_capabilities = mappings._mappings_capabilities.copy()
        # Cache source field types for efficient lookup
        # (this massively improves performance of DataFrame.flatten)
-        self.source_field_pd_dtypes = {}
+        self._source_field_pd_dtypes = {}
-        for field_name in self.mappings_capabilities[self.mappings_capabilities._source == True].index:
+        for field_name in self._mappings_capabilities[self._mappings_capabilities._source == True].index:
-            pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype']
+            pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
-            self.source_field_pd_dtypes[field_name] = pd_dtype
+            self._source_field_pd_dtypes[field_name] = pd_dtype
    def _extract_fields_from_mapping(mappings, source_only=False):
        """
@ -262,24 +266,29 @@ class Mappings():
        all_fields: list
            All typed fields in the index mapping
        """
-        return self.mappings_capabilities.index.tolist()
+        return self._mappings_capabilities.index.tolist()
    def field_capabilities(self, field_name):
        """
        Parameters
        ----------
        field_name: str
    """
    def pd_dtypes_groupby_source_fields(self):
        Returns
        -------
-        groups: dict
+        mappings_capabilities: pd.Series with index values:
-            Calls pandas.core.groupby.GroupBy.groups for _source fields
+            _source: bool
-            E.g.
+                Is this field name a top-level source field?
-            {
+            ed_dtype: str
-                'bool': Index(['Cancelled', 'FlightDelay'], dtype='object'),
+                The Elasticsearch data type
-                'datetime64[ns]': Index(['timestamp'], dtype='object'),
+            pd_dtype: str
-                'float64': Index(['AvgTicketPrice', 'DistanceKilometers', 'DistanceMiles',...
+                The pandas data type
-            }
+            searchable: bool
-        return self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype').groups
+                Is the field searchable in Elasticsearch?
-
+            aggregatable: bool
-    def pd_dtype
+                Is the field aggregatable in Elasticsearch?
-    """
+        """
        return self._mappings_capabilities.loc[field_name]
    def source_field_pd_dtype(self, field_name):
        """
@ -297,9 +306,9 @@ class Mappings():
        pd_dtype = 'object'
        is_source_field = False
-        if field_name in self.source_field_pd_dtypes:
+        if field_name in self._source_field_pd_dtypes:
            is_source_field = True
-            pd_dtype = self.source_field_pd_dtypes[field_name]
+            pd_dtype = self._source_field_pd_dtypes[field_name]
        return is_source_field, pd_dtype
@ -316,7 +325,7 @@ class Mappings():
        """
        is_source_field = False
-        if field_name in self.source_field_pd_dtypes:
+        if field_name in self._source_field_pd_dtypes:
            is_source_field = True
        return is_source_field
@ -328,9 +337,9 @@ class Mappings():
        numeric_source_fields: list of str
            List of source fields where pd_dtype == (int64 or float64)
        """
-        return self.mappings_capabilities[(self.mappings_capabilities._source == True) &
+        return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
-                                          ((self.mappings_capabilities.pd_dtype == 'int64') |
+                                          ((self._mappings_capabilities.pd_dtype == 'int64') |
-                                           (self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
+                                           (self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
    def source_fields(self):
        """
@ -339,7 +348,7 @@ class Mappings():
        source_fields: list of str
            List of source fields
        """
-        return self.source_field_pd_dtypes.keys()
+        return self._source_field_pd_dtypes.keys()
    def count_source_fields(self):
        """
@ -357,7 +366,7 @@ class Mappings():
        dtypes: pd.Series
            Source field name + pd_dtype
        """
-        return pd.Series(self.source_field_pd_dtypes)
+        return pd.Series(self._source_field_pd_dtypes)
    def get_dtype_counts(self):
        """
@ -368,5 +377,5 @@ class Mappings():
        get_dtype_counts : Series
            Series with the count of columns with each dtype.
        """
-        return pd.Series(self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype')[
+        return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
                             '_source'].count().to_dict())
--- a/eland/tests/client/test_mappings_pytest.py
+++ b/eland/tests/client/test_mappings_pytest.py
@ -69,3 +69,22 @@ class TestMapping(TestData):
        expected_get_dtype_counts = pd.Series({'datetime64[ns]': 1, 'float64': 1, 'int64': 5, 'object': 11})
        assert_series_equal(expected_get_dtype_counts, mappings.get_dtype_counts())
    def test_mapping_capabilities(self):
        mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
        field_capabilities = mappings.field_capabilities('city')
        assert True == field_capabilities['_source']
        assert 'text' == field_capabilities['es_dtype']
        assert 'object' == field_capabilities['pd_dtype']
        assert True == field_capabilities['searchable']
        assert False == field_capabilities['aggregatable']
        field_capabilities = mappings.field_capabilities('city.raw')
        assert False == field_capabilities['_source']
        assert 'keyword' == field_capabilities['es_dtype']
        assert 'object' == field_capabilities['pd_dtype']
        assert True == field_capabilities['searchable']
        assert True == field_capabilities['aggregatable']
--- a/eland/tests/common.py
+++ b/eland/tests/common.py
@ -16,6 +16,7 @@ from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
 _pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
 _pd_flights['timestamp'] = \
    pd.to_datetime(_pd_flights['timestamp'])
 _pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
 _ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
 _pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
@ -24,6 +25,7 @@ _pd_ecommerce['order_date'] = \
 _pd_ecommerce['products.created_on'] = \
    _pd_ecommerce['products.created_on'].apply(lambda x: pd.to_datetime(x))
 _pd_ecommerce.insert(2, 'customer_birth_date', None)
 _pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
 _pd_ecommerce['customer_birth_date'].astype('datetime64')
 _ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
--- a/eland/tests/dataframe/init.py
+++ b/eland/tests/dataframe/init.py
--- a/eland/tests/dataframe/test_getitem_pytest.py
+++ b/eland/tests/dataframe/test_getitem_pytest.py
@ -0,0 +1,63 @@
 # File called _pytest for PyCharm compatability
 from eland.tests.common import TestData
 import pandas as pd
 import io
 from pandas.util.testing import (
    assert_series_equal, assert_frame_equal)
 class TestDataFrameGetItem(TestData):
    def test_getitem_basic(self):
        # Test 1 attribute
        pd_carrier = self.pd_flights()['Carrier']
        ed_carrier = self.ed_flights()['Carrier']
        # pandas returns a Series here
        assert_frame_equal(pd.DataFrame(pd_carrier.head(100)), ed_carrier.head(100))
        pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']]
        ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']]
        assert_frame_equal(pd_3_items.head(100), ed_3_items.head(100))
        # Test numerics
        numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
        ed_numerics = self.ed_flights()[numerics]
        pd_numerics = self.pd_flights()[numerics]
        assert_frame_equal(pd_numerics.head(100), ed_numerics.head(100))
        # just test headers
        ed_numerics_describe = ed_numerics.describe()
        assert ed_numerics_describe.columns.tolist() == numerics
    def test_getattr_basic(self):
        # Test 1 attribute
        pd_carrier = self.pd_flights().Carrier
        #ed_carrier = self.ed_flights().Carrier
        print(type(pd_carrier))
        print(pd_carrier)
    def test_boolean(self):
        # Test 1 attribute
        pd_carrier = self.pd_flights()['Carrier == "Kibana Airlines"']
        #ed_carrier = self.ed_flights().Carrier
        print(type(pd_carrier))
        print(pd_carrier)
    def test_loc(self):
        pd = self.pd_flights().loc[10:15, ['Dest', 'Carrier']]
        print(type(pd))
        print(pd)
        pd = self.pd_flights().loc[10]
        print(type(pd))
        print(pd)
--- a/eland/tests/dataframe/test_indexing_pytest.py
+++ b/eland/tests/dataframe/test_indexing_pytest.py
@ -10,8 +10,8 @@ from pandas.util.testing import (
 class TestDataFrameIndexing(TestData):
    def test_mapping(self):
-        ed_flights_mappings = pd.DataFrame(self.ed_flights().mappings.mappings_capabilities
+        ed_flights_mappings = pd.DataFrame(self.ed_flights()._mappings._mappings_capabilities
-                                           [self.ed_flights().mappings.mappings_capabilities._source==True]
+                                           [self.ed_flights()._mappings._mappings_capabilities._source==True]
                                           ['pd_dtype'])
        pd_flights_mappings = pd.DataFrame(self.pd_flights().dtypes, columns = ['pd_dtype'])
@ -25,6 +25,8 @@ class TestDataFrameIndexing(TestData):
        pd_flights_head = self.pd_flights().head()
        ed_flights_head = self.ed_flights().head()
        print(ed_flights_head)
        assert_frame_equal(pd_flights_head, ed_flights_head)
        pd_ecommerce_head = self.pd_ecommerce().head()
@ -32,10 +34,25 @@ class TestDataFrameIndexing(TestData):
        assert_frame_equal(pd_ecommerce_head, ed_ecommerce_head)
    def test_tail(self):
        pd_flights_tail = self.pd_flights().tail()
        ed_flights_tail = self.ed_flights().tail()
        print(ed_flights_tail)
        assert_frame_equal(pd_flights_tail, ed_flights_tail)
        pd_ecommerce_tail = self.pd_ecommerce().tail()
        ed_ecommerce_tail = self.ed_ecommerce().tail()
        assert_frame_equal(pd_ecommerce_tail, ed_ecommerce_tail)
    def test_describe(self):
        pd_flights_describe = self.pd_flights().describe()
        ed_flights_describe = self.ed_flights().describe()
        print(ed_flights_describe)
        # TODO - this fails now as ES aggregations are approximate
        #        if ES percentile agg uses
        #        "hdr": {
@ -47,6 +64,8 @@ class TestDataFrameIndexing(TestData):
        pd_ecommerce_describe = self.pd_ecommerce().describe()
        ed_ecommerce_describe = self.ed_ecommerce().describe()
        print(ed_ecommerce_describe)
        # We don't compare ecommerce here as the default dtypes in pandas from read_json
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas
@ -57,52 +76,7 @@ class TestDataFrameIndexing(TestData):
    def test_to_string(self):
        print(self.ed_flights())
-
+        print(self.ed_flights().to_string())
    def test_getitem(self):
        # Test 1 attribute
        ed_carrier = self.ed_flights()['Carrier']
        carrier_head = ed_carrier.head(5)
        carrier_head_expected = pd.DataFrame(
            {'Carrier':[
                'Kibana Airlines',
                'Logstash Airways',
                'Logstash Airways',
                'Kibana Airlines',
                'Kibana Airlines'
            ]})
        assert_frame_equal(carrier_head_expected, carrier_head)
        #carrier_to_string = ed_carrier.to_string()
        #print(carrier_to_string)
        # Test multiple attributes (out of order)
        ed_3_items = self.ed_flights()['Dest','Carrier','FlightDelay']
        ed_3_items_head = ed_3_items.head(5)
        ed_3_items_expected = pd.DataFrame(dict(
            Dest={0: 'Sydney Kingsford Smith International Airport', 1: 'Venice Marco Polo Airport',
                  2: 'Venice Marco Polo Airport', 3: "Treviso-Sant'Angelo Airport",
                  4: "Xi'an Xianyang International Airport"},
            Carrier={0: 'Kibana Airlines', 1: 'Logstash Airways', 2: 'Logstash Airways', 3: 'Kibana Airlines',
                     4: 'Kibana Airlines'},
            FlightDelay={0: False, 1: False, 2: False, 3: True, 4: False}))
        assert_frame_equal(ed_3_items_expected, ed_3_items_head)
        #ed_3_items_to_string = ed_3_items.to_string()
        #print(ed_3_items_to_string)
        # Test numerics
        numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
        ed_numerics = self.ed_flights()[numerics]
        # just test headers
        ed_numerics_describe = ed_numerics.describe()
        assert ed_numerics_describe.columns.tolist() == numerics
    def test_info(self):
        ed_flights_info_buf = io.StringIO()
@ -111,6 +85,8 @@ class TestDataFrameIndexing(TestData):
        self.ed_flights().info(buf=ed_flights_info_buf)
        self.pd_flights().info(buf=pd_flights_info_buf)
        print(ed_flights_info_buf.getvalue())
        ed_flights_info = (ed_flights_info_buf.getvalue().splitlines())
        pd_flights_info = (pd_flights_info_buf.getvalue().splitlines())
@ -148,7 +124,7 @@ class TestDataFrameIndexing(TestData):
        assert_series_equal(pd_flights_get_dtype_counts, ed_flights_get_dtype_counts)
-    def test_properties(self):
+    def test_get_properties(self):
        pd_flights_shape = self.pd_flights().shape
        ed_flights_shape = self.ed_flights().shape
@ -164,3 +140,16 @@ class TestDataFrameIndexing(TestData):
        assert_series_equal(pd_flights_dtypes, ed_flights_dtypes)
    def test_index(self):
        pd_flights = self.pd_flights()
        pd_flights_timestamp = pd_flights.set_index('timestamp')
        pd_flights.info()
        pd_flights_timestamp.info()
        pd_flights.info()
        ed_flights = self.ed_flights()
        ed_flights_timestamp = ed_flights.set_index('timestamp')
        ed_flights.info()
        ed_flights_timestamp.info()
        ed_flights.info()
--- a/eland/tests/setup_tests.py
+++ b/eland/tests/setup_tests.py
@ -33,7 +33,8 @@ def _setup_data(es):
            # make timestamp datetime 2018-01-01T12:09:35
            #values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
-            action = {'_index': index_name, '_source': values}
+            # Use integer as id field for repeatable results
            action = {'_index': index_name, '_source': values, '_id': str(n)}
            actions.append(action)
--- a/eland/tests/test.ipynb
+++ b/eland/tests/test.ipynb