Merge pull request #13 from stevedodson/master

Adding eland.Index feature
2025-07-11 00:02:14 +08:00 · 2019-06-28 16:47:59 +02:00 · 2019-06-28 16:47:59 +02:00 · 99279724f6
commit 99279724f6
parent c5cee6afc8 582d744848
12 changed files with 5186 additions and 461 deletions
--- a/NOTES.md
+++ b/NOTES.md
@ -0,0 +1,58 @@
+# Implementation Notes
+
+The goal of an `eland.DataFrame` is to enable users who are familiar with `pandas.DataFrame` 
+to access, explore and manipulate data that resides in Elasticsearch. 
+
+Ideally, all data should reside in Elasticsearch and not to reside in memory.
+This restricts the API, but allows access to huge data sets that do not fit into memory, and allows
+use of powerful Elasticsearch features such as aggrergations.
+
+## Implementation Details
+
+### 3rd Party System Access
+
+Generally, integrations with [3rd party storage systems](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html) 
+(SQL, Google Big Query etc.) involve accessing these systems and reading all external data into an 
+in-core pandas data structure. This also applies to [Apache Arrow](https://arrow.apache.org/docs/python/pandas.html) 
+structures.
+
+Whilst this provides access to data in these systems, for large datasets this can require significant
+in-core memory, and for systems such as Elasticsearch, bulk export of data can be an inefficient way
+of exploring the data.
+
+An alternative option is to create an API that proxies `pandas.DataFrame`-like calls to Elasticsearch
+queries and operations. This could allow the Elasticsearch cluster to perform operations such as
+aggregations rather than exporting all the data and performing this operation in-core.
+
+### Implementation Options
+
+An option would be to replace the `pandas.DataFrame` backend in-core memory structures with Elasticsearch
+accessors. This would allow full access to the `pandas.DataFrame` APIs. However, this has issues:
+
+* If a `pandas.DataFrame` instance maps to an index, typical manipulation of a `pandas.DataFrame` 
+may involve creating many derived `pandas.DataFrame` instances. Constructing an index per 
+`pandas.DataFrame` may result in many Elasticsearch indexes and a significant load on Elasticsearch. 
+For example, `df_a = df['a']` should not require Elasticsearch indices `df` and `df_a`
+
+* Not all `pandas.DataFrame` APIs map to things we may want to do in Elasticsearch. In particular, 
+API calls that involve exporting all data from Elasticsearch into memory e.g. `df.to_dict()`. 
+
+* The backend `pandas.DataFrame` structures are not easily abstractable and are deeply embedded in 
+the implementation.
+
+Another option is to create a `eland.DataFrame` API that mimics appropriate aspects of 
+the `pandas.DataFrame` API. This resolves some of the issues above as:
+
+* `df_a = df['a']` could be implemented as a change to the Elasticsearch query used, rather 
+than a new index
+
+* Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for
+Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame.to_pandas()` 
+method which would explicitly export all data to a `pandas.DataFrame` 
+
+* Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However, 
+it does create a large amount of work which may duplicate a lot of the `pandas` code - for example,
+printing objects etc. - this creates maintenance issues etc.
+
+
+
--- a/eland/init.py
+++ b/eland/init.py
@ -1,4 +1,5 @@
 from .utils import *
-from .frame import *
+from .dataframe import *
 from .client import *
 from .mappings import *
+from .index import *
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -23,21 +23,19 @@ Similarly, only Elasticsearch searchable fields can be searched or filtered, and
 only Elasticsearch aggregatable fields can be aggregated or grouped.

 """
-import eland as ed
-
-from elasticsearch import Elasticsearch
-from elasticsearch_dsl import Search
+import sys

 import pandas as pd
-
-from pandas.core.arrays.sparse import BlockIndex
-
+from elasticsearch_dsl import Search
+from pandas.compat import StringIO
+from pandas.core import common as com
+from pandas.io.common import _expand_user, _stringify_path
 from pandas.io.formats import format as fmt
 from pandas.io.formats.printing import pprint_thing
+from pandas.io.formats import console

-from io import StringIO
+import eland as ed

-import sys

 class DataFrame():
    """
@ -79,26 +77,24 @@ class DataFrame():
    object is created, the object is not rebuilt and so inconsistencies can occur.

    """
+
    def __init__(self,
                 client,
                 index_pattern,
                 mappings=None,
-                 operations=None):
-        self.client = ed.Client(client)
-        self.index_pattern = index_pattern
+                 index_field=None):
+
+        self._client = ed.Client(client)
+        self._index_pattern = index_pattern

        # Get and persist mappings, this allows us to correctly
        # map returned types from Elasticsearch to pandas datatypes
        if mappings is None:
-            self.mappings = ed.Mappings(self.client, self.index_pattern)
+            self._mappings = ed.Mappings(self._client, self._index_pattern)
        else:
-            self.mappings = mappings
+            self._mappings = mappings

-        # Initialise a list of 'operations'
-        # these are filters
-        self.operations = []
-        if operations is not None:
-            self.operations.extend(operations)
+        self._index = ed.Index(index_field)

    def _es_results_to_pandas(self, results):
        """
@ -187,6 +183,7 @@ class DataFrame():
        TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great)
        NOTE - using this lists is generally not a good way to use this API
        """
+
        def flatten_dict(y):
            out = {}

@ -197,7 +194,7 @@ class DataFrame():
                    is_source_field = False
                    pd_dtype = 'object'
                else:
-                    is_source_field, pd_dtype = self.mappings.source_field_pd_dtype(name[:-1])
+                    is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])

                if not is_source_field and type(x) is dict:
                    for a in x:
@ -205,7 +202,7 @@ class DataFrame():
                elif not is_source_field and type(x) is list:
                    for a in x:
                        flatten(a, name)
-                elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
+                elif is_source_field == True:  # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
                    field_name = name[:-1]

                    # Coerce types - for now just datetime
@ -227,14 +224,22 @@ class DataFrame():
            return out

        rows = []
+        index = []
        for hit in results['hits']['hits']:
            row = hit['_source']

+            # get index value - can be _id or can be field value in source
+            if self._index.is_source_field:
+                index_field = row[self._index.index_field]
+            else:
+                index_field = hit[self._index.index_field]
+            index.append(index_field)
+
            # flatten row to map correctly to 2D DataFrame
            rows.append(flatten_dict(row))

        # Create pandas DataFrame
-        df = pd.DataFrame(data=rows)
+        df = pd.DataFrame(data=rows, index=index)

        # _source may not contain all columns in the mapping
        # therefore, fill in missing columns
@ -242,7 +247,7 @@ class DataFrame():
        missing_columns = list(set(self.columns) - set(df.columns))

        for missing in missing_columns:
-            is_source_field, pd_dtype = self.mappings.source_field_pd_dtype(missing)
+            is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
            df[missing] = None
            df[missing].astype(pd_dtype)

@ -252,20 +257,32 @@ class DataFrame():
        return df

    def head(self, n=5):
-        results = self.client.search(index=self.index_pattern, size=n)
+        sort_params = self._index.sort_field + ":asc"
+
+        results = self._client.search(index=self._index_pattern, size=n, sort=sort_params)

        return self._es_results_to_pandas(results)
-    
+
+    def tail(self, n=5):
+        sort_params = self._index.sort_field + ":desc"
+
+        results = self._client.search(index=self._index_pattern, size=n, sort=sort_params)
+
+        df = self._es_results_to_pandas(results)
+
+        # reverse order (index ascending)
+        return df.sort_index()
+
    def describe(self):
-        numeric_source_fields = self.mappings.numeric_source_fields()
+        numeric_source_fields = self._mappings.numeric_source_fields()

        # for each field we compute:
        # count, mean, std, min, 25%, 50%, 75%, max
-        search = Search(using=self.client, index=self.index_pattern).extra(size=0)
+        search = Search(using=self._client, index=self._index_pattern).extra(size=0)

        for field in numeric_source_fields:
-            search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
-            search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
+            search.aggs.metric('extended_stats_' + field, 'extended_stats', field=field)
+            search.aggs.metric('percentiles_' + field, 'percentiles', field=field)

        response = search.execute()

@ -273,21 +290,21 @@ class DataFrame():

        for field in numeric_source_fields:
            values = []
-            values.append(response.aggregations['extended_stats_'+field]['count'])
-            values.append(response.aggregations['extended_stats_'+field]['avg'])
-            values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
-            values.append(response.aggregations['extended_stats_'+field]['min'])
-            values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
-            values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
-            values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
-            values.append(response.aggregations['extended_stats_'+field]['max'])
-            
+            values.append(response.aggregations['extended_stats_' + field]['count'])
+            values.append(response.aggregations['extended_stats_' + field]['avg'])
+            values.append(response.aggregations['extended_stats_' + field]['std_deviation'])
+            values.append(response.aggregations['extended_stats_' + field]['min'])
+            values.append(response.aggregations['percentiles_' + field]['values']['25.0'])
+            values.append(response.aggregations['percentiles_' + field]['values']['50.0'])
+            values.append(response.aggregations['percentiles_' + field]['values']['75.0'])
+            values.append(response.aggregations['extended_stats_' + field]['max'])
+
            # if not None
            if (values.count(None) < len(values)):
                results[field] = values

        df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
-            
+
        return df

    def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
@ -305,12 +322,10 @@ class DataFrame():
        if buf is None:  # pragma: no cover
            buf = sys.stdout

-        fake_df = self.__fake_dataframe__()
-
        lines = []

        lines.append(str(type(self)))
-        lines.append(fake_df.index._summary())
+        lines.append(self.index_summary())

        if len(self.columns) == 0:
            lines.append('Empty {name}'.format(name=type(self).__name__))
@ -322,7 +337,7 @@ class DataFrame():
        # hack
        if max_cols is None:
            max_cols = pd.get_option('display.max_info_columns',
-                                  len(self.columns) + 1)
+                                     len(self.columns) + 1)

        max_rows = pd.get_option('display.max_info_rows', len(self) + 1)

@ -404,7 +419,6 @@ class DataFrame():

        fmt.buffer_put_lines(buf, lines)

-
    @property
    def shape(self):
        """
@ -423,14 +437,32 @@ class DataFrame():

    @property
    def columns(self):
-        return pd.Index(self.mappings.source_fields())
+        return pd.Index(self._mappings.source_fields())
+
+    @property
+    def index(self):
+        return self._index
+
+    def set_index(self, index_field):
+        copy = self.copy()
+        copy._index = ed.Index(index_field)
+        return copy
+
+    def index_summary(self):
+        head = self.head(1).index[0]
+        tail = self.tail(1).index[0]
+        index_summary = ', %s to %s' % (pprint_thing(head),
+                                        pprint_thing(tail))
+
+        name = "Index"
+        return '%s: %s entries%s' % (name, len(self), index_summary)

    @property
    def dtypes(self):
-        return self.mappings.dtypes()
+        return self._mappings.dtypes()

    def get_dtype_counts(self):
-        return self.mappings.get_dtype_counts()
+        return self._mappings.get_dtype_counts()

    def count(self):
        """
@ -446,63 +478,155 @@ class DataFrame():
        for a single document.
        """
        counts = {}
-        for field in self.mappings.source_fields():
-            exists_query = {"query":{"exists":{"field":field}}}
-            field_exists_count = self.client.count(index=self.index_pattern, body=exists_query)
+        for field in self._mappings.source_fields():
+            exists_query = {"query": {"exists": {"field": field}}}
+            field_exists_count = self._client.count(index=self._index_pattern, body=exists_query)
            counts[field] = field_exists_count

-        count = pd.Series(data=counts, index=self.mappings.source_fields())
+        count = pd.Series(data=counts, index=self._mappings.source_fields())

        return count

+    def index_count(self):
+        """
+        Returns
+        -------
+        index_count: int
+            Count of docs where index_field exists
+        """
+        exists_query = {"query": {"exists": {"field": self._index.index_field}}}

-    def __getitem__(self, item):
-        # df['a'] -> item == str
-        # df['a', 'b'] -> item == (str, str) tuple
+        index_count = self._client.count(index=self._index_pattern, body=exists_query)
+
+        return index_count
+
+    def _filter_by_columns(self, columns):
+        # Return new eland.DataFrame with modified mappings
+        mappings = ed.Mappings(mappings=self._mappings, columns=columns)
+
+        return DataFrame(self._client, self._index_pattern, mappings=mappings)
+
+    def __getitem__(self, key):
+        # NOTE: there is a difference between pandas here.
+        # e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame
+        # we always return DataFrame - TODO maybe create eland.Series at some point...
+
+        # Implementation mainly copied from pandas v0.24.2
+        # (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)
+        key = com.apply_if_callable(key, self)
+
+        # TODO - add slice capabilities - need to add index features first
+        #   e.g. set index etc.
+        # Do we have a slicer (on rows)?
+        """
+        indexer = convert_to_index_sliceable(self, key)
+        if indexer is not None:
+            return self._slice(indexer, axis=0)
+        # Do we have a (boolean) DataFrame?
+        if isinstance(key, DataFrame):
+            return self._getitem_frame(key)
+        """
+
+        # Do we have a (boolean) 1d indexer?
+        """
+        if com.is_bool_indexer(key):
+            return self._getitem_bool_array(key)
+        """
+
+        # We are left with two options: a single key, and a collection of keys,
        columns = []
-        if isinstance(item, str):
-            if not self.mappings.is_source_field(item):
-                raise TypeError('Column does not exist: [{0}]'.format(item))
-            columns.append(item)
-        elif isinstance(item, tuple):
-            columns.extend(list(item))
-        elif isinstance(item, list):
-            columns.extend(item)
-
-        if len(columns) > 0:
-            # Return new eland.DataFrame with modified mappings
-            mappings = ed.Mappings(mappings=self.mappings, columns=columns)
-
-            return DataFrame(self.client, self.index_pattern, mappings=mappings)
-        """
-        elif isinstance(item, BooleanFilter):
-            self._filter = item.build()
-            return self
+        if isinstance(key, str):
+            if not self._mappings.is_source_field(key):
+                raise TypeError('Column does not exist: [{0}]'.format(key))
+            columns.append(key)
+        elif isinstance(key, list):
+            columns.extend(key)
        else:
-            raise TypeError('Unsupported expr: [{0}]'.format(item))
-        """
+            raise TypeError('__getitem__ arguments invalid: [{0}]'.format(key))
+
+        return self._filter_by_columns(columns)

    def __len__(self):
        """
        Returns length of info axis, but here we use the index.
        """
-        return self.client.count(index=self.index_pattern)
+        return self._client.count(index=self._index_pattern)
+
+    def copy(self):
+        # TODO - test and validate...may need deep copying
+        return ed.DataFrame(self._client,
+                 self._index_pattern,
+                 self._mappings,
+                 self._index)

    # ----------------------------------------------------------------------
    # Rendering Methods
-
    def __repr__(self):
        """
-        Return a string representation for a particular DataFrame.
+        From pandas
        """
-        return self.to_string()
+        buf = StringIO()
+
+        max_rows = pd.get_option("display.max_rows")
+        max_cols = pd.get_option("display.max_columns")
+        show_dimensions = pd.get_option("display.show_dimensions")
+        if pd.get_option("display.expand_frame_repr"):
+            width, _ = console.get_console_size()
+        else:
+            width = None
+        self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
+                       line_width=width, show_dimensions=show_dimensions)
+
+        return buf.getvalue()
+
+    def to_string(self, buf=None, columns=None, col_space=None, header=True,
+                  index=True, na_rep='NaN', formatters=None, float_format=None,
+                  sparsify=None, index_names=True, justify=None,
+                  max_rows=None, max_cols=None, show_dimensions=True,
+                  decimal='.', line_width=None):
+        """
+        From pandas
+        """
+        if max_rows == None:
+            max_rows = pd.get_option('display.max_rows')
+
+        sdf = self.__fake_dataframe__(max_rows=max_rows+1)
+
+        _show_dimensions = show_dimensions
+
+        if buf is not None:
+            _buf = _expand_user(_stringify_path(buf))
+        else:
+            _buf = StringIO()
+
+        sdf.to_string(buf=_buf, columns=columns,
+                      col_space=col_space, na_rep=na_rep,
+                      formatters=formatters,
+                      float_format=float_format,
+                      sparsify=sparsify, justify=justify,
+                      index_names=index_names,
+                      header=header, index=index,
+                      max_rows=max_rows,
+                      max_cols=max_cols,
+                      show_dimensions=False, # print this outside of this call
+                      decimal=decimal,
+                      line_width=line_width)
+
+        if _show_dimensions:
+            _buf.write("\n\n[{nrows} rows x {ncols} columns]"
+                      .format(nrows=self.index_count(), ncols=len(self.columns)))
+
+        if buf is None:
+            result = _buf.getvalue()
+            return result


    def __fake_dataframe__(self, max_rows=1):
-        head_rows = max_rows / 2 + 1
+        head_rows = int(max_rows / 2) + max_rows % 2
        tail_rows = max_rows - head_rows

-        head = self.head(max_rows)
+        head = self.head(head_rows)
+        tail = self.tail(tail_rows)

        num_rows = len(self)

@ -514,8 +638,9 @@ class DataFrame():
            # to use the pandas IO methods.
            # TODO - if data is indexed by time series, return top/bottom of
            #   time series, rather than first max_rows items
+            """
            if tail_rows > 0:
-                locations = [0, num_rows-tail_rows]
+                locations = [0, num_rows - tail_rows]
                lengths = [head_rows, tail_rows]
            else:
                locations = [0]
@ -526,21 +651,13 @@ class DataFrame():
                                                     BlockIndex(
                                                         num_rows, locations, lengths))
                                for item in self.columns})
-
-            return sdf
-
-        return head
+            """
+            return pd.concat([head, tail])


-    def to_string(self):
-        # TODO - this doesn't return 'notebook' friendly results yet..
-        # TODO - don't hard code max_rows - use pandas default/ES default
-        max_rows = 60
-
-        df = self.__fake_dataframe__(max_rows=max_rows)
-
-        return df.to_string(max_rows=max_rows, show_dimensions=True)
+        return pd.concat([head, tail])


+# From pandas.DataFrame
 def _put_str(s, space):
-    return '{s}'.format(s=s)[:space].ljust(space)
+    return '{s}'.format(s=s)[:space].ljust(space)
--- a/eland/index.py
+++ b/eland/index.py
@ -0,0 +1,46 @@
+"""
+class Index
+
+The index for an eland.DataFrame.
+
+Currently, the index is a field that exists in every document in an Elasticsearch index.
+For slicing and sorting operations it must be a docvalues field. By default _id is used,
+which can't be used for range queries and is inefficient for sorting:
+
+https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html
+(The value of the _id field is also accessible in aggregations or for sorting,
+but doing so is discouraged as it requires to load a lot of data in memory.
+In case sorting or aggregating on the _id field is required, it is advised to duplicate
+the content of the _id field in another field that has doc_values enabled.)
+
+"""
+class Index:
+    ID_INDEX_FIELD = '_id'
+    ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
+
+    def __init__(self, index_field=None):
+        # Calls setter
+        self.index_field = index_field
+
+    @property
+    def sort_field(self):
+        if self._index_field == self.ID_INDEX_FIELD:
+            return self.ID_SORT_FIELD
+        return self._index_field
+
+    @property
+    def is_source_field(self):
+        return self._is_source_field
+
+    @property
+    def index_field(self):
+        return self._index_field
+
+    @index_field.setter
+    def index_field(self, index_field):
+        if index_field == None:
+            self._index_field = Index.ID_INDEX_FIELD
+            self._is_source_field = False
+        else:
+            self._index_field = index_field
+            self._is_source_field = True
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -64,18 +64,22 @@ class Mappings():

            # Populate capability matrix of fields
            # field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
-            self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
+            self._mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
        else:
-            # Reference object and restrict mapping columns
-            self.mappings_capabilities = mappings.mappings_capabilities.loc[columns]
+            if columns is not None:
+                # Reference object and restrict mapping columns
+                self._mappings_capabilities = mappings._mappings_capabilities.loc[columns]
+            else:
+                # straight copy
+                self._mappings_capabilities = mappings._mappings_capabilities.copy()

        # Cache source field types for efficient lookup
        # (this massively improves performance of DataFrame.flatten)
-        self.source_field_pd_dtypes = {}
+        self._source_field_pd_dtypes = {}

-        for field_name in self.mappings_capabilities[self.mappings_capabilities._source == True].index:
-            pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype']
-            self.source_field_pd_dtypes[field_name] = pd_dtype
+        for field_name in self._mappings_capabilities[self._mappings_capabilities._source == True].index:
+            pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
+            self._source_field_pd_dtypes[field_name] = pd_dtype

    def _extract_fields_from_mapping(mappings, source_only=False):
        """
@ -262,24 +266,29 @@ class Mappings():
        all_fields: list
            All typed fields in the index mapping
        """
-        return self.mappings_capabilities.index.tolist()
+        return self._mappings_capabilities.index.tolist()
+
+    def field_capabilities(self, field_name):
+        """
+        Parameters
+        ----------
+        field_name: str

-    """
-    def pd_dtypes_groupby_source_fields(self):
        Returns
        -------
-        groups: dict
-            Calls pandas.core.groupby.GroupBy.groups for _source fields
-            E.g.
-            {
-                'bool': Index(['Cancelled', 'FlightDelay'], dtype='object'),
-                'datetime64[ns]': Index(['timestamp'], dtype='object'),
-                'float64': Index(['AvgTicketPrice', 'DistanceKilometers', 'DistanceMiles',...
-            }
-        return self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype').groups
-
-    def pd_dtype
-    """
+        mappings_capabilities: pd.Series with index values:
+            _source: bool
+                Is this field name a top-level source field?
+            ed_dtype: str
+                The Elasticsearch data type
+            pd_dtype: str
+                The pandas data type
+            searchable: bool
+                Is the field searchable in Elasticsearch?
+            aggregatable: bool
+                Is the field aggregatable in Elasticsearch?
+        """
+        return self._mappings_capabilities.loc[field_name]

    def source_field_pd_dtype(self, field_name):
        """
@ -297,9 +306,9 @@ class Mappings():
        pd_dtype = 'object'
        is_source_field = False

-        if field_name in self.source_field_pd_dtypes:
+        if field_name in self._source_field_pd_dtypes:
            is_source_field = True
-            pd_dtype = self.source_field_pd_dtypes[field_name]
+            pd_dtype = self._source_field_pd_dtypes[field_name]

        return is_source_field, pd_dtype

@ -316,7 +325,7 @@ class Mappings():
        """
        is_source_field = False

-        if field_name in self.source_field_pd_dtypes:
+        if field_name in self._source_field_pd_dtypes:
            is_source_field = True

        return is_source_field
@ -328,9 +337,9 @@ class Mappings():
        numeric_source_fields: list of str
            List of source fields where pd_dtype == (int64 or float64)
        """
-        return self.mappings_capabilities[(self.mappings_capabilities._source == True) &
-                                          ((self.mappings_capabilities.pd_dtype == 'int64') |
-                                           (self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
+        return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
+                                          ((self._mappings_capabilities.pd_dtype == 'int64') |
+                                           (self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()

    def source_fields(self):
        """
@ -339,7 +348,7 @@ class Mappings():
        source_fields: list of str
            List of source fields
        """
-        return self.source_field_pd_dtypes.keys()
+        return self._source_field_pd_dtypes.keys()

    def count_source_fields(self):
        """
@ -357,7 +366,7 @@ class Mappings():
        dtypes: pd.Series
            Source field name + pd_dtype
        """
-        return pd.Series(self.source_field_pd_dtypes)
+        return pd.Series(self._source_field_pd_dtypes)

    def get_dtype_counts(self):
        """
@ -368,5 +377,5 @@ class Mappings():
        get_dtype_counts : Series
            Series with the count of columns with each dtype.
        """
-        return pd.Series(self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype')[
+        return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
                             '_source'].count().to_dict())
--- a/eland/tests/client/test_mappings_pytest.py
+++ b/eland/tests/client/test_mappings_pytest.py
@ -69,3 +69,22 @@ class TestMapping(TestData):
        expected_get_dtype_counts = pd.Series({'datetime64[ns]': 1, 'float64': 1, 'int64': 5, 'object': 11})

        assert_series_equal(expected_get_dtype_counts, mappings.get_dtype_counts())
+
+    def test_mapping_capabilities(self):
+        mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
+
+        field_capabilities = mappings.field_capabilities('city')
+
+        assert True == field_capabilities['_source']
+        assert 'text' == field_capabilities['es_dtype']
+        assert 'object' == field_capabilities['pd_dtype']
+        assert True == field_capabilities['searchable']
+        assert False == field_capabilities['aggregatable']
+
+        field_capabilities = mappings.field_capabilities('city.raw')
+
+        assert False == field_capabilities['_source']
+        assert 'keyword' == field_capabilities['es_dtype']
+        assert 'object' == field_capabilities['pd_dtype']
+        assert True == field_capabilities['searchable']
+        assert True == field_capabilities['aggregatable']
--- a/eland/tests/common.py
+++ b/eland/tests/common.py
@ -16,6 +16,7 @@ from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
 _pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
 _pd_flights['timestamp'] = \
    pd.to_datetime(_pd_flights['timestamp'])
+_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
 _ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)

 _pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
@ -24,6 +25,7 @@ _pd_ecommerce['order_date'] = \
 _pd_ecommerce['products.created_on'] = \
    _pd_ecommerce['products.created_on'].apply(lambda x: pd.to_datetime(x))
 _pd_ecommerce.insert(2, 'customer_birth_date', None)
+_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
 _pd_ecommerce['customer_birth_date'].astype('datetime64')
 _ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)

--- a/eland/tests/dataframe/init.py
+++ b/eland/tests/dataframe/init.py
--- a/eland/tests/dataframe/test_getitem_pytest.py
+++ b/eland/tests/dataframe/test_getitem_pytest.py
@ -0,0 +1,63 @@
+# File called _pytest for PyCharm compatability
+from eland.tests.common import TestData
+
+import pandas as pd
+import io
+
+from pandas.util.testing import (
+    assert_series_equal, assert_frame_equal)
+
+class TestDataFrameGetItem(TestData):
+
+    def test_getitem_basic(self):
+        # Test 1 attribute
+        pd_carrier = self.pd_flights()['Carrier']
+        ed_carrier = self.ed_flights()['Carrier']
+
+        # pandas returns a Series here
+        assert_frame_equal(pd.DataFrame(pd_carrier.head(100)), ed_carrier.head(100))
+
+        pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']]
+        ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']]
+
+        assert_frame_equal(pd_3_items.head(100), ed_3_items.head(100))
+
+        # Test numerics
+        numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
+        ed_numerics = self.ed_flights()[numerics]
+        pd_numerics = self.pd_flights()[numerics]
+
+        assert_frame_equal(pd_numerics.head(100), ed_numerics.head(100))
+
+        # just test headers
+        ed_numerics_describe = ed_numerics.describe()
+        assert ed_numerics_describe.columns.tolist() == numerics
+
+    def test_getattr_basic(self):
+        # Test 1 attribute
+        pd_carrier = self.pd_flights().Carrier
+        #ed_carrier = self.ed_flights().Carrier
+
+        print(type(pd_carrier))
+        print(pd_carrier)
+
+    def test_boolean(self):
+        # Test 1 attribute
+        pd_carrier = self.pd_flights()['Carrier == "Kibana Airlines"']
+        #ed_carrier = self.ed_flights().Carrier
+
+        print(type(pd_carrier))
+        print(pd_carrier)
+
+
+    def test_loc(self):
+        pd = self.pd_flights().loc[10:15, ['Dest', 'Carrier']]
+
+        print(type(pd))
+        print(pd)
+
+        pd = self.pd_flights().loc[10]
+
+        print(type(pd))
+        print(pd)
+
--- a/eland/tests/dataframe/test_indexing_pytest.py
+++ b/eland/tests/dataframe/test_indexing_pytest.py
@ -10,8 +10,8 @@ from pandas.util.testing import (
 class TestDataFrameIndexing(TestData):

    def test_mapping(self):
-        ed_flights_mappings = pd.DataFrame(self.ed_flights().mappings.mappings_capabilities
-                                           [self.ed_flights().mappings.mappings_capabilities._source==True]
+        ed_flights_mappings = pd.DataFrame(self.ed_flights()._mappings._mappings_capabilities
+                                           [self.ed_flights()._mappings._mappings_capabilities._source==True]
                                           ['pd_dtype'])
        pd_flights_mappings = pd.DataFrame(self.pd_flights().dtypes, columns = ['pd_dtype'])

@ -25,6 +25,8 @@ class TestDataFrameIndexing(TestData):
        pd_flights_head = self.pd_flights().head()
        ed_flights_head = self.ed_flights().head()

+        print(ed_flights_head)
+
        assert_frame_equal(pd_flights_head, ed_flights_head)

        pd_ecommerce_head = self.pd_ecommerce().head()
@ -32,10 +34,25 @@ class TestDataFrameIndexing(TestData):

        assert_frame_equal(pd_ecommerce_head, ed_ecommerce_head)

+    def test_tail(self):
+        pd_flights_tail = self.pd_flights().tail()
+        ed_flights_tail = self.ed_flights().tail()
+
+        print(ed_flights_tail)
+
+        assert_frame_equal(pd_flights_tail, ed_flights_tail)
+
+        pd_ecommerce_tail = self.pd_ecommerce().tail()
+        ed_ecommerce_tail = self.ed_ecommerce().tail()
+
+        assert_frame_equal(pd_ecommerce_tail, ed_ecommerce_tail)
+
    def test_describe(self):
        pd_flights_describe = self.pd_flights().describe()
        ed_flights_describe = self.ed_flights().describe()

+        print(ed_flights_describe)
+
        # TODO - this fails now as ES aggregations are approximate
        #        if ES percentile agg uses
        #        "hdr": {
@ -47,6 +64,8 @@ class TestDataFrameIndexing(TestData):
        pd_ecommerce_describe = self.pd_ecommerce().describe()
        ed_ecommerce_describe = self.ed_ecommerce().describe()

+        print(ed_ecommerce_describe)
+
        # We don't compare ecommerce here as the default dtypes in pandas from read_json
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas
@ -57,52 +76,7 @@ class TestDataFrameIndexing(TestData):

    def test_to_string(self):
        print(self.ed_flights())
-
-    def test_getitem(self):
-        # Test 1 attribute
-        ed_carrier = self.ed_flights()['Carrier']
-
-        carrier_head = ed_carrier.head(5)
-
-        carrier_head_expected = pd.DataFrame(
-            {'Carrier':[
-                'Kibana Airlines',
-                'Logstash Airways',
-                'Logstash Airways',
-                'Kibana Airlines',
-                'Kibana Airlines'
-            ]})
-
-        assert_frame_equal(carrier_head_expected, carrier_head)
-
-        #carrier_to_string = ed_carrier.to_string()
-        #print(carrier_to_string)
-
-        # Test multiple attributes (out of order)
-        ed_3_items = self.ed_flights()['Dest','Carrier','FlightDelay']
-
-        ed_3_items_head = ed_3_items.head(5)
-
-        ed_3_items_expected = pd.DataFrame(dict(
-            Dest={0: 'Sydney Kingsford Smith International Airport', 1: 'Venice Marco Polo Airport',
-                  2: 'Venice Marco Polo Airport', 3: "Treviso-Sant'Angelo Airport",
-                  4: "Xi'an Xianyang International Airport"},
-            Carrier={0: 'Kibana Airlines', 1: 'Logstash Airways', 2: 'Logstash Airways', 3: 'Kibana Airlines',
-                     4: 'Kibana Airlines'},
-            FlightDelay={0: False, 1: False, 2: False, 3: True, 4: False}))
-
-        assert_frame_equal(ed_3_items_expected, ed_3_items_head)
-
-        #ed_3_items_to_string = ed_3_items.to_string()
-        #print(ed_3_items_to_string)
-
-        # Test numerics
-        numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
-        ed_numerics = self.ed_flights()[numerics]
-
-        # just test headers
-        ed_numerics_describe = ed_numerics.describe()
-        assert ed_numerics_describe.columns.tolist() == numerics
+        print(self.ed_flights().to_string())

    def test_info(self):
        ed_flights_info_buf = io.StringIO()
@ -111,6 +85,8 @@ class TestDataFrameIndexing(TestData):
        self.ed_flights().info(buf=ed_flights_info_buf)
        self.pd_flights().info(buf=pd_flights_info_buf)

+        print(ed_flights_info_buf.getvalue())
+
        ed_flights_info = (ed_flights_info_buf.getvalue().splitlines())
        pd_flights_info = (pd_flights_info_buf.getvalue().splitlines())

@ -148,7 +124,7 @@ class TestDataFrameIndexing(TestData):

        assert_series_equal(pd_flights_get_dtype_counts, ed_flights_get_dtype_counts)

-    def test_properties(self):
+    def test_get_properties(self):
        pd_flights_shape = self.pd_flights().shape
        ed_flights_shape = self.ed_flights().shape

@ -164,3 +140,16 @@ class TestDataFrameIndexing(TestData):

        assert_series_equal(pd_flights_dtypes, ed_flights_dtypes)

+    def test_index(self):
+        pd_flights = self.pd_flights()
+        pd_flights_timestamp = pd_flights.set_index('timestamp')
+        pd_flights.info()
+        pd_flights_timestamp.info()
+        pd_flights.info()
+
+        ed_flights = self.ed_flights()
+        ed_flights_timestamp = ed_flights.set_index('timestamp')
+        ed_flights.info()
+        ed_flights_timestamp.info()
+        ed_flights.info()
+
--- a/eland/tests/setup_tests.py
+++ b/eland/tests/setup_tests.py
@ -33,7 +33,8 @@ def _setup_data(es):
            # make timestamp datetime 2018-01-01T12:09:35
            #values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')

-            action = {'_index': index_name, '_source': values}
+            # Use integer as id field for repeatable results
+            action = {'_index': index_name, '_source': values, '_id': str(n)}

            actions.append(action)

--- a/eland/tests/test.ipynb
+++ b/eland/tests/test.ipynb