Merge pull request #37 from stevedodson/master

Resolve DataFrame.query issues + more docs
2025-07-11 00:02:14 +08:00 · 2019-11-14 21:07:11 +01:00 · 2019-11-14 21:07:11 +01:00 · dc2b1acbc2
commit dc2b1acbc2
parent be3e8e2746 4719afa57f
24 changed files with 580 additions and 71 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -57,7 +57,10 @@ except ImportError:
    pd = None
 '''
-extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')}
+extlinks = {
    'pandas_api_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', ''),
    'pandas_user_guide': ('https://pandas.pydata.org/pandas-docs/stable/user_guide/%s.html', 'Pandas User Guide/')
 }
 numpydoc_attributes_as_param_list = False
 numpydoc_show_class_members = False
@ -85,7 +88,8 @@ exclude_patterns = []
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+#html_theme = 'sphinx_rtd_theme'
 html_theme = "pandas_sphinx_theme"
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
--- a/docs/source/reference/api/eland.DataFrame.info_es.rst
+++ b/docs/source/reference/api/eland.DataFrame.info_es.rst
@ -0,0 +1,6 @@
 eland.DataFrame.info_es
 =======================
 .. currentmodule:: eland
 .. automethod:: DataFrame.info_es
--- a/docs/source/reference/api/eland.DataFrame.keys.rst
+++ b/docs/source/reference/api/eland.DataFrame.keys.rst
@ -0,0 +1,6 @@
 eland.DataFrame.keys
 ====================
 .. currentmodule:: eland
 .. automethod:: DataFrame.keys
--- a/docs/source/reference/api/eland.DataFrame.max.rst
+++ b/docs/source/reference/api/eland.DataFrame.max.rst
@ -0,0 +1,6 @@
 eland.DataFrame.max
 ===================
 .. currentmodule:: eland
 .. automethod:: DataFrame.max
--- a/docs/source/reference/api/eland.DataFrame.mean.rst
+++ b/docs/source/reference/api/eland.DataFrame.mean.rst
@ -0,0 +1,6 @@
 eland.DataFrame.mean
 ====================
 .. currentmodule:: eland
 .. automethod:: DataFrame.mean
--- a/docs/source/reference/api/eland.DataFrame.min.rst
+++ b/docs/source/reference/api/eland.DataFrame.min.rst
@ -0,0 +1,6 @@
 eland.DataFrame.min
 ===================
 .. currentmodule:: eland
 .. automethod:: DataFrame.min
--- a/docs/source/reference/api/eland.DataFrame.nunique.rst
+++ b/docs/source/reference/api/eland.DataFrame.nunique.rst
@ -0,0 +1,6 @@
 eland.DataFrame.nunique
 =======================
 .. currentmodule:: eland
 .. automethod:: DataFrame.nunique
--- a/docs/source/reference/api/eland.DataFrame.query.rst
+++ b/docs/source/reference/api/eland.DataFrame.query.rst
@ -0,0 +1,6 @@
 eland.DataFrame.query
 =====================
 .. currentmodule:: eland
 .. automethod:: DataFrame.query
--- a/docs/source/reference/api/eland.DataFrame.sum.rst
+++ b/docs/source/reference/api/eland.DataFrame.sum.rst
@ -0,0 +1,6 @@
 eland.DataFrame.sum
 ===================
 .. currentmodule:: eland
 .. automethod:: DataFrame.sum
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@ -31,8 +31,10 @@ Indexing, iteration
   :toctree: api/
   DataFrame.head
   DataFrame.keys
   DataFrame.tail
   DataFrame.get
   DataFrame.query
 Function application, GroupBy & window
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -52,6 +54,11 @@ Computations / descriptive stats
   DataFrame.count
   DataFrame.describe
   DataFrame.info
   DataFrame.max
   DataFrame.mean
   DataFrame.min
   DataFrame.sum
   DataFrame.nunique
 Reindexing / selection / label manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -74,3 +81,11 @@ Serialization / IO / conversion
   DataFrame.info
 Elasticsearch utilities
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. autosummary::
   :toctree: api/
   DataFrame.info_es
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -5,6 +5,7 @@ from io import StringIO
 import numpy as np
 import pandas as pd
 import six
 from pandas.core.computation.eval import eval
 from pandas.core.common import apply_if_callable, is_bool_indexer
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.indexing import check_bool_indexer
@ -41,7 +42,7 @@ class DataFrame(NDFrame):
    See Also
    --------
-    :pandas_docs:`pandas.DataFrame`
+    :pandas_api_docs:`pandas.DataFrame`
    Examples
    --------
@ -119,11 +120,12 @@ class DataFrame(NDFrame):
        Returns
        -------
        pandas.Index
            Elasticsearch field names as pandas.Index
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.columns`
+        :pandas_api_docs:`pandas.DataFrame.columns`
        Examples
        --------
@ -153,7 +155,7 @@ class DataFrame(NDFrame):
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.empty`
+        :pandas_api_docs:`pandas.DataFrame.empty`
        Examples
        --------
@ -183,7 +185,7 @@ class DataFrame(NDFrame):
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.head`
+        :pandas_api_docs:`pandas.DataFrame.head`
        Examples
        --------
@ -218,7 +220,7 @@ class DataFrame(NDFrame):
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.tail`
+        :pandas_api_docs:`pandas.DataFrame.tail`
        Examples
        --------
@ -304,7 +306,7 @@ class DataFrame(NDFrame):
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.count`
+        :pandas_api_docs:`pandas.DataFrame.count`
        Examples
        --------
@ -318,11 +320,57 @@ class DataFrame(NDFrame):
    def info_es(self):
        """
        A debug summary of an eland DataFrame internals.
        This includes the Elasticsearch search queries and query compiler task list.
        Returns
        -------
-        None
+        str
-            This method prints a debug summary of the task list Elasticsearch
+            A debug summary of an eland DataFrame internals.
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)]
        >>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
        >>> df = df.tail()
        >>> df
                        timestamp OriginAirportID DestAirportID  FlightDelayMin
        12608 2018-02-10 01:20:52             AMS          CYEG             120
        12720 2018-02-10 14:09:40             AMS           BHM             255
        12725 2018-02-10 00:53:01             AMS           ATL             360
        12823 2018-02-10 15:41:20             AMS           NGO             120
        12907 2018-02-11 20:08:25             AMS           LIM             225
        <BLANKLINE>
        [5 rows x 4 columns]
        >>> print(df.info_es())
        index_pattern: flights
        Index:
         index_field: _id
         is_source_field: False
        Mappings:
         capabilities:                 _source   es_dtype        pd_dtype  searchable  aggregatable
        AvgTicketPrice     True      float         float64        True          True
        Cancelled          True    boolean            bool        True          True
        Carrier            True    keyword          object        True          True
        Dest               True    keyword          object        True          True
        DestAirportID      True    keyword          object        True          True
        ...                 ...        ...             ...         ...           ...
        OriginLocation     True  geo_point          object        True          True
        OriginRegion       True    keyword          object        True          True
        OriginWeather      True    keyword          object        True          True
        dayOfWeek          True    integer           int64        True          True
        timestamp          True       date  datetime64[ns]        True          True
        <BLANKLINE>
        [27 rows x 5 columns]
        Operations:
         tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
         size: 5
         sort_params: _doc:desc
         columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
         post_processing: ['sort_index']
        <BLANKLINE>
        """
        buf = StringIO()
@ -350,7 +398,7 @@ class DataFrame(NDFrame):
        This method prints information about a DataFrame including
        the index dtype and column dtypes, non-null values and memory usage.
-        See :pandas_docs:`pandas.DataFrame.info` for full details.
+        See :pandas_api_docs:`pandas.DataFrame.info` for full details.
        Notes
        -----
@ -368,7 +416,7 @@ class DataFrame(NDFrame):
        customer_first_name    4675 non-null object
        geoip.city_name        4094 non-null object
        dtypes: object(2)
-        memory usage: 96.0 bytes
+        memory usage: ...
        """
        if buf is None:  # pragma: no cover
            buf = sys.stdout
@ -559,6 +607,26 @@ class DataFrame(NDFrame):
            result = _buf.getvalue()
            return result
    def __getattr__(self, key):
        """After regular attribute access, looks up the name in the columns
        Parameters
        ----------
            key: str
                Attribute name.
        Returns
        -------
            The value of the attribute.
        """
        try:
            return object.__getattribute__(self, key)
        except AttributeError as e:
            if key in self.columns:
                return self[key]
            raise e
    def _getitem(self, key):
        """Get the column specified by key for this DataFrame.
@ -695,7 +763,7 @@ class DataFrame(NDFrame):
        """
        Return a subset of the DataFrame's columns based on the column dtypes.
-        Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes`
+        Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
        """
        empty_df = self._empty_pd_df()
@ -720,6 +788,16 @@ class DataFrame(NDFrame):
        return num_rows, num_columns
    def keys(self):
        """
        Return columns
        See :pandas_api_docs:`pandas.DataFrame.keys`
        Returns
        -------
        pandas.Index
            Elasticsearch field names as pandas.Index
        """
        return self.columns
    def aggregate(self, func, axis=0, *args, **kwargs):
@ -758,7 +836,7 @@ class DataFrame(NDFrame):
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.aggregate`
+        :pandas_api_docs:`pandas.DataFrame.aggregate`
        Examples
        --------
@ -788,19 +866,49 @@ class DataFrame(NDFrame):
    hist = gfx.ed_hist_frame
-    def query(self, expr, inplace=False, **kwargs):
+    def query(self, expr):
-        """Queries the Dataframe with a boolean expression
+        """
        Query the columns of a DataFrame with a boolean expression.
-        Returns:
+        TODO - add additional pandas arguments
-            A new DataFrame if inplace=False
+
        Parameters
        ----------
        expr: str
            A boolean expression
        Returns
        -------
        eland.DataFrame:
            DataFrame populated by results of the query
        TODO - add link to eland user guide
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.query`
        :pandas_user_guide:`indexing`
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df = df.query('FlightDelayMin > 60')
        >>> df.info()
        """
        if isinstance(expr, BooleanFilter):
            return DataFrame(
                query_compiler=self._query_compiler._update_query(BooleanFilter(expr))
            )
        elif isinstance(expr, six.string_types):
            column_resolver = {}
            for key in self.keys():
                column_resolver[key] = self.get(key)
            # Create fake resolvers - index resolver is empty
            resolvers = column_resolver, {}
            # Use pandas eval to parse query - TODO validate this further
            filter = eval(expr, target=self, resolvers=tuple(tuple(resolvers)))
            return DataFrame(
-                query_compiler=self._query_compiler._update_query(ScriptFilter(expr))
+                query_compiler=self._query_compiler._update_query(filter)
            )
        else:
            raise NotImplementedError(expr, type(expr))
@ -820,7 +928,7 @@ class DataFrame(NDFrame):
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.get`
+        :pandas_api_docs:`pandas.DataFrame.get`
        Examples
        --------
--- a/eland/index.py
+++ b/eland/index.py
@ -58,5 +58,5 @@ class Index:
    def info_es(self, buf):
        buf.write("Index:\n")
-        buf.write("\tindex_field: {0}\n".format(self.index_field))
+        buf.write(" index_field: {0}\n".format(self.index_field))
-        buf.write("\tis_source_field: {0}\n".format(self.is_source_field))
+        buf.write(" is_source_field: {0}\n".format(self.is_source_field))
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -408,6 +408,44 @@ class Mappings:
        return is_source_field
    def aggregatable_columns(self, columns=None):
        """
        Return a dict of aggregatable columns from all columns or columns list
        {'customer_full_name': 'customer_full_name.keyword', ...}
        Logic here is that column names are '_source' fields and keyword fields
        may be nested beneath the field. E.g.
        customer_full_name: text
        customer_full_name.keyword: keyword
        customer_full_name.keyword is the aggregatable field for customer_full_name
        Returns
        -------
        dict
            e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
        """
        if columns is None:
            columns = self.source_fields()
        aggregatables = {}
        for column in columns:
            capabilities = self.field_capabilities(column)
            if capabilities['aggregatable']:
                aggregatables[column] = column
            else:
                # Try 'column.keyword'
                column_keyword = column + '.keyword'
                capabilities = self.field_capabilities(column_keyword)
                if capabilities['aggregatable']:
                    aggregatables[column_keyword] = column
                else:
                    # Aggregations not supported for this field
                    raise ValueError("Aggregations not supported for ", column)
        return aggregatables
    def numeric_source_fields(self, columns, include_bool=True):
        """
        Returns
@ -471,4 +509,4 @@ class Mappings:
    def info_es(self, buf):
        buf.write("Mappings:\n")
-        buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))
+        buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -66,7 +66,7 @@ class NDFrame:
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.index`
+        :pandas_api_docs:`pandas.DataFrame.index`
        Examples
        --------
@ -92,7 +92,7 @@ class NDFrame:
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.dtypes`
+        :pandas_api_docs:`pandas.DataFrame.dtypes`
        Examples
        --------
@ -125,22 +125,6 @@ class NDFrame:
    def __getitem__(self, key):
        return self._getitem(key)
    def __getattr__(self, key):
        """After regular attribute access, looks up the name in the columns
        Args:
            key (str): Attribute name.
        Returns:
            The value of the attribute.
        """
        try:
            return object.__getattribute__(self, key)
        except AttributeError as e:
            if key in self.columns:
                return self[key]
            raise e
    def __sizeof__(self):
        # Don't default to pandas, just return approximation TODO - make this more accurate
        return sys.getsizeof(self._query_compiler)
@ -190,7 +174,7 @@ class NDFrame:
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.drop`
+        :pandas_api_docs:`pandas.DataFrame.drop`
        Examples
        --------
@ -299,26 +283,185 @@ class NDFrame:
        )
        return self._create_or_update_from_compiler(new_query_compiler, inplace)
-    # TODO implement arguments
+    def mean(self, numeric_only=True):
-    def mean(self):
+        """
        Return mean value for each numeric column
        TODO - implement remainder of pandas arguments
        Returns
        -------
        pandas.Series
            mean value for each numeric column
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.mean`
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.mean()
        AvgTicketPrice         628.253689
        Cancelled                0.128494
        DistanceKilometers    7092.142457
        DistanceMiles         4406.853010
        FlightDelay              0.251168
        FlightDelayMin          47.335171
        FlightTimeHour           8.518797
        FlightTimeMin          511.127842
        dayOfWeek                2.835975
        dtype: float64
        """
        if numeric_only == False:
            raise NotImplementedError("Only mean of numeric fields is implemented")
        return self._query_compiler.mean()
    def sum(self, numeric_only=True):
        """
        Return sum for each numeric column
        TODO - implement remainder of pandas arguments
        Returns
        -------
        pandas.Series
            sum for each numeric column
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.sum`
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.sum()
        AvgTicketPrice        8.204365e+06
        Cancelled             1.678000e+03
        DistanceKilometers    9.261629e+07
        DistanceMiles         5.754909e+07
        FlightDelay           3.280000e+03
        FlightDelayMin        6.181500e+05
        FlightTimeHour        1.112470e+05
        FlightTimeMin         6.674818e+06
        dayOfWeek             3.703500e+04
        dtype: float64
        """
        if numeric_only == False:
            raise NotImplementedError("Only sum of numeric fields is implemented")
        return self._query_compiler.sum()
    def min(self, numeric_only=True):
        """
        Return the minimum value for each numeric column
        TODO - implement remainder of pandas arguments
        Returns
        -------
        pandas.Series
            min value for each numeric column
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.min`
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.min()
        AvgTicketPrice        100.020531
        Cancelled               0.000000
        DistanceKilometers      0.000000
        DistanceMiles           0.000000
        FlightDelay             0.000000
        FlightDelayMin          0.000000
        FlightTimeHour          0.000000
        FlightTimeMin           0.000000
        dayOfWeek               0.000000
        dtype: float64
        """
        if numeric_only == False:
-            raise NotImplementedError("Only sum of numeric fields is implemented")
+            raise NotImplementedError("Only min of numeric fields is implemented")
        return self._query_compiler.min()
    def max(self, numeric_only=True):
        """
        Return the maximum value for each numeric column
        TODO - implement remainder of pandas arguments
        Returns
        -------
        pandas.Series
            max value for each numeric column
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.max`
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.max()
        AvgTicketPrice         1199.729004
        Cancelled                 1.000000
        DistanceKilometers    19881.482422
        DistanceMiles         12353.780273
        FlightDelay               1.000000
        FlightDelayMin          360.000000
        FlightTimeHour           31.715034
        FlightTimeMin          1902.901978
        dayOfWeek                 6.000000
        dtype: float64
        """
        if numeric_only == False:
-            raise NotImplementedError("Only sum of numeric fields is implemented")
+            raise NotImplementedError("Only max of numeric fields is implemented")
        return self._query_compiler.max()
    def nunique(self):
        """
        Return cardinality of each field.
        **Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields**
        This method will try and field aggregatable fields if possible if mapping has::
            "customer_first_name" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword",
                  "ignore_above" : 256
                }
              }
            }
        we will aggregate ``customer_first_name`` columns using ``customer_first_name.keyword``.
        TODO - implement remainder of pandas arguments
        Returns
        -------
        pandas.Series
            cardinality of each column
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.nunique`
        Examples
        --------
        >>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
        >>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns)
        >>> df.nunique()
        category                6
        currency                1
        customer_birth_date     0
        customer_first_name    46
        user                   46
        dtype: int64
        """
        return self._query_compiler.nunique()
    def _hist(self, num_bins):
@ -341,7 +484,7 @@ class NDFrame:
        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.describe`
+        :pandas_api_docs:`pandas.DataFrame.describe`
        Examples
        --------
--- a/eland/operations.py
+++ b/eland/operations.py
@ -183,12 +183,13 @@ class Operations:
            raise NotImplementedError("Can not count field matches if size is set {}".format(size))
        columns = self.get_columns()
-        if columns is None:
+
-            columns = query_compiler._mappings.source_fields()
+        # Get just aggregatable columns
        aggregatable_columns = query_compiler._mappings.aggregatable_columns(columns)
        body = Query(query_params['query'])
-        for field in columns:
+        for field in aggregatable_columns.keys():
            body.metric_aggs(field, func, field)
        response = query_compiler._client.search(
@ -198,10 +199,10 @@ class Operations:
        results = {}
-        for field in columns:
+        for key, value in aggregatable_columns.items():
-            results[field] = response['aggregations'][field]['value']
+            results[value] = response['aggregations'][key]['value']
-        s = pd.Series(data=results, index=columns)
+        s = pd.Series(data=results, index=results.keys())
        return s
@ -845,16 +846,16 @@ class Operations:
    def info_es(self, buf):
        buf.write("Operations:\n")
-        buf.write("\ttasks: {0}\n".format(self._tasks))
+        buf.write(" tasks: {0}\n".format(self._tasks))
        query_params, post_processing = self._resolve_tasks()
        size, sort_params = Operations._query_params_to_size_and_sort(query_params)
        columns = self.get_columns()
-        buf.write("\tsize: {0}\n".format(size))
+        buf.write(" size: {0}\n".format(size))
-        buf.write("\tsort_params: {0}\n".format(sort_params))
+        buf.write(" sort_params: {0}\n".format(sort_params))
-        buf.write("\tcolumns: {0}\n".format(columns))
+        buf.write(" columns: {0}\n".format(columns))
-        buf.write("\tpost_processing: {0}\n".format(post_processing))
+        buf.write(" post_processing: {0}\n".format(post_processing))
    def update_query(self, boolean_filter):
        task = ('boolean_filter', boolean_filter)
--- a/eland/plotting.py
+++ b/eland/plotting.py
@ -10,7 +10,7 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
                  xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
                  sharey=False, figsize=None, layout=None, bins=10, **kwds):
    """
-    See :pandas_docs:`pandas.DataFrame.hist` for usage.
+    See :pandas_api_docs:`pandas.DataFrame.hist` for usage.
    Notes
    -----
--- a/eland/series.py
+++ b/eland/series.py
@ -215,3 +215,16 @@ class Series(NDFrame):
            return NotFilter(Equal(field=self.name, value=other))
        else:
            raise NotImplementedError(other, type(other))
    @property
    def ndim(self):
        """
        Returns 1 by definition of a Series1
        Returns
        -------
        int
            By definition 1
        """
        return 1
--- a/eland/tests/dataframe/test_keys_pytest.py
+++ b/eland/tests/dataframe/test_keys_pytest.py
@ -0,0 +1,26 @@
 # File called _pytest for PyCharm compatability
 from eland.tests.common import TestData
 from pandas.testing import assert_index_equal
 class TestDataFrameKeys(TestData):
    def test_ecommerce_keys(self):
        pd_ecommerce = self.pd_ecommerce()
        ed_ecommerce = self.ed_ecommerce()
        pd_keys = pd_ecommerce.keys()
        ed_keys = ed_ecommerce.keys()
        assert_index_equal(pd_keys, ed_keys)
    def test_flights_keys(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
        pd_keys = pd_flights.keys()
        ed_keys = ed_flights.keys()
        assert_index_equal(pd_keys, ed_keys)
--- a/eland/tests/dataframe/test_metrics_pytest.py
+++ b/eland/tests/dataframe/test_metrics_pytest.py
@ -7,16 +7,16 @@ from eland.tests.common import TestData
 class TestDataFrameMetrics(TestData):
-    def test_to_mean(self):
+    def test_mean(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
-        pd_mean = pd_flights.mean()
+        pd_mean = pd_flights.mean(numeric_only=True)
-        ed_mean = ed_flights.mean()
+        ed_mean = ed_flights.mean(numeric_only=True)
        assert_series_equal(pd_mean, ed_mean)
-    def test_to_sum(self):
+    def test_sum(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
@ -25,7 +25,7 @@ class TestDataFrameMetrics(TestData):
        assert_series_equal(pd_sum, ed_sum)
-    def test_to_min(self):
+    def test_min(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
@ -34,7 +34,7 @@ class TestDataFrameMetrics(TestData):
        assert_series_equal(pd_min, ed_min)
-    def test_to_max(self):
+    def test_max(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
--- a/eland/tests/dataframe/test_nunique_pytest.py
+++ b/eland/tests/dataframe/test_nunique_pytest.py
@ -0,0 +1,33 @@
 # File called _pytest for PyCharm compatability
 import pandas as pd
 from pandas.util.testing import assert_series_equal
 from eland.tests.common import TestData
 class TestDataFrameNUnique(TestData):
    def test_flights_nunique(self):
        # Note pandas.nunique fails for dict columns (e.g. DestLocation)
        columns = ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName']
        pd_flights = self.pd_flights()[columns]
        ed_flights = self.ed_flights()[columns]
        pd_nunique = pd_flights.nunique()
        ed_nunique = ed_flights.nunique()
        # TODO - ES is approximate counts so these aren't equal...
        #E[left]: [13059, 2, 4, 156, 156, 143]
        #E[right]: [13132, 2, 4, 156, 156, 143]
        #assert_series_equal(pd_nunique, ed_nunique)
    def test_ecommerce_nunique(self):
        columns = ['customer_first_name', 'customer_gender', 'day_of_week_i']
        pd_ecommerce = self.pd_ecommerce()[columns]
        ed_ecommerce = self.ed_ecommerce()[columns]
        pd_nunique = pd_ecommerce.nunique()
        ed_nunique = ed_ecommerce.nunique()
        assert_series_equal(pd_nunique, ed_nunique)
--- a/eland/tests/dataframe/test_query_pytest.py
+++ b/eland/tests/dataframe/test_query_pytest.py
@ -10,14 +10,14 @@ from eland.tests.common import assert_pandas_eland_frame_equal
 class TestDataFrameQuery(TestData):
-    def test_query(self):
+    def test_getitem_query(self):
        # Examples from:
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
        pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
                             index=['0', '1', '2', '3', '4'])
        # Now create index
-        index_name = 'eland_test_query1'
+        index_name = 'eland_test_query'
        ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
@ -42,3 +42,12 @@ class TestDataFrameQuery(TestData):
        ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
        assert_pandas_eland_frame_equal(pd_q4, ed_q4)
    def test_query(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        #print(ed_flights.query('FlightDelayMin > 60').info_es())
        print(pd_flights.query('FlightDelayMin > 60').shape)
        print(ed_flights.query('FlightDelayMin > 60').shape)
--- a/eland/tests/dataframe/test_shape_pytest.py
+++ b/eland/tests/dataframe/test_shape_pytest.py
@ -5,7 +5,7 @@ from eland.tests.common import TestData
 class TestDataFrameShape(TestData):
-    def test_to_shape1(self):
+    def test_ecommerce_shape(self):
        pd_ecommerce = self.pd_ecommerce()
        ed_ecommerce = self.ed_ecommerce()
@ -14,7 +14,7 @@ class TestDataFrameShape(TestData):
        assert pd_shape == ed_shape
-    def test_to_shape2(self):
+    def test_flights_shape(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
--- a/eland/tests/mappings/test_aggregatables_pytest.py
+++ b/eland/tests/mappings/test_aggregatables_pytest.py
@ -0,0 +1,72 @@
 # File called _pytest for PyCharm compatability
 from eland.tests.common import TestData
 class TestMappingsAggregatables(TestData):
    def test_ecommerce_all_aggregatables(self):
        ed_ecommerce = self.ed_ecommerce()
        aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns()
        expected = {'category.keyword': 'category',
         'currency': 'currency',
         'customer_birth_date': 'customer_birth_date',
         'customer_first_name.keyword': 'customer_first_name',
         'customer_full_name.keyword': 'customer_full_name',
         'customer_gender': 'customer_gender',
         'customer_id': 'customer_id',
         'customer_last_name.keyword': 'customer_last_name',
         'customer_phone': 'customer_phone',
         'day_of_week': 'day_of_week',
         'day_of_week_i': 'day_of_week_i',
         'email': 'email',
         'geoip.city_name': 'geoip.city_name',
         'geoip.continent_name': 'geoip.continent_name',
         'geoip.country_iso_code': 'geoip.country_iso_code',
         'geoip.location': 'geoip.location',
         'geoip.region_name': 'geoip.region_name',
         'manufacturer.keyword': 'manufacturer',
         'order_date': 'order_date',
         'order_id': 'order_id',
         'products._id.keyword': 'products._id',
         'products.base_price': 'products.base_price',
         'products.base_unit_price': 'products.base_unit_price',
         'products.category.keyword': 'products.category',
         'products.created_on': 'products.created_on',
         'products.discount_amount': 'products.discount_amount',
         'products.discount_percentage': 'products.discount_percentage',
         'products.manufacturer.keyword': 'products.manufacturer',
         'products.min_price': 'products.min_price',
         'products.price': 'products.price',
         'products.product_id': 'products.product_id',
         'products.product_name.keyword': 'products.product_name',
         'products.quantity': 'products.quantity',
         'products.sku': 'products.sku',
         'products.tax_amount': 'products.tax_amount',
         'products.taxful_price': 'products.taxful_price',
         'products.taxless_price': 'products.taxless_price',
         'products.unit_discount_amount': 'products.unit_discount_amount',
         'sku': 'sku',
         'taxful_total_price': 'taxful_total_price',
         'taxless_total_price': 'taxless_total_price',
         'total_quantity': 'total_quantity',
         'total_unique_products': 'total_unique_products',
         'type': 'type',
         'user': 'user'}
        assert expected == aggregatables
    def test_ecommerce_selected_aggregatables(self):
        ed_ecommerce = self.ed_ecommerce()
        expected = {'category.keyword': 'category',
                    'currency': 'currency',
                    'customer_birth_date': 'customer_birth_date',
                    'customer_first_name.keyword': 'customer_first_name',
                    'type': 'type', 'user': 'user'}
        aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns(expected.values())
        assert expected == aggregatables
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,5 +2,4 @@ elasticsearch>=7.0.5
 pandas==0.25.1
 matplotlib
 pytest>=5.2.1
 sphinx_rtd_theme
 numpydoc==0.8