Resolving DataFrame.query issues + more docs

2025-07-11 00:02:14 +08:00 · 2019-11-14 20:04:38 +00:00 · 2019-11-14 20:04:38 +00:00 · 5a546577f4
commit 5a546577f4
parent e76a4de79d
24 changed files with 580 additions and 71 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -57,7 +57,10 @@ except ImportError:
    pd = None
 '''

-extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')}
+extlinks = {
+    'pandas_api_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', ''),
+    'pandas_user_guide': ('https://pandas.pydata.org/pandas-docs/stable/user_guide/%s.html', 'Pandas User Guide/')
+}

 numpydoc_attributes_as_param_list = False
 numpydoc_show_class_members = False
@ -85,7 +88,8 @@ exclude_patterns = []
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+#html_theme = 'sphinx_rtd_theme'
+html_theme = "pandas_sphinx_theme"

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
--- a/docs/source/reference/api/eland.DataFrame.info_es.rst
+++ b/docs/source/reference/api/eland.DataFrame.info_es.rst
@ -0,0 +1,6 @@
+eland.DataFrame.info_es
+=======================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.info_es
--- a/docs/source/reference/api/eland.DataFrame.keys.rst
+++ b/docs/source/reference/api/eland.DataFrame.keys.rst
@ -0,0 +1,6 @@
+eland.DataFrame.keys
+====================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.keys
--- a/docs/source/reference/api/eland.DataFrame.max.rst
+++ b/docs/source/reference/api/eland.DataFrame.max.rst
@ -0,0 +1,6 @@
+eland.DataFrame.max
+===================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.max
--- a/docs/source/reference/api/eland.DataFrame.mean.rst
+++ b/docs/source/reference/api/eland.DataFrame.mean.rst
@ -0,0 +1,6 @@
+eland.DataFrame.mean
+====================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.mean
--- a/docs/source/reference/api/eland.DataFrame.min.rst
+++ b/docs/source/reference/api/eland.DataFrame.min.rst
@ -0,0 +1,6 @@
+eland.DataFrame.min
+===================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.min
--- a/docs/source/reference/api/eland.DataFrame.nunique.rst
+++ b/docs/source/reference/api/eland.DataFrame.nunique.rst
@ -0,0 +1,6 @@
+eland.DataFrame.nunique
+=======================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.nunique
--- a/docs/source/reference/api/eland.DataFrame.query.rst
+++ b/docs/source/reference/api/eland.DataFrame.query.rst
@ -0,0 +1,6 @@
+eland.DataFrame.query
+=====================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.query
--- a/docs/source/reference/api/eland.DataFrame.sum.rst
+++ b/docs/source/reference/api/eland.DataFrame.sum.rst
@ -0,0 +1,6 @@
+eland.DataFrame.sum
+===================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.sum
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@ -31,8 +31,10 @@ Indexing, iteration
   :toctree: api/

   DataFrame.head
+   DataFrame.keys
   DataFrame.tail
   DataFrame.get
+   DataFrame.query

 Function application, GroupBy & window
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -52,6 +54,11 @@ Computations / descriptive stats
   DataFrame.count
   DataFrame.describe
   DataFrame.info
+   DataFrame.max
+   DataFrame.mean
+   DataFrame.min
+   DataFrame.sum
+   DataFrame.nunique

 Reindexing / selection / label manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -74,3 +81,11 @@ Serialization / IO / conversion

   DataFrame.info

+Elasticsearch utilities
+~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.info_es
+
+
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -5,6 +5,7 @@ from io import StringIO
 import numpy as np
 import pandas as pd
 import six
+from pandas.core.computation.eval import eval
 from pandas.core.common import apply_if_callable, is_bool_indexer
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.indexing import check_bool_indexer
@ -41,7 +42,7 @@ class DataFrame(NDFrame):

    See Also
    --------
-    :pandas_docs:`pandas.DataFrame`
+    :pandas_api_docs:`pandas.DataFrame`

    Examples
    --------
@ -119,11 +120,12 @@ class DataFrame(NDFrame):

        Returns
        -------
-        Elasticsearch field names as pandas.Index
+        pandas.Index
+            Elasticsearch field names as pandas.Index

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.columns`
+        :pandas_api_docs:`pandas.DataFrame.columns`

        Examples
        --------
@ -153,7 +155,7 @@ class DataFrame(NDFrame):

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.empty`
+        :pandas_api_docs:`pandas.DataFrame.empty`

        Examples
        --------
@ -183,7 +185,7 @@ class DataFrame(NDFrame):

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.head`
+        :pandas_api_docs:`pandas.DataFrame.head`

        Examples
        --------
@ -218,7 +220,7 @@ class DataFrame(NDFrame):

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.tail`
+        :pandas_api_docs:`pandas.DataFrame.tail`

        Examples
        --------
@ -304,7 +306,7 @@ class DataFrame(NDFrame):

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.count`
+        :pandas_api_docs:`pandas.DataFrame.count`

        Examples
        --------
@ -318,11 +320,57 @@ class DataFrame(NDFrame):

    def info_es(self):
        """
+        A debug summary of an eland DataFrame internals.
+
+        This includes the Elasticsearch search queries and query compiler task list.

        Returns
        -------
-        None
-            This method prints a debug summary of the task list Elasticsearch
+        str
+            A debug summary of an eland DataFrame internals.
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)]
+        >>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
+        >>> df = df.tail()
+        >>> df
+                        timestamp OriginAirportID DestAirportID  FlightDelayMin
+        12608 2018-02-10 01:20:52             AMS          CYEG             120
+        12720 2018-02-10 14:09:40             AMS           BHM             255
+        12725 2018-02-10 00:53:01             AMS           ATL             360
+        12823 2018-02-10 15:41:20             AMS           NGO             120
+        12907 2018-02-11 20:08:25             AMS           LIM             225
+        <BLANKLINE>
+        [5 rows x 4 columns]
+        >>> print(df.info_es())
+        index_pattern: flights
+        Index:
+         index_field: _id
+         is_source_field: False
+        Mappings:
+         capabilities:                 _source   es_dtype        pd_dtype  searchable  aggregatable
+        AvgTicketPrice     True      float         float64        True          True
+        Cancelled          True    boolean            bool        True          True
+        Carrier            True    keyword          object        True          True
+        Dest               True    keyword          object        True          True
+        DestAirportID      True    keyword          object        True          True
+        ...                 ...        ...             ...         ...           ...
+        OriginLocation     True  geo_point          object        True          True
+        OriginRegion       True    keyword          object        True          True
+        OriginWeather      True    keyword          object        True          True
+        dayOfWeek          True    integer           int64        True          True
+        timestamp          True       date  datetime64[ns]        True          True
+        <BLANKLINE>
+        [27 rows x 5 columns]
+        Operations:
+         tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
+         size: 5
+         sort_params: _doc:desc
+         columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
+         post_processing: ['sort_index']
+        <BLANKLINE>
        """
        buf = StringIO()

@ -350,7 +398,7 @@ class DataFrame(NDFrame):
        This method prints information about a DataFrame including
        the index dtype and column dtypes, non-null values and memory usage.

-        See :pandas_docs:`pandas.DataFrame.info` for full details.
+        See :pandas_api_docs:`pandas.DataFrame.info` for full details.

        Notes
        -----
@ -368,7 +416,7 @@ class DataFrame(NDFrame):
        customer_first_name    4675 non-null object
        geoip.city_name        4094 non-null object
        dtypes: object(2)
-        memory usage: 96.0 bytes
+        memory usage: ...
        """
        if buf is None:  # pragma: no cover
            buf = sys.stdout
@ -559,6 +607,26 @@ class DataFrame(NDFrame):
            result = _buf.getvalue()
            return result

+    def __getattr__(self, key):
+        """After regular attribute access, looks up the name in the columns
+
+        Parameters
+        ----------
+            key: str
+                Attribute name.
+
+        Returns
+        -------
+            The value of the attribute.
+        """
+        try:
+            return object.__getattribute__(self, key)
+        except AttributeError as e:
+            if key in self.columns:
+                return self[key]
+            raise e
+
+
    def _getitem(self, key):
        """Get the column specified by key for this DataFrame.

@ -695,7 +763,7 @@ class DataFrame(NDFrame):
        """
        Return a subset of the DataFrame's columns based on the column dtypes.

-        Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes`
+        Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
        """
        empty_df = self._empty_pd_df()

@ -720,6 +788,16 @@ class DataFrame(NDFrame):
        return num_rows, num_columns

    def keys(self):
+        """
+        Return columns
+
+        See :pandas_api_docs:`pandas.DataFrame.keys`
+
+        Returns
+        -------
+        pandas.Index
+            Elasticsearch field names as pandas.Index
+        """
        return self.columns

    def aggregate(self, func, axis=0, *args, **kwargs):
@ -758,7 +836,7 @@ class DataFrame(NDFrame):

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.aggregate`
+        :pandas_api_docs:`pandas.DataFrame.aggregate`

        Examples
        --------
@ -788,19 +866,49 @@ class DataFrame(NDFrame):

    hist = gfx.ed_hist_frame

-    def query(self, expr, inplace=False, **kwargs):
-        """Queries the Dataframe with a boolean expression
+    def query(self, expr):
+        """
+        Query the columns of a DataFrame with a boolean expression.

-        Returns:
-            A new DataFrame if inplace=False
+        TODO - add additional pandas arguments
+
+        Parameters
+        ----------
+        expr: str
+            A boolean expression
+
+        Returns
+        -------
+        eland.DataFrame:
+            DataFrame populated by results of the query
+
+        TODO - add link to eland user guide
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.query`
+        :pandas_user_guide:`indexing`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df = df.query('FlightDelayMin > 60')
+        >>> df.info()
        """
        if isinstance(expr, BooleanFilter):
            return DataFrame(
                query_compiler=self._query_compiler._update_query(BooleanFilter(expr))
            )
        elif isinstance(expr, six.string_types):
+            column_resolver = {}
+            for key in self.keys():
+                column_resolver[key] = self.get(key)
+            # Create fake resolvers - index resolver is empty
+            resolvers = column_resolver, {}
+            # Use pandas eval to parse query - TODO validate this further
+            filter = eval(expr, target=self, resolvers=tuple(tuple(resolvers)))
            return DataFrame(
-                query_compiler=self._query_compiler._update_query(ScriptFilter(expr))
+                query_compiler=self._query_compiler._update_query(filter)
            )
        else:
            raise NotImplementedError(expr, type(expr))
@ -820,7 +928,7 @@ class DataFrame(NDFrame):

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.get`
+        :pandas_api_docs:`pandas.DataFrame.get`

        Examples
        --------
--- a/eland/index.py
+++ b/eland/index.py
@ -58,5 +58,5 @@ class Index:

    def info_es(self, buf):
        buf.write("Index:\n")
-        buf.write("\tindex_field: {0}\n".format(self.index_field))
-        buf.write("\tis_source_field: {0}\n".format(self.is_source_field))
+        buf.write(" index_field: {0}\n".format(self.index_field))
+        buf.write(" is_source_field: {0}\n".format(self.is_source_field))
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -408,6 +408,44 @@ class Mappings:

        return is_source_field

+    def aggregatable_columns(self, columns=None):
+        """
+        Return a dict of aggregatable columns from all columns or columns list
+        {'customer_full_name': 'customer_full_name.keyword', ...}
+
+        Logic here is that column names are '_source' fields and keyword fields
+        may be nested beneath the field. E.g.
+        customer_full_name: text
+        customer_full_name.keyword: keyword
+
+        customer_full_name.keyword is the aggregatable field for customer_full_name
+
+        Returns
+        -------
+        dict
+            e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
+        """
+        if columns is None:
+            columns = self.source_fields()
+
+        aggregatables = {}
+
+        for column in columns:
+            capabilities = self.field_capabilities(column)
+            if capabilities['aggregatable']:
+                aggregatables[column] = column
+            else:
+                # Try 'column.keyword'
+                column_keyword = column + '.keyword'
+                capabilities = self.field_capabilities(column_keyword)
+                if capabilities['aggregatable']:
+                    aggregatables[column_keyword] = column
+                else:
+                    # Aggregations not supported for this field
+                    raise ValueError("Aggregations not supported for ", column)
+
+        return aggregatables
+
    def numeric_source_fields(self, columns, include_bool=True):
        """
        Returns
@ -471,4 +509,4 @@ class Mappings:

    def info_es(self, buf):
        buf.write("Mappings:\n")
-        buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))
+        buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -66,7 +66,7 @@ class NDFrame:

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.index`
+        :pandas_api_docs:`pandas.DataFrame.index`

        Examples
        --------
@ -92,7 +92,7 @@ class NDFrame:

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.dtypes`
+        :pandas_api_docs:`pandas.DataFrame.dtypes`

        Examples
        --------
@ -125,22 +125,6 @@ class NDFrame:
    def __getitem__(self, key):
        return self._getitem(key)

-    def __getattr__(self, key):
-        """After regular attribute access, looks up the name in the columns
-
-        Args:
-            key (str): Attribute name.
-
-        Returns:
-            The value of the attribute.
-        """
-        try:
-            return object.__getattribute__(self, key)
-        except AttributeError as e:
-            if key in self.columns:
-                return self[key]
-            raise e
-
    def __sizeof__(self):
        # Don't default to pandas, just return approximation TODO - make this more accurate
        return sys.getsizeof(self._query_compiler)
@ -190,7 +174,7 @@ class NDFrame:

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.drop`
+        :pandas_api_docs:`pandas.DataFrame.drop`

        Examples
        --------
@ -299,26 +283,185 @@ class NDFrame:
        )
        return self._create_or_update_from_compiler(new_query_compiler, inplace)

-    # TODO implement arguments
-    def mean(self):
+    def mean(self, numeric_only=True):
+        """
+        Return mean value for each numeric column
+
+        TODO - implement remainder of pandas arguments
+
+        Returns
+        -------
+        pandas.Series
+            mean value for each numeric column
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.mean`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df.mean()
+        AvgTicketPrice         628.253689
+        Cancelled                0.128494
+        DistanceKilometers    7092.142457
+        DistanceMiles         4406.853010
+        FlightDelay              0.251168
+        FlightDelayMin          47.335171
+        FlightTimeHour           8.518797
+        FlightTimeMin          511.127842
+        dayOfWeek                2.835975
+        dtype: float64
+        """
+        if numeric_only == False:
+            raise NotImplementedError("Only mean of numeric fields is implemented")
        return self._query_compiler.mean()

    def sum(self, numeric_only=True):
+        """
+        Return sum for each numeric column
+
+        TODO - implement remainder of pandas arguments
+
+        Returns
+        -------
+        pandas.Series
+            sum for each numeric column
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.sum`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df.sum()
+        AvgTicketPrice        8.204365e+06
+        Cancelled             1.678000e+03
+        DistanceKilometers    9.261629e+07
+        DistanceMiles         5.754909e+07
+        FlightDelay           3.280000e+03
+        FlightDelayMin        6.181500e+05
+        FlightTimeHour        1.112470e+05
+        FlightTimeMin         6.674818e+06
+        dayOfWeek             3.703500e+04
+        dtype: float64
+        """
        if numeric_only == False:
            raise NotImplementedError("Only sum of numeric fields is implemented")
        return self._query_compiler.sum()

    def min(self, numeric_only=True):
+        """
+        Return the minimum value for each numeric column
+
+        TODO - implement remainder of pandas arguments
+
+        Returns
+        -------
+        pandas.Series
+            min value for each numeric column
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.min`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df.min()
+        AvgTicketPrice        100.020531
+        Cancelled               0.000000
+        DistanceKilometers      0.000000
+        DistanceMiles           0.000000
+        FlightDelay             0.000000
+        FlightDelayMin          0.000000
+        FlightTimeHour          0.000000
+        FlightTimeMin           0.000000
+        dayOfWeek               0.000000
+        dtype: float64
+        """
        if numeric_only == False:
-            raise NotImplementedError("Only sum of numeric fields is implemented")
+            raise NotImplementedError("Only min of numeric fields is implemented")
        return self._query_compiler.min()

    def max(self, numeric_only=True):
+        """
+        Return the maximum value for each numeric column
+
+        TODO - implement remainder of pandas arguments
+
+        Returns
+        -------
+        pandas.Series
+            max value for each numeric column
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.max`
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df.max()
+        AvgTicketPrice         1199.729004
+        Cancelled                 1.000000
+        DistanceKilometers    19881.482422
+        DistanceMiles         12353.780273
+        FlightDelay               1.000000
+        FlightDelayMin          360.000000
+        FlightTimeHour           31.715034
+        FlightTimeMin          1902.901978
+        dayOfWeek                 6.000000
+        dtype: float64
+        """
        if numeric_only == False:
-            raise NotImplementedError("Only sum of numeric fields is implemented")
+            raise NotImplementedError("Only max of numeric fields is implemented")
        return self._query_compiler.max()

    def nunique(self):
+        """
+        Return cardinality of each field.
+
+        **Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields**
+
+        This method will try and field aggregatable fields if possible if mapping has::
+
+            "customer_first_name" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            }
+
+        we will aggregate ``customer_first_name`` columns using ``customer_first_name.keyword``.
+
+        TODO - implement remainder of pandas arguments
+
+        Returns
+        -------
+        pandas.Series
+            cardinality of each column
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.nunique`
+
+        Examples
+        --------
+        >>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
+        >>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns)
+        >>> df.nunique()
+        category                6
+        currency                1
+        customer_birth_date     0
+        customer_first_name    46
+        user                   46
+        dtype: int64
+        """
        return self._query_compiler.nunique()

    def _hist(self, num_bins):
@ -341,7 +484,7 @@ class NDFrame:

        See Also
        --------
-        :pandas_docs:`pandas.DataFrame.describe`
+        :pandas_api_docs:`pandas.DataFrame.describe`

        Examples
        --------
--- a/eland/operations.py
+++ b/eland/operations.py
@ -183,12 +183,13 @@ class Operations:
            raise NotImplementedError("Can not count field matches if size is set {}".format(size))

        columns = self.get_columns()
-        if columns is None:
-            columns = query_compiler._mappings.source_fields()
+
+        # Get just aggregatable columns
+        aggregatable_columns = query_compiler._mappings.aggregatable_columns(columns)

        body = Query(query_params['query'])

-        for field in columns:
+        for field in aggregatable_columns.keys():
            body.metric_aggs(field, func, field)

        response = query_compiler._client.search(
@ -198,10 +199,10 @@ class Operations:

        results = {}

-        for field in columns:
-            results[field] = response['aggregations'][field]['value']
+        for key, value in aggregatable_columns.items():
+            results[value] = response['aggregations'][key]['value']

-        s = pd.Series(data=results, index=columns)
+        s = pd.Series(data=results, index=results.keys())

        return s

@ -845,16 +846,16 @@ class Operations:

    def info_es(self, buf):
        buf.write("Operations:\n")
-        buf.write("\ttasks: {0}\n".format(self._tasks))
+        buf.write(" tasks: {0}\n".format(self._tasks))

        query_params, post_processing = self._resolve_tasks()
        size, sort_params = Operations._query_params_to_size_and_sort(query_params)
        columns = self.get_columns()

-        buf.write("\tsize: {0}\n".format(size))
-        buf.write("\tsort_params: {0}\n".format(sort_params))
-        buf.write("\tcolumns: {0}\n".format(columns))
-        buf.write("\tpost_processing: {0}\n".format(post_processing))
+        buf.write(" size: {0}\n".format(size))
+        buf.write(" sort_params: {0}\n".format(sort_params))
+        buf.write(" columns: {0}\n".format(columns))
+        buf.write(" post_processing: {0}\n".format(post_processing))

    def update_query(self, boolean_filter):
        task = ('boolean_filter', boolean_filter)
--- a/eland/plotting.py
+++ b/eland/plotting.py
@ -10,7 +10,7 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
                  xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
                  sharey=False, figsize=None, layout=None, bins=10, **kwds):
    """
-    See :pandas_docs:`pandas.DataFrame.hist` for usage.
+    See :pandas_api_docs:`pandas.DataFrame.hist` for usage.

    Notes
    -----
--- a/eland/series.py
+++ b/eland/series.py
@ -215,3 +215,16 @@ class Series(NDFrame):
            return NotFilter(Equal(field=self.name, value=other))
        else:
            raise NotImplementedError(other, type(other))
+
+    @property
+    def ndim(self):
+        """
+        Returns 1 by definition of a Series1
+
+        Returns
+        -------
+        int
+            By definition 1
+
+        """
+        return 1
--- a/eland/tests/dataframe/test_keys_pytest.py
+++ b/eland/tests/dataframe/test_keys_pytest.py
@ -0,0 +1,26 @@
+# File called _pytest for PyCharm compatability
+
+from eland.tests.common import TestData
+
+from pandas.testing import assert_index_equal
+
+
+class TestDataFrameKeys(TestData):
+
+    def test_ecommerce_keys(self):
+        pd_ecommerce = self.pd_ecommerce()
+        ed_ecommerce = self.ed_ecommerce()
+
+        pd_keys = pd_ecommerce.keys()
+        ed_keys = ed_ecommerce.keys()
+
+        assert_index_equal(pd_keys, ed_keys)
+
+    def test_flights_keys(self):
+        pd_flights = self.pd_flights()
+        ed_flights = self.ed_flights()
+
+        pd_keys = pd_flights.keys()
+        ed_keys = ed_flights.keys()
+
+        assert_index_equal(pd_keys, ed_keys)
--- a/eland/tests/dataframe/test_metrics_pytest.py
+++ b/eland/tests/dataframe/test_metrics_pytest.py
@ -7,16 +7,16 @@ from eland.tests.common import TestData

 class TestDataFrameMetrics(TestData):

-    def test_to_mean(self):
+    def test_mean(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()

-        pd_mean = pd_flights.mean()
-        ed_mean = ed_flights.mean()
+        pd_mean = pd_flights.mean(numeric_only=True)
+        ed_mean = ed_flights.mean(numeric_only=True)

        assert_series_equal(pd_mean, ed_mean)

-    def test_to_sum(self):
+    def test_sum(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()

@ -25,7 +25,7 @@ class TestDataFrameMetrics(TestData):

        assert_series_equal(pd_sum, ed_sum)

-    def test_to_min(self):
+    def test_min(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()

@ -34,7 +34,7 @@ class TestDataFrameMetrics(TestData):

        assert_series_equal(pd_min, ed_min)

-    def test_to_max(self):
+    def test_max(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()

--- a/eland/tests/dataframe/test_nunique_pytest.py
+++ b/eland/tests/dataframe/test_nunique_pytest.py
@ -0,0 +1,33 @@
+# File called _pytest for PyCharm compatability
+import pandas as pd
+
+from pandas.util.testing import assert_series_equal
+
+from eland.tests.common import TestData
+
+
+class TestDataFrameNUnique(TestData):
+
+    def test_flights_nunique(self):
+        # Note pandas.nunique fails for dict columns (e.g. DestLocation)
+        columns = ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName']
+        pd_flights = self.pd_flights()[columns]
+        ed_flights = self.ed_flights()[columns]
+
+        pd_nunique = pd_flights.nunique()
+        ed_nunique = ed_flights.nunique()
+
+        # TODO - ES is approximate counts so these aren't equal...
+        #E[left]: [13059, 2, 4, 156, 156, 143]
+        #E[right]: [13132, 2, 4, 156, 156, 143]
+        #assert_series_equal(pd_nunique, ed_nunique)
+
+    def test_ecommerce_nunique(self):
+        columns = ['customer_first_name', 'customer_gender', 'day_of_week_i']
+        pd_ecommerce = self.pd_ecommerce()[columns]
+        ed_ecommerce = self.ed_ecommerce()[columns]
+
+        pd_nunique = pd_ecommerce.nunique()
+        ed_nunique = ed_ecommerce.nunique()
+
+        assert_series_equal(pd_nunique, ed_nunique)
--- a/eland/tests/dataframe/test_query_pytest.py
+++ b/eland/tests/dataframe/test_query_pytest.py
@ -10,14 +10,14 @@ from eland.tests.common import assert_pandas_eland_frame_equal

 class TestDataFrameQuery(TestData):

-    def test_query(self):
+    def test_getitem_query(self):
        # Examples from:
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
        pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
                             index=['0', '1', '2', '3', '4'])

        # Now create index
-        index_name = 'eland_test_query1'
+        index_name = 'eland_test_query'

        ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)

@ -42,3 +42,12 @@ class TestDataFrameQuery(TestData):
        ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]

        assert_pandas_eland_frame_equal(pd_q4, ed_q4)
+
+    def test_query(self):
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+
+        #print(ed_flights.query('FlightDelayMin > 60').info_es())
+
+        print(pd_flights.query('FlightDelayMin > 60').shape)
+        print(ed_flights.query('FlightDelayMin > 60').shape)
--- a/eland/tests/dataframe/test_shape_pytest.py
+++ b/eland/tests/dataframe/test_shape_pytest.py
@ -5,7 +5,7 @@ from eland.tests.common import TestData

 class TestDataFrameShape(TestData):

-    def test_to_shape1(self):
+    def test_ecommerce_shape(self):
        pd_ecommerce = self.pd_ecommerce()
        ed_ecommerce = self.ed_ecommerce()

@ -14,7 +14,7 @@ class TestDataFrameShape(TestData):

        assert pd_shape == ed_shape

-    def test_to_shape2(self):
+    def test_flights_shape(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()

--- a/eland/tests/mappings/test_aggregatables_pytest.py
+++ b/eland/tests/mappings/test_aggregatables_pytest.py
@ -0,0 +1,72 @@
+# File called _pytest for PyCharm compatability
+
+from eland.tests.common import TestData
+
+
+class TestMappingsAggregatables(TestData):
+
+    def test_ecommerce_all_aggregatables(self):
+        ed_ecommerce = self.ed_ecommerce()
+
+        aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns()
+
+        expected = {'category.keyword': 'category',
+         'currency': 'currency',
+         'customer_birth_date': 'customer_birth_date',
+         'customer_first_name.keyword': 'customer_first_name',
+         'customer_full_name.keyword': 'customer_full_name',
+         'customer_gender': 'customer_gender',
+         'customer_id': 'customer_id',
+         'customer_last_name.keyword': 'customer_last_name',
+         'customer_phone': 'customer_phone',
+         'day_of_week': 'day_of_week',
+         'day_of_week_i': 'day_of_week_i',
+         'email': 'email',
+         'geoip.city_name': 'geoip.city_name',
+         'geoip.continent_name': 'geoip.continent_name',
+         'geoip.country_iso_code': 'geoip.country_iso_code',
+         'geoip.location': 'geoip.location',
+         'geoip.region_name': 'geoip.region_name',
+         'manufacturer.keyword': 'manufacturer',
+         'order_date': 'order_date',
+         'order_id': 'order_id',
+         'products._id.keyword': 'products._id',
+         'products.base_price': 'products.base_price',
+         'products.base_unit_price': 'products.base_unit_price',
+         'products.category.keyword': 'products.category',
+         'products.created_on': 'products.created_on',
+         'products.discount_amount': 'products.discount_amount',
+         'products.discount_percentage': 'products.discount_percentage',
+         'products.manufacturer.keyword': 'products.manufacturer',
+         'products.min_price': 'products.min_price',
+         'products.price': 'products.price',
+         'products.product_id': 'products.product_id',
+         'products.product_name.keyword': 'products.product_name',
+         'products.quantity': 'products.quantity',
+         'products.sku': 'products.sku',
+         'products.tax_amount': 'products.tax_amount',
+         'products.taxful_price': 'products.taxful_price',
+         'products.taxless_price': 'products.taxless_price',
+         'products.unit_discount_amount': 'products.unit_discount_amount',
+         'sku': 'sku',
+         'taxful_total_price': 'taxful_total_price',
+         'taxless_total_price': 'taxless_total_price',
+         'total_quantity': 'total_quantity',
+         'total_unique_products': 'total_unique_products',
+         'type': 'type',
+         'user': 'user'}
+
+        assert expected == aggregatables
+
+    def test_ecommerce_selected_aggregatables(self):
+        ed_ecommerce = self.ed_ecommerce()
+
+        expected = {'category.keyword': 'category',
+                    'currency': 'currency',
+                    'customer_birth_date': 'customer_birth_date',
+                    'customer_first_name.keyword': 'customer_first_name',
+                    'type': 'type', 'user': 'user'}
+
+        aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns(expected.values())
+
+        assert expected == aggregatables
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,5 +2,4 @@ elasticsearch>=7.0.5
 pandas==0.25.1
 matplotlib
 pytest>=5.2.1
-sphinx_rtd_theme
 numpydoc==0.8