Feature/refactor tasks (#83)

* Significant refactor of task list in operations.py Classes based on composite pattern replace tuples for tasks. * Addressing review comments for eland/operations.py * Minor update to review fixes * Minor fix for some better handling of non-aggregatable fields: https://github.com/elastic/eland/issues/71 * Test for non-aggrgatable value_counts * Refactoring tasks/actions * Removing debug and fixing doctest
2025-07-11 00:02:14 +08:00 · 2019-12-06 08:46:43 +00:00 · 2019-12-06 08:46:43 +00:00 · f06219f0ec
commit f06219f0ec
parent f263e21b8a
13 changed files with 625 additions and 445 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1 +1 @@
-include LICENSE
+include LICENSE.txt
--- a/README.md
+++ b/README.md
@ -34,14 +34,15 @@ max    400140.000000     246.000000       5.000000
 ## Connecting to Elasticsearch Cloud
-```python
+```
 >>> import eland as ed
 >>> from elasticsearch import Elasticsearch
 >>> es = Elasticsearch(cloud_id="<cloud_id>", http_auth=('<user>','<password>'))
 >>> es.info()
 {'name': 'instance-0000000000', 'cluster_name': 'bf900cfce5684a81bca0be0cce5913bc', 'cluster_uuid': 'xLPvrV3jQNeadA7oM4l1jA', 'version': {'number': '7.4.2', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '2f90bbf7b93631e52bafb59b3b049cb44ec25e96', 'build_date': '2019-10-28T20:40:44.881551Z', 'build_snapshot': False, 'lucene_version': '8.2.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
 >>> import eland as ed
 >>> df = ed.read_es(es, 'reviews')
 ```
--- a/eland/actions.py
+++ b/eland/actions.py
@ -0,0 +1,87 @@
 from abc import ABC, abstractmethod
 # -------------------------------------------------------------------------------------------------------------------- #
 # PostProcessingActions                                                                                                #
 # -------------------------------------------------------------------------------------------------------------------- #
 class PostProcessingAction(ABC):
    def __init__(self, action_type):
        """
        Abstract class for postprocessing actions
        Parameters
        ----------
            action_type: str
                The action type (e.g. sort_index, head etc.)
        """
        self._action_type = action_type
    @property
    def type(self):
        return self._action_type
    @abstractmethod
    def resolve_action(self, df):
        pass
    @abstractmethod
    def __repr__(self):
        pass
 class SortIndexAction(PostProcessingAction):
    def __init__(self):
        super().__init__("sort_index")
    def resolve_action(self, df):
        return df.sort_index()
    def __repr__(self):
        return "('{}')".format(self.type)
 class HeadAction(PostProcessingAction):
    def __init__(self, count):
        super().__init__("head")
        self._count = count
    def resolve_action(self, df):
        return df.head(self._count)
    def __repr__(self):
        return "('{}': ('count': {}))".format(self.type, self._count)
 class TailAction(PostProcessingAction):
    def __init__(self, count):
        super().__init__("tail")
        self._count = count
    def resolve_action(self, df):
        return df.tail(self._count)
    def __repr__(self):
        return "('{}': ('count': {}))".format(self.type, self._count)
 class SortFieldAction(PostProcessingAction):
    def __init__(self, sort_params_string):
        super().__init__("sort_field")
        if sort_params_string is None:
            raise ValueError("Expected valid string")
        # Split string
        sort_params = sort_params_string.split(":")
        if len(sort_params) != 2:
            raise ValueError("Expected ES sort params string (e.g. _doc:desc). Got '{}'".format(sort_params_string))
        self._sort_field = sort_params[0]
        self._sort_order = Operations.SortOrder.from_string(sort_params[1])
    def resolve_action(self, df):
        if self._sort_order == Operations.SortOrder.ASC:
            return df.sort_values(self._sort_field, True)
        return df.sort_values(self._sort_field, False)
    def __repr__(self):
        return "('{}': ('sort_field': '{}', 'sort_order': {}))".format(self.type, self._sort_field, self._sort_order)
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -385,12 +385,14 @@ class DataFrame(NDFrame):
        <BLANKLINE>
        [27 rows x 5 columns]
        Operations:
-         tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('field_names', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
+         tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
         size: 5
         sort_params: _doc:desc
         _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
         body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}, 'aggs': {}}
-         post_processing: ['sort_index']
+         post_processing: [('sort_index')]
        'field_to_display_names': {}
        'display_to_field_names': {}
        <BLANKLINE>
        """
        buf = StringIO()
--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -1,7 +1,7 @@
 """
 NDFrame
 ---------
-Base class for eland.DataFrame and eland.Series.
+Abstract base class for eland.DataFrame and eland.Series.
 The underlying data resides in Elasticsearch and the API aligns as much as
 possible with pandas APIs.
@ -24,6 +24,7 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
 """
 import sys
 from abc import ABC
 import pandas as pd
 from pandas.core.dtypes.common import is_list_like
@ -32,7 +33,7 @@ from pandas.util._validators import validate_bool_kwarg
 from eland import ElandQueryCompiler
-class NDFrame:
+class NDFrame(ABC):
    def __init__(self,
                 client=None,
--- a/eland/operations.py
+++ b/eland/operations.py
@ -1,11 +1,12 @@
 import copy
 from enum import Enum
 import numpy as np
 import pandas as pd
 from eland import Index
 from eland import Query
 from eland.actions import SortFieldAction
 from eland.tasks import HeadTask, TailTask, BooleanFilterTask, ArithmeticOpFieldsTask, QueryTermsTask, \
    QueryIdsTask, SortOrder, SizeTask
 class Operations:
@ -20,87 +21,46 @@ class Operations:
    This is maintained as a 'task graph' (inspired by dask)
    (see https://docs.dask.org/en/latest/spec.html)
    """
-
+    def __init__(self, tasks=None, field_names=None):
    class SortOrder(Enum):
        ASC = 0
        DESC = 1
        @staticmethod
        def reverse(order):
            if order == Operations.SortOrder.ASC:
                return Operations.SortOrder.DESC
            return Operations.SortOrder.ASC
        @staticmethod
        def to_string(order):
            if order == Operations.SortOrder.ASC:
                return "asc"
            return "desc"
        @staticmethod
        def from_string(order):
            if order == "asc":
                return Operations.SortOrder.ASC
            return Operations.SortOrder.DESC
    def __init__(self, tasks=None):
        if tasks is None:
            self._tasks = []
        else:
            self._tasks = tasks
        self._field_names = field_names
    def __constructor__(self, *args, **kwargs):
        return type(self)(*args, **kwargs)
    def copy(self):
-        return self.__constructor__(tasks=copy.deepcopy(self._tasks))
+        return self.__constructor__(tasks=copy.deepcopy(self._tasks), field_names=copy.deepcopy(self._field_names))
    def head(self, index, n):
        # Add a task that is an ascending sort with size=n
-        task = ('head', (index.sort_field, n))
+        task = HeadTask(index.sort_field, n)
        self._tasks.append(task)
    def tail(self, index, n):
        # Add a task that is descending sort with size=n
-        task = ('tail', (index.sort_field, n))
+        task = TailTask(index.sort_field, n)
        self._tasks.append(task)
    def arithmetic_op_fields(self, field_name, op_name, left_field, right_field, op_type=None):
        if op_type:
            task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))), op_type)
        else:
            task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))))
        # Set this as a column we want to retrieve
        self.set_field_names([field_name])
        task = ArithmeticOpFieldsTask(field_name, op_name, left_field, right_field, op_type)
        self._tasks.append(task)
    def set_field_names(self, field_names):
        # Setting field_names at different phases of the task list may result in different
        # operations. So instead of setting field_names once, set when it happens in call chain
        if not isinstance(field_names, list):
            field_names = list(field_names)
-        # TODO - field_name renaming
+        self._field_names = field_names
-        # TODO - validate we are setting field_names to a subset of last field_names?
+
-        task = ('field_names', field_names)
+        return self._field_names
        self._tasks.append(task)
        # Iterate backwards through task list looking for last 'field_names' task
        for task in reversed(self._tasks):
            if task[0] == 'field_names':
                return task[1]
        return None
    def get_field_names(self):
-        # Iterate backwards through task list looking for last 'field_names' task
+        return self._field_names
        for task in reversed(self._tasks):
            if task[0] == 'field_names':
                return task[1]
        return None
    def __repr__(self):
        return repr(self._tasks)
@ -248,7 +208,9 @@ class Operations:
        results = {}
-        for key, value in aggregatable_field_names.items():
+        for key in aggregatable_field_names.keys():
            # key is aggregatable field, value is label
            # e.g. key=category.keyword, value=category
            for bucket in response['aggregations'][key]['buckets']:
                results[bucket['key']] = bucket['doc_count']
@ -597,7 +559,7 @@ class Operations:
                _source=field_names)
            # create post sort
            if sort_params is not None:
-                post_processing.append(self._sort_params_to_postprocessing(sort_params))
+                post_processing.append(SortFieldAction(sort_params))
        if is_scan:
            while True:
@ -611,11 +573,6 @@ class Operations:
            df = self._apply_df_post_processing(df, post_processing)
            collector.collect(df)
    def iloc(self, index, field_names):
        # index and field_names are indexers
        task = ('iloc', (index, field_names))
        self._tasks.append(task)
    def index_count(self, query_compiler, field):
        # field is the index field so count values
        query_params, post_processing = self._resolve_tasks()
@ -671,28 +628,16 @@ class Operations:
        # b not in ['a','b','c']
        # For now use term queries
        if field == Index.ID_INDEX_FIELD:
-            task = ('query_ids', ('must_not', items))
+            task = QueryIdsTask(False, items)
        else:
-            task = ('query_terms', ('must_not', (field, items)))
+            task = QueryTermsTask(False, field, items)
        self._tasks.append(task)
    @staticmethod
    def _sort_params_to_postprocessing(input):
        # Split string
        sort_params = input.split(":")
        query_sort_field = sort_params[0]
        query_sort_order = Operations.SortOrder.from_string(sort_params[1])
        task = ('sort_field', (query_sort_field, query_sort_order))
        return task
    @staticmethod
    def _query_params_to_size_and_sort(query_params):
        sort_params = None
        if query_params['query_sort_field'] and query_params['query_sort_order']:
-            sort_params = query_params['query_sort_field'] + ":" + Operations.SortOrder.to_string(
+            sort_params = query_params['query_sort_field'] + ":" + SortOrder.to_string(
                query_params['query_sort_order'])
        size = query_params['query_size']
@ -703,37 +648,16 @@ class Operations:
    def _count_post_processing(post_processing):
        size = None
        for action in post_processing:
-            if action[0] == 'head' or action[0] == 'tail':
+            if isinstance(action, SizeTask):
-                if size is None or action[1][1] < size:
+                if size is None or action.size() < size:
-                    size = action[1][1]
+                    size = action.size()
        return size
    @staticmethod
    def _apply_df_post_processing(df, post_processing):
        for action in post_processing:
-            if action == 'sort_index':
+            df = action.resolve_action(df)
                df = df.sort_index()
            elif action[0] == 'head':
                df = df.head(action[1][1])
            elif action[0] == 'tail':
                df = df.tail(action[1][1])
            elif action[0] == 'sort_field':
                sort_field = action[1][0]
                sort_order = action[1][1]
                if sort_order == Operations.SortOrder.ASC:
                    df = df.sort_values(sort_field, True)
                else:
                    df = df.sort_values(sort_field, False)
            elif action[0] == 'iloc':
                index_indexer = action[1][0]
                field_name_indexer = action[1][1]
                if index_indexer is None:
                    index_indexer = slice(None)
                if field_name_indexer is None:
                    field_name_indexer = slice(None)
                df = df.iloc[index_indexer, field_name_indexer]
            # field_names could be in here (and we ignore it)
        return df
@ -752,337 +676,7 @@ class Operations:
        post_processing = []
        for task in self._tasks:
-            if task[0] == 'head':
+            query_params, post_processing = task.resolve_task(query_params, post_processing)
                query_params, post_processing = self._resolve_head(task, query_params, post_processing)
            elif task[0] == 'tail':
                query_params, post_processing = self._resolve_tail(task, query_params, post_processing)
            elif task[0] == 'iloc':
                query_params, post_processing = self._resolve_iloc(task, query_params, post_processing)
            elif task[0] == 'query_ids':
                query_params, post_processing = self._resolve_query_ids(task, query_params, post_processing)
            elif task[0] == 'query_terms':
                query_params, post_processing = self._resolve_query_terms(task, query_params, post_processing)
            elif task[0] == 'boolean_filter':
                query_params, post_processing = self._resolve_boolean_filter(task, query_params, post_processing)
            elif task[0] == 'arithmetic_op_fields':
                query_params, post_processing = self._resolve_arithmetic_op_fields(task, query_params, post_processing)
            else:  # a lot of operations simply post-process the dataframe - put these straight through
                query_params, post_processing = self._resolve_post_processing_task(task, query_params, post_processing)
        return query_params, post_processing
    @staticmethod
    def _resolve_head(item, query_params, post_processing):
        # head - sort asc, size n
        # |12345-------------|
        query_sort_field = item[1][0]
        query_sort_order = Operations.SortOrder.ASC
        query_size = item[1][1]
        # If we are already postprocessing the query results, we just get 'head' of these
        # (note, currently we just append another head, we don't optimise by
        # overwriting previous head)
        if len(post_processing) > 0:
            post_processing.append(item)
            return query_params, post_processing
        if query_params['query_sort_field'] is None:
            query_params['query_sort_field'] = query_sort_field
        # if it is already sorted we use existing field
        if query_params['query_sort_order'] is None:
            query_params['query_sort_order'] = query_sort_order
        # if it is already sorted we get head of existing order
        if query_params['query_size'] is None:
            query_params['query_size'] = query_size
        else:
            # truncate if head is smaller
            if query_size < query_params['query_size']:
                query_params['query_size'] = query_size
        return query_params, post_processing
    @staticmethod
    def _resolve_tail(item, query_params, post_processing):
        # tail - sort desc, size n, post-process sort asc
        # |-------------12345|
        query_sort_field = item[1][0]
        query_sort_order = Operations.SortOrder.DESC
        query_size = item[1][1]
        # If this is a tail of a tail adjust settings and return
        if query_params['query_size'] is not None and \
                query_params['query_sort_order'] == query_sort_order and \
                post_processing == ['sort_index']:
            if query_size < query_params['query_size']:
                query_params['query_size'] = query_size
            return query_params, post_processing
        # If we are already postprocessing the query results, just get 'tail' of these
        # (note, currently we just append another tail, we don't optimise by
        # overwriting previous tail)
        if len(post_processing) > 0:
            post_processing.append(item)
            return query_params, post_processing
        # If results are already constrained, just get 'tail' of these
        # (note, currently we just append another tail, we don't optimise by
        # overwriting previous tail)
        if query_params['query_size'] is not None:
            post_processing.append(item)
            return query_params, post_processing
        else:
            query_params['query_size'] = query_size
        if query_params['query_sort_field'] is None:
            query_params['query_sort_field'] = query_sort_field
        if query_params['query_sort_order'] is None:
            query_params['query_sort_order'] = query_sort_order
        else:
            # reverse sort order
            query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
        post_processing.append('sort_index')
        return query_params, post_processing
    @staticmethod
    def _resolve_iloc(item, query_params, post_processing):
        # tail - sort desc, size n, post-process sort asc
        # |---4--7-9---------|
        # This is a list of items we return via an integer index
        int_index = item[1][0]
        if int_index is not None:
            last_item = int_index.max()
            # If we have a query_size we do this post processing
            if query_params['query_size'] is not None:
                post_processing.append(item)
                return query_params, post_processing
            # size should be > last item
            query_params['query_size'] = last_item + 1
        post_processing.append(item)
        return query_params, post_processing
    @staticmethod
    def _resolve_query_ids(item, query_params, post_processing):
        # task = ('query_ids', ('must_not', items))
        must_clause = item[1][0]
        ids = item[1][1]
        if must_clause == 'must':
            query_params['query'].ids(ids, must=True)
        else:
            query_params['query'].ids(ids, must=False)
        return query_params, post_processing
    @staticmethod
    def _resolve_query_terms(item, query_params, post_processing):
        # task = ('query_terms', ('must_not', (field, terms)))
        must_clause = item[1][0]
        field = item[1][1][0]
        terms = item[1][1][1]
        if must_clause == 'must':
            query_params['query'].terms(field, terms, must=True)
        else:
            query_params['query'].terms(field, terms, must=False)
        return query_params, post_processing
    @staticmethod
    def _resolve_boolean_filter(item, query_params, post_processing):
        # task = ('boolean_filter', object)
        boolean_filter = item[1]
        query_params['query'].update_boolean_filter(boolean_filter)
        return query_params, post_processing
    def _resolve_arithmetic_op_fields(self, item, query_params, post_processing):
        # task = ('arithmetic_op_fields', (field_name, (op_name, (left_field, right_field))))
        field_name = item[1][0]
        op_name = item[1][1][0]
        left_field = item[1][1][1][0]
        right_field = item[1][1][1][1]
        try:
            op_type = item[2]
        except IndexError:
            op_type = None
        # https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
        if not op_type:
            if isinstance(left_field, str) and isinstance(right_field, str):
                """
                (if op_name = '__truediv__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "doc[left_field].value / doc[right_field].value"
                    }
                    }
                }
                """
                if op_name == '__add__':
                    source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__truediv__':
                    source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__floordiv__':
                    source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
                elif op_name == '__pow__':
                    source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
                elif op_name == '__mod__':
                    source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__mul__':
                    source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__sub__':
                    source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
                if query_params['query_script_fields'] is None:
                    query_params['query_script_fields'] = {}
                query_params['query_script_fields'][field_name] = {
                    'script': {
                        'source': source
                    }
                }
            elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number):
                """
                (if op_name = '__truediv__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "doc[left_field].value / right_field"
                    }
                    }
                }
                """
                if op_name == '__add__':
                    source = "doc['{0}'].value + {1}".format(left_field, right_field)
                elif op_name == '__truediv__':
                    source = "doc['{0}'].value / {1}".format(left_field, right_field)
                elif op_name == '__floordiv__':
                    source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
                elif op_name == '__pow__':
                    source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
                elif op_name == '__mod__':
                    source = "doc['{0}'].value % {1}".format(left_field, right_field)
                elif op_name == '__mul__':
                    source = "doc['{0}'].value * {1}".format(left_field, right_field)
                elif op_name == '__sub__':
                    source = "doc['{0}'].value - {1}".format(left_field, right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
            elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str):
                """
                (if op_name = '__truediv__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "left_field / doc['right_field'].value"
                    }
                    }
                }
                """
                if op_name == '__add__':
                    source = "{0} + doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__truediv__':
                    source = "{0} / doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__floordiv__':
                    source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field)
                elif op_name == '__pow__':
                    source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field)
                elif op_name == '__mod__':
                    source = "{0} % doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__mul__':
                    source = "{0} * doc['{1}'].value".format(left_field, right_field)
                elif op_name == '__sub__':
                    source = "{0} - doc['{1}'].value".format(left_field, right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
            else:
                raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name)
        elif op_type[0] == "string":
            # we need to check the type of string addition
            if op_type[1] == "s":
                """
                (if op_name = '__add__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "doc[left_field].value + doc[right_field].value"
                    }
                    }
                }
                """
                if op_name == '__add__':
                    source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
            elif op_type[1] == "r":
                if isinstance(left_field, str) and isinstance(right_field, str):
                    """
                    (if op_name = '__add__')
                    "script_fields": {
                        "field_name": {
                        "script": {
                            "source": "doc[left_field].value + right_field"
                        }
                        }
                    }
                    """
                    if op_name == '__add__':
                        source = "doc['{0}'].value + '{1}'".format(left_field, right_field)
                    else:
                        raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
            elif op_type[1] == 'l':
                if isinstance(left_field, str) and isinstance(right_field, str):
                    """
                    (if op_name = '__add__')
                    "script_fields": {
                        "field_name": {
                        "script": {
                            "source": "left_field + doc[right_field].value"
                        }
                        }
                    }
                    """
                    if op_name == '__add__':
                        source = "'{0}' + doc['{1}'].value".format(left_field, right_field)
                    else:
                        raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
        if query_params['query_script_fields'] is None:
            query_params['query_script_fields'] = {}
        query_params['query_script_fields'][field_name] = {
            'script': {
                'source': source
            }
        }
        return query_params, post_processing
    @staticmethod
    def _resolve_post_processing_task(item, query_params, post_processing):
        # Just do this in post-processing
        if item[0] != 'field_names':
            post_processing.append(item)
        return query_params, post_processing
@ -1121,5 +715,5 @@ class Operations:
        buf.write(" post_processing: {0}\n".format(post_processing))
    def update_query(self, boolean_filter):
-        task = ('boolean_filter', boolean_filter)
+        task = BooleanFilterTask(boolean_filter)
        self._tasks.append(task)
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -461,6 +461,7 @@ class ElandQueryCompiler:
        self._index.info_es(buf)
        self._mappings.info_es(buf)
        self._operations.info_es(buf)
        self._name_mapper.info_es(buf)
    def describe(self):
        return self._operations.describe(self)
@ -548,7 +549,7 @@ class ElandQueryCompiler:
        else:
            raise ValueError(
                "Can not perform arithmetic operations on non aggregatable fields"
-                "One of [{}, {}] is not aggregatable.".format(self.name, right.name)
+                "One of [{}, {}] is not aggregatable.".format(self_field, right_field)
        )
    def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
@ -648,6 +649,9 @@ class ElandQueryCompiler:
                display_to_field_names=self._display_to_field_names.copy()
            )
        def info_es(self, buf):
            buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
            buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))
 def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
    """
--- a/eland/series.py
+++ b/eland/series.py
@ -215,7 +215,7 @@ class Series(NDFrame):
        Returns
        -------
        pandas.Series
-            number of occurences of each value in the column
+            number of occurrences of each value in the column
        See Also
        --------
--- a/eland/tasks.py
+++ b/eland/tasks.py
@ -0,0 +1,438 @@
 from abc import ABC, abstractmethod
 from enum import Enum
 import numpy as np
 from eland.actions import HeadAction, TailAction, SortIndexAction
 class SortOrder(Enum):
    ASC = 0
    DESC = 1
    @staticmethod
    def reverse(order):
        if order == SortOrder.ASC:
            return SortOrder.DESC
        return SortOrder.ASC
    @staticmethod
    def to_string(order):
        if order == SortOrder.ASC:
            return "asc"
        return "desc"
    @staticmethod
    def from_string(order):
        if order == "asc":
            return SortOrder.ASC
        return SortOrder.DESC
 # -------------------------------------------------------------------------------------------------------------------- #
 # Tasks                                                                                                                #
 # -------------------------------------------------------------------------------------------------------------------- #
 class Task(ABC):
    """
    Abstract class for tasks
    Parameters
    ----------
        task_type: str
            The task type (e.g. head, tail etc.)
    """
    def __init__(self, task_type):
        self._task_type = task_type
    @property
    def type(self):
        return self._task_type
    @abstractmethod
    def resolve_task(self, query_params, post_processing):
        pass
    @abstractmethod
    def __repr__(self):
        pass
 class SizeTask(Task):
    def __init__(self, task_type):
        super().__init__(task_type)
    @abstractmethod
    def size(self):
        # must override
        pass
 class HeadTask(SizeTask):
    def __init__(self, sort_field, count):
        super().__init__("head")
        # Add a task that is an ascending sort with size=count
        self._sort_field = sort_field
        self._count = count
    def __repr__(self):
        return "('{}': ('sort_field': '{}', 'count': {}))".format(self._task_type, self._sort_field, self._count)
    def resolve_task(self, query_params, post_processing):
        # head - sort asc, size n
        # |12345-------------|
        query_sort_field = self._sort_field
        query_sort_order = SortOrder.ASC
        query_size = self._count
        # If we are already postprocessing the query results, we just get 'head' of these
        # (note, currently we just append another head, we don't optimise by
        # overwriting previous head)
        if len(post_processing) > 0:
            post_processing.append(HeadAction(self._count))
            return query_params, post_processing
        if query_params['query_sort_field'] is None:
            query_params['query_sort_field'] = query_sort_field
        # if it is already sorted we use existing field
        if query_params['query_sort_order'] is None:
            query_params['query_sort_order'] = query_sort_order
        # if it is already sorted we get head of existing order
        if query_params['query_size'] is None:
            query_params['query_size'] = query_size
        else:
            # truncate if head is smaller
            if query_size < query_params['query_size']:
                query_params['query_size'] = query_size
        return query_params, post_processing
    def size(self):
        return self._count
 class TailTask(SizeTask):
    def __init__(self, sort_field, count):
        super().__init__("tail")
        # Add a task that is descending sort with size=count
        self._sort_field = sort_field
        self._count = count
    def __repr__(self):
        return "('{}': ('sort_field': '{}', 'count': {}))".format(self._task_type, self._sort_field, self._count)
    def resolve_task(self, query_params, post_processing):
        # tail - sort desc, size n, post-process sort asc
        # |-------------12345|
        query_sort_field = self._sort_field
        query_sort_order = SortOrder.DESC
        query_size = self._count
        # If this is a tail of a tail adjust settings and return
        if query_params['query_size'] is not None and \
                query_params['query_sort_order'] == query_sort_order and \
                post_processing == ['sort_index']:
            if query_size < query_params['query_size']:
                query_params['query_size'] = query_size
            return query_params, post_processing
        # If we are already postprocessing the query results, just get 'tail' of these
        # (note, currently we just append another tail, we don't optimise by
        # overwriting previous tail)
        if len(post_processing) > 0:
            post_processing.append(TailAction(self._count))
            return query_params, post_processing
        # If results are already constrained, just get 'tail' of these
        # (note, currently we just append another tail, we don't optimise by
        # overwriting previous tail)
        if query_params['query_size'] is not None:
            post_processing.append(TailAction(self._count))
            return query_params, post_processing
        else:
            query_params['query_size'] = query_size
        if query_params['query_sort_field'] is None:
            query_params['query_sort_field'] = query_sort_field
        if query_params['query_sort_order'] is None:
            query_params['query_sort_order'] = query_sort_order
        else:
            # reverse sort order
            query_params['query_sort_order'] = SortOrder.reverse(query_sort_order)
        post_processing.append(SortIndexAction())
        return query_params, post_processing
    def size(self):
        return self._count
 class QueryIdsTask(Task):
    def __init__(self, must, ids):
        """
        Parameters
        ----------
        must: bool
            Include or exclude these ids (must/must_not)
        ids: list
            ids for the filter
        """
        super().__init__("query_ids")
        self._must = must
        self._ids = ids
    def resolve_task(self, query_params, post_processing):
        query_params['query'].ids(self._ids, must=self._must)
        return query_params, post_processing
    def __repr__(self):
        return "('{}': ('must': {}, 'ids': {}))".format(self._task_type, self._must, self._ids)
 class QueryTermsTask(Task):
    def __init__(self, must, field, terms):
        """
        Parameters
        ----------
        must: bool
            Include or exclude these ids (must/must_not)
        field: str
            field_name to filter
        terms: list
            field_values for filter
        """
        super().__init__("query_terms")
        self._must = must
        self._field = field
        self._terms = terms
    def __repr__(self):
        return "('{}': ('must': {}, 'field': '{}', 'terms': {}))".format(self._task_type, self._must, self._field,
                                                                         self._terms)
    def resolve_task(self, query_params, post_processing):
        query_params['query'].terms(self._field, self._terms, must=self._must)
        return query_params, post_processing
 class BooleanFilterTask(Task):
    def __init__(self, boolean_filter):
        """
        Parameters
        ----------
        boolean_filter: BooleanFilter or str
            The filter to apply
        """
        super().__init__("boolean_filter")
        self._boolean_filter = boolean_filter
    def __repr__(self):
        return "('{}': ('boolean_filter': {}))".format(self._task_type, repr(self._boolean_filter))
    def resolve_task(self, query_params, post_processing):
        query_params['query'].update_boolean_filter(self._boolean_filter)
        return query_params, post_processing
 class ArithmeticOpFieldsTask(Task):
    def __init__(self, field_name, op_name, left_field, right_field, op_type):
        super().__init__("arithmetic_op_fields")
        self._field_name = field_name
        self._op_name = op_name
        self._left_field = left_field
        self._right_field = right_field
        self._op_type = op_type
    def __repr__(self):
        return "('{}': (" \
               "'field_name': {}, " \
               "'op_name': {}, " \
               "'left_field': {}, " \
               "'right_field': {}, " \
               "'op_type': {}" \
               "))" \
            .format(self._task_type, self._field_name, self._op_name, self._left_field, self._right_field,
                    self._op_type)
    def resolve_task(self, query_params, post_processing):
        # https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
        if not self._op_type:
            if isinstance(self._left_field, str) and isinstance(self._right_field, str):
                """
                (if op_name = '__truediv__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "doc[left_field].value / doc[right_field].value"
                    }
                    }
                }
                """
                if self._op_name == '__add__':
                    source = "doc['{0}'].value + doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__truediv__':
                    source = "doc['{0}'].value / doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__floordiv__':
                    source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(self._left_field,
                                                                                      self._right_field)
                elif self._op_name == '__pow__':
                    source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(self._left_field, self._right_field)
                elif self._op_name == '__mod__':
                    source = "doc['{0}'].value % doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__mul__':
                    source = "doc['{0}'].value * doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__sub__':
                    source = "doc['{0}'].value - doc['{1}'].value".format(self._left_field, self._right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
                if query_params['query_script_fields'] is None:
                    query_params['query_script_fields'] = {}
                query_params['query_script_fields'][self._field_name] = {
                    'script': {
                        'source': source
                    }
                }
            elif isinstance(self._left_field, str) and np.issubdtype(np.dtype(type(self._right_field)), np.number):
                """
                (if self._op_name = '__truediv__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "doc[self._left_field].value / self._right_field"
                    }
                    }
                }
                """
                if self._op_name == '__add__':
                    source = "doc['{0}'].value + {1}".format(self._left_field, self._right_field)
                elif self._op_name == '__truediv__':
                    source = "doc['{0}'].value / {1}".format(self._left_field, self._right_field)
                elif self._op_name == '__floordiv__':
                    source = "Math.floor(doc['{0}'].value / {1})".format(self._left_field, self._right_field)
                elif self._op_name == '__pow__':
                    source = "Math.pow(doc['{0}'].value, {1})".format(self._left_field, self._right_field)
                elif self._op_name == '__mod__':
                    source = "doc['{0}'].value % {1}".format(self._left_field, self._right_field)
                elif self._op_name == '__mul__':
                    source = "doc['{0}'].value * {1}".format(self._left_field, self._right_field)
                elif self._op_name == '__sub__':
                    source = "doc['{0}'].value - {1}".format(self._left_field, self._right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
            elif np.issubdtype(np.dtype(type(self._left_field)), np.number) and isinstance(self._right_field, str):
                """
                (if self._op_name = '__truediv__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "self._left_field / doc['self._right_field'].value"
                    }
                    }
                }
                """
                if self._op_name == '__add__':
                    source = "{0} + doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__truediv__':
                    source = "{0} / doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__floordiv__':
                    source = "Math.floor({0} / doc['{1}'].value)".format(self._left_field, self._right_field)
                elif self._op_name == '__pow__':
                    source = "Math.pow({0}, doc['{1}'].value)".format(self._left_field, self._right_field)
                elif self._op_name == '__mod__':
                    source = "{0} % doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__mul__':
                    source = "{0} * doc['{1}'].value".format(self._left_field, self._right_field)
                elif self._op_name == '__sub__':
                    source = "{0} - doc['{1}'].value".format(self._left_field, self._right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
            else:
                raise TypeError("Types for operation inconsistent {} {} {}", type(self._left_field),
                                type(self._right_field), self._op_name)
        elif self._op_type[0] == "string":
            # we need to check the type of string addition
            if self._op_type[1] == "s":
                """
                (if self._op_name = '__add__')
                "script_fields": {
                    "field_name": {
                    "script": {
                        "source": "doc[self._left_field].value + doc[self._right_field].value"
                    }
                    }
                }
                """
                if self._op_name == '__add__':
                    source = "doc['{0}'].value + doc['{1}'].value".format(self._left_field, self._right_field)
                else:
                    raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
            elif self._op_type[1] == "r":
                if isinstance(self._left_field, str) and isinstance(self._right_field, str):
                    """
                    (if self._op_name = '__add__')
                    "script_fields": {
                        "field_name": {
                        "script": {
                            "source": "doc[self._left_field].value + self._right_field"
                        }
                        }
                    }
                    """
                    if self._op_name == '__add__':
                        source = "doc['{0}'].value + '{1}'".format(self._left_field, self._right_field)
                    else:
                        raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
            elif self._op_type[1] == 'l':
                if isinstance(self._left_field, str) and isinstance(self._right_field, str):
                    """
                    (if self._op_name = '__add__')
                    "script_fields": {
                        "field_name": {
                        "script": {
                            "source": "self._left_field + doc[self._right_field].value"
                        }
                        }
                    }
                    """
                    if self._op_name == '__add__':
                        source = "'{0}' + doc['{1}'].value".format(self._left_field, self._right_field)
                    else:
                        raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
        if query_params['query_script_fields'] is None:
            query_params['query_script_fields'] = {}
        query_params['query_script_fields'][self._field_name] = {
            'script': {
                'source': source
            }
        }
        return query_params, post_processing
--- a/eland/tests/dataframe/test_aggs_pytest.py
+++ b/eland/tests/dataframe/test_aggs_pytest.py
@ -27,3 +27,23 @@ class TestDataFrameAggs(TestData):
        print(ed_sum_min_std.dtypes)
        assert_almost_equal(pd_sum_min_std, ed_sum_min_std, check_less_precise=True)
    def test_terms_aggs(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
        pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
        ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
        # Eland returns all float values for all metric aggs, pandas can return int
        # TODO - investigate this more
        pd_sum_min = pd_sum_min.astype('float64')
        assert_almost_equal(pd_sum_min, ed_sum_min)
        pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(['sum', 'min', 'std'])
        ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(['sum', 'min', 'std'])
        print(pd_sum_min_std.dtypes)
        print(ed_sum_min_std.dtypes)
        assert_almost_equal(pd_sum_min_std, ed_sum_min_std, check_less_precise=True)
--- a/eland/tests/dataframe/test_head_tail_pytest.py
+++ b/eland/tests/dataframe/test_head_tail_pytest.py
@ -85,3 +85,10 @@ class TestDataFrameHeadTail(TestData):
        ed_head_0 = ed_flights.head(0)
        pd_head_0 = pd_flights.head(0)
        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
    def test_doc_test_tail(self):
        df = self.ed_flights()
        df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)]
        df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
        df = df.tail()
        print(df)
--- a/eland/tests/series/test_str_arithmetics_pytest.py
+++ b/eland/tests/series/test_str_arithmetics_pytest.py
@ -20,8 +20,7 @@ class TestSeriesArithmetics(TestData):
        with pytest.raises(TypeError):
            assert self.ed_ecommerce()['total_quantity'] + self.ed_ecommerce()['currency']
-    def test_str_add_ser(self):
+    def test_ser_add_ser(self):
        edadd = self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_last_name']
        pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name']
@ -33,12 +32,31 @@ class TestSeriesArithmetics(TestData):
        assert_pandas_eland_series_equal(pdadd, edadd)
-    def test_ser_add_ser(self):
+    def test_str_add_ser(self):
        edadd = "The last name is: " + self.ed_ecommerce()['customer_last_name']
        pdadd = "The last name is: " + self.pd_ecommerce()['customer_last_name']
        assert_pandas_eland_series_equal(pdadd, edadd)
    def test_bad_str_add_ser(self):
        # TODO encode special characters better
        #      Elasticsearch accepts this, but it will cause problems
        edadd = " *" + self.ed_ecommerce()['customer_last_name']
        pdadd = " *" + self.pd_ecommerce()['customer_last_name']
        assert_pandas_eland_series_equal(pdadd, edadd)
    def test_ser_add_str_add_ser(self):
        pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name']
        print(pdadd.name)
        edadd = self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_last_name']
        print(edadd.name)
        print(edadd.info_es())
        assert_pandas_eland_series_equal(pdadd, edadd)
    def test_non_aggregatable_add_str(self):
        with pytest.raises(ValueError):
            assert self.ed_ecommerce()['customer_gender'] + "is the gender"
--- a/eland/tests/series/test_value_counts_pytest.py
+++ b/eland/tests/series/test_value_counts_pytest.py
@ -47,6 +47,14 @@ class TestSeriesValueCounts(TestData):
            assert ed_s.value_counts(es_size=-9)
    def test_value_counts_non_aggregatable(self):
        ed_s = self.ed_ecommerce()['customer_first_name']
        pd_s = self.pd_ecommerce()['customer_first_name']
        pd_vc = pd_s.value_counts().head(20).sort_index()
        ed_vc = ed_s.value_counts(es_size=20).sort_index()
        assert_series_equal(pd_vc, ed_vc)
        ed_s = self.ed_ecommerce()['customer_gender']
        with pytest.raises(ValueError):
            assert ed_s.value_counts()