Adding smaller test and first effort to implement aggs

2025-07-11 00:02:14 +08:00 · 2019-08-06 14:58:38 +00:00 · 2019-08-06 14:58:38 +00:00 · c6e0c5b92b
commit c6e0c5b92b
parent 67b7aee9c9
18 changed files with 1150 additions and 175 deletions
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -469,4 +469,53 @@ class DataFrame(NDFrame):
    def keys(self):
        return self.columns
    def to_csv(
        self,
        path_or_buf=None,
        sep=",",
        na_rep="",
        float_format=None,
        columns=None,
        header=True,
        index=True,
        index_label=None,
        mode="w",
        encoding=None,
        compression="infer",
        quoting=None,
        quotechar='"',
        line_terminator=None,
        chunksize=None,
        tupleize_cols=None,
        date_format=None,
        doublequote=True,
        escapechar=None,
        decimal=".",
        *args,
        **kwargs
    ):
        kwargs = {
            "path_or_buf": path_or_buf,
            "sep": sep,
            "na_rep": na_rep,
            "float_format": float_format,
            "columns": columns,
            "header": header,
            "index": index,
            "index_label": index_label,
            "mode": mode,
            "encoding": encoding,
            "compression": compression,
            "quoting": quoting,
            "quotechar": quotechar,
            "line_terminator": line_terminator,
            "chunksize": chunksize,
            "tupleize_cols": tupleize_cols,
            "date_format": date_format,
            "doublequote": doublequote,
            "escapechar": escapechar,
            "decimal": decimal,
        }
    hist = gfx.ed_hist_frame
--- a/eland/docs/dataframe_supported.rst
+++ b/eland/docs/dataframe_supported.rst
@ -27,7 +27,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
 +-------------------------+-------+------------------------------------------------+
 | df.head                 | 783   | y                                              |
 +-------------------------+-------+------------------------------------------------+
-| df.drop                 | 761   |                                                |
+| df.drop                 | 761   | y                                              |
 +-------------------------+-------+------------------------------------------------+
 | df.sum                  | 755   | y                                              |
 +-------------------------+-------+------------------------------------------------+
--- a/eland/operations.py
+++ b/eland/operations.py
@ -70,6 +70,9 @@ class Operations:
    def set_columns(self, columns):
        # Setting columns at different phases of the task list may result in different
        # operations. So instead of setting columns once, set when it happens in call chain
        if type(columns) is not list:
            columns = list(columns)
        # TODO - column renaming
        # TODO - validate we are setting columns to a subset of last columns?
        task = ('columns', columns)
@ -483,6 +486,7 @@ class Operations:
                df = df.iloc[index_indexer, column_indexer]
            elif action[0] == 'squeeze':
                df = df.squeeze(axis=action[1])
            # columns could be in here (and we ignore it)
        return df
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -11,6 +11,36 @@ from pandas.core.indexes.range import RangeIndex
 class ElandQueryCompiler(BaseQueryCompiler):
    """
    Some notes on what can and can not be mapped:
    1. df.head(10)
    /_search?size=10
    2. df.tail(10)
    /_search?size=10&sort=_doc:desc
    + post_process results (sort_index)
    3. df[['OriginAirportID', 'AvgTicketPrice', 'Carrier']]
    /_search
    { '_source': ['OriginAirportID', 'AvgTicketPrice', 'Carrier']}
    4. df.drop(['1', '2'])
    /_search
    {'query': {'bool': {'must': [], 'must_not': [{'ids': {'values': ['1', '2']}}]}}, 'aggs': {}}
    This doesn't work is size is set (e.g. head/tail) as we don't know in Elasticsearch if values '1' or '2' are
    in the first/last n fields.
    A way to mitigate this would be to post process this drop - TODO
    """
    def __init__(self,
                 client=None,
@ -155,45 +185,6 @@ class ElandQueryCompiler(BaseQueryCompiler):
        if results is None:
            return self._empty_pd_ef()
        def flatten_dict(y):
            out = {}
            def flatten(x, name=''):
                # We flatten into source fields e.g. if type=geo_point
                # location: {lat=52.38, lon=4.90}
                if name == '':
                    is_source_field = False
                    pd_dtype = 'object'
                else:
                    is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
                if not is_source_field and type(x) is dict:
                    for a in x:
                        flatten(x[a], name + a + '.')
                elif not is_source_field and type(x) is list:
                    for a in x:
                        flatten(a, name)
                elif is_source_field == True:  # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
                    field_name = name[:-1]
                    # Coerce types - for now just datetime
                    if pd_dtype == 'datetime64[ns]':
                        x = pd.to_datetime(x)
                    # Elasticsearch can have multiple values for a field. These are represented as lists, so
                    # create lists for this pivot (see notes above)
                    if field_name in out:
                        if type(out[field_name]) is not list:
                            l = [out[field_name]]
                            out[field_name] = l
                        out[field_name].append(x)
                    else:
                        out[field_name] = x
            flatten(y)
            return out
        rows = []
        index = []
        if isinstance(results, dict):
@ -212,7 +203,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
            index.append(index_field)
            # flatten row to map correctly to 2D DataFrame
-            rows.append(flatten_dict(row))
+            rows.append(self._flatten_dict(row))
        # Create pandas DataFrame
        df = pd.DataFrame(data=rows, index=index)
@ -232,6 +223,100 @@ class ElandQueryCompiler(BaseQueryCompiler):
        return df
    def _to_csv(self, results, **kwargs):
        # Very similar to _es_results_to_pandas except we create partial pandas.DataFrame
        # and write these to csv
        # Use chunksize in kwargs do determine size of partial data frame
        if 'chunksize' in kwargs:
            chunksize = kwargs['chunksize']
        else:
            # If no default chunk, set to 1000
            chunksize = 1000
        if results is None:
            return self._empty_pd_ef()
        rows = []
        index = []
        if isinstance(results, dict):
            iterator = results['hits']['hits']
        else:
            iterator = results
        i = 0
        for hit in iterator:
            row = hit['_source']
            # get index value - can be _id or can be field value in source
            if self._index.is_source_field:
                index_field = row[self._index.index_field]
            else:
                index_field = hit[self._index.index_field]
            index.append(index_field)
            # flatten row to map correctly to 2D DataFrame
            rows.append(self._flatten_dict(row))
            i = i + 1
            if i % chunksize == 0:
                # Create pandas DataFrame
                df = pd.DataFrame(data=rows, index=index)
                # _source may not contain all columns in the mapping
                # therefore, fill in missing columns
                # (note this returns self.columns NOT IN df.columns)
                missing_columns = list(set(self.columns) - set(df.columns))
                for missing in missing_columns:
                    is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
                    df[missing] = None
                    df[missing].astype(pd_dtype)
                # Sort columns in mapping order
                df = df[self.columns]
        return df
    def _flatten_dict(self, y):
        out = {}
        def flatten(x, name=''):
            # We flatten into source fields e.g. if type=geo_point
            # location: {lat=52.38, lon=4.90}
            if name == '':
                is_source_field = False
                pd_dtype = 'object'
            else:
                is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
            if not is_source_field and type(x) is dict:
                for a in x:
                    flatten(x[a], name + a + '.')
            elif not is_source_field and type(x) is list:
                for a in x:
                    flatten(a, name)
            elif is_source_field == True:  # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
                field_name = name[:-1]
                # Coerce types - for now just datetime
                if pd_dtype == 'datetime64[ns]':
                    x = pd.to_datetime(x)
                # Elasticsearch can have multiple values for a field. These are represented as lists, so
                # create lists for this pivot (see notes above)
                if field_name in out:
                    if type(out[field_name]) is not list:
                        l = [out[field_name]]
                        out[field_name] = l
                    out[field_name].append(x)
                else:
                    out[field_name] = x
        flatten(y)
        return out
    def _index_count(self):
        """
        Returns
--- a/eland/tests/init.py
+++ b/eland/tests/init.py
@ -99,6 +99,11 @@ FLIGHTS_MAPPING = { "mappings" : {
 FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
 FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
 FLIGHTS_SMALL_INDEX_NAME = 'flights_small'
 FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
 FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
 FLIGHTS_SMALL_DF_FILE_NAME = ROOT_DIR + '/flights_small_df.json.gz'
 ECOMMERCE_INDEX_NAME = 'ecommerce'
 ECOMMERCE_MAPPING = { "mappings" : {
      "properties" : {
--- a/eland/tests/common.py
+++ b/eland/tests/common.py
@ -12,6 +12,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 # Create pandas and eland data frames
 from eland.tests import ELASTICSEARCH_HOST
 from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
    FLIGHTS_SMALL_INDEX_NAME,\
    ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME
 _pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
@ -20,6 +21,9 @@ _pd_flights['timestamp'] = \
 _pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
 _ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
 _pd_flights_small = _pd_flights.head(48)
 _ed_flights_small = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_SMALL_INDEX_NAME)
 _pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
 _pd_ecommerce['order_date'] = \
    pd.to_datetime(_pd_ecommerce['order_date'])
@ -38,6 +42,13 @@ class TestData:
    def ed_flights(self):
        return _ed_flights
    def pd_flights_small(self):
        return _pd_flights_small
    def ed_flights_small(self):
        return _ed_flights_small
    def pd_ecommerce(self):
        return _pd_ecommerce
--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@ -31,14 +31,5 @@ class TestDataFrameDescribe(TestData):
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas
-    def test_to_describe2(self):
+        # We can not also run 'describe' on a truncate ed dataframe
        pd_flights = self.pd_flights().head()
        ed_flights = self.ed_flights().head()
        pd_describe = pd_flights.describe()
        # This fails as we can not run 'describe' on a truncate ed dataframe
        ed_describe = ed_flights.describe()
        print(pd_describe)
        print(ed_describe)
--- a/eland/tests/dataframe/test_drop_pytest.py
+++ b/eland/tests/dataframe/test_drop_pytest.py
@ -14,8 +14,8 @@ import numpy as np
 class TestDataFrameDrop(TestData):
    def test_drop1(self):
-        ed_flights = self.ed_flights()
+        ed_flights_small = self.ed_flights_small()
-        pd_flights = self.pd_flights()
+        pd_flights_small = self.pd_flights_small()
        # ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
        #        'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
@ -24,33 +24,17 @@ class TestDataFrameDrop(TestData):
        #        'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
        #        'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
        #        'dayOfWeek', 'timestamp']
-        pd_col0 = pd_flights.drop(['Carrier', 'DestCityName'], axis=1)
+        pd_col0 = pd_flights_small.drop(['Carrier', 'DestCityName'], axis=1)
-        pd_col1 = pd_flights.drop(columns=['Carrier', 'DestCityName'])
+        pd_col1 = pd_flights_small.drop(columns=['Carrier', 'DestCityName'])
-        ed_col0 = ed_flights.drop(['Carrier', 'DestCityName'], axis=1)
+        ed_col0 = ed_flights_small.drop(['Carrier', 'DestCityName'], axis=1)
-        ed_col1 = ed_flights.drop(columns=['Carrier', 'DestCityName'])
+        ed_col1 = ed_flights_small.drop(columns=['Carrier', 'DestCityName'])
-        #assert_pandas_eland_frame_equal(pd_col0, ed_col0)
+        assert_pandas_eland_frame_equal(pd_col0, ed_col0)
-        #assert_pandas_eland_frame_equal(pd_col1, ed_col1)
+        assert_pandas_eland_frame_equal(pd_col1, ed_col1)
        # Drop rows by index
-        pd_idx0 = pd_flights.drop(['1', '2'])
+        pd_idx0 = pd_flights_small.drop(['1', '2'])
-        ed_idx0 = ed_flights.drop(['1', '2'])
+        ed_idx0 = ed_flights_small.drop(['1', '2'])
        print(pd_idx0.info())
        print(ed_idx0.info())
        assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
        """
        #assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
        assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
        assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
        assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
        assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
        #assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
        assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
        assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
        assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
        assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
        """
--- a/eland/tests/dataframe/test_hist_pytest.py
+++ b/eland/tests/dataframe/test_hist_pytest.py
@ -30,3 +30,15 @@ class TestDataFrameHist(TestData):
        # Numbers are slightly different
        assert_almost_equal(pd_bins, ed_bins)
        assert_almost_equal(pd_weights, ed_weights)
    def test_hist2(self):
        pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
        ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
        num_bins = 10
        ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
        print(ed_bins)
--- a/eland/tests/dataframe/test_reviews_pytest.py
+++ b/eland/tests/dataframe/test_reviews_pytest.py
@ -1,8 +1,11 @@
 # File called _pytest for PyCharm compatability
-from eland.tests.common import TestData
+import gzip
 import pandas as pd
 import eland as ed
 from eland.tests.common import TestData
 class TestDataFrameReviews(TestData):
@ -13,5 +16,16 @@ class TestDataFrameReviews(TestData):
        print(ed_reviews.head())
        print(ed_reviews.describe())
        print(ed_reviews.info())
-        print(ed_reviews.hist(column="rating", bins = 5))
+        print(ed_reviews.hist(column="rating", bins=5))
-        #print(ed_reviews.head().info_es())
+        # print(ed_reviews.head().info_es())
    def test_review(self):
        csv_handle = gzip.open('../anonreviews.csv.gz')
        reviews = pd.read_csv(csv_handle)
        reviews['date'] = pd.to_datetime(reviews['date'])
        g = reviews.groupby('reviewerId')
        print(g.describe())
--- a/eland/tests/dataframe/test_to_csv_pytest.py
+++ b/eland/tests/dataframe/test_to_csv_pytest.py
@ -0,0 +1,14 @@
 # File called _pytest for PyCharm compatability
 import numpy as np
 import pandas as pd
 import eland as ed
 from eland.tests.common import ELASTICSEARCH_HOST
 from eland.tests.common import TestData
 class TestDataFrameToCSV(TestData):
    def test_to_csv(self):
        print("TODO")
--- a/eland/tests/flights.json.gz
+++ b/eland/tests/flights.json.gz
--- a/eland/tests/flights_df.json.gz
+++ b/eland/tests/flights_df.json.gz
--- a/eland/tests/flights_small.json.gz
+++ b/eland/tests/flights_small.json.gz
--- a/eland/tests/pivot_review_data_pandas.ipynb
+++ b/eland/tests/pivot_review_data_pandas.ipynb
--- a/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@ -5,7 +5,7 @@ from eland.tests.common import TestData
 from matplotlib.testing.decorators import check_figures_equal
@check_figures_equal(extensions=['png'])
-def test_plot(fig_test, fig_ref):
+def test_plot_hist(fig_test, fig_ref):
    test_data = TestData()
    pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
@ -16,4 +16,3 @@ def test_plot(fig_test, fig_ref):
    ed_ax = fig_test.subplots()
    ed_flights.hist(ax=ed_ax)
--- a/eland/tests/setup_tests.py
+++ b/eland/tests/setup_tests.py
@ -6,6 +6,7 @@ from eland.tests import *
 DATA_LIST = [
    (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
    (FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING),
    (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
 ]