Adding smaller test and first effort to implement aggs

2025-07-11 00:02:14 +08:00 · 2019-08-06 14:58:38 +00:00 · 2019-08-06 14:58:38 +00:00 · c6e0c5b92b
commit c6e0c5b92b
parent 67b7aee9c9
18 changed files with 1150 additions and 175 deletions
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -469,4 +469,53 @@ class DataFrame(NDFrame):
    def keys(self):
        return self.columns

+    def to_csv(
+        self,
+        path_or_buf=None,
+        sep=",",
+        na_rep="",
+        float_format=None,
+        columns=None,
+        header=True,
+        index=True,
+        index_label=None,
+        mode="w",
+        encoding=None,
+        compression="infer",
+        quoting=None,
+        quotechar='"',
+        line_terminator=None,
+        chunksize=None,
+        tupleize_cols=None,
+        date_format=None,
+        doublequote=True,
+        escapechar=None,
+        decimal=".",
+        *args,
+        **kwargs
+    ):
+        kwargs = {
+            "path_or_buf": path_or_buf,
+            "sep": sep,
+            "na_rep": na_rep,
+            "float_format": float_format,
+            "columns": columns,
+            "header": header,
+            "index": index,
+            "index_label": index_label,
+            "mode": mode,
+            "encoding": encoding,
+            "compression": compression,
+            "quoting": quoting,
+            "quotechar": quotechar,
+            "line_terminator": line_terminator,
+            "chunksize": chunksize,
+            "tupleize_cols": tupleize_cols,
+            "date_format": date_format,
+            "doublequote": doublequote,
+            "escapechar": escapechar,
+            "decimal": decimal,
+        }
+
+
    hist = gfx.ed_hist_frame
--- a/eland/docs/dataframe_supported.rst
+++ b/eland/docs/dataframe_supported.rst
@ -27,7 +27,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
 +-------------------------+-------+------------------------------------------------+
 | df.head                 | 783   | y                                              |
 +-------------------------+-------+------------------------------------------------+
-| df.drop                 | 761   |                                                |
+| df.drop                 | 761   | y                                              |
 +-------------------------+-------+------------------------------------------------+
 | df.sum                  | 755   | y                                              |
 +-------------------------+-------+------------------------------------------------+
--- a/eland/operations.py
+++ b/eland/operations.py
@ -70,6 +70,9 @@ class Operations:
    def set_columns(self, columns):
        # Setting columns at different phases of the task list may result in different
        # operations. So instead of setting columns once, set when it happens in call chain
+        if type(columns) is not list:
+            columns = list(columns)
+
        # TODO - column renaming
        # TODO - validate we are setting columns to a subset of last columns?
        task = ('columns', columns)
@ -483,6 +486,7 @@ class Operations:
                df = df.iloc[index_indexer, column_indexer]
            elif action[0] == 'squeeze':
                df = df.squeeze(axis=action[1])
+            # columns could be in here (and we ignore it)

        return df

--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -11,6 +11,36 @@ from pandas.core.indexes.range import RangeIndex


 class ElandQueryCompiler(BaseQueryCompiler):
+    """
+    Some notes on what can and can not be mapped:
+
+    1. df.head(10)
+
+    /_search?size=10
+
+    2. df.tail(10)
+
+    /_search?size=10&sort=_doc:desc
+    + post_process results (sort_index)
+
+    3. df[['OriginAirportID', 'AvgTicketPrice', 'Carrier']]
+
+    /_search
+    { '_source': ['OriginAirportID', 'AvgTicketPrice', 'Carrier']}
+
+    4. df.drop(['1', '2'])
+
+    /_search
+    {'query': {'bool': {'must': [], 'must_not': [{'ids': {'values': ['1', '2']}}]}}, 'aggs': {}}
+
+    This doesn't work is size is set (e.g. head/tail) as we don't know in Elasticsearch if values '1' or '2' are
+    in the first/last n fields.
+
+    A way to mitigate this would be to post process this drop - TODO
+
+
+
+    """

    def __init__(self,
                 client=None,
@ -155,45 +185,6 @@ class ElandQueryCompiler(BaseQueryCompiler):
        if results is None:
            return self._empty_pd_ef()

-        def flatten_dict(y):
-            out = {}
-
-            def flatten(x, name=''):
-                # We flatten into source fields e.g. if type=geo_point
-                # location: {lat=52.38, lon=4.90}
-                if name == '':
-                    is_source_field = False
-                    pd_dtype = 'object'
-                else:
-                    is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
-
-                if not is_source_field and type(x) is dict:
-                    for a in x:
-                        flatten(x[a], name + a + '.')
-                elif not is_source_field and type(x) is list:
-                    for a in x:
-                        flatten(a, name)
-                elif is_source_field == True:  # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
-                    field_name = name[:-1]
-
-                    # Coerce types - for now just datetime
-                    if pd_dtype == 'datetime64[ns]':
-                        x = pd.to_datetime(x)
-
-                    # Elasticsearch can have multiple values for a field. These are represented as lists, so
-                    # create lists for this pivot (see notes above)
-                    if field_name in out:
-                        if type(out[field_name]) is not list:
-                            l = [out[field_name]]
-                            out[field_name] = l
-                        out[field_name].append(x)
-                    else:
-                        out[field_name] = x
-
-            flatten(y)
-
-            return out
-
        rows = []
        index = []
        if isinstance(results, dict):
@ -212,7 +203,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
            index.append(index_field)

            # flatten row to map correctly to 2D DataFrame
-            rows.append(flatten_dict(row))
+            rows.append(self._flatten_dict(row))

        # Create pandas DataFrame
        df = pd.DataFrame(data=rows, index=index)
@ -232,6 +223,100 @@ class ElandQueryCompiler(BaseQueryCompiler):

        return df

+    def _to_csv(self, results, **kwargs):
+        # Very similar to _es_results_to_pandas except we create partial pandas.DataFrame
+        # and write these to csv
+
+        # Use chunksize in kwargs do determine size of partial data frame
+        if 'chunksize' in kwargs:
+            chunksize = kwargs['chunksize']
+        else:
+            # If no default chunk, set to 1000
+            chunksize = 1000
+
+        if results is None:
+            return self._empty_pd_ef()
+
+        rows = []
+        index = []
+        if isinstance(results, dict):
+            iterator = results['hits']['hits']
+        else:
+            iterator = results
+
+        i = 0
+        for hit in iterator:
+            row = hit['_source']
+
+            # get index value - can be _id or can be field value in source
+            if self._index.is_source_field:
+                index_field = row[self._index.index_field]
+            else:
+                index_field = hit[self._index.index_field]
+            index.append(index_field)
+
+            # flatten row to map correctly to 2D DataFrame
+            rows.append(self._flatten_dict(row))
+
+            i = i + 1
+            if i % chunksize == 0:
+                # Create pandas DataFrame
+                df = pd.DataFrame(data=rows, index=index)
+
+                # _source may not contain all columns in the mapping
+                # therefore, fill in missing columns
+                # (note this returns self.columns NOT IN df.columns)
+                missing_columns = list(set(self.columns) - set(df.columns))
+
+                for missing in missing_columns:
+                    is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
+                    df[missing] = None
+                    df[missing].astype(pd_dtype)
+
+                # Sort columns in mapping order
+                df = df[self.columns]
+
+        return df
+
+    def _flatten_dict(self, y):
+        out = {}
+
+        def flatten(x, name=''):
+            # We flatten into source fields e.g. if type=geo_point
+            # location: {lat=52.38, lon=4.90}
+            if name == '':
+                is_source_field = False
+                pd_dtype = 'object'
+            else:
+                is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
+
+            if not is_source_field and type(x) is dict:
+                for a in x:
+                    flatten(x[a], name + a + '.')
+            elif not is_source_field and type(x) is list:
+                for a in x:
+                    flatten(a, name)
+            elif is_source_field == True:  # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
+                field_name = name[:-1]
+
+                # Coerce types - for now just datetime
+                if pd_dtype == 'datetime64[ns]':
+                    x = pd.to_datetime(x)
+
+                # Elasticsearch can have multiple values for a field. These are represented as lists, so
+                # create lists for this pivot (see notes above)
+                if field_name in out:
+                    if type(out[field_name]) is not list:
+                        l = [out[field_name]]
+                        out[field_name] = l
+                    out[field_name].append(x)
+                else:
+                    out[field_name] = x
+
+        flatten(y)
+
+        return out
+
    def _index_count(self):
        """
        Returns
--- a/eland/tests/init.py
+++ b/eland/tests/init.py
@ -99,6 +99,11 @@ FLIGHTS_MAPPING = { "mappings" : {
 FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
 FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'

+FLIGHTS_SMALL_INDEX_NAME = 'flights_small'
+FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
+FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
+FLIGHTS_SMALL_DF_FILE_NAME = ROOT_DIR + '/flights_small_df.json.gz'
+
 ECOMMERCE_INDEX_NAME = 'ecommerce'
 ECOMMERCE_MAPPING = { "mappings" : {
      "properties" : {
--- a/eland/tests/common.py
+++ b/eland/tests/common.py
@ -12,6 +12,7 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 # Create pandas and eland data frames
 from eland.tests import ELASTICSEARCH_HOST
 from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
+    FLIGHTS_SMALL_INDEX_NAME,\
    ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME

 _pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
@ -20,6 +21,9 @@ _pd_flights['timestamp'] = \
 _pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
 _ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)

+_pd_flights_small = _pd_flights.head(48)
+_ed_flights_small = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_SMALL_INDEX_NAME)
+
 _pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
 _pd_ecommerce['order_date'] = \
    pd.to_datetime(_pd_ecommerce['order_date'])
@ -38,6 +42,13 @@ class TestData:
    def ed_flights(self):
        return _ed_flights

+    def pd_flights_small(self):
+        return _pd_flights_small
+
+    def ed_flights_small(self):
+        return _ed_flights_small
+
+
    def pd_ecommerce(self):
        return _pd_ecommerce

--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@ -31,14 +31,5 @@ class TestDataFrameDescribe(TestData):
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas

-    def test_to_describe2(self):
-        pd_flights = self.pd_flights().head()
-        ed_flights = self.ed_flights().head()
-
-        pd_describe = pd_flights.describe()
-        # This fails as we can not run 'describe' on a truncate ed dataframe
-        ed_describe = ed_flights.describe()
-
-        print(pd_describe)
-        print(ed_describe)
+        # We can not also run 'describe' on a truncate ed dataframe

--- a/eland/tests/dataframe/test_drop_pytest.py
+++ b/eland/tests/dataframe/test_drop_pytest.py
@ -14,8 +14,8 @@ import numpy as np
 class TestDataFrameDrop(TestData):

    def test_drop1(self):
-        ed_flights = self.ed_flights()
-        pd_flights = self.pd_flights()
+        ed_flights_small = self.ed_flights_small()
+        pd_flights_small = self.pd_flights_small()

        # ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
        #        'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
@ -24,33 +24,17 @@ class TestDataFrameDrop(TestData):
        #        'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
        #        'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
        #        'dayOfWeek', 'timestamp']
-        pd_col0 = pd_flights.drop(['Carrier', 'DestCityName'], axis=1)
-        pd_col1 = pd_flights.drop(columns=['Carrier', 'DestCityName'])
+        pd_col0 = pd_flights_small.drop(['Carrier', 'DestCityName'], axis=1)
+        pd_col1 = pd_flights_small.drop(columns=['Carrier', 'DestCityName'])

-        ed_col0 = ed_flights.drop(['Carrier', 'DestCityName'], axis=1)
-        ed_col1 = ed_flights.drop(columns=['Carrier', 'DestCityName'])
+        ed_col0 = ed_flights_small.drop(['Carrier', 'DestCityName'], axis=1)
+        ed_col1 = ed_flights_small.drop(columns=['Carrier', 'DestCityName'])

-        #assert_pandas_eland_frame_equal(pd_col0, ed_col0)
-        #assert_pandas_eland_frame_equal(pd_col1, ed_col1)
+        assert_pandas_eland_frame_equal(pd_col0, ed_col0)
+        assert_pandas_eland_frame_equal(pd_col1, ed_col1)

        # Drop rows by index
-        pd_idx0 = pd_flights.drop(['1', '2'])
-        ed_idx0 = ed_flights.drop(['1', '2'])
-
-        print(pd_idx0.info())
-        print(ed_idx0.info())
+        pd_idx0 = pd_flights_small.drop(['1', '2'])
+        ed_idx0 = ed_flights_small.drop(['1', '2'])

        assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
-
-        """
-        #assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
-        assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
-        assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
-        assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
-        assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
-        #assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
-        assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
-        assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
-        assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
-        assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
-        """
--- a/eland/tests/dataframe/test_hist_pytest.py
+++ b/eland/tests/dataframe/test_hist_pytest.py
@ -30,3 +30,15 @@ class TestDataFrameHist(TestData):
        # Numbers are slightly different
        assert_almost_equal(pd_bins, ed_bins)
        assert_almost_equal(pd_weights, ed_weights)
+
+    def test_hist2(self):
+        pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
+        ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
+
+        num_bins = 10
+
+        ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
+
+        print(ed_bins)
+
+
--- a/eland/tests/dataframe/test_reviews_pytest.py
+++ b/eland/tests/dataframe/test_reviews_pytest.py
@ -1,8 +1,11 @@
 # File called _pytest for PyCharm compatability

-from eland.tests.common import TestData
+import gzip
+
+import pandas as pd

 import eland as ed
+from eland.tests.common import TestData


 class TestDataFrameReviews(TestData):
@ -13,5 +16,16 @@ class TestDataFrameReviews(TestData):
        print(ed_reviews.head())
        print(ed_reviews.describe())
        print(ed_reviews.info())
-        print(ed_reviews.hist(column="rating", bins = 5))
-        #print(ed_reviews.head().info_es())
+        print(ed_reviews.hist(column="rating", bins=5))
+        # print(ed_reviews.head().info_es())
+
+    def test_review(self):
+        csv_handle = gzip.open('../anonreviews.csv.gz')
+
+        reviews = pd.read_csv(csv_handle)
+
+        reviews['date'] = pd.to_datetime(reviews['date'])
+
+        g = reviews.groupby('reviewerId')
+
+        print(g.describe())
--- a/eland/tests/dataframe/test_to_csv_pytest.py
+++ b/eland/tests/dataframe/test_to_csv_pytest.py
@ -0,0 +1,14 @@
+# File called _pytest for PyCharm compatability
+
+import numpy as np
+import pandas as pd
+
+import eland as ed
+from eland.tests.common import ELASTICSEARCH_HOST
+from eland.tests.common import TestData
+
+
+class TestDataFrameToCSV(TestData):
+
+    def test_to_csv(self):
+        print("TODO")
--- a/eland/tests/flights.json.gz
+++ b/eland/tests/flights.json.gz
--- a/eland/tests/flights_df.json.gz
+++ b/eland/tests/flights_df.json.gz
--- a/eland/tests/flights_small.json.gz
+++ b/eland/tests/flights_small.json.gz
--- a/eland/tests/pivot_review_data_pandas.ipynb
+++ b/eland/tests/pivot_review_data_pandas.ipynb
--- a/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@ -5,7 +5,7 @@ from eland.tests.common import TestData
 from matplotlib.testing.decorators import check_figures_equal

@check_figures_equal(extensions=['png'])
-def test_plot(fig_test, fig_ref):
+def test_plot_hist(fig_test, fig_ref):
    test_data = TestData()

    pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
@ -16,4 +16,3 @@ def test_plot(fig_test, fig_ref):

    ed_ax = fig_test.subplots()
    ed_flights.hist(ax=ed_ax)
-
--- a/eland/tests/setup_tests.py
+++ b/eland/tests/setup_tests.py
@ -6,6 +6,7 @@ from eland.tests import *

 DATA_LIST = [
    (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
+    (FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING),
    (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
 ]