From f5025b9f395d6cd9dad5dea779ffd3285c3a2ce6 Mon Sep 17 00:00:00 2001
From: Stephen Dodson <steve.dodson@elastic.co>
Date: Fri, 15 Nov 2019 11:21:27 +0000
Subject: [PATCH] Renamed ed_to_pd eland_to_pandas and added docs.

+ added some additions to .gitignore
+ removed DataFrame.squeeze for now
---
 .gitignore                                    | 10 ++-
 docs/source/reference/api/eland.ed_to_pd.rst  |  6 --
 .../reference/api/eland.eland_to_pandas.rst   |  6 ++
 .../reference/api/eland.pandas_to_eland.rst   |  6 ++
 docs/source/reference/api/eland.pd_to_ed.rst  |  6 --
 docs/source/reference/dataframe.rst           |  4 +
 .../reference/general_utility_functions.rst   |  4 +-
 eland/dataframe.py                            | 74 ++++++++++++++++---
 eland/operations.py                           |  6 --
 eland/query_compiler.py                       |  7 --
 eland/tests/dataframe/test_datetime_pytest.py |  2 +-
 eland/tests/dataframe/test_describe_pytest.py |  6 +-
 eland/tests/dataframe/test_dtypes_pytest.py   | 12 +++
 eland/tests/dataframe/test_query_pytest.py    |  2 +-
 eland/tests/dataframe/test_repr_pytest.py     |  7 +-
 eland/tests/dataframe/test_utils_pytest.py    |  2 +-
 .../plotting/test_dataframe_hist_pytest.py    | 14 +++-
 eland/utils.py                                | 18 ++---
 18 files changed, 130 insertions(+), 62 deletions(-)
 delete mode 100644 docs/source/reference/api/eland.ed_to_pd.rst
 create mode 100644 docs/source/reference/api/eland.eland_to_pandas.rst
 create mode 100644 docs/source/reference/api/eland.pandas_to_eland.rst
 delete mode 100644 docs/source/reference/api/eland.pd_to_ed.rst

diff --git a/.gitignore b/.gitignore
index 8969c38..4de1325 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,13 @@
 *.pyc
 
 # Setuptools distribution folder.
-/dist/
+dist/
+
+# Build folder
+build/
+
+# docs build folder
+docs/build/
 
 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
@@ -36,4 +42,4 @@ env/
 venv/
 ENV/
 env.bak/
-venv.bak/
\ No newline at end of file
+venv.bak/
diff --git a/docs/source/reference/api/eland.ed_to_pd.rst b/docs/source/reference/api/eland.ed_to_pd.rst
deleted file mode 100644
index 55dcf64..0000000
--- a/docs/source/reference/api/eland.ed_to_pd.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-eland.ed_to_pd
-==============
-
-.. currentmodule:: eland
-
-.. autofunction:: ed_to_pd
diff --git a/docs/source/reference/api/eland.eland_to_pandas.rst b/docs/source/reference/api/eland.eland_to_pandas.rst
new file mode 100644
index 0000000..eb87670
--- /dev/null
+++ b/docs/source/reference/api/eland.eland_to_pandas.rst
@@ -0,0 +1,6 @@
+eland.eland_to_pandas
+=====================
+
+.. currentmodule:: eland
+
+.. autofunction:: eland_to_pandas
diff --git a/docs/source/reference/api/eland.pandas_to_eland.rst b/docs/source/reference/api/eland.pandas_to_eland.rst
new file mode 100644
index 0000000..c24836e
--- /dev/null
+++ b/docs/source/reference/api/eland.pandas_to_eland.rst
@@ -0,0 +1,6 @@
+eland.pandas_to_eland
+=====================
+
+.. currentmodule:: eland
+
+.. autofunction:: pandas_to_eland
diff --git a/docs/source/reference/api/eland.pd_to_ed.rst b/docs/source/reference/api/eland.pd_to_ed.rst
deleted file mode 100644
index 615c987..0000000
--- a/docs/source/reference/api/eland.pd_to_ed.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-eland.pd_to_ed
-==============
-
-.. currentmodule:: eland
-
-.. autofunction:: pd_to_ed
diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst
index f009456..e1e71fa 100644
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@@ -24,6 +24,7 @@ Attributes and underlying data
    DataFrame.dtypes   
    DataFrame.select_dtypes   
    DataFrame.empty   
+   DataFrame.shape
 
 Indexing, iteration
 ~~~~~~~~~~~~~~~~~~~
@@ -80,6 +81,9 @@ Serialization / IO / conversion
    :toctree: api/
 
    DataFrame.info
+   DataFrame.to_csv
+   DataFrame.to_html
+   DataFrame.to_string
 
 Elasticsearch utilities
 ~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/reference/general_utility_functions.rst b/docs/source/reference/general_utility_functions.rst
index 63e1865..fd6960d 100644
--- a/docs/source/reference/general_utility_functions.rst
+++ b/docs/source/reference/general_utility_functions.rst
@@ -17,5 +17,5 @@ Pandas and Eland
 .. autosummary::
    :toctree: api/
 
-    pd_to_ed
-    ed_to_pd
+    pandas_to_eland
+    eland_to_pandas
diff --git a/eland/dataframe.py b/eland/dataframe.py
index 34892df..33164da 100644
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@@ -76,6 +76,7 @@ class DataFrame(NDFrame):
     [5 rows x 2 columns]
 
     Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the  DataFrame index field
+    (TODO - currently index_field must also be a field if not _id)
 
     >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
     >>> df.head()
@@ -529,7 +530,11 @@ class DataFrame(NDFrame):
                 bold_rows=True, classes=None, escape=True, notebook=False,
                 border=None, table_id=None, render_links=False):
         """
-        From pandas - except we set max_rows default to avoid careless extraction of entire index
+        Render a Elasticsearch data as an HTML table.
+
+        See Also
+        --------
+        :pandas_api_docs:`to_html` for argument details.
         """
         if max_rows is None:
             warnings.warn("DataFrame.to_string called without max_rows set "
@@ -568,7 +573,13 @@ class DataFrame(NDFrame):
                   max_rows=None, max_cols=None, show_dimensions=False,
                   decimal='.', line_width=None):
         """
-        From pandas - except we set max_rows default to avoid careless extraction of entire index
+        Render a DataFrame to a console-friendly tabular output.
+
+        Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
+
+        See Also
+        --------
+        :pandas_api_docs:`to_string` for argument details.
         """
         if max_rows is None:
             warnings.warn("DataFrame.to_string called without max_rows set "
@@ -718,6 +729,13 @@ class DataFrame(NDFrame):
                quotechar='"', line_terminator=None, chunksize=None,
                tupleize_cols=None, date_format=None, doublequote=True,
                escapechar=None, decimal='.'):
+        """
+        Write Elasticsearch data to a comma-separated values (csv) file.
+
+        See Also
+        --------
+        :pandas_api_docs:`to_csv` for argument details.
+        """
         kwargs = {
             "path_or_buf": path_or_buf,
             "sep": sep,
@@ -754,16 +772,34 @@ class DataFrame(NDFrame):
     def _empty_pd_df(self):
         return self._query_compiler._empty_pd_ef()
 
-    def squeeze(self, axis=None):
-        return DataFrame(
-            query_compiler=self._query_compiler.squeeze(axis)
-        )
-
     def select_dtypes(self, include=None, exclude=None):
         """
         Return a subset of the DataFrame's columns based on the column dtypes.
 
         Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
+
+        Returns
+        -------
+        eland.DataFrame
+            DataFrame contains only columns of selected dtypes
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights',
+        ... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
+        >>> df.dtypes
+        AvgTicketPrice           float64
+        Dest                      object
+        Cancelled                   bool
+        timestamp         datetime64[ns]
+        dayOfWeek                  int64
+        dtype: object
+        >>> df = df.select_dtypes(include=[np.number, 'datetime'])
+        >>> df.dtypes
+        AvgTicketPrice           float64
+        timestamp         datetime64[ns]
+        dayOfWeek                  int64
+        dtype: object
         """
         empty_df = self._empty_pd_df()
 
@@ -779,8 +815,20 @@ class DataFrame(NDFrame):
         Returns
         -------
         shape: tuple
-            0 - number of rows
-            1 - number of columns
+
+        0. number of rows
+        1. number of columns
+
+        Notes
+        -----
+        - number of rows ``len(df)`` queries Elasticsearch
+        - number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated.
+
+        Examples
+        --------
+        >>> df = ed.read_es('localhost', 'ecommerce')
+        >>> df.shape
+        (4675, 45)
         """
         num_rows = len(self)
         num_columns = len(self.columns)
@@ -891,9 +939,11 @@ class DataFrame(NDFrame):
 
         Examples
         --------
-        >>> df = ed.DataFrame('localhost', 'flights')
-        >>> df = df.query('FlightDelayMin > 60')
-        >>> df.info()
+        >>> df = ed.read_es('localhost', 'flights')
+        >>> df.shape
+        (13059, 27)
+        >>> df.query('FlightDelayMin > 60').shape
+        (2730, 27)
         """
         if isinstance(expr, BooleanFilter):
             return DataFrame(
diff --git a/eland/operations.py b/eland/operations.py
index 33faba0..aa1aea5 100644
--- a/eland/operations.py
+++ b/eland/operations.py
@@ -539,10 +539,6 @@ class Operations:
         task = ('iloc', (index, columns))
         self._tasks.append(task)
 
-    def squeeze(self, axis):
-        task = ('squeeze', axis)
-        self._tasks.append(task)
-
     def index_count(self, query_compiler, field):
         # field is the index field so count values
         query_params, post_processing = self._resolve_tasks()
@@ -660,8 +656,6 @@ class Operations:
                 if column_indexer is None:
                     column_indexer = slice(None)
                 df = df.iloc[index_indexer, column_indexer]
-            elif action[0] == 'squeeze':
-                df = df.squeeze(axis=action[1])
             # columns could be in here (and we ignore it)
 
         return df
diff --git a/eland/query_compiler.py b/eland/query_compiler.py
index e2cc5a5..ba6423a 100644
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@@ -369,13 +369,6 @@ class ElandQueryCompiler:
 
         return result
 
-    def squeeze(self, axis=None):
-        result = self.copy()
-
-        result._operations.squeeze(axis)
-
-        return result
-
     def view(self, index=None, columns=None):
         result = self.copy()
 
diff --git a/eland/tests/dataframe/test_datetime_pytest.py b/eland/tests/dataframe/test_datetime_pytest.py
index ae7fe8a..77dc2b8 100644
--- a/eland/tests/dataframe/test_datetime_pytest.py
+++ b/eland/tests/dataframe/test_datetime_pytest.py
@@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData):
         # Now create index
         index_name = 'eland_test_generate_es_mappings'
 
-        ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
         ed_df_head = ed_df.head()
 
         assert_pandas_eland_frame_equal(df, ed_df_head)
diff --git a/eland/tests/dataframe/test_describe_pytest.py b/eland/tests/dataframe/test_describe_pytest.py
index 8cf96b7..af24e66 100644
--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData):
         pd_describe = pd_flights.describe()
         ed_describe = ed_flights.describe()
 
-        assert_almost_equal(pd_describe[['AvgTicketPrice']],
-                            ed_describe[['AvgTicketPrice']],
+        assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'),
+                            ed_describe.drop(['25%','50%','75%'], axis='index'),
                             check_less_precise=True)
 
-        # TODO - this fails for all fields now as ES aggregations are approximate
+        # TODO - this fails for percentile fields as ES aggregations are approximate
         #        if ES percentile agg uses
         #        "hdr": {
         #           "number_of_significant_value_digits": 3
diff --git a/eland/tests/dataframe/test_dtypes_pytest.py b/eland/tests/dataframe/test_dtypes_pytest.py
index 2266283..2db1734 100644
--- a/eland/tests/dataframe/test_dtypes_pytest.py
+++ b/eland/tests/dataframe/test_dtypes_pytest.py
@@ -1,8 +1,11 @@
 # File called _pytest for PyCharm compatability
 
+import numpy as np
+
 from pandas.util.testing import assert_series_equal
 
 from eland.tests.common import TestData
+from eland.tests.common import assert_pandas_eland_frame_equal
 
 
 class TestDataFrameDtypes(TestData):
@@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData):
         pd_flights = self.pd_flights()
 
         assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
+
+    def test_flights_select_dtypes(self):
+        ed_flights = self.ed_flights_small()
+        pd_flights = self.pd_flights_small()
+
+        assert_pandas_eland_frame_equal(
+            pd_flights.select_dtypes(include=np.number),
+            ed_flights.select_dtypes(include=np.number)
+        )
diff --git a/eland/tests/dataframe/test_query_pytest.py b/eland/tests/dataframe/test_query_pytest.py
index 25cff71..0dcbc5f 100644
--- a/eland/tests/dataframe/test_query_pytest.py
+++ b/eland/tests/dataframe/test_query_pytest.py
@@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData):
         # Now create index
         index_name = 'eland_test_query'
 
-        ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
 
         assert_pandas_eland_frame_equal(pd_df, ed_df)
 
diff --git a/eland/tests/dataframe/test_repr_pytest.py b/eland/tests/dataframe/test_repr_pytest.py
index 456ae49..8210af0 100644
--- a/eland/tests/dataframe/test_repr_pytest.py
+++ b/eland/tests/dataframe/test_repr_pytest.py
@@ -1,5 +1,7 @@
 # File called _pytest for PyCharm compatability
 
+import pytest
+
 from eland.tests.common import TestData
 
 
@@ -12,8 +14,9 @@ class TestDataFrameRepr(TestData):
         ed_head_101 = ed_flights.head(101)
         pd_head_101 = pd_flights.head(101)
 
-        # This sets max_rows=60 by default
-        ed_head_101_str = ed_head_101.to_string()
+        # This sets max_rows=60 by default (but throws userwarning)
+        with pytest.warns(UserWarning):
+            ed_head_101_str = ed_head_101.to_string()
         pd_head_101_str = pd_head_101.to_string(max_rows=60)
 
         assert pd_head_101_str == ed_head_101_str
diff --git a/eland/tests/dataframe/test_utils_pytest.py b/eland/tests/dataframe/test_utils_pytest.py
index 021f32e..992dcc7 100644
--- a/eland/tests/dataframe/test_utils_pytest.py
+++ b/eland/tests/dataframe/test_utils_pytest.py
@@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData):
         # Now create index
         index_name = 'eland_test_generate_es_mappings'
 
-        ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
         ed_df_head = ed_df.head()
 
         assert_pandas_eland_frame_equal(df, ed_df_head)
diff --git a/eland/tests/plotting/test_dataframe_hist_pytest.py b/eland/tests/plotting/test_dataframe_hist_pytest.py
index 471b89e..7ee5eb8 100644
--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@@ -1,5 +1,7 @@
 # File called _pytest for PyCharm compatability
 
+import pytest
+
 from matplotlib.testing.decorators import check_figures_equal
 
 from eland.tests.common import TestData
@@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref):
     pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
     ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
 
-    pd_ax = fig_ref.subplots()
-    pd_flights.hist(ax=pd_ax)
+    # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
+    with pytest.warns(UserWarning):
+        pd_ax = fig_ref.subplots()
+        pd_flights.hist(ax=pd_ax)
 
-    ed_ax = fig_test.subplots()
-    ed_flights.hist(ax=ed_ax)
+    # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
+    with pytest.warns(UserWarning):
+        ed_ax = fig_test.subplots()
+        ed_flights.hist(ax=ed_ax)
diff --git a/eland/utils.py b/eland/utils.py
index e55e348..463e8d4 100644
--- a/eland/utils.py
+++ b/eland/utils.py
@@ -26,13 +26,13 @@ def read_es(es_params, index_pattern):
 
     See Also
     --------
-    eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
-    eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
+    eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
+    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
     """
     return DataFrame(client=es_params, index_pattern=index_pattern)
 
-def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
-             geo_points=None):
+def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
+                    geo_points=None):
     """
     Append a pandas DataFrame to an Elasticsearch index.
     Mainly used in testing.
@@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
     See Also
     --------
     eland.read_es: Create an eland.Dataframe from an Elasticsearch index
-    eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
+    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
     """
     client = Client(es_params)
 
-    mapping = Mappings._generate_es_mappings(df, geo_points)
+    mapping = Mappings._generate_es_mappings(pd_df, geo_points)
 
     # If table exists, check if_exists parameter
     if client.index_exists(index=destination_index):
@@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
     # Now add data
     actions = []
     n = 0
-    for row in df.iterrows():
+    for row in pd_df.iterrows():
         # Use index as _id
         id = row[0]
 
@@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
 
     return ed_df
 
-def ed_to_pd(ed_df):
+def eland_to_pandas(ed_df):
     """
     Convert an eland.Dataframe to a pandas.DataFrame
 
@@ -138,7 +138,7 @@ def ed_to_pd(ed_df):
     See Also
     --------
     eland.read_es: Create an eland.Dataframe from an Elasticsearch index
-    eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
+    eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
     """
     return ed_df._to_pandas()