Renamed ed_to_pd eland_to_pandas and added docs.

+ added some additions to .gitignore + removed DataFrame.squeeze for now
2025-07-11 00:02:14 +08:00 · 2019-11-15 11:21:27 +00:00 · 2019-11-15 11:21:27 +00:00 · f5025b9f39
commit f5025b9f39
parent 29fe2278b7
18 changed files with 130 additions and 62 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,13 @@
 *.pyc

 # Setuptools distribution folder.
-/dist/
+dist/
+
+# Build folder
+build/
+
+# docs build folder
+docs/build/

 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
@ -36,4 +42,4 @@ env/
 venv/
 ENV/
 env.bak/
-venv.bak/
+venv.bak/
--- a/docs/source/reference/api/eland.ed_to_pd.rst
+++ b/docs/source/reference/api/eland.ed_to_pd.rst
@ -1,6 +0,0 @@
-eland.ed_to_pd
-==============
-
-.. currentmodule:: eland
-
-.. autofunction:: ed_to_pd
--- a/docs/source/reference/api/eland.eland_to_pandas.rst
+++ b/docs/source/reference/api/eland.eland_to_pandas.rst
@ -0,0 +1,6 @@
+eland.eland_to_pandas
+=====================
+
+.. currentmodule:: eland
+
+.. autofunction:: eland_to_pandas
--- a/docs/source/reference/api/eland.pandas_to_eland.rst
+++ b/docs/source/reference/api/eland.pandas_to_eland.rst
@ -0,0 +1,6 @@
+eland.pandas_to_eland
+=====================
+
+.. currentmodule:: eland
+
+.. autofunction:: pandas_to_eland
--- a/docs/source/reference/api/eland.pd_to_ed.rst
+++ b/docs/source/reference/api/eland.pd_to_ed.rst
@ -1,6 +0,0 @@
-eland.pd_to_ed
-==============
-
-.. currentmodule:: eland
-
-.. autofunction:: pd_to_ed
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@ -24,6 +24,7 @@ Attributes and underlying data
   DataFrame.dtypes   
   DataFrame.select_dtypes   
   DataFrame.empty   
+   DataFrame.shape

 Indexing, iteration
 ~~~~~~~~~~~~~~~~~~~
@ -80,6 +81,9 @@ Serialization / IO / conversion
   :toctree: api/

   DataFrame.info
+   DataFrame.to_csv
+   DataFrame.to_html
+   DataFrame.to_string

 Elasticsearch utilities
 ~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/reference/general_utility_functions.rst
+++ b/docs/source/reference/general_utility_functions.rst
@ -17,5 +17,5 @@ Pandas and Eland
 .. autosummary::
   :toctree: api/

-    pd_to_ed
-    ed_to_pd
+    pandas_to_eland
+    eland_to_pandas
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -76,6 +76,7 @@ class DataFrame(NDFrame):
    [5 rows x 2 columns]

    Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the  DataFrame index field
+    (TODO - currently index_field must also be a field if not _id)

    >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
    >>> df.head()
@ -529,7 +530,11 @@ class DataFrame(NDFrame):
                bold_rows=True, classes=None, escape=True, notebook=False,
                border=None, table_id=None, render_links=False):
        """
-        From pandas - except we set max_rows default to avoid careless extraction of entire index
+        Render a Elasticsearch data as an HTML table.
+
+        See Also
+        --------
+        :pandas_api_docs:`to_html` for argument details.
        """
        if max_rows is None:
            warnings.warn("DataFrame.to_string called without max_rows set "
@ -568,7 +573,13 @@ class DataFrame(NDFrame):
                  max_rows=None, max_cols=None, show_dimensions=False,
                  decimal='.', line_width=None):
        """
-        From pandas - except we set max_rows default to avoid careless extraction of entire index
+        Render a DataFrame to a console-friendly tabular output.
+
+        Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
+
+        See Also
+        --------
+        :pandas_api_docs:`to_string` for argument details.
        """
        if max_rows is None:
            warnings.warn("DataFrame.to_string called without max_rows set "
@ -718,6 +729,13 @@ class DataFrame(NDFrame):
               quotechar='"', line_terminator=None, chunksize=None,
               tupleize_cols=None, date_format=None, doublequote=True,
               escapechar=None, decimal='.'):
+        """
+        Write Elasticsearch data to a comma-separated values (csv) file.
+
+        See Also
+        --------
+        :pandas_api_docs:`to_csv` for argument details.
+        """
        kwargs = {
            "path_or_buf": path_or_buf,
            "sep": sep,
@ -754,16 +772,34 @@ class DataFrame(NDFrame):
    def _empty_pd_df(self):
        return self._query_compiler._empty_pd_ef()

-    def squeeze(self, axis=None):
-        return DataFrame(
-            query_compiler=self._query_compiler.squeeze(axis)
-        )
-
    def select_dtypes(self, include=None, exclude=None):
        """
        Return a subset of the DataFrame's columns based on the column dtypes.

        Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
+
+        Returns
+        -------
+        eland.DataFrame
+            DataFrame contains only columns of selected dtypes
+
+        Examples
+        --------
+        >>> df = ed.DataFrame('localhost', 'flights',
+        ... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
+        >>> df.dtypes
+        AvgTicketPrice           float64
+        Dest                      object
+        Cancelled                   bool
+        timestamp         datetime64[ns]
+        dayOfWeek                  int64
+        dtype: object
+        >>> df = df.select_dtypes(include=[np.number, 'datetime'])
+        >>> df.dtypes
+        AvgTicketPrice           float64
+        timestamp         datetime64[ns]
+        dayOfWeek                  int64
+        dtype: object
        """
        empty_df = self._empty_pd_df()

@ -779,8 +815,20 @@ class DataFrame(NDFrame):
        Returns
        -------
        shape: tuple
-            0 - number of rows
-            1 - number of columns
+
+        0. number of rows
+        1. number of columns
+
+        Notes
+        -----
+        - number of rows ``len(df)`` queries Elasticsearch
+        - number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated.
+
+        Examples
+        --------
+        >>> df = ed.read_es('localhost', 'ecommerce')
+        >>> df.shape
+        (4675, 45)
        """
        num_rows = len(self)
        num_columns = len(self.columns)
@ -891,9 +939,11 @@ class DataFrame(NDFrame):

        Examples
        --------
-        >>> df = ed.DataFrame('localhost', 'flights')
-        >>> df = df.query('FlightDelayMin > 60')
-        >>> df.info()
+        >>> df = ed.read_es('localhost', 'flights')
+        >>> df.shape
+        (13059, 27)
+        >>> df.query('FlightDelayMin > 60').shape
+        (2730, 27)
        """
        if isinstance(expr, BooleanFilter):
            return DataFrame(
--- a/eland/operations.py
+++ b/eland/operations.py
@ -539,10 +539,6 @@ class Operations:
        task = ('iloc', (index, columns))
        self._tasks.append(task)

-    def squeeze(self, axis):
-        task = ('squeeze', axis)
-        self._tasks.append(task)
-
    def index_count(self, query_compiler, field):
        # field is the index field so count values
        query_params, post_processing = self._resolve_tasks()
@ -660,8 +656,6 @@ class Operations:
                if column_indexer is None:
                    column_indexer = slice(None)
                df = df.iloc[index_indexer, column_indexer]
-            elif action[0] == 'squeeze':
-                df = df.squeeze(axis=action[1])
            # columns could be in here (and we ignore it)

        return df
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -369,13 +369,6 @@ class ElandQueryCompiler:

        return result

-    def squeeze(self, axis=None):
-        result = self.copy()
-
-        result._operations.squeeze(axis)
-
-        return result
-
    def view(self, index=None, columns=None):
        result = self.copy()

--- a/eland/tests/dataframe/test_datetime_pytest.py
+++ b/eland/tests/dataframe/test_datetime_pytest.py
@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData):
        # Now create index
        index_name = 'eland_test_generate_es_mappings'

-        ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
        ed_df_head = ed_df.head()

        assert_pandas_eland_frame_equal(df, ed_df_head)
--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData):
        pd_describe = pd_flights.describe()
        ed_describe = ed_flights.describe()

-        assert_almost_equal(pd_describe[['AvgTicketPrice']],
-                            ed_describe[['AvgTicketPrice']],
+        assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'),
+                            ed_describe.drop(['25%','50%','75%'], axis='index'),
                            check_less_precise=True)

-        # TODO - this fails for all fields now as ES aggregations are approximate
+        # TODO - this fails for percentile fields as ES aggregations are approximate
        #        if ES percentile agg uses
        #        "hdr": {
        #           "number_of_significant_value_digits": 3
--- a/eland/tests/dataframe/test_dtypes_pytest.py
+++ b/eland/tests/dataframe/test_dtypes_pytest.py
@ -1,8 +1,11 @@
 # File called _pytest for PyCharm compatability

+import numpy as np
+
 from pandas.util.testing import assert_series_equal

 from eland.tests.common import TestData
+from eland.tests.common import assert_pandas_eland_frame_equal


 class TestDataFrameDtypes(TestData):
@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData):
        pd_flights = self.pd_flights()

        assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
+
+    def test_flights_select_dtypes(self):
+        ed_flights = self.ed_flights_small()
+        pd_flights = self.pd_flights_small()
+
+        assert_pandas_eland_frame_equal(
+            pd_flights.select_dtypes(include=np.number),
+            ed_flights.select_dtypes(include=np.number)
+        )
--- a/eland/tests/dataframe/test_query_pytest.py
+++ b/eland/tests/dataframe/test_query_pytest.py
@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData):
        # Now create index
        index_name = 'eland_test_query'

-        ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)

        assert_pandas_eland_frame_equal(pd_df, ed_df)

--- a/eland/tests/dataframe/test_repr_pytest.py
+++ b/eland/tests/dataframe/test_repr_pytest.py
@ -1,5 +1,7 @@
 # File called _pytest for PyCharm compatability

+import pytest
+
 from eland.tests.common import TestData


@ -12,8 +14,9 @@ class TestDataFrameRepr(TestData):
        ed_head_101 = ed_flights.head(101)
        pd_head_101 = pd_flights.head(101)

-        # This sets max_rows=60 by default
-        ed_head_101_str = ed_head_101.to_string()
+        # This sets max_rows=60 by default (but throws userwarning)
+        with pytest.warns(UserWarning):
+            ed_head_101_str = ed_head_101.to_string()
        pd_head_101_str = pd_head_101.to_string(max_rows=60)

        assert pd_head_101_str == ed_head_101_str
--- a/eland/tests/dataframe/test_utils_pytest.py
+++ b/eland/tests/dataframe/test_utils_pytest.py
@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData):
        # Now create index
        index_name = 'eland_test_generate_es_mappings'

-        ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
        ed_df_head = ed_df.head()

        assert_pandas_eland_frame_equal(df, ed_df_head)
--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@ -1,5 +1,7 @@
 # File called _pytest for PyCharm compatability

+import pytest
+
 from matplotlib.testing.decorators import check_figures_equal

 from eland.tests.common import TestData
@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref):
    pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
    ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]

-    pd_ax = fig_ref.subplots()
-    pd_flights.hist(ax=pd_ax)
+    # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
+    with pytest.warns(UserWarning):
+        pd_ax = fig_ref.subplots()
+        pd_flights.hist(ax=pd_ax)

-    ed_ax = fig_test.subplots()
-    ed_flights.hist(ax=ed_ax)
+    # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
+    with pytest.warns(UserWarning):
+        ed_ax = fig_test.subplots()
+        ed_flights.hist(ax=ed_ax)
--- a/eland/utils.py
+++ b/eland/utils.py
@ -26,13 +26,13 @@ def read_es(es_params, index_pattern):

    See Also
    --------
-    eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
-    eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
+    eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
+    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
    """
    return DataFrame(client=es_params, index_pattern=index_pattern)

-def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
-             geo_points=None):
+def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
+                    geo_points=None):
    """
    Append a pandas DataFrame to an Elasticsearch index.
    Mainly used in testing.
@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
    See Also
    --------
    eland.read_es: Create an eland.Dataframe from an Elasticsearch index
-    eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
+    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
    """
    client = Client(es_params)

-    mapping = Mappings._generate_es_mappings(df, geo_points)
+    mapping = Mappings._generate_es_mappings(pd_df, geo_points)

    # If table exists, check if_exists parameter
    if client.index_exists(index=destination_index):
@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
    # Now add data
    actions = []
    n = 0
-    for row in df.iterrows():
+    for row in pd_df.iterrows():
        # Use index as _id
        id = row[0]

@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000

    return ed_df

-def ed_to_pd(ed_df):
+def eland_to_pandas(ed_df):
    """
    Convert an eland.Dataframe to a pandas.DataFrame

@ -138,7 +138,7 @@ def ed_to_pd(ed_df):
    See Also
    --------
    eland.read_es: Create an eland.Dataframe from an Elasticsearch index
-    eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
+    eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
    """
    return ed_df._to_pandas()