Renamed ed_to_pd eland_to_pandas and added docs.

+ added some additions to .gitignore + removed DataFrame.squeeze for now
2025-07-11 00:02:14 +08:00 · 2019-11-15 11:21:27 +00:00 · 2019-11-15 11:21:27 +00:00 · f5025b9f39
commit f5025b9f39
parent 29fe2278b7
18 changed files with 130 additions and 62 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,13 @@
 *.pyc
 # Setuptools distribution folder.
-/dist/
+dist/
 # Build folder
 build/
 # docs build folder
 docs/build/
 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
@ -36,4 +42,4 @@ env/
 venv/
 ENV/
 env.bak/
-venv.bak/
+venv.bak/
--- a/docs/source/reference/api/eland.ed_to_pd.rst
+++ b/docs/source/reference/api/eland.ed_to_pd.rst
@ -1,6 +0,0 @@
 eland.ed_to_pd
 ==============
 .. currentmodule:: eland
 .. autofunction:: ed_to_pd
--- a/docs/source/reference/api/eland.eland_to_pandas.rst
+++ b/docs/source/reference/api/eland.eland_to_pandas.rst
@ -0,0 +1,6 @@
 eland.eland_to_pandas
 =====================
 .. currentmodule:: eland
 .. autofunction:: eland_to_pandas
--- a/docs/source/reference/api/eland.pandas_to_eland.rst
+++ b/docs/source/reference/api/eland.pandas_to_eland.rst
@ -0,0 +1,6 @@
 eland.pandas_to_eland
 =====================
 .. currentmodule:: eland
 .. autofunction:: pandas_to_eland
--- a/docs/source/reference/api/eland.pd_to_ed.rst
+++ b/docs/source/reference/api/eland.pd_to_ed.rst
@ -1,6 +0,0 @@
 eland.pd_to_ed
 ==============
 .. currentmodule:: eland
 .. autofunction:: pd_to_ed
--- a/docs/source/reference/dataframe.rst
+++ b/docs/source/reference/dataframe.rst
@ -24,6 +24,7 @@ Attributes and underlying data
   DataFrame.dtypes   
   DataFrame.select_dtypes   
   DataFrame.empty   
   DataFrame.shape
 Indexing, iteration
 ~~~~~~~~~~~~~~~~~~~
@ -80,6 +81,9 @@ Serialization / IO / conversion
   :toctree: api/
   DataFrame.info
   DataFrame.to_csv
   DataFrame.to_html
   DataFrame.to_string
 Elasticsearch utilities
 ~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/reference/general_utility_functions.rst
+++ b/docs/source/reference/general_utility_functions.rst
@ -17,5 +17,5 @@ Pandas and Eland
 .. autosummary::
   :toctree: api/
-    pd_to_ed
+    pandas_to_eland
-    ed_to_pd
+    eland_to_pandas
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -76,6 +76,7 @@ class DataFrame(NDFrame):
    [5 rows x 2 columns]
    Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the  DataFrame index field
    (TODO - currently index_field must also be a field if not _id)
    >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
    >>> df.head()
@ -529,7 +530,11 @@ class DataFrame(NDFrame):
                bold_rows=True, classes=None, escape=True, notebook=False,
                border=None, table_id=None, render_links=False):
        """
-        From pandas - except we set max_rows default to avoid careless extraction of entire index
+        Render a Elasticsearch data as an HTML table.
        See Also
        --------
        :pandas_api_docs:`to_html` for argument details.
        """
        if max_rows is None:
            warnings.warn("DataFrame.to_string called without max_rows set "
@ -568,7 +573,13 @@ class DataFrame(NDFrame):
                  max_rows=None, max_cols=None, show_dimensions=False,
                  decimal='.', line_width=None):
        """
-        From pandas - except we set max_rows default to avoid careless extraction of entire index
+        Render a DataFrame to a console-friendly tabular output.
        Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
        See Also
        --------
        :pandas_api_docs:`to_string` for argument details.
        """
        if max_rows is None:
            warnings.warn("DataFrame.to_string called without max_rows set "
@ -718,6 +729,13 @@ class DataFrame(NDFrame):
               quotechar='"', line_terminator=None, chunksize=None,
               tupleize_cols=None, date_format=None, doublequote=True,
               escapechar=None, decimal='.'):
        """
        Write Elasticsearch data to a comma-separated values (csv) file.
        See Also
        --------
        :pandas_api_docs:`to_csv` for argument details.
        """
        kwargs = {
            "path_or_buf": path_or_buf,
            "sep": sep,
@ -754,16 +772,34 @@ class DataFrame(NDFrame):
    def _empty_pd_df(self):
        return self._query_compiler._empty_pd_ef()
    def squeeze(self, axis=None):
        return DataFrame(
            query_compiler=self._query_compiler.squeeze(axis)
        )
    def select_dtypes(self, include=None, exclude=None):
        """
        Return a subset of the DataFrame's columns based on the column dtypes.
        Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
        Returns
        -------
        eland.DataFrame
            DataFrame contains only columns of selected dtypes
        Examples
        --------
        >>> df = ed.DataFrame('localhost', 'flights',
        ... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
        >>> df.dtypes
        AvgTicketPrice           float64
        Dest                      object
        Cancelled                   bool
        timestamp         datetime64[ns]
        dayOfWeek                  int64
        dtype: object
        >>> df = df.select_dtypes(include=[np.number, 'datetime'])
        >>> df.dtypes
        AvgTicketPrice           float64
        timestamp         datetime64[ns]
        dayOfWeek                  int64
        dtype: object
        """
        empty_df = self._empty_pd_df()
@ -779,8 +815,20 @@ class DataFrame(NDFrame):
        Returns
        -------
        shape: tuple
-            0 - number of rows
+
-            1 - number of columns
+        0. number of rows
        1. number of columns
        Notes
        -----
        - number of rows ``len(df)`` queries Elasticsearch
        - number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated.
        Examples
        --------
        >>> df = ed.read_es('localhost', 'ecommerce')
        >>> df.shape
        (4675, 45)
        """
        num_rows = len(self)
        num_columns = len(self.columns)
@ -891,9 +939,11 @@ class DataFrame(NDFrame):
        Examples
        --------
-        >>> df = ed.DataFrame('localhost', 'flights')
+        >>> df = ed.read_es('localhost', 'flights')
-        >>> df = df.query('FlightDelayMin > 60')
+        >>> df.shape
-        >>> df.info()
+        (13059, 27)
        >>> df.query('FlightDelayMin > 60').shape
        (2730, 27)
        """
        if isinstance(expr, BooleanFilter):
            return DataFrame(
--- a/eland/operations.py
+++ b/eland/operations.py
@ -539,10 +539,6 @@ class Operations:
        task = ('iloc', (index, columns))
        self._tasks.append(task)
    def squeeze(self, axis):
        task = ('squeeze', axis)
        self._tasks.append(task)
    def index_count(self, query_compiler, field):
        # field is the index field so count values
        query_params, post_processing = self._resolve_tasks()
@ -660,8 +656,6 @@ class Operations:
                if column_indexer is None:
                    column_indexer = slice(None)
                df = df.iloc[index_indexer, column_indexer]
            elif action[0] == 'squeeze':
                df = df.squeeze(axis=action[1])
            # columns could be in here (and we ignore it)
        return df
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -369,13 +369,6 @@ class ElandQueryCompiler:
        return result
    def squeeze(self, axis=None):
        result = self.copy()
        result._operations.squeeze(axis)
        return result
    def view(self, index=None, columns=None):
        result = self.copy()
--- a/eland/tests/dataframe/test_datetime_pytest.py
+++ b/eland/tests/dataframe/test_datetime_pytest.py
@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData):
        # Now create index
        index_name = 'eland_test_generate_es_mappings'
-        ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
        ed_df_head = ed_df.head()
        assert_pandas_eland_frame_equal(df, ed_df_head)
--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData):
        pd_describe = pd_flights.describe()
        ed_describe = ed_flights.describe()
-        assert_almost_equal(pd_describe[['AvgTicketPrice']],
+        assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'),
-                            ed_describe[['AvgTicketPrice']],
+                            ed_describe.drop(['25%','50%','75%'], axis='index'),
                            check_less_precise=True)
-        # TODO - this fails for all fields now as ES aggregations are approximate
+        # TODO - this fails for percentile fields as ES aggregations are approximate
        #        if ES percentile agg uses
        #        "hdr": {
        #           "number_of_significant_value_digits": 3
--- a/eland/tests/dataframe/test_dtypes_pytest.py
+++ b/eland/tests/dataframe/test_dtypes_pytest.py
@ -1,8 +1,11 @@
 # File called _pytest for PyCharm compatability
 import numpy as np
 from pandas.util.testing import assert_series_equal
 from eland.tests.common import TestData
 from eland.tests.common import assert_pandas_eland_frame_equal
 class TestDataFrameDtypes(TestData):
@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData):
        pd_flights = self.pd_flights()
        assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
    def test_flights_select_dtypes(self):
        ed_flights = self.ed_flights_small()
        pd_flights = self.pd_flights_small()
        assert_pandas_eland_frame_equal(
            pd_flights.select_dtypes(include=np.number),
            ed_flights.select_dtypes(include=np.number)
        )
--- a/eland/tests/dataframe/test_query_pytest.py
+++ b/eland/tests/dataframe/test_query_pytest.py
@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData):
        # Now create index
        index_name = 'eland_test_query'
-        ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
        assert_pandas_eland_frame_equal(pd_df, ed_df)
--- a/eland/tests/dataframe/test_repr_pytest.py
+++ b/eland/tests/dataframe/test_repr_pytest.py
@ -1,5 +1,7 @@
 # File called _pytest for PyCharm compatability
 import pytest
 from eland.tests.common import TestData
@ -12,8 +14,9 @@ class TestDataFrameRepr(TestData):
        ed_head_101 = ed_flights.head(101)
        pd_head_101 = pd_flights.head(101)
-        # This sets max_rows=60 by default
+        # This sets max_rows=60 by default (but throws userwarning)
-        ed_head_101_str = ed_head_101.to_string()
+        with pytest.warns(UserWarning):
            ed_head_101_str = ed_head_101.to_string()
        pd_head_101_str = pd_head_101.to_string(max_rows=60)
        assert pd_head_101_str == ed_head_101_str
--- a/eland/tests/dataframe/test_utils_pytest.py
+++ b/eland/tests/dataframe/test_utils_pytest.py
@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData):
        # Now create index
        index_name = 'eland_test_generate_es_mappings'
-        ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
+        ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
        ed_df_head = ed_df.head()
        assert_pandas_eland_frame_equal(df, ed_df_head)
--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@ -1,5 +1,7 @@
 # File called _pytest for PyCharm compatability
 import pytest
 from matplotlib.testing.decorators import check_figures_equal
 from eland.tests.common import TestData
@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref):
    pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
    ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
-    pd_ax = fig_ref.subplots()
+    # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
-    pd_flights.hist(ax=pd_ax)
+    with pytest.warns(UserWarning):
        pd_ax = fig_ref.subplots()
        pd_flights.hist(ax=pd_ax)
-    ed_ax = fig_test.subplots()
+    # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
-    ed_flights.hist(ax=ed_ax)
+    with pytest.warns(UserWarning):
        ed_ax = fig_test.subplots()
        ed_flights.hist(ax=ed_ax)
--- a/eland/utils.py
+++ b/eland/utils.py
@ -26,13 +26,13 @@ def read_es(es_params, index_pattern):
    See Also
    --------
-    eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
+    eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
-    eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
+    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
    """
    return DataFrame(client=es_params, index_pattern=index_pattern)
-def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
+def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
-             geo_points=None):
+                    geo_points=None):
    """
    Append a pandas DataFrame to an Elasticsearch index.
    Mainly used in testing.
@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
    See Also
    --------
    eland.read_es: Create an eland.Dataframe from an Elasticsearch index
-    eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
+    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
    """
    client = Client(es_params)
-    mapping = Mappings._generate_es_mappings(df, geo_points)
+    mapping = Mappings._generate_es_mappings(pd_df, geo_points)
    # If table exists, check if_exists parameter
    if client.index_exists(index=destination_index):
@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
    # Now add data
    actions = []
    n = 0
-    for row in df.iterrows():
+    for row in pd_df.iterrows():
        # Use index as _id
        id = row[0]
@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
    return ed_df
-def ed_to_pd(ed_df):
+def eland_to_pandas(ed_df):
    """
    Convert an eland.Dataframe to a pandas.DataFrame
@ -138,7 +138,7 @@ def ed_to_pd(ed_df):
    See Also
    --------
    eland.read_es: Create an eland.Dataframe from an Elasticsearch index
-    eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
+    eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
    """
    return ed_df._to_pandas()