From f5025b9f395d6cd9dad5dea779ffd3285c3a2ce6 Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Fri, 15 Nov 2019 11:21:27 +0000 Subject: [PATCH] Renamed ed_to_pd eland_to_pandas and added docs. + added some additions to .gitignore + removed DataFrame.squeeze for now --- .gitignore | 10 ++- docs/source/reference/api/eland.ed_to_pd.rst | 6 -- .../reference/api/eland.eland_to_pandas.rst | 6 ++ .../reference/api/eland.pandas_to_eland.rst | 6 ++ docs/source/reference/api/eland.pd_to_ed.rst | 6 -- docs/source/reference/dataframe.rst | 4 + .../reference/general_utility_functions.rst | 4 +- eland/dataframe.py | 74 ++++++++++++++++--- eland/operations.py | 6 -- eland/query_compiler.py | 7 -- eland/tests/dataframe/test_datetime_pytest.py | 2 +- eland/tests/dataframe/test_describe_pytest.py | 6 +- eland/tests/dataframe/test_dtypes_pytest.py | 12 +++ eland/tests/dataframe/test_query_pytest.py | 2 +- eland/tests/dataframe/test_repr_pytest.py | 7 +- eland/tests/dataframe/test_utils_pytest.py | 2 +- .../plotting/test_dataframe_hist_pytest.py | 14 +++- eland/utils.py | 18 ++--- 18 files changed, 130 insertions(+), 62 deletions(-) delete mode 100644 docs/source/reference/api/eland.ed_to_pd.rst create mode 100644 docs/source/reference/api/eland.eland_to_pandas.rst create mode 100644 docs/source/reference/api/eland.pandas_to_eland.rst delete mode 100644 docs/source/reference/api/eland.pd_to_ed.rst diff --git a/.gitignore b/.gitignore index 8969c38..4de1325 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,13 @@ *.pyc # Setuptools distribution folder. -/dist/ +dist/ + +# Build folder +build/ + +# docs build folder +docs/build/ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info @@ -36,4 +42,4 @@ env/ venv/ ENV/ env.bak/ -venv.bak/ \ No newline at end of file +venv.bak/ diff --git a/docs/source/reference/api/eland.ed_to_pd.rst b/docs/source/reference/api/eland.ed_to_pd.rst deleted file mode 100644 index 55dcf64..0000000 --- a/docs/source/reference/api/eland.ed_to_pd.rst +++ /dev/null @@ -1,6 +0,0 @@ -eland.ed_to_pd -============== - -.. currentmodule:: eland - -.. autofunction:: ed_to_pd diff --git a/docs/source/reference/api/eland.eland_to_pandas.rst b/docs/source/reference/api/eland.eland_to_pandas.rst new file mode 100644 index 0000000..eb87670 --- /dev/null +++ b/docs/source/reference/api/eland.eland_to_pandas.rst @@ -0,0 +1,6 @@ +eland.eland_to_pandas +===================== + +.. currentmodule:: eland + +.. autofunction:: eland_to_pandas diff --git a/docs/source/reference/api/eland.pandas_to_eland.rst b/docs/source/reference/api/eland.pandas_to_eland.rst new file mode 100644 index 0000000..c24836e --- /dev/null +++ b/docs/source/reference/api/eland.pandas_to_eland.rst @@ -0,0 +1,6 @@ +eland.pandas_to_eland +===================== + +.. currentmodule:: eland + +.. autofunction:: pandas_to_eland diff --git a/docs/source/reference/api/eland.pd_to_ed.rst b/docs/source/reference/api/eland.pd_to_ed.rst deleted file mode 100644 index 615c987..0000000 --- a/docs/source/reference/api/eland.pd_to_ed.rst +++ /dev/null @@ -1,6 +0,0 @@ -eland.pd_to_ed -============== - -.. currentmodule:: eland - -.. autofunction:: pd_to_ed diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst index f009456..e1e71fa 100644 --- a/docs/source/reference/dataframe.rst +++ b/docs/source/reference/dataframe.rst @@ -24,6 +24,7 @@ Attributes and underlying data DataFrame.dtypes DataFrame.select_dtypes DataFrame.empty + DataFrame.shape Indexing, iteration ~~~~~~~~~~~~~~~~~~~ @@ -80,6 +81,9 @@ Serialization / IO / conversion :toctree: api/ DataFrame.info + DataFrame.to_csv + DataFrame.to_html + DataFrame.to_string Elasticsearch utilities ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/reference/general_utility_functions.rst b/docs/source/reference/general_utility_functions.rst index 63e1865..fd6960d 100644 --- a/docs/source/reference/general_utility_functions.rst +++ b/docs/source/reference/general_utility_functions.rst @@ -17,5 +17,5 @@ Pandas and Eland .. autosummary:: :toctree: api/ - pd_to_ed - ed_to_pd + pandas_to_eland + eland_to_pandas diff --git a/eland/dataframe.py b/eland/dataframe.py index 34892df..33164da 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -76,6 +76,7 @@ class DataFrame(NDFrame): [5 rows x 2 columns] Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field + (TODO - currently index_field must also be a field if not _id) >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp') >>> df.head() @@ -529,7 +530,11 @@ class DataFrame(NDFrame): bold_rows=True, classes=None, escape=True, notebook=False, border=None, table_id=None, render_links=False): """ - From pandas - except we set max_rows default to avoid careless extraction of entire index + Render a Elasticsearch data as an HTML table. + + See Also + -------- + :pandas_api_docs:`to_html` for argument details. """ if max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " @@ -568,7 +573,13 @@ class DataFrame(NDFrame): max_rows=None, max_cols=None, show_dimensions=False, decimal='.', line_width=None): """ - From pandas - except we set max_rows default to avoid careless extraction of entire index + Render a DataFrame to a console-friendly tabular output. + + Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index. + + See Also + -------- + :pandas_api_docs:`to_string` for argument details. """ if max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " @@ -718,6 +729,13 @@ class DataFrame(NDFrame): quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.'): + """ + Write Elasticsearch data to a comma-separated values (csv) file. + + See Also + -------- + :pandas_api_docs:`to_csv` for argument details. + """ kwargs = { "path_or_buf": path_or_buf, "sep": sep, @@ -754,16 +772,34 @@ class DataFrame(NDFrame): def _empty_pd_df(self): return self._query_compiler._empty_pd_ef() - def squeeze(self, axis=None): - return DataFrame( - query_compiler=self._query_compiler.squeeze(axis) - ) - def select_dtypes(self, include=None, exclude=None): """ Return a subset of the DataFrame's columns based on the column dtypes. Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes` + + Returns + ------- + eland.DataFrame + DataFrame contains only columns of selected dtypes + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights', + ... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek']) + >>> df.dtypes + AvgTicketPrice float64 + Dest object + Cancelled bool + timestamp datetime64[ns] + dayOfWeek int64 + dtype: object + >>> df = df.select_dtypes(include=[np.number, 'datetime']) + >>> df.dtypes + AvgTicketPrice float64 + timestamp datetime64[ns] + dayOfWeek int64 + dtype: object """ empty_df = self._empty_pd_df() @@ -779,8 +815,20 @@ class DataFrame(NDFrame): Returns ------- shape: tuple - 0 - number of rows - 1 - number of columns + + 0. number of rows + 1. number of columns + + Notes + ----- + - number of rows ``len(df)`` queries Elasticsearch + - number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated. + + Examples + -------- + >>> df = ed.read_es('localhost', 'ecommerce') + >>> df.shape + (4675, 45) """ num_rows = len(self) num_columns = len(self.columns) @@ -891,9 +939,11 @@ class DataFrame(NDFrame): Examples -------- - >>> df = ed.DataFrame('localhost', 'flights') - >>> df = df.query('FlightDelayMin > 60') - >>> df.info() + >>> df = ed.read_es('localhost', 'flights') + >>> df.shape + (13059, 27) + >>> df.query('FlightDelayMin > 60').shape + (2730, 27) """ if isinstance(expr, BooleanFilter): return DataFrame( diff --git a/eland/operations.py b/eland/operations.py index 33faba0..aa1aea5 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -539,10 +539,6 @@ class Operations: task = ('iloc', (index, columns)) self._tasks.append(task) - def squeeze(self, axis): - task = ('squeeze', axis) - self._tasks.append(task) - def index_count(self, query_compiler, field): # field is the index field so count values query_params, post_processing = self._resolve_tasks() @@ -660,8 +656,6 @@ class Operations: if column_indexer is None: column_indexer = slice(None) df = df.iloc[index_indexer, column_indexer] - elif action[0] == 'squeeze': - df = df.squeeze(axis=action[1]) # columns could be in here (and we ignore it) return df diff --git a/eland/query_compiler.py b/eland/query_compiler.py index e2cc5a5..ba6423a 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -369,13 +369,6 @@ class ElandQueryCompiler: return result - def squeeze(self, axis=None): - result = self.copy() - - result._operations.squeeze(axis) - - return result - def view(self, index=None, columns=None): result = self.copy() diff --git a/eland/tests/dataframe/test_datetime_pytest.py b/eland/tests/dataframe/test_datetime_pytest.py index ae7fe8a..77dc2b8 100644 --- a/eland/tests/dataframe/test_datetime_pytest.py +++ b/eland/tests/dataframe/test_datetime_pytest.py @@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData): # Now create index index_name = 'eland_test_generate_es_mappings' - ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) + ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df_head = ed_df.head() assert_pandas_eland_frame_equal(df, ed_df_head) diff --git a/eland/tests/dataframe/test_describe_pytest.py b/eland/tests/dataframe/test_describe_pytest.py index 8cf96b7..af24e66 100644 --- a/eland/tests/dataframe/test_describe_pytest.py +++ b/eland/tests/dataframe/test_describe_pytest.py @@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData): pd_describe = pd_flights.describe() ed_describe = ed_flights.describe() - assert_almost_equal(pd_describe[['AvgTicketPrice']], - ed_describe[['AvgTicketPrice']], + assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'), + ed_describe.drop(['25%','50%','75%'], axis='index'), check_less_precise=True) - # TODO - this fails for all fields now as ES aggregations are approximate + # TODO - this fails for percentile fields as ES aggregations are approximate # if ES percentile agg uses # "hdr": { # "number_of_significant_value_digits": 3 diff --git a/eland/tests/dataframe/test_dtypes_pytest.py b/eland/tests/dataframe/test_dtypes_pytest.py index 2266283..2db1734 100644 --- a/eland/tests/dataframe/test_dtypes_pytest.py +++ b/eland/tests/dataframe/test_dtypes_pytest.py @@ -1,8 +1,11 @@ # File called _pytest for PyCharm compatability +import numpy as np + from pandas.util.testing import assert_series_equal from eland.tests.common import TestData +from eland.tests.common import assert_pandas_eland_frame_equal class TestDataFrameDtypes(TestData): @@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData): pd_flights = self.pd_flights() assert_series_equal(pd_flights.dtypes, ed_flights.dtypes) + + def test_flights_select_dtypes(self): + ed_flights = self.ed_flights_small() + pd_flights = self.pd_flights_small() + + assert_pandas_eland_frame_equal( + pd_flights.select_dtypes(include=np.number), + ed_flights.select_dtypes(include=np.number) + ) diff --git a/eland/tests/dataframe/test_query_pytest.py b/eland/tests/dataframe/test_query_pytest.py index 25cff71..0dcbc5f 100644 --- a/eland/tests/dataframe/test_query_pytest.py +++ b/eland/tests/dataframe/test_query_pytest.py @@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData): # Now create index index_name = 'eland_test_query' - ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) + ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) assert_pandas_eland_frame_equal(pd_df, ed_df) diff --git a/eland/tests/dataframe/test_repr_pytest.py b/eland/tests/dataframe/test_repr_pytest.py index 456ae49..8210af0 100644 --- a/eland/tests/dataframe/test_repr_pytest.py +++ b/eland/tests/dataframe/test_repr_pytest.py @@ -1,5 +1,7 @@ # File called _pytest for PyCharm compatability +import pytest + from eland.tests.common import TestData @@ -12,8 +14,9 @@ class TestDataFrameRepr(TestData): ed_head_101 = ed_flights.head(101) pd_head_101 = pd_flights.head(101) - # This sets max_rows=60 by default - ed_head_101_str = ed_head_101.to_string() + # This sets max_rows=60 by default (but throws userwarning) + with pytest.warns(UserWarning): + ed_head_101_str = ed_head_101.to_string() pd_head_101_str = pd_head_101.to_string(max_rows=60) assert pd_head_101_str == ed_head_101_str diff --git a/eland/tests/dataframe/test_utils_pytest.py b/eland/tests/dataframe/test_utils_pytest.py index 021f32e..992dcc7 100644 --- a/eland/tests/dataframe/test_utils_pytest.py +++ b/eland/tests/dataframe/test_utils_pytest.py @@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData): # Now create index index_name = 'eland_test_generate_es_mappings' - ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) + ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df_head = ed_df.head() assert_pandas_eland_frame_equal(df, ed_df_head) diff --git a/eland/tests/plotting/test_dataframe_hist_pytest.py b/eland/tests/plotting/test_dataframe_hist_pytest.py index 471b89e..7ee5eb8 100644 --- a/eland/tests/plotting/test_dataframe_hist_pytest.py +++ b/eland/tests/plotting/test_dataframe_hist_pytest.py @@ -1,5 +1,7 @@ # File called _pytest for PyCharm compatability +import pytest + from matplotlib.testing.decorators import check_figures_equal from eland.tests.common import TestData @@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref): pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']] ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']] - pd_ax = fig_ref.subplots() - pd_flights.hist(ax=pd_ax) + # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222) + with pytest.warns(UserWarning): + pd_ax = fig_ref.subplots() + pd_flights.hist(ax=pd_ax) - ed_ax = fig_test.subplots() - ed_flights.hist(ax=ed_ax) + # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222) + with pytest.warns(UserWarning): + ed_ax = fig_test.subplots() + ed_flights.hist(ax=ed_ax) diff --git a/eland/utils.py b/eland/utils.py index e55e348..463e8d4 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -26,13 +26,13 @@ def read_es(es_params, index_pattern): See Also -------- - eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame - eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame + eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame + eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame """ return DataFrame(client=es_params, index_pattern=index_pattern) -def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, - geo_points=None): +def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, + geo_points=None): """ Append a pandas DataFrame to an Elasticsearch index. Mainly used in testing. @@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000 See Also -------- eland.read_es: Create an eland.Dataframe from an Elasticsearch index - eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame + eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame """ client = Client(es_params) - mapping = Mappings._generate_es_mappings(df, geo_points) + mapping = Mappings._generate_es_mappings(pd_df, geo_points) # If table exists, check if_exists parameter if client.index_exists(index=destination_index): @@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000 # Now add data actions = [] n = 0 - for row in df.iterrows(): + for row in pd_df.iterrows(): # Use index as _id id = row[0] @@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000 return ed_df -def ed_to_pd(ed_df): +def eland_to_pandas(ed_df): """ Convert an eland.Dataframe to a pandas.DataFrame @@ -138,7 +138,7 @@ def ed_to_pd(ed_df): See Also -------- eland.read_es: Create an eland.Dataframe from an Elasticsearch index - eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame + eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame """ return ed_df._to_pandas()