From d92ed94ef00bca996ddb6e4ba01a8b1d992c4f55 Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Mon, 18 Nov 2019 12:55:23 +0000 Subject: [PATCH 1/3] Improve to_string/to_html/__repr__/_repr_html_ tests Added more rigorious tests for string representation and fixing issue with to_html. --- eland/conftest.py | 2 +- eland/dataframe.py | 44 +++++-- eland/tests/dataframe/test_repr_pytest.py | 153 +++++++++++++++++----- 3 files changed, 159 insertions(+), 40 deletions(-) diff --git a/eland/conftest.py b/eland/conftest.py index ce62d3b..98cebe5 100644 --- a/eland/conftest.py +++ b/eland/conftest.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import eland as ed -# Fix console sizxe for consistent test results +# Fix console size for consistent test results pd.set_option('display.max_rows', 10) pd.set_option('display.max_columns', 5) pd.set_option('display.width', 100) diff --git a/eland/dataframe.py b/eland/dataframe.py index f9a18b4..fd7035f 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,6 +19,9 @@ from eland import NDFrame from eland import Series from eland.filter import BooleanFilter, ScriptFilter +# Default number of rows displayed (different to pandas where ALL could be displayed) +DEFAULT_NUM_ROWS_DISPLAYED = 60 + class DataFrame(NDFrame): """ @@ -288,10 +291,15 @@ class DataFrame(NDFrame): if pd.get_option("display.notebook_repr_html"): max_rows = pd.get_option("display.max_rows") max_cols = pd.get_option("display.max_columns") + min_rows = pd.get_option("display.min_rows") show_dimensions = pd.get_option("display.show_dimensions") + if len(self) > max_rows: + max_rows = min_rows + return self.to_html(max_rows=max_rows, max_cols=max_cols, - show_dimensions=show_dimensions, notebook=True) + show_dimensions=show_dimensions, notebook=True, + bold_rows=False) # set for consistency with pandas output else: return None @@ -545,11 +553,19 @@ class DataFrame(NDFrame): -------- :pandas_api_docs:`to_html` for argument details. """ - if max_rows is None: + # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this + # by limiting rows by default. + num_rows = len(self) # avoid multiple calls + if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: + max_rows = num_rows + elif max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " "- this will return entire index results. " - "Setting max_rows=60, overwrite if different behaviour is required.") - max_rows = 60 + "Setting max_rows={default}" + " overwrite if different behaviour is required." + .format(default=DEFAULT_NUM_ROWS_DISPLAYED), + UserWarning) + max_rows = DEFAULT_NUM_ROWS_DISPLAYED # Create a slightly bigger dataframe than display df = self._build_repr_df(max_rows + 1, max_cols) @@ -569,7 +585,9 @@ class DataFrame(NDFrame): # Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out # the correct number of rows if show_dimensions: - _buf.write("\n

{nrows} rows x {ncols} columns

" + # TODO - this results in different output to pandas + # TODO - the 'x' character is different and this gets added after the + _buf.write("\n

{nrows} rows x {ncols} columns

" .format(nrows=len(self.index), ncols=len(self.columns))) if buf is None: @@ -590,11 +608,19 @@ class DataFrame(NDFrame): -------- :pandas_api_docs:`to_string` for argument details. """ - if max_rows is None: + # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this + # by limiting rows by default. + num_rows = len(self) # avoid multiple calls + if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: + max_rows = num_rows + elif max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " - "- this will return entire index results. " - "Setting max_rows=60, overwrite if different behaviour is required.") - max_rows = 60 + "- this will return entire index results. " + "Setting max_rows={default}" + " overwrite if different behaviour is required." + .format(default=DEFAULT_NUM_ROWS_DISPLAYED), + UserWarning) + max_rows = DEFAULT_NUM_ROWS_DISPLAYED # Create a slightly bigger dataframe than display df = self._build_repr_df(max_rows + 1, max_cols) diff --git a/eland/tests/dataframe/test_repr_pytest.py b/eland/tests/dataframe/test_repr_pytest.py index 27ce50c..622b88a 100644 --- a/eland/tests/dataframe/test_repr_pytest.py +++ b/eland/tests/dataframe/test_repr_pytest.py @@ -2,57 +2,150 @@ import pytest +import pandas as pd + from eland.tests.common import TestData +from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED + class TestDataFrameRepr(TestData): - def test_head_101_to_string(self): - ed_flights = self.ed_flights() - pd_flights = self.pd_flights() + @classmethod + def setup_class(cls): + # conftest.py changes this default - restore to original setting + pd.set_option('display.max_rows', 60) - ed_head_101 = ed_flights.head(101) - pd_head_101 = pd_flights.head(101) + """ + to_string + """ + def test_num_rows_to_string(self): + # check setup works + assert pd.get_option('display.max_rows') == 60 - # This sets max_rows=60 by default (but throws userwarning) + # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string + # In pandas calling 'to_string' without max_rows set, will dump ALL rows + + # Test n-1, n, n+1 for edge cases + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED-1) + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED) with pytest.warns(UserWarning): - ed_head_101_str = ed_head_101.to_string() - pd_head_101_str = pd_head_101.to_string(max_rows=60) + # UserWarning displayed by eland here + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, DEFAULT_NUM_ROWS_DISPLAYED) - assert pd_head_101_str == ed_head_101_str - - def test_head_11_to_string2(self): + def num_rows_to_string(self, rows, max_rows=None): ed_flights = self.ed_flights() pd_flights = self.pd_flights() - ed_head_11 = ed_flights.head(11) - pd_head_11 = pd_flights.head(11) + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) - ed_head_11_str = ed_head_11.to_string(max_rows=60) - pd_head_11_str = pd_head_11.to_string(max_rows=60) + ed_head_str = ed_head.to_string() + pd_head_str = pd_head.to_string(max_rows=max_rows) - assert pd_head_11_str == ed_head_11_str + #print(ed_head_str) + #print(pd_head_str) - def test_less_than_max_rows_to_string(self): + assert pd_head_str == ed_head_str + + """ + repr + """ + def test_num_rows_repr(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() - ed_less_than_max = ed_flights[ed_flights['AvgTicketPrice']>1190] - pd_less_than_max = pd_flights[pd_flights['AvgTicketPrice']>1190] + self.num_rows_repr(pd.get_option('display.max_rows')-1, pd.get_option('display.max_rows')-1) + self.num_rows_repr(pd.get_option('display.max_rows'), pd.get_option('display.max_rows')) + self.num_rows_repr(pd.get_option('display.max_rows')+1, pd.get_option('display.min_rows')) - ed_less_than_max_str = ed_less_than_max.to_string() - pd_less_than_max_str = pd_less_than_max.to_string() + def num_rows_repr(self, rows, num_rows_printed): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() - assert pd_less_than_max_str == ed_less_than_max_str + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) - def test_repr(self): - ed_ecommerce = self.ed_ecommerce() - pd_ecommerce = self.pd_ecommerce() + ed_head_str = repr(ed_head) + pd_head_str = repr(pd_head) - ed_head_18 = ed_ecommerce.head(18) - pd_head_18 = pd_ecommerce.head(18) + if num_rows_printed < rows: + # add 1 for ellipsis + num_rows_printed = num_rows_printed + 1 - ed_head_18_repr = repr(ed_head_18) - pd_head_18_repr = repr(pd_head_18) + # number of rows is num_rows_printed + 3 (header, summary) + assert (num_rows_printed+3) == len(ed_head_str.splitlines()) - assert ed_head_18_repr == pd_head_18_repr + assert pd_head_str == ed_head_str + + """ + to_html + """ + def test_num_rows_to_html(self): + # check setup works + assert pd.get_option('display.max_rows') == 60 + + # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string + # In pandas calling 'to_string' without max_rows set, will dump ALL rows + + # Test n-1, n, n+1 for edge cases + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED-1) + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED) + with pytest.warns(UserWarning): + # UserWarning displayed by eland here + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, DEFAULT_NUM_ROWS_DISPLAYED) + + def num_rows_to_html(self, rows, max_rows=None): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) + + ed_head_str = ed_head.to_html() + pd_head_str = pd_head.to_html(max_rows=max_rows) + + print(ed_head_str) + print(pd_head_str) + + assert pd_head_str == ed_head_str + + + """ + _repr_html_ + """ + def test_num_rows_repr_html(self): + # check setup works + assert pd.get_option('display.max_rows') == 60 + + show_dimensions = pd.get_option('display.show_dimensions') + + # TODO - there is a bug in 'show_dimensions' as it gets added after the last + # For now test without this + pd.set_option('display.show_dimensions', False) + + # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string + # In pandas calling 'to_string' without max_rows set, will dump ALL rows + + # Test n-1, n, n+1 for edge cases + self.num_rows_repr_html(pd.get_option('display.max_rows')-1) + self.num_rows_repr_html(pd.get_option('display.max_rows')) + self.num_rows_repr_html(pd.get_option('display.max_rows')+1, pd.get_option('display.max_rows')) + + # Restore default + pd.set_option('display.show_dimensions', show_dimensions) + + def num_rows_repr_html(self, rows, max_rows=None): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) + + ed_head_str = ed_head._repr_html_() + pd_head_str = pd_head._repr_html_() + + #print(ed_head_str) + #print(pd_head_str) + + assert pd_head_str == ed_head_str From 327f43d912f33259bf767e10f5e6b5085250606d Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Mon, 18 Nov 2019 14:47:35 +0000 Subject: [PATCH 2/3] Fixing issue in to_html/to_string if max_rows is set --- eland/dataframe.py | 12 ++++++--- eland/tests/dataframe/test_repr_pytest.py | 30 ++++++++++++++--------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index fd7035f..81a165d 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -278,7 +278,7 @@ class DataFrame(NDFrame): def _repr_html_(self): """ - From pandas + From pandas - this is called by notebooks """ if self._info_repr(): buf = StringIO("") @@ -557,7 +557,10 @@ class DataFrame(NDFrame): # by limiting rows by default. num_rows = len(self) # avoid multiple calls if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: - max_rows = num_rows + if max_rows is None: + max_rows = num_rows + else: + max_rows = min(num_rows, max_rows) elif max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " "- this will return entire index results. " @@ -612,7 +615,10 @@ class DataFrame(NDFrame): # by limiting rows by default. num_rows = len(self) # avoid multiple calls if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: - max_rows = num_rows + if max_rows is None: + max_rows = num_rows + else: + max_rows = min(num_rows, max_rows) elif max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " "- this will return entire index results. " diff --git a/eland/tests/dataframe/test_repr_pytest.py b/eland/tests/dataframe/test_repr_pytest.py index 622b88a..bee457a 100644 --- a/eland/tests/dataframe/test_repr_pytest.py +++ b/eland/tests/dataframe/test_repr_pytest.py @@ -30,18 +30,22 @@ class TestDataFrameRepr(TestData): self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED-1) self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED) with pytest.warns(UserWarning): - # UserWarning displayed by eland here - self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, DEFAULT_NUM_ROWS_DISPLAYED) + # UserWarning displayed by eland here (compare to pandas with max_rows set) + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED) - def num_rows_to_string(self, rows, max_rows=None): + # Test for where max_rows lt or gt num_rows + self.num_rows_to_string(10, 5, 5) + self.num_rows_to_string(100, 200, 200) + + def num_rows_to_string(self, rows, max_rows_eland=None, max_rows_pandas=None): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_head = ed_flights.head(rows) pd_head = pd_flights.head(rows) - ed_head_str = ed_head.to_string() - pd_head_str = pd_head.to_string(max_rows=max_rows) + ed_head_str = ed_head.to_string(max_rows=max_rows_eland) + pd_head_str = pd_head.to_string(max_rows=max_rows_pandas) #print(ed_head_str) #print(pd_head_str) @@ -93,20 +97,24 @@ class TestDataFrameRepr(TestData): self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED) with pytest.warns(UserWarning): # UserWarning displayed by eland here - self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, DEFAULT_NUM_ROWS_DISPLAYED) + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED) - def num_rows_to_html(self, rows, max_rows=None): + # Test for where max_rows lt or gt num_rows + self.num_rows_to_html(10, 5, 5) + self.num_rows_to_html(100, 200, 200) + + def num_rows_to_html(self, rows, max_rows_eland=None, max_rows_pandas=None): ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_head = ed_flights.head(rows) pd_head = pd_flights.head(rows) - ed_head_str = ed_head.to_html() - pd_head_str = pd_head.to_html(max_rows=max_rows) + ed_head_str = ed_head.to_html(max_rows=max_rows_eland) + pd_head_str = pd_head.to_html(max_rows=max_rows_pandas) - print(ed_head_str) - print(pd_head_str) + #print(ed_head_str) + #print(pd_head_str) assert pd_head_str == ed_head_str From fb2a1fae7b060da86d53e5896b5719977b6dc4cf Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Mon, 18 Nov 2019 15:27:43 +0000 Subject: [PATCH 3/3] Updated to_string/to_html docs --- eland/dataframe.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index 81a165d..6532138 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -22,6 +22,12 @@ from eland.filter import BooleanFilter, ScriptFilter # Default number of rows displayed (different to pandas where ALL could be displayed) DEFAULT_NUM_ROWS_DISPLAYED = 60 +def docstring_parameter(*sub): + def dec(obj): + obj.__doc__ = obj.__doc__.format(*sub) + return obj + return dec + class DataFrame(NDFrame): """ @@ -540,6 +546,7 @@ class DataFrame(NDFrame): fmt.buffer_put_lines(buf, lines) + @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, @@ -549,6 +556,9 @@ class DataFrame(NDFrame): """ Render a Elasticsearch data as an HTML table. + Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid + accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``. + See Also -------- :pandas_api_docs:`to_html` for argument details. @@ -597,6 +607,7 @@ class DataFrame(NDFrame): result = _buf.getvalue() return result + @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED) def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, @@ -605,7 +616,8 @@ class DataFrame(NDFrame): """ Render a DataFrame to a console-friendly tabular output. - Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index. + Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid + accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``. See Also --------