diff --git a/eland/conftest.py b/eland/conftest.py index ce62d3b..98cebe5 100644 --- a/eland/conftest.py +++ b/eland/conftest.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import eland as ed -# Fix console sizxe for consistent test results +# Fix console size for consistent test results pd.set_option('display.max_rows', 10) pd.set_option('display.max_columns', 5) pd.set_option('display.width', 100) diff --git a/eland/dataframe.py b/eland/dataframe.py index f9a18b4..6532138 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,6 +19,15 @@ from eland import NDFrame from eland import Series from eland.filter import BooleanFilter, ScriptFilter +# Default number of rows displayed (different to pandas where ALL could be displayed) +DEFAULT_NUM_ROWS_DISPLAYED = 60 + +def docstring_parameter(*sub): + def dec(obj): + obj.__doc__ = obj.__doc__.format(*sub) + return obj + return dec + class DataFrame(NDFrame): """ @@ -275,7 +284,7 @@ class DataFrame(NDFrame): def _repr_html_(self): """ - From pandas + From pandas - this is called by notebooks """ if self._info_repr(): buf = StringIO("") @@ -288,10 +297,15 @@ class DataFrame(NDFrame): if pd.get_option("display.notebook_repr_html"): max_rows = pd.get_option("display.max_rows") max_cols = pd.get_option("display.max_columns") + min_rows = pd.get_option("display.min_rows") show_dimensions = pd.get_option("display.show_dimensions") + if len(self) > max_rows: + max_rows = min_rows + return self.to_html(max_rows=max_rows, max_cols=max_cols, - show_dimensions=show_dimensions, notebook=True) + show_dimensions=show_dimensions, notebook=True, + bold_rows=False) # set for consistency with pandas output else: return None @@ -532,6 +546,7 @@ class DataFrame(NDFrame): fmt.buffer_put_lines(buf, lines) + @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED) def to_html(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, max_rows=None, @@ -541,15 +556,29 @@ class DataFrame(NDFrame): """ Render a Elasticsearch data as an HTML table. + Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid + accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``. + See Also -------- :pandas_api_docs:`to_html` for argument details. """ - if max_rows is None: + # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this + # by limiting rows by default. + num_rows = len(self) # avoid multiple calls + if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: + if max_rows is None: + max_rows = num_rows + else: + max_rows = min(num_rows, max_rows) + elif max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " "- this will return entire index results. " - "Setting max_rows=60, overwrite if different behaviour is required.") - max_rows = 60 + "Setting max_rows={default}" + " overwrite if different behaviour is required." + .format(default=DEFAULT_NUM_ROWS_DISPLAYED), + UserWarning) + max_rows = DEFAULT_NUM_ROWS_DISPLAYED # Create a slightly bigger dataframe than display df = self._build_repr_df(max_rows + 1, max_cols) @@ -569,13 +598,16 @@ class DataFrame(NDFrame): # Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out # the correct number of rows if show_dimensions: - _buf.write("\n

{nrows} rows x {ncols} columns

" + # TODO - this results in different output to pandas + # TODO - the 'x' character is different and this gets added after the + _buf.write("\n

{nrows} rows x {ncols} columns

" .format(nrows=len(self.index), ncols=len(self.columns))) if buf is None: result = _buf.getvalue() return result + @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED) def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, @@ -584,17 +616,29 @@ class DataFrame(NDFrame): """ Render a DataFrame to a console-friendly tabular output. - Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index. + Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid + accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``. See Also -------- :pandas_api_docs:`to_string` for argument details. """ - if max_rows is None: + # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this + # by limiting rows by default. + num_rows = len(self) # avoid multiple calls + if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: + if max_rows is None: + max_rows = num_rows + else: + max_rows = min(num_rows, max_rows) + elif max_rows is None: warnings.warn("DataFrame.to_string called without max_rows set " - "- this will return entire index results. " - "Setting max_rows=60, overwrite if different behaviour is required.") - max_rows = 60 + "- this will return entire index results. " + "Setting max_rows={default}" + " overwrite if different behaviour is required." + .format(default=DEFAULT_NUM_ROWS_DISPLAYED), + UserWarning) + max_rows = DEFAULT_NUM_ROWS_DISPLAYED # Create a slightly bigger dataframe than display df = self._build_repr_df(max_rows + 1, max_cols) diff --git a/eland/tests/dataframe/test_repr_pytest.py b/eland/tests/dataframe/test_repr_pytest.py index 27ce50c..bee457a 100644 --- a/eland/tests/dataframe/test_repr_pytest.py +++ b/eland/tests/dataframe/test_repr_pytest.py @@ -2,57 +2,158 @@ import pytest +import pandas as pd + from eland.tests.common import TestData +from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED + class TestDataFrameRepr(TestData): - def test_head_101_to_string(self): - ed_flights = self.ed_flights() - pd_flights = self.pd_flights() + @classmethod + def setup_class(cls): + # conftest.py changes this default - restore to original setting + pd.set_option('display.max_rows', 60) - ed_head_101 = ed_flights.head(101) - pd_head_101 = pd_flights.head(101) + """ + to_string + """ + def test_num_rows_to_string(self): + # check setup works + assert pd.get_option('display.max_rows') == 60 - # This sets max_rows=60 by default (but throws userwarning) + # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string + # In pandas calling 'to_string' without max_rows set, will dump ALL rows + + # Test n-1, n, n+1 for edge cases + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED-1) + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED) with pytest.warns(UserWarning): - ed_head_101_str = ed_head_101.to_string() - pd_head_101_str = pd_head_101.to_string(max_rows=60) + # UserWarning displayed by eland here (compare to pandas with max_rows set) + self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED) - assert pd_head_101_str == ed_head_101_str + # Test for where max_rows lt or gt num_rows + self.num_rows_to_string(10, 5, 5) + self.num_rows_to_string(100, 200, 200) - def test_head_11_to_string2(self): + def num_rows_to_string(self, rows, max_rows_eland=None, max_rows_pandas=None): ed_flights = self.ed_flights() pd_flights = self.pd_flights() - ed_head_11 = ed_flights.head(11) - pd_head_11 = pd_flights.head(11) + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) - ed_head_11_str = ed_head_11.to_string(max_rows=60) - pd_head_11_str = pd_head_11.to_string(max_rows=60) + ed_head_str = ed_head.to_string(max_rows=max_rows_eland) + pd_head_str = pd_head.to_string(max_rows=max_rows_pandas) - assert pd_head_11_str == ed_head_11_str + #print(ed_head_str) + #print(pd_head_str) - def test_less_than_max_rows_to_string(self): + assert pd_head_str == ed_head_str + + """ + repr + """ + def test_num_rows_repr(self): ed_flights = self.ed_flights() pd_flights = self.pd_flights() - ed_less_than_max = ed_flights[ed_flights['AvgTicketPrice']>1190] - pd_less_than_max = pd_flights[pd_flights['AvgTicketPrice']>1190] + self.num_rows_repr(pd.get_option('display.max_rows')-1, pd.get_option('display.max_rows')-1) + self.num_rows_repr(pd.get_option('display.max_rows'), pd.get_option('display.max_rows')) + self.num_rows_repr(pd.get_option('display.max_rows')+1, pd.get_option('display.min_rows')) - ed_less_than_max_str = ed_less_than_max.to_string() - pd_less_than_max_str = pd_less_than_max.to_string() + def num_rows_repr(self, rows, num_rows_printed): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() - assert pd_less_than_max_str == ed_less_than_max_str + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) - def test_repr(self): - ed_ecommerce = self.ed_ecommerce() - pd_ecommerce = self.pd_ecommerce() + ed_head_str = repr(ed_head) + pd_head_str = repr(pd_head) - ed_head_18 = ed_ecommerce.head(18) - pd_head_18 = pd_ecommerce.head(18) + if num_rows_printed < rows: + # add 1 for ellipsis + num_rows_printed = num_rows_printed + 1 - ed_head_18_repr = repr(ed_head_18) - pd_head_18_repr = repr(pd_head_18) + # number of rows is num_rows_printed + 3 (header, summary) + assert (num_rows_printed+3) == len(ed_head_str.splitlines()) - assert ed_head_18_repr == pd_head_18_repr + assert pd_head_str == ed_head_str + + """ + to_html + """ + def test_num_rows_to_html(self): + # check setup works + assert pd.get_option('display.max_rows') == 60 + + # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string + # In pandas calling 'to_string' without max_rows set, will dump ALL rows + + # Test n-1, n, n+1 for edge cases + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED-1) + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED) + with pytest.warns(UserWarning): + # UserWarning displayed by eland here + self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED) + + # Test for where max_rows lt or gt num_rows + self.num_rows_to_html(10, 5, 5) + self.num_rows_to_html(100, 200, 200) + + def num_rows_to_html(self, rows, max_rows_eland=None, max_rows_pandas=None): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) + + ed_head_str = ed_head.to_html(max_rows=max_rows_eland) + pd_head_str = pd_head.to_html(max_rows=max_rows_pandas) + + #print(ed_head_str) + #print(pd_head_str) + + assert pd_head_str == ed_head_str + + + """ + _repr_html_ + """ + def test_num_rows_repr_html(self): + # check setup works + assert pd.get_option('display.max_rows') == 60 + + show_dimensions = pd.get_option('display.show_dimensions') + + # TODO - there is a bug in 'show_dimensions' as it gets added after the last + # For now test without this + pd.set_option('display.show_dimensions', False) + + # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string + # In pandas calling 'to_string' without max_rows set, will dump ALL rows + + # Test n-1, n, n+1 for edge cases + self.num_rows_repr_html(pd.get_option('display.max_rows')-1) + self.num_rows_repr_html(pd.get_option('display.max_rows')) + self.num_rows_repr_html(pd.get_option('display.max_rows')+1, pd.get_option('display.max_rows')) + + # Restore default + pd.set_option('display.show_dimensions', show_dimensions) + + def num_rows_repr_html(self, rows, max_rows=None): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + ed_head = ed_flights.head(rows) + pd_head = pd_flights.head(rows) + + ed_head_str = ed_head._repr_html_() + pd_head_str = pd_head._repr_html_() + + #print(ed_head_str) + #print(pd_head_str) + + assert pd_head_str == ed_head_str