Merge pull request #48 from stevedodson/master

Improve to_string/to_html/__repr__/_repr_html_ tests
This commit is contained in:
stevedodson 2019-11-19 08:22:36 +00:00 committed by GitHub
commit be4055093b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 186 additions and 41 deletions

View File

@ -4,7 +4,7 @@ import numpy as np
import pandas as pd
import eland as ed
# Fix console sizxe for consistent test results
# Fix console size for consistent test results
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 100)

View File

@ -19,6 +19,15 @@ from eland import NDFrame
from eland import Series
from eland.filter import BooleanFilter, ScriptFilter
# Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub):
def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub)
return obj
return dec
class DataFrame(NDFrame):
"""
@ -275,7 +284,7 @@ class DataFrame(NDFrame):
def _repr_html_(self):
"""
From pandas
From pandas - this is called by notebooks
"""
if self._info_repr():
buf = StringIO("")
@ -288,10 +297,15 @@ class DataFrame(NDFrame):
if pd.get_option("display.notebook_repr_html"):
max_rows = pd.get_option("display.max_rows")
max_cols = pd.get_option("display.max_columns")
min_rows = pd.get_option("display.min_rows")
show_dimensions = pd.get_option("display.show_dimensions")
if len(self) > max_rows:
max_rows = min_rows
return self.to_html(max_rows=max_rows, max_cols=max_cols,
show_dimensions=show_dimensions, notebook=True)
show_dimensions=show_dimensions, notebook=True,
bold_rows=False) # set for consistency with pandas output
else:
return None
@ -532,6 +546,7 @@ class DataFrame(NDFrame):
fmt.buffer_put_lines(buf, lines)
@docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED)
def to_html(self, buf=None, columns=None, col_space=None, header=True,
index=True, na_rep='NaN', formatters=None, float_format=None,
sparsify=None, index_names=True, justify=None, max_rows=None,
@ -541,15 +556,29 @@ class DataFrame(NDFrame):
"""
Render a Elasticsearch data as an HTML table.
Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid
accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``.
See Also
--------
:pandas_api_docs:`to_html` for argument details.
"""
if max_rows is None:
# In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
# by limiting rows by default.
num_rows = len(self) # avoid multiple calls
if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
if max_rows is None:
max_rows = num_rows
else:
max_rows = min(num_rows, max_rows)
elif max_rows is None:
warnings.warn("DataFrame.to_string called without max_rows set "
"- this will return entire index results. "
"Setting max_rows=60, overwrite if different behaviour is required.")
max_rows = 60
"Setting max_rows={default}"
" overwrite if different behaviour is required."
.format(default=DEFAULT_NUM_ROWS_DISPLAYED),
UserWarning)
max_rows = DEFAULT_NUM_ROWS_DISPLAYED
# Create a slightly bigger dataframe than display
df = self._build_repr_df(max_rows + 1, max_cols)
@ -569,13 +598,16 @@ class DataFrame(NDFrame):
# Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
# the correct number of rows
if show_dimensions:
_buf.write("\n<p>{nrows} rows x {ncols} columns</p>"
# TODO - this results in different output to pandas
# TODO - the 'x' character is different and this gets added after the </div>
_buf.write("\n<p>{nrows} rows x {ncols} columns</p>"
.format(nrows=len(self.index), ncols=len(self.columns)))
if buf is None:
result = _buf.getvalue()
return result
@docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED)
def to_string(self, buf=None, columns=None, col_space=None, header=True,
index=True, na_rep='NaN', formatters=None, float_format=None,
sparsify=None, index_names=True, justify=None,
@ -584,17 +616,29 @@ class DataFrame(NDFrame):
"""
Render a DataFrame to a console-friendly tabular output.
Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid
accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``.
See Also
--------
:pandas_api_docs:`to_string` for argument details.
"""
if max_rows is None:
# In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
# by limiting rows by default.
num_rows = len(self) # avoid multiple calls
if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
if max_rows is None:
max_rows = num_rows
else:
max_rows = min(num_rows, max_rows)
elif max_rows is None:
warnings.warn("DataFrame.to_string called without max_rows set "
"- this will return entire index results. "
"Setting max_rows=60, overwrite if different behaviour is required.")
max_rows = 60
"- this will return entire index results. "
"Setting max_rows={default}"
" overwrite if different behaviour is required."
.format(default=DEFAULT_NUM_ROWS_DISPLAYED),
UserWarning)
max_rows = DEFAULT_NUM_ROWS_DISPLAYED
# Create a slightly bigger dataframe than display
df = self._build_repr_df(max_rows + 1, max_cols)

View File

@ -2,57 +2,158 @@
import pytest
import pandas as pd
from eland.tests.common import TestData
from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED
class TestDataFrameRepr(TestData):
def test_head_101_to_string(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@classmethod
def setup_class(cls):
# conftest.py changes this default - restore to original setting
pd.set_option('display.max_rows', 60)
ed_head_101 = ed_flights.head(101)
pd_head_101 = pd_flights.head(101)
"""
to_string
"""
def test_num_rows_to_string(self):
# check setup works
assert pd.get_option('display.max_rows') == 60
# This sets max_rows=60 by default (but throws userwarning)
# Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
# In pandas calling 'to_string' without max_rows set, will dump ALL rows
# Test n-1, n, n+1 for edge cases
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED-1)
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED)
with pytest.warns(UserWarning):
ed_head_101_str = ed_head_101.to_string()
pd_head_101_str = pd_head_101.to_string(max_rows=60)
# UserWarning displayed by eland here (compare to pandas with max_rows set)
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED)
assert pd_head_101_str == ed_head_101_str
# Test for where max_rows lt or gt num_rows
self.num_rows_to_string(10, 5, 5)
self.num_rows_to_string(100, 200, 200)
def test_head_11_to_string2(self):
def num_rows_to_string(self, rows, max_rows_eland=None, max_rows_pandas=None):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_head_11 = ed_flights.head(11)
pd_head_11 = pd_flights.head(11)
ed_head = ed_flights.head(rows)
pd_head = pd_flights.head(rows)
ed_head_11_str = ed_head_11.to_string(max_rows=60)
pd_head_11_str = pd_head_11.to_string(max_rows=60)
ed_head_str = ed_head.to_string(max_rows=max_rows_eland)
pd_head_str = pd_head.to_string(max_rows=max_rows_pandas)
assert pd_head_11_str == ed_head_11_str
#print(ed_head_str)
#print(pd_head_str)
def test_less_than_max_rows_to_string(self):
assert pd_head_str == ed_head_str
"""
repr
"""
def test_num_rows_repr(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_less_than_max = ed_flights[ed_flights['AvgTicketPrice']>1190]
pd_less_than_max = pd_flights[pd_flights['AvgTicketPrice']>1190]
self.num_rows_repr(pd.get_option('display.max_rows')-1, pd.get_option('display.max_rows')-1)
self.num_rows_repr(pd.get_option('display.max_rows'), pd.get_option('display.max_rows'))
self.num_rows_repr(pd.get_option('display.max_rows')+1, pd.get_option('display.min_rows'))
ed_less_than_max_str = ed_less_than_max.to_string()
pd_less_than_max_str = pd_less_than_max.to_string()
def num_rows_repr(self, rows, num_rows_printed):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
assert pd_less_than_max_str == ed_less_than_max_str
ed_head = ed_flights.head(rows)
pd_head = pd_flights.head(rows)
def test_repr(self):
ed_ecommerce = self.ed_ecommerce()
pd_ecommerce = self.pd_ecommerce()
ed_head_str = repr(ed_head)
pd_head_str = repr(pd_head)
ed_head_18 = ed_ecommerce.head(18)
pd_head_18 = pd_ecommerce.head(18)
if num_rows_printed < rows:
# add 1 for ellipsis
num_rows_printed = num_rows_printed + 1
ed_head_18_repr = repr(ed_head_18)
pd_head_18_repr = repr(pd_head_18)
# number of rows is num_rows_printed + 3 (header, summary)
assert (num_rows_printed+3) == len(ed_head_str.splitlines())
assert ed_head_18_repr == pd_head_18_repr
assert pd_head_str == ed_head_str
"""
to_html
"""
def test_num_rows_to_html(self):
# check setup works
assert pd.get_option('display.max_rows') == 60
# Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
# In pandas calling 'to_string' without max_rows set, will dump ALL rows
# Test n-1, n, n+1 for edge cases
self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED-1)
self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED)
with pytest.warns(UserWarning):
# UserWarning displayed by eland here
self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED)
# Test for where max_rows lt or gt num_rows
self.num_rows_to_html(10, 5, 5)
self.num_rows_to_html(100, 200, 200)
def num_rows_to_html(self, rows, max_rows_eland=None, max_rows_pandas=None):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_head = ed_flights.head(rows)
pd_head = pd_flights.head(rows)
ed_head_str = ed_head.to_html(max_rows=max_rows_eland)
pd_head_str = pd_head.to_html(max_rows=max_rows_pandas)
#print(ed_head_str)
#print(pd_head_str)
assert pd_head_str == ed_head_str
"""
_repr_html_
"""
def test_num_rows_repr_html(self):
# check setup works
assert pd.get_option('display.max_rows') == 60
show_dimensions = pd.get_option('display.show_dimensions')
# TODO - there is a bug in 'show_dimensions' as it gets added after the last </div>
# For now test without this
pd.set_option('display.show_dimensions', False)
# Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
# In pandas calling 'to_string' without max_rows set, will dump ALL rows
# Test n-1, n, n+1 for edge cases
self.num_rows_repr_html(pd.get_option('display.max_rows')-1)
self.num_rows_repr_html(pd.get_option('display.max_rows'))
self.num_rows_repr_html(pd.get_option('display.max_rows')+1, pd.get_option('display.max_rows'))
# Restore default
pd.set_option('display.show_dimensions', show_dimensions)
def num_rows_repr_html(self, rows, max_rows=None):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_head = ed_flights.head(rows)
pd_head = pd_flights.head(rows)
ed_head_str = ed_head._repr_html_()
pd_head_str = pd_head._repr_html_()
#print(ed_head_str)
#print(pd_head_str)
assert pd_head_str == ed_head_str