Merge pull request #48 from stevedodson/master

Improve to_string/to_html/__repr__/_repr_html_ tests
2025-07-24 00:00:39 +08:00 · 2019-11-19 08:22:36 +00:00 · 2019-11-19 08:22:36 +00:00 · be4055093b
commit be4055093b
parent c93d07981d fb2a1fae7b
3 changed files with 186 additions and 41 deletions
--- a/eland/conftest.py
+++ b/eland/conftest.py
@ -4,7 +4,7 @@ import numpy as np
 import pandas as pd
 import eland as ed
-# Fix console sizxe for consistent test results
+# Fix console size for consistent test results
 pd.set_option('display.max_rows', 10)
 pd.set_option('display.max_columns', 5)
 pd.set_option('display.width', 100)
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -19,6 +19,15 @@ from eland import NDFrame
 from eland import Series
 from eland.filter import BooleanFilter, ScriptFilter
 # Default number of rows displayed (different to pandas where ALL could be displayed)
 DEFAULT_NUM_ROWS_DISPLAYED = 60
 def docstring_parameter(*sub):
    def dec(obj):
        obj.__doc__ = obj.__doc__.format(*sub)
        return obj
    return dec
 class DataFrame(NDFrame):
    """
@ -275,7 +284,7 @@ class DataFrame(NDFrame):
    def _repr_html_(self):
        """
-        From pandas
+        From pandas - this is called by notebooks
        """
        if self._info_repr():
            buf = StringIO("")
@ -288,10 +297,15 @@ class DataFrame(NDFrame):
        if pd.get_option("display.notebook_repr_html"):
            max_rows = pd.get_option("display.max_rows")
            max_cols = pd.get_option("display.max_columns")
            min_rows = pd.get_option("display.min_rows")
            show_dimensions = pd.get_option("display.show_dimensions")
            if len(self) > max_rows:
                max_rows = min_rows
            return self.to_html(max_rows=max_rows, max_cols=max_cols,
-                                show_dimensions=show_dimensions, notebook=True)
+                                show_dimensions=show_dimensions, notebook=True,
                                bold_rows=False) # set for consistency with pandas output
        else:
            return None
@ -532,6 +546,7 @@ class DataFrame(NDFrame):
        fmt.buffer_put_lines(buf, lines)
    @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED)
    def to_html(self, buf=None, columns=None, col_space=None, header=True,
                index=True, na_rep='NaN', formatters=None, float_format=None,
                sparsify=None, index_names=True, justify=None, max_rows=None,
@ -541,15 +556,29 @@ class DataFrame(NDFrame):
        """
        Render a Elasticsearch data as an HTML table.
        Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid
        accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``.
        See Also
        --------
        :pandas_api_docs:`to_html` for argument details.
        """
-        if max_rows is None:
+        # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
        # by limiting rows by default.
        num_rows = len(self) # avoid multiple calls
        if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
            if max_rows is None:
                max_rows = num_rows
            else:
                max_rows = min(num_rows, max_rows)
        elif max_rows is None:
            warnings.warn("DataFrame.to_string called without max_rows set "
                          "- this will return entire index results. "
-                          "Setting max_rows=60, overwrite if different behaviour is required.")
+                          "Setting max_rows={default}"
-            max_rows = 60
+                          " overwrite if different behaviour is required."
                          .format(default=DEFAULT_NUM_ROWS_DISPLAYED),
                          UserWarning)
            max_rows = DEFAULT_NUM_ROWS_DISPLAYED
        # Create a slightly bigger dataframe than display
        df = self._build_repr_df(max_rows + 1, max_cols)
@ -569,13 +598,16 @@ class DataFrame(NDFrame):
        # Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
        # the correct number of rows
        if show_dimensions:
-            _buf.write("\n<p>{nrows} rows x {ncols} columns</p>"
+                # TODO - this results in different output to pandas
                # TODO - the 'x' character is different and this gets added after the </div>
                _buf.write("\n<p>{nrows} rows x {ncols} columns</p>"
                       .format(nrows=len(self.index), ncols=len(self.columns)))
        if buf is None:
            result = _buf.getvalue()
            return result
    @docstring_parameter(DEFAULT_NUM_ROWS_DISPLAYED)
    def to_string(self, buf=None, columns=None, col_space=None, header=True,
                  index=True, na_rep='NaN', formatters=None, float_format=None,
                  sparsify=None, index_names=True, justify=None,
@ -584,17 +616,29 @@ class DataFrame(NDFrame):
        """
        Render a DataFrame to a console-friendly tabular output.
-        Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
+        Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid
        accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``.
        See Also
        --------
        :pandas_api_docs:`to_string` for argument details.
        """
-        if max_rows is None:
+        # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
        # by limiting rows by default.
        num_rows = len(self) # avoid multiple calls
        if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
            if max_rows is None:
                max_rows = num_rows
            else:
                max_rows = min(num_rows, max_rows)
        elif max_rows is None:
            warnings.warn("DataFrame.to_string called without max_rows set "
-                          "- this will return entire index results. "
+                      "- this will return entire index results. "
-                          "Setting max_rows=60, overwrite if different behaviour is required.")
+                      "Setting max_rows={default}"
-            max_rows = 60
+                      " overwrite if different behaviour is required."
                          .format(default=DEFAULT_NUM_ROWS_DISPLAYED),
                          UserWarning)
            max_rows = DEFAULT_NUM_ROWS_DISPLAYED
        # Create a slightly bigger dataframe than display
        df = self._build_repr_df(max_rows + 1, max_cols)
--- a/eland/tests/dataframe/test_repr_pytest.py
+++ b/eland/tests/dataframe/test_repr_pytest.py
@ -2,57 +2,158 @@
 import pytest
 import pandas as pd
 from eland.tests.common import TestData
 from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED
 class TestDataFrameRepr(TestData):
-    def test_head_101_to_string(self):
+    @classmethod
-        ed_flights = self.ed_flights()
+    def setup_class(cls):
-        pd_flights = self.pd_flights()
+        # conftest.py changes this default - restore to original setting
        pd.set_option('display.max_rows', 60)
-        ed_head_101 = ed_flights.head(101)
+    """
-        pd_head_101 = pd_flights.head(101)
+    to_string
    """
    def test_num_rows_to_string(self):
        # check setup works
        assert pd.get_option('display.max_rows') == 60
-        # This sets max_rows=60 by default (but throws userwarning)
+        # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
        # In pandas calling 'to_string' without max_rows set, will dump ALL rows
        # Test n-1, n, n+1 for edge cases
        self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED-1)
        self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED)
        with pytest.warns(UserWarning):
-            ed_head_101_str = ed_head_101.to_string()
+            # UserWarning displayed by eland here (compare to pandas with max_rows set)
-        pd_head_101_str = pd_head_101.to_string(max_rows=60)
+            self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED)
-        assert pd_head_101_str == ed_head_101_str
+        # Test for where max_rows lt or gt num_rows
        self.num_rows_to_string(10, 5, 5)
        self.num_rows_to_string(100, 200, 200)
-    def test_head_11_to_string2(self):
+    def num_rows_to_string(self, rows, max_rows_eland=None, max_rows_pandas=None):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
-        ed_head_11 = ed_flights.head(11)
+        ed_head = ed_flights.head(rows)
-        pd_head_11 = pd_flights.head(11)
+        pd_head = pd_flights.head(rows)
-        ed_head_11_str = ed_head_11.to_string(max_rows=60)
+        ed_head_str = ed_head.to_string(max_rows=max_rows_eland)
-        pd_head_11_str = pd_head_11.to_string(max_rows=60)
+        pd_head_str = pd_head.to_string(max_rows=max_rows_pandas)
-        assert pd_head_11_str == ed_head_11_str
+        #print(ed_head_str)
        #print(pd_head_str)
-    def test_less_than_max_rows_to_string(self):
+        assert pd_head_str == ed_head_str
    """
    repr
    """
    def test_num_rows_repr(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
-        ed_less_than_max = ed_flights[ed_flights['AvgTicketPrice']>1190]
+        self.num_rows_repr(pd.get_option('display.max_rows')-1, pd.get_option('display.max_rows')-1)
-        pd_less_than_max = pd_flights[pd_flights['AvgTicketPrice']>1190]
+        self.num_rows_repr(pd.get_option('display.max_rows'), pd.get_option('display.max_rows'))
        self.num_rows_repr(pd.get_option('display.max_rows')+1, pd.get_option('display.min_rows'))
-        ed_less_than_max_str = ed_less_than_max.to_string()
+    def num_rows_repr(self, rows, num_rows_printed):
-        pd_less_than_max_str = pd_less_than_max.to_string()
+        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
-        assert pd_less_than_max_str == ed_less_than_max_str
+        ed_head = ed_flights.head(rows)
        pd_head = pd_flights.head(rows)
-    def test_repr(self):
+        ed_head_str = repr(ed_head)
-        ed_ecommerce = self.ed_ecommerce()
+        pd_head_str = repr(pd_head)
        pd_ecommerce = self.pd_ecommerce()
-        ed_head_18 = ed_ecommerce.head(18)
+        if num_rows_printed < rows:
-        pd_head_18 = pd_ecommerce.head(18)
+            # add 1 for ellipsis
            num_rows_printed = num_rows_printed + 1
-        ed_head_18_repr = repr(ed_head_18)
+        # number of rows is num_rows_printed + 3 (header, summary)
-        pd_head_18_repr = repr(pd_head_18)
+        assert (num_rows_printed+3) == len(ed_head_str.splitlines())
-        assert ed_head_18_repr == pd_head_18_repr
+        assert pd_head_str == ed_head_str
    """
    to_html 
    """
    def test_num_rows_to_html(self):
        # check setup works
        assert pd.get_option('display.max_rows') == 60
        # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
        # In pandas calling 'to_string' without max_rows set, will dump ALL rows
        # Test n-1, n, n+1 for edge cases
        self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED-1)
        self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED)
        with pytest.warns(UserWarning):
            # UserWarning displayed by eland here
            self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED)
        # Test for where max_rows lt or gt num_rows
        self.num_rows_to_html(10, 5, 5)
        self.num_rows_to_html(100, 200, 200)
    def num_rows_to_html(self, rows, max_rows_eland=None, max_rows_pandas=None):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_head = ed_flights.head(rows)
        pd_head = pd_flights.head(rows)
        ed_head_str = ed_head.to_html(max_rows=max_rows_eland)
        pd_head_str = pd_head.to_html(max_rows=max_rows_pandas)
        #print(ed_head_str)
        #print(pd_head_str)
        assert pd_head_str == ed_head_str
    """
    _repr_html_
    """
    def test_num_rows_repr_html(self):
        # check setup works
        assert pd.get_option('display.max_rows') == 60
        show_dimensions = pd.get_option('display.show_dimensions')
        # TODO - there is a bug in 'show_dimensions' as it gets added after the last </div>
        # For now test without this
        pd.set_option('display.show_dimensions', False)
        # Test eland.DataFrame.to_string vs pandas.DataFrame.to_string
        # In pandas calling 'to_string' without max_rows set, will dump ALL rows
        # Test n-1, n, n+1 for edge cases
        self.num_rows_repr_html(pd.get_option('display.max_rows')-1)
        self.num_rows_repr_html(pd.get_option('display.max_rows'))
        self.num_rows_repr_html(pd.get_option('display.max_rows')+1, pd.get_option('display.max_rows'))
        # Restore default
        pd.set_option('display.show_dimensions', show_dimensions)
    def num_rows_repr_html(self, rows, max_rows=None):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_head = ed_flights.head(rows)
        pd_head = pd_flights.head(rows)
        ed_head_str = ed_head._repr_html_()
        pd_head_str = pd_head._repr_html_()
        #print(ed_head_str)
        #print(pd_head_str)
        assert pd_head_str == ed_head_str