Refactor df.info() for better readability

This commit is contained in:
P. Sai Vinay 2021-09-29 01:42:29 +05:30 committed by GitHub
parent bc201e22dd
commit dabb327b8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -154,7 +154,8 @@ class DataFrame(NDFrame):
_query_compiler=_query_compiler,
)
def _get_columns(self) -> pd.Index:
@property
def columns(self) -> pd.Index:
"""
The column labels of the DataFrame.
@ -182,8 +183,6 @@ class DataFrame(NDFrame):
"""
return self._query_compiler.columns
columns = property(_get_columns)
@property
def empty(self) -> bool:
"""Determines if the DataFrame is empty.
@ -808,7 +807,12 @@ class DataFrame(NDFrame):
return f"{name}: {len(self)} entries{index_summary}"
def info(
self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
self,
verbose: Optional[bool] = None,
buf: Optional[StringIO] = None,
max_cols: Optional[int] = None,
memory_usage: Optional[bool] = None,
show_counts: Optional[bool] = None,
) -> None:
"""
Print a concise summary of a DataFrame.
@ -844,42 +848,41 @@ class DataFrame(NDFrame):
lines = [str(type(self)), self._index_summary()]
if len(self.columns) == 0:
columns: pd.Index = self.columns
number_of_columns: int = len(columns)
if number_of_columns == 0:
lines.append(f"Empty {type(self).__name__}")
fmt.buffer_put_lines(buf, lines)
return
cols = self.columns
col_count = len(self.columns)
# hack
if max_cols is None:
max_cols = pd.get_option("display.max_info_columns", len(self.columns) + 1)
max_cols = pd.get_option("display.max_info_columns", number_of_columns + 1)
max_rows = pd.get_option("display.max_info_rows", len(self) + 1)
if null_counts is None:
show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows)
else:
show_counts = null_counts
exceeds_info_cols = len(self.columns) > max_cols
if show_counts is None:
show_counts = (number_of_columns <= max_cols) and (len(self) < max_rows)
exceeds_info_cols = number_of_columns > max_cols
# From pandas.DataFrame
def _put_str(s, space) -> str:
return f"{s}"[:space].ljust(space)
def _verbose_repr() -> None:
lines.append(f"Data columns (total {len(self.columns)} columns):")
def _verbose_repr(number_of_columns: int) -> None:
lines.append(f"Data columns (total {number_of_columns} columns):")
id_head = " # "
column_head = "Column"
col_space = 2
max_col = max(len(pprint_thing(k)) for k in cols)
max_col = max(len(pprint_thing(k)) for k in columns)
len_column = len(pprint_thing(column_head))
space = max(max_col, len_column) + col_space
max_id = len(pprint_thing(col_count))
max_id = len(pprint_thing(number_of_columns))
len_id = len(pprint_thing(id_head))
space_num = max(max_id, len_id) + col_space
counts = None
@ -887,9 +890,9 @@ class DataFrame(NDFrame):
header = _put_str(id_head, space_num) + _put_str(column_head, space)
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
if number_of_columns != len(counts): # pragma: no cover
raise AssertionError(
f"Columns must equal counts ({len(cols):d} != {len(counts):d})"
f"Columns must equal counts ({number_of_columns:d} != {len(counts):d})"
)
count_header = "Non-Null Count"
len_count = len(count_header)
@ -920,7 +923,7 @@ class DataFrame(NDFrame):
)
dtypes = self.dtypes
for i, col in enumerate(self.columns):
for i, col in enumerate(columns):
dtype = dtypes.iloc[i]
col = pprint_thing(col)
@ -938,7 +941,7 @@ class DataFrame(NDFrame):
)
def _non_verbose_repr() -> None:
lines.append(self.columns._summary(name="Columns"))
lines.append(columns._summary(name="Columns"))
def _sizeof_fmt(num: float, size_qualifier: str) -> str:
# returns size in human readable format
@ -949,14 +952,13 @@ class DataFrame(NDFrame):
return f"{num:3.3f}{size_qualifier} PB"
if verbose:
_verbose_repr()
_verbose_repr(number_of_columns)
elif verbose is False: # specifically set to False, not nesc None
_non_verbose_repr()
else:
if exceeds_info_cols:
_non_verbose_repr()
else:
_verbose_repr()
_non_verbose_repr() if exceeds_info_cols else _verbose_repr(
number_of_columns
)
# pandas 0.25.1 uses get_dtype_counts() here. This
# returns a Series with strings as the index NOT dtypes.