Update supported Pandas to v1.0

This commit is contained in:
Daniel Mesejo-León 2020-03-27 18:21:15 +01:00 committed by GitHub
parent 0c1d7222fe
commit e27a508c59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 249 additions and 155 deletions

1
.gitignore vendored
View File

@ -43,6 +43,7 @@ ipython_config.py
# Environments
.env
.venv
.nox
env/
venv/
ENV/

View File

@ -1,5 +1,5 @@
elasticsearch>=7.0.5
pandas==0.25.3
pandas>=1
matplotlib
pytest>=5.2.1
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master

File diff suppressed because one or more lines are too long

View File

@ -22,7 +22,7 @@ from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.computation.eval import eval
from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path
from pandas.io.common import _expand_user, stringify_path
from pandas.io.formats import console
from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing
@ -249,12 +249,19 @@ class DataFrame(NDFrame):
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
>>> df.tail()
Origin Dest
13054 Pisa International Airport Xi'an Xianyang International Airport
13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport
13056 Licenciado Benito Juarez International Airport Ukrainka Air Base
13057 Itami Airport Ministro Pistarini International Airport
13058 Adelaide International Airport Washington Dulles International Airport
Origin \\
13054 Pisa International Airport
13055 Winnipeg / James Armstrong Richardson International Airport
13056 Licenciado Benito Juarez International Airport
13057 Itami Airport
13058 Adelaide International Airport
<BLANKLINE>
Dest
13054 Xi'an Xianyang International Airport
13055 Zurich Airport
13056 Ukrainka Air Base
13057 Ministro Pistarini International Airport
13058 Washington Dulles International Airport
<BLANKLINE>
[5 rows x 2 columns]
"""
@ -602,8 +609,10 @@ class DataFrame(NDFrame):
<class 'eland.dataframe.DataFrame'>
Index: 4675 entries, 0 to 4674
Data columns (total 2 columns):
customer_first_name 4675 non-null object
geoip.city_name 4094 non-null object
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customer_first_name 4675 non-null object
1 geoip.city_name 4094 non-null object
dtypes: object(2)
memory usage: ...
"""
@ -618,6 +627,7 @@ class DataFrame(NDFrame):
return
cols = self.columns
col_count = len(self.columns)
# hack
if max_cols is None:
@ -637,30 +647,74 @@ class DataFrame(NDFrame):
def _verbose_repr():
lines.append(f"Data columns (total {len(self.columns)} columns):")
space = max(len(pprint_thing(k)) for k in self.columns) + 4
id_head = " # "
column_head = "Column"
col_space = 2
max_col = max(len(pprint_thing(k)) for k in cols)
len_column = len(pprint_thing(column_head))
space = max(max_col, len_column) + col_space
max_id = len(pprint_thing(col_count))
len_id = len(pprint_thing(id_head))
space_num = max(max_id, len_id) + col_space
counts = None
tmpl = "{count}{dtype}"
header = _put_str(id_head, space_num) + _put_str(column_head, space)
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError(
f"Columns must equal counts "
f"({len(cols):d} != {len(counts):d})"
"Columns must equal counts "
"({cols:d} != {counts:d})".format(
cols=len(cols), counts=len(counts)
)
)
count_header = "Non-Null Count"
len_count = len(count_header)
non_null = " non-null"
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
space_count = max(len_count, max_count) + col_space
count_temp = "{count}" + non_null
else:
count_header = ""
space_count = len(count_header)
len_count = space_count
count_temp = "{count}"
dtype_header = "Dtype"
len_dtype = len(dtype_header)
max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
space_dtype = max(len_dtype, max_dtypes)
header += _put_str(count_header, space_count) + _put_str(
dtype_header, space_dtype
)
lines.append(header)
lines.append(
_put_str("-" * len_id, space_num)
+ _put_str("-" * len_column, space)
+ _put_str("-" * len_count, space_count)
+ _put_str("-" * len_dtype, space_dtype)
)
tmpl = "{count} non-null {dtype}"
dtypes = self.dtypes
for i, col in enumerate(self.columns):
dtype = dtypes.iloc[i]
col = pprint_thing(col)
line_no = _put_str(" {num}".format(num=i), space_num)
count = ""
if show_counts:
count = counts.iloc[i]
lines.append(
_put_str(col, space) + tmpl.format(count=count, dtype=dtype)
line_no
+ _put_str(col, space)
+ _put_str(count_temp.format(count=count), space_count)
+ _put_str(dtype, space_dtype)
)
def _non_verbose_repr():
@ -769,7 +823,7 @@ class DataFrame(NDFrame):
df = self._build_repr(max_rows + 1)
if buf is not None:
_buf = _expand_user(_stringify_path(buf))
_buf = _expand_user(stringify_path(buf))
else:
_buf = StringIO()
@ -866,7 +920,7 @@ class DataFrame(NDFrame):
df = self._build_repr(max_rows + 1)
if buf is not None:
_buf = _expand_user(_stringify_path(buf))
_buf = _expand_user(stringify_path(buf))
else:
_buf = StringIO()

View File

@ -238,16 +238,16 @@ class NDFrame(ABC):
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.min()
AvgTicketPrice 100.020531
Cancelled 0.000000
DistanceKilometers 0.000000
DistanceMiles 0.000000
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 0.000000
FlightTimeMin 0.000000
dayOfWeek 0.000000
dtype: float64
AvgTicketPrice 100.021
Cancelled False
DistanceKilometers 0
DistanceMiles 0
FlightDelay False
FlightDelayMin 0
FlightTimeHour 0
FlightTimeMin 0
dayOfWeek 0
dtype: object
"""
return self._query_compiler.min(numeric_only=numeric_only)
@ -270,16 +270,16 @@ class NDFrame(ABC):
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.max()
AvgTicketPrice 1199.729004
Cancelled 1.000000
DistanceKilometers 19881.482422
DistanceMiles 12353.780273
FlightDelay 1.000000
FlightDelayMin 360.000000
FlightTimeHour 31.715034
FlightTimeMin 1902.901978
dayOfWeek 6.000000
dtype: float64
AvgTicketPrice 1199.73
Cancelled True
DistanceKilometers 19881.5
DistanceMiles 12353.8
FlightDelay True
FlightDelayMin 360
FlightTimeHour 31.715
FlightTimeMin 1902.9
dayOfWeek 6
dtype: object
"""
return self._query_compiler.max(numeric_only=numeric_only)

View File

@ -126,10 +126,14 @@ class Operations:
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
def max(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "max", numeric_only=numeric_only)
return self._metric_aggs(
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
)
def min(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "min", numeric_only=numeric_only)
return self._metric_aggs(
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
)
def nunique(self, query_compiler):
return self._metric_aggs(
@ -142,13 +146,22 @@ class Operations:
def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins)
def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None):
def _metric_aggs(
self,
query_compiler,
func,
field_types=None,
numeric_only=None,
keep_original_dtype=False,
):
"""
Parameters
----------
field_types: str, default None
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
If `None`, use only numeric fields.
keep_original_dtype : bool, default False
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
Returns
-------
@ -235,6 +248,10 @@ class Operations:
results[field] = elasticsearch_date_to_pandas_date(
response["aggregations"][field]["value_as_string"], date_format
)
elif keep_original_dtype:
results[field] = pd_dtype.type(
response["aggregations"][field]["value"]
)
else:
results[field] = response["aggregations"][field]["value"]

View File

@ -35,7 +35,7 @@ from io import StringIO
import numpy as np
import pandas as pd
from pandas.io.common import _expand_user, _stringify_path
from pandas.io.common import _expand_user, stringify_path
import eland.plotting
from eland import NDFrame
@ -365,7 +365,7 @@ class Series(NDFrame):
temp_series = self._build_repr(max_rows + 1)
if buf is not None:
_buf = _expand_user(_stringify_path(buf))
_buf = _expand_user(stringify_path(buf))
else:
_buf = StringIO()

View File

@ -20,7 +20,7 @@ def blacken(session):
def lint(session):
session.install("black", "flake8")
session.run("black", "--check", "--target-version=py36", *SOURCE_FILES)
session.run("flake8", "--ignore=E501,W503,E402,E712", *SOURCE_FILES)
session.run("flake8", "--ignore=W291,E501,W503,E402,E712", *SOURCE_FILES)
@nox.session(python=["3.6", "3.7", "3.8"])

View File

@ -1,5 +1,5 @@
elasticsearch>=7.0.5
pandas==0.25.3
pandas>=1
matplotlib
pytest>=5.2.1
nbval

View File

@ -1,3 +1,3 @@
elasticsearch>=7.0.5
pandas==0.25.3
pandas>=1
matplotlib

View File

@ -187,6 +187,6 @@ setup(
classifiers=CLASSIFIERS,
keywords="elastic eland pandas python",
packages=find_packages(include=["eland", "eland.*"]),
install_requires=["elasticsearch>=7.0.5, <8", "pandas==0.25.3", "matplotlib"],
install_requires=["elasticsearch>=7.0.5, <8", "pandas>=1", "matplotlib"],
python_requires=">=3.6",
)