Update supported Pandas to v1.0

This commit is contained in:
Daniel Mesejo-León 2020-03-27 18:21:15 +01:00 committed by GitHub
parent 0c1d7222fe
commit e27a508c59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 249 additions and 155 deletions

1
.gitignore vendored
View File

@ -43,6 +43,7 @@ ipython_config.py
# Environments # Environments
.env .env
.venv .venv
.nox
env/ env/
venv/ venv/
ENV/ ENV/

View File

@ -1,5 +1,5 @@
elasticsearch>=7.0.5 elasticsearch>=7.0.5
pandas==0.25.3 pandas>=1
matplotlib matplotlib
pytest>=5.2.1 pytest>=5.2.1
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master

File diff suppressed because one or more lines are too long

View File

@ -22,7 +22,7 @@ from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.computation.eval import eval from pandas.core.computation.eval import eval
from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, stringify_path
from pandas.io.formats import console from pandas.io.formats import console
from pandas.io.formats import format as fmt from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing from pandas.io.formats.printing import pprint_thing
@ -249,12 +249,19 @@ class DataFrame(NDFrame):
-------- --------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest']) >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
>>> df.tail() >>> df.tail()
Origin Dest Origin \\
13054 Pisa International Airport Xi'an Xianyang International Airport 13054 Pisa International Airport
13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport 13055 Winnipeg / James Armstrong Richardson International Airport
13056 Licenciado Benito Juarez International Airport Ukrainka Air Base 13056 Licenciado Benito Juarez International Airport
13057 Itami Airport Ministro Pistarini International Airport 13057 Itami Airport
13058 Adelaide International Airport Washington Dulles International Airport 13058 Adelaide International Airport
<BLANKLINE>
Dest
13054 Xi'an Xianyang International Airport
13055 Zurich Airport
13056 Ukrainka Air Base
13057 Ministro Pistarini International Airport
13058 Washington Dulles International Airport
<BLANKLINE> <BLANKLINE>
[5 rows x 2 columns] [5 rows x 2 columns]
""" """
@ -602,8 +609,10 @@ class DataFrame(NDFrame):
<class 'eland.dataframe.DataFrame'> <class 'eland.dataframe.DataFrame'>
Index: 4675 entries, 0 to 4674 Index: 4675 entries, 0 to 4674
Data columns (total 2 columns): Data columns (total 2 columns):
customer_first_name 4675 non-null object # Column Non-Null Count Dtype
geoip.city_name 4094 non-null object --- ------ -------------- -----
0 customer_first_name 4675 non-null object
1 geoip.city_name 4094 non-null object
dtypes: object(2) dtypes: object(2)
memory usage: ... memory usage: ...
""" """
@ -618,6 +627,7 @@ class DataFrame(NDFrame):
return return
cols = self.columns cols = self.columns
col_count = len(self.columns)
# hack # hack
if max_cols is None: if max_cols is None:
@ -637,30 +647,74 @@ class DataFrame(NDFrame):
def _verbose_repr(): def _verbose_repr():
lines.append(f"Data columns (total {len(self.columns)} columns):") lines.append(f"Data columns (total {len(self.columns)} columns):")
space = max(len(pprint_thing(k)) for k in self.columns) + 4
id_head = " # "
column_head = "Column"
col_space = 2
max_col = max(len(pprint_thing(k)) for k in cols)
len_column = len(pprint_thing(column_head))
space = max(max_col, len_column) + col_space
max_id = len(pprint_thing(col_count))
len_id = len(pprint_thing(id_head))
space_num = max(max_id, len_id) + col_space
counts = None counts = None
tmpl = "{count}{dtype}" header = _put_str(id_head, space_num) + _put_str(column_head, space)
if show_counts: if show_counts:
counts = self.count() counts = self.count()
if len(cols) != len(counts): # pragma: no cover if len(cols) != len(counts): # pragma: no cover
raise AssertionError( raise AssertionError(
f"Columns must equal counts " "Columns must equal counts "
f"({len(cols):d} != {len(counts):d})" "({cols:d} != {counts:d})".format(
cols=len(cols), counts=len(counts)
)
) )
tmpl = "{count} non-null {dtype}" count_header = "Non-Null Count"
len_count = len(count_header)
non_null = " non-null"
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
space_count = max(len_count, max_count) + col_space
count_temp = "{count}" + non_null
else:
count_header = ""
space_count = len(count_header)
len_count = space_count
count_temp = "{count}"
dtype_header = "Dtype"
len_dtype = len(dtype_header)
max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
space_dtype = max(len_dtype, max_dtypes)
header += _put_str(count_header, space_count) + _put_str(
dtype_header, space_dtype
)
lines.append(header)
lines.append(
_put_str("-" * len_id, space_num)
+ _put_str("-" * len_column, space)
+ _put_str("-" * len_count, space_count)
+ _put_str("-" * len_dtype, space_dtype)
)
dtypes = self.dtypes dtypes = self.dtypes
for i, col in enumerate(self.columns): for i, col in enumerate(self.columns):
dtype = dtypes.iloc[i] dtype = dtypes.iloc[i]
col = pprint_thing(col) col = pprint_thing(col)
line_no = _put_str(" {num}".format(num=i), space_num)
count = "" count = ""
if show_counts: if show_counts:
count = counts.iloc[i] count = counts.iloc[i]
lines.append( lines.append(
_put_str(col, space) + tmpl.format(count=count, dtype=dtype) line_no
+ _put_str(col, space)
+ _put_str(count_temp.format(count=count), space_count)
+ _put_str(dtype, space_dtype)
) )
def _non_verbose_repr(): def _non_verbose_repr():
@ -769,7 +823,7 @@ class DataFrame(NDFrame):
df = self._build_repr(max_rows + 1) df = self._build_repr(max_rows + 1)
if buf is not None: if buf is not None:
_buf = _expand_user(_stringify_path(buf)) _buf = _expand_user(stringify_path(buf))
else: else:
_buf = StringIO() _buf = StringIO()
@ -866,7 +920,7 @@ class DataFrame(NDFrame):
df = self._build_repr(max_rows + 1) df = self._build_repr(max_rows + 1)
if buf is not None: if buf is not None:
_buf = _expand_user(_stringify_path(buf)) _buf = _expand_user(stringify_path(buf))
else: else:
_buf = StringIO() _buf = StringIO()

View File

@ -238,16 +238,16 @@ class NDFrame(ABC):
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights')
>>> df.min() >>> df.min()
AvgTicketPrice 100.020531 AvgTicketPrice 100.021
Cancelled 0.000000 Cancelled False
DistanceKilometers 0.000000 DistanceKilometers 0
DistanceMiles 0.000000 DistanceMiles 0
FlightDelay 0.000000 FlightDelay False
FlightDelayMin 0.000000 FlightDelayMin 0
FlightTimeHour 0.000000 FlightTimeHour 0
FlightTimeMin 0.000000 FlightTimeMin 0
dayOfWeek 0.000000 dayOfWeek 0
dtype: float64 dtype: object
""" """
return self._query_compiler.min(numeric_only=numeric_only) return self._query_compiler.min(numeric_only=numeric_only)
@ -270,16 +270,16 @@ class NDFrame(ABC):
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights')
>>> df.max() >>> df.max()
AvgTicketPrice 1199.729004 AvgTicketPrice 1199.73
Cancelled 1.000000 Cancelled True
DistanceKilometers 19881.482422 DistanceKilometers 19881.5
DistanceMiles 12353.780273 DistanceMiles 12353.8
FlightDelay 1.000000 FlightDelay True
FlightDelayMin 360.000000 FlightDelayMin 360
FlightTimeHour 31.715034 FlightTimeHour 31.715
FlightTimeMin 1902.901978 FlightTimeMin 1902.9
dayOfWeek 6.000000 dayOfWeek 6
dtype: float64 dtype: object
""" """
return self._query_compiler.max(numeric_only=numeric_only) return self._query_compiler.max(numeric_only=numeric_only)

View File

@ -126,10 +126,14 @@ class Operations:
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only) return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
def max(self, query_compiler, numeric_only=True): def max(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "max", numeric_only=numeric_only) return self._metric_aggs(
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
)
def min(self, query_compiler, numeric_only=True): def min(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "min", numeric_only=numeric_only) return self._metric_aggs(
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
)
def nunique(self, query_compiler): def nunique(self, query_compiler):
return self._metric_aggs( return self._metric_aggs(
@ -142,13 +146,22 @@ class Operations:
def hist(self, query_compiler, bins): def hist(self, query_compiler, bins):
return self._hist_aggs(query_compiler, bins) return self._hist_aggs(query_compiler, bins)
def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None): def _metric_aggs(
self,
query_compiler,
func,
field_types=None,
numeric_only=None,
keep_original_dtype=False,
):
""" """
Parameters Parameters
---------- ----------
field_types: str, default None field_types: str, default None
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable. if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
If `None`, use only numeric fields. If `None`, use only numeric fields.
keep_original_dtype : bool, default False
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
Returns Returns
------- -------
@ -235,6 +248,10 @@ class Operations:
results[field] = elasticsearch_date_to_pandas_date( results[field] = elasticsearch_date_to_pandas_date(
response["aggregations"][field]["value_as_string"], date_format response["aggregations"][field]["value_as_string"], date_format
) )
elif keep_original_dtype:
results[field] = pd_dtype.type(
response["aggregations"][field]["value"]
)
else: else:
results[field] = response["aggregations"][field]["value"] results[field] = response["aggregations"][field]["value"]

View File

@ -35,7 +35,7 @@ from io import StringIO
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, stringify_path
import eland.plotting import eland.plotting
from eland import NDFrame from eland import NDFrame
@ -365,7 +365,7 @@ class Series(NDFrame):
temp_series = self._build_repr(max_rows + 1) temp_series = self._build_repr(max_rows + 1)
if buf is not None: if buf is not None:
_buf = _expand_user(_stringify_path(buf)) _buf = _expand_user(stringify_path(buf))
else: else:
_buf = StringIO() _buf = StringIO()

View File

@ -20,7 +20,7 @@ def blacken(session):
def lint(session): def lint(session):
session.install("black", "flake8") session.install("black", "flake8")
session.run("black", "--check", "--target-version=py36", *SOURCE_FILES) session.run("black", "--check", "--target-version=py36", *SOURCE_FILES)
session.run("flake8", "--ignore=E501,W503,E402,E712", *SOURCE_FILES) session.run("flake8", "--ignore=W291,E501,W503,E402,E712", *SOURCE_FILES)
@nox.session(python=["3.6", "3.7", "3.8"]) @nox.session(python=["3.6", "3.7", "3.8"])

View File

@ -1,5 +1,5 @@
elasticsearch>=7.0.5 elasticsearch>=7.0.5
pandas==0.25.3 pandas>=1
matplotlib matplotlib
pytest>=5.2.1 pytest>=5.2.1
nbval nbval

View File

@ -1,3 +1,3 @@
elasticsearch>=7.0.5 elasticsearch>=7.0.5
pandas==0.25.3 pandas>=1
matplotlib matplotlib

View File

@ -187,6 +187,6 @@ setup(
classifiers=CLASSIFIERS, classifiers=CLASSIFIERS,
keywords="elastic eland pandas python", keywords="elastic eland pandas python",
packages=find_packages(include=["eland", "eland.*"]), packages=find_packages(include=["eland", "eland.*"]),
install_requires=["elasticsearch>=7.0.5, <8", "pandas==0.25.3", "matplotlib"], install_requires=["elasticsearch>=7.0.5, <8", "pandas>=1", "matplotlib"],
python_requires=">=3.6", python_requires=">=3.6",
) )