mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Update supported Pandas to v1.0
This commit is contained in:
parent
0c1d7222fe
commit
e27a508c59
1
.gitignore
vendored
1
.gitignore
vendored
@ -43,6 +43,7 @@ ipython_config.py
|
|||||||
# Environments
|
# Environments
|
||||||
.env
|
.env
|
||||||
.venv
|
.venv
|
||||||
|
.nox
|
||||||
env/
|
env/
|
||||||
venv/
|
venv/
|
||||||
ENV/
|
ENV/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
elasticsearch>=7.0.5
|
elasticsearch>=7.0.5
|
||||||
pandas==0.25.3
|
pandas>=1
|
||||||
matplotlib
|
matplotlib
|
||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
|
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
|
||||||
|
File diff suppressed because one or more lines are too long
@ -22,7 +22,7 @@ from pandas.core.common import apply_if_callable, is_bool_indexer
|
|||||||
from pandas.core.computation.eval import eval
|
from pandas.core.computation.eval import eval
|
||||||
from pandas.core.dtypes.common import is_list_like
|
from pandas.core.dtypes.common import is_list_like
|
||||||
from pandas.core.indexing import check_bool_indexer
|
from pandas.core.indexing import check_bool_indexer
|
||||||
from pandas.io.common import _expand_user, _stringify_path
|
from pandas.io.common import _expand_user, stringify_path
|
||||||
from pandas.io.formats import console
|
from pandas.io.formats import console
|
||||||
from pandas.io.formats import format as fmt
|
from pandas.io.formats import format as fmt
|
||||||
from pandas.io.formats.printing import pprint_thing
|
from pandas.io.formats.printing import pprint_thing
|
||||||
@ -249,12 +249,19 @@ class DataFrame(NDFrame):
|
|||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||||
>>> df.tail()
|
>>> df.tail()
|
||||||
Origin Dest
|
Origin \\
|
||||||
13054 Pisa International Airport Xi'an Xianyang International Airport
|
13054 Pisa International Airport
|
||||||
13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport
|
13055 Winnipeg / James Armstrong Richardson International Airport
|
||||||
13056 Licenciado Benito Juarez International Airport Ukrainka Air Base
|
13056 Licenciado Benito Juarez International Airport
|
||||||
13057 Itami Airport Ministro Pistarini International Airport
|
13057 Itami Airport
|
||||||
13058 Adelaide International Airport Washington Dulles International Airport
|
13058 Adelaide International Airport
|
||||||
|
<BLANKLINE>
|
||||||
|
Dest
|
||||||
|
13054 Xi'an Xianyang International Airport
|
||||||
|
13055 Zurich Airport
|
||||||
|
13056 Ukrainka Air Base
|
||||||
|
13057 Ministro Pistarini International Airport
|
||||||
|
13058 Washington Dulles International Airport
|
||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
[5 rows x 2 columns]
|
[5 rows x 2 columns]
|
||||||
"""
|
"""
|
||||||
@ -602,8 +609,10 @@ class DataFrame(NDFrame):
|
|||||||
<class 'eland.dataframe.DataFrame'>
|
<class 'eland.dataframe.DataFrame'>
|
||||||
Index: 4675 entries, 0 to 4674
|
Index: 4675 entries, 0 to 4674
|
||||||
Data columns (total 2 columns):
|
Data columns (total 2 columns):
|
||||||
customer_first_name 4675 non-null object
|
# Column Non-Null Count Dtype
|
||||||
geoip.city_name 4094 non-null object
|
--- ------ -------------- -----
|
||||||
|
0 customer_first_name 4675 non-null object
|
||||||
|
1 geoip.city_name 4094 non-null object
|
||||||
dtypes: object(2)
|
dtypes: object(2)
|
||||||
memory usage: ...
|
memory usage: ...
|
||||||
"""
|
"""
|
||||||
@ -618,6 +627,7 @@ class DataFrame(NDFrame):
|
|||||||
return
|
return
|
||||||
|
|
||||||
cols = self.columns
|
cols = self.columns
|
||||||
|
col_count = len(self.columns)
|
||||||
|
|
||||||
# hack
|
# hack
|
||||||
if max_cols is None:
|
if max_cols is None:
|
||||||
@ -637,30 +647,74 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
def _verbose_repr():
|
def _verbose_repr():
|
||||||
lines.append(f"Data columns (total {len(self.columns)} columns):")
|
lines.append(f"Data columns (total {len(self.columns)} columns):")
|
||||||
space = max(len(pprint_thing(k)) for k in self.columns) + 4
|
|
||||||
|
id_head = " # "
|
||||||
|
column_head = "Column"
|
||||||
|
col_space = 2
|
||||||
|
|
||||||
|
max_col = max(len(pprint_thing(k)) for k in cols)
|
||||||
|
len_column = len(pprint_thing(column_head))
|
||||||
|
space = max(max_col, len_column) + col_space
|
||||||
|
|
||||||
|
max_id = len(pprint_thing(col_count))
|
||||||
|
len_id = len(pprint_thing(id_head))
|
||||||
|
space_num = max(max_id, len_id) + col_space
|
||||||
counts = None
|
counts = None
|
||||||
|
|
||||||
tmpl = "{count}{dtype}"
|
header = _put_str(id_head, space_num) + _put_str(column_head, space)
|
||||||
if show_counts:
|
if show_counts:
|
||||||
counts = self.count()
|
counts = self.count()
|
||||||
if len(cols) != len(counts): # pragma: no cover
|
if len(cols) != len(counts): # pragma: no cover
|
||||||
raise AssertionError(
|
raise AssertionError(
|
||||||
f"Columns must equal counts "
|
"Columns must equal counts "
|
||||||
f"({len(cols):d} != {len(counts):d})"
|
"({cols:d} != {counts:d})".format(
|
||||||
|
cols=len(cols), counts=len(counts)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
tmpl = "{count} non-null {dtype}"
|
count_header = "Non-Null Count"
|
||||||
|
len_count = len(count_header)
|
||||||
|
non_null = " non-null"
|
||||||
|
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
|
||||||
|
space_count = max(len_count, max_count) + col_space
|
||||||
|
count_temp = "{count}" + non_null
|
||||||
|
else:
|
||||||
|
count_header = ""
|
||||||
|
space_count = len(count_header)
|
||||||
|
len_count = space_count
|
||||||
|
count_temp = "{count}"
|
||||||
|
|
||||||
|
dtype_header = "Dtype"
|
||||||
|
len_dtype = len(dtype_header)
|
||||||
|
max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
|
||||||
|
space_dtype = max(len_dtype, max_dtypes)
|
||||||
|
header += _put_str(count_header, space_count) + _put_str(
|
||||||
|
dtype_header, space_dtype
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.append(header)
|
||||||
|
lines.append(
|
||||||
|
_put_str("-" * len_id, space_num)
|
||||||
|
+ _put_str("-" * len_column, space)
|
||||||
|
+ _put_str("-" * len_count, space_count)
|
||||||
|
+ _put_str("-" * len_dtype, space_dtype)
|
||||||
|
)
|
||||||
|
|
||||||
dtypes = self.dtypes
|
dtypes = self.dtypes
|
||||||
for i, col in enumerate(self.columns):
|
for i, col in enumerate(self.columns):
|
||||||
dtype = dtypes.iloc[i]
|
dtype = dtypes.iloc[i]
|
||||||
col = pprint_thing(col)
|
col = pprint_thing(col)
|
||||||
|
|
||||||
|
line_no = _put_str(" {num}".format(num=i), space_num)
|
||||||
|
|
||||||
count = ""
|
count = ""
|
||||||
if show_counts:
|
if show_counts:
|
||||||
count = counts.iloc[i]
|
count = counts.iloc[i]
|
||||||
|
|
||||||
lines.append(
|
lines.append(
|
||||||
_put_str(col, space) + tmpl.format(count=count, dtype=dtype)
|
line_no
|
||||||
|
+ _put_str(col, space)
|
||||||
|
+ _put_str(count_temp.format(count=count), space_count)
|
||||||
|
+ _put_str(dtype, space_dtype)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _non_verbose_repr():
|
def _non_verbose_repr():
|
||||||
@ -769,7 +823,7 @@ class DataFrame(NDFrame):
|
|||||||
df = self._build_repr(max_rows + 1)
|
df = self._build_repr(max_rows + 1)
|
||||||
|
|
||||||
if buf is not None:
|
if buf is not None:
|
||||||
_buf = _expand_user(_stringify_path(buf))
|
_buf = _expand_user(stringify_path(buf))
|
||||||
else:
|
else:
|
||||||
_buf = StringIO()
|
_buf = StringIO()
|
||||||
|
|
||||||
@ -866,7 +920,7 @@ class DataFrame(NDFrame):
|
|||||||
df = self._build_repr(max_rows + 1)
|
df = self._build_repr(max_rows + 1)
|
||||||
|
|
||||||
if buf is not None:
|
if buf is not None:
|
||||||
_buf = _expand_user(_stringify_path(buf))
|
_buf = _expand_user(stringify_path(buf))
|
||||||
else:
|
else:
|
||||||
_buf = StringIO()
|
_buf = StringIO()
|
||||||
|
|
||||||
|
@ -238,16 +238,16 @@ class NDFrame(ABC):
|
|||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights')
|
||||||
>>> df.min()
|
>>> df.min()
|
||||||
AvgTicketPrice 100.020531
|
AvgTicketPrice 100.021
|
||||||
Cancelled 0.000000
|
Cancelled False
|
||||||
DistanceKilometers 0.000000
|
DistanceKilometers 0
|
||||||
DistanceMiles 0.000000
|
DistanceMiles 0
|
||||||
FlightDelay 0.000000
|
FlightDelay False
|
||||||
FlightDelayMin 0.000000
|
FlightDelayMin 0
|
||||||
FlightTimeHour 0.000000
|
FlightTimeHour 0
|
||||||
FlightTimeMin 0.000000
|
FlightTimeMin 0
|
||||||
dayOfWeek 0.000000
|
dayOfWeek 0
|
||||||
dtype: float64
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.min(numeric_only=numeric_only)
|
return self._query_compiler.min(numeric_only=numeric_only)
|
||||||
|
|
||||||
@ -270,16 +270,16 @@ class NDFrame(ABC):
|
|||||||
--------
|
--------
|
||||||
>>> df = ed.DataFrame('localhost', 'flights')
|
>>> df = ed.DataFrame('localhost', 'flights')
|
||||||
>>> df.max()
|
>>> df.max()
|
||||||
AvgTicketPrice 1199.729004
|
AvgTicketPrice 1199.73
|
||||||
Cancelled 1.000000
|
Cancelled True
|
||||||
DistanceKilometers 19881.482422
|
DistanceKilometers 19881.5
|
||||||
DistanceMiles 12353.780273
|
DistanceMiles 12353.8
|
||||||
FlightDelay 1.000000
|
FlightDelay True
|
||||||
FlightDelayMin 360.000000
|
FlightDelayMin 360
|
||||||
FlightTimeHour 31.715034
|
FlightTimeHour 31.715
|
||||||
FlightTimeMin 1902.901978
|
FlightTimeMin 1902.9
|
||||||
dayOfWeek 6.000000
|
dayOfWeek 6
|
||||||
dtype: float64
|
dtype: object
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.max(numeric_only=numeric_only)
|
return self._query_compiler.max(numeric_only=numeric_only)
|
||||||
|
|
||||||
|
@ -126,10 +126,14 @@ class Operations:
|
|||||||
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
|
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)
|
||||||
|
|
||||||
def max(self, query_compiler, numeric_only=True):
|
def max(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(query_compiler, "max", numeric_only=numeric_only)
|
return self._metric_aggs(
|
||||||
|
query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
|
||||||
|
)
|
||||||
|
|
||||||
def min(self, query_compiler, numeric_only=True):
|
def min(self, query_compiler, numeric_only=True):
|
||||||
return self._metric_aggs(query_compiler, "min", numeric_only=numeric_only)
|
return self._metric_aggs(
|
||||||
|
query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
|
||||||
|
)
|
||||||
|
|
||||||
def nunique(self, query_compiler):
|
def nunique(self, query_compiler):
|
||||||
return self._metric_aggs(
|
return self._metric_aggs(
|
||||||
@ -142,13 +146,22 @@ class Operations:
|
|||||||
def hist(self, query_compiler, bins):
|
def hist(self, query_compiler, bins):
|
||||||
return self._hist_aggs(query_compiler, bins)
|
return self._hist_aggs(query_compiler, bins)
|
||||||
|
|
||||||
def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None):
|
def _metric_aggs(
|
||||||
|
self,
|
||||||
|
query_compiler,
|
||||||
|
func,
|
||||||
|
field_types=None,
|
||||||
|
numeric_only=None,
|
||||||
|
keep_original_dtype=False,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
field_types: str, default None
|
field_types: str, default None
|
||||||
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
|
if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
|
||||||
If `None`, use only numeric fields.
|
If `None`, use only numeric fields.
|
||||||
|
keep_original_dtype : bool, default False
|
||||||
|
if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -235,6 +248,10 @@ class Operations:
|
|||||||
results[field] = elasticsearch_date_to_pandas_date(
|
results[field] = elasticsearch_date_to_pandas_date(
|
||||||
response["aggregations"][field]["value_as_string"], date_format
|
response["aggregations"][field]["value_as_string"], date_format
|
||||||
)
|
)
|
||||||
|
elif keep_original_dtype:
|
||||||
|
results[field] = pd_dtype.type(
|
||||||
|
response["aggregations"][field]["value"]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
results[field] = response["aggregations"][field]["value"]
|
results[field] = response["aggregations"][field]["value"]
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ from io import StringIO
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.io.common import _expand_user, _stringify_path
|
from pandas.io.common import _expand_user, stringify_path
|
||||||
|
|
||||||
import eland.plotting
|
import eland.plotting
|
||||||
from eland import NDFrame
|
from eland import NDFrame
|
||||||
@ -365,7 +365,7 @@ class Series(NDFrame):
|
|||||||
temp_series = self._build_repr(max_rows + 1)
|
temp_series = self._build_repr(max_rows + 1)
|
||||||
|
|
||||||
if buf is not None:
|
if buf is not None:
|
||||||
_buf = _expand_user(_stringify_path(buf))
|
_buf = _expand_user(stringify_path(buf))
|
||||||
else:
|
else:
|
||||||
_buf = StringIO()
|
_buf = StringIO()
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ def blacken(session):
|
|||||||
def lint(session):
|
def lint(session):
|
||||||
session.install("black", "flake8")
|
session.install("black", "flake8")
|
||||||
session.run("black", "--check", "--target-version=py36", *SOURCE_FILES)
|
session.run("black", "--check", "--target-version=py36", *SOURCE_FILES)
|
||||||
session.run("flake8", "--ignore=E501,W503,E402,E712", *SOURCE_FILES)
|
session.run("flake8", "--ignore=W291,E501,W503,E402,E712", *SOURCE_FILES)
|
||||||
|
|
||||||
|
|
||||||
@nox.session(python=["3.6", "3.7", "3.8"])
|
@nox.session(python=["3.6", "3.7", "3.8"])
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
elasticsearch>=7.0.5
|
elasticsearch>=7.0.5
|
||||||
pandas==0.25.3
|
pandas>=1
|
||||||
matplotlib
|
matplotlib
|
||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
nbval
|
nbval
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
elasticsearch>=7.0.5
|
elasticsearch>=7.0.5
|
||||||
pandas==0.25.3
|
pandas>=1
|
||||||
matplotlib
|
matplotlib
|
||||||
|
2
setup.py
2
setup.py
@ -187,6 +187,6 @@ setup(
|
|||||||
classifiers=CLASSIFIERS,
|
classifiers=CLASSIFIERS,
|
||||||
keywords="elastic eland pandas python",
|
keywords="elastic eland pandas python",
|
||||||
packages=find_packages(include=["eland", "eland.*"]),
|
packages=find_packages(include=["eland", "eland.*"]),
|
||||||
install_requires=["elasticsearch>=7.0.5, <8", "pandas==0.25.3", "matplotlib"],
|
install_requires=["elasticsearch>=7.0.5, <8", "pandas>=1", "matplotlib"],
|
||||||
python_requires=">=3.6",
|
python_requires=">=3.6",
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user