Renamed ed_to_pd eland_to_pandas and added docs.

+ added some additions to .gitignore
+ removed DataFrame.squeeze for now
This commit is contained in:
Stephen Dodson 2019-11-15 11:21:27 +00:00
parent 29fe2278b7
commit f5025b9f39
18 changed files with 130 additions and 62 deletions

10
.gitignore vendored
View File

@ -2,7 +2,13 @@
*.pyc
# Setuptools distribution folder.
/dist/
dist/
# Build folder
build/
# docs build folder
docs/build/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
@ -36,4 +42,4 @@ env/
venv/
ENV/
env.bak/
venv.bak/
venv.bak/

View File

@ -1,6 +0,0 @@
eland.ed_to_pd
==============
.. currentmodule:: eland
.. autofunction:: ed_to_pd

View File

@ -0,0 +1,6 @@
eland.eland_to_pandas
=====================
.. currentmodule:: eland
.. autofunction:: eland_to_pandas

View File

@ -0,0 +1,6 @@
eland.pandas_to_eland
=====================
.. currentmodule:: eland
.. autofunction:: pandas_to_eland

View File

@ -1,6 +0,0 @@
eland.pd_to_ed
==============
.. currentmodule:: eland
.. autofunction:: pd_to_ed

View File

@ -24,6 +24,7 @@ Attributes and underlying data
DataFrame.dtypes
DataFrame.select_dtypes
DataFrame.empty
DataFrame.shape
Indexing, iteration
~~~~~~~~~~~~~~~~~~~
@ -80,6 +81,9 @@ Serialization / IO / conversion
:toctree: api/
DataFrame.info
DataFrame.to_csv
DataFrame.to_html
DataFrame.to_string
Elasticsearch utilities
~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -17,5 +17,5 @@ Pandas and Eland
.. autosummary::
:toctree: api/
pd_to_ed
ed_to_pd
pandas_to_eland
eland_to_pandas

View File

@ -76,6 +76,7 @@ class DataFrame(NDFrame):
[5 rows x 2 columns]
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field
(TODO - currently index_field must also be a field if not _id)
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
>>> df.head()
@ -529,7 +530,11 @@ class DataFrame(NDFrame):
bold_rows=True, classes=None, escape=True, notebook=False,
border=None, table_id=None, render_links=False):
"""
From pandas - except we set max_rows default to avoid careless extraction of entire index
Render a Elasticsearch data as an HTML table.
See Also
--------
:pandas_api_docs:`to_html` for argument details.
"""
if max_rows is None:
warnings.warn("DataFrame.to_string called without max_rows set "
@ -568,7 +573,13 @@ class DataFrame(NDFrame):
max_rows=None, max_cols=None, show_dimensions=False,
decimal='.', line_width=None):
"""
From pandas - except we set max_rows default to avoid careless extraction of entire index
Render a DataFrame to a console-friendly tabular output.
Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
See Also
--------
:pandas_api_docs:`to_string` for argument details.
"""
if max_rows is None:
warnings.warn("DataFrame.to_string called without max_rows set "
@ -718,6 +729,13 @@ class DataFrame(NDFrame):
quotechar='"', line_terminator=None, chunksize=None,
tupleize_cols=None, date_format=None, doublequote=True,
escapechar=None, decimal='.'):
"""
Write Elasticsearch data to a comma-separated values (csv) file.
See Also
--------
:pandas_api_docs:`to_csv` for argument details.
"""
kwargs = {
"path_or_buf": path_or_buf,
"sep": sep,
@ -754,16 +772,34 @@ class DataFrame(NDFrame):
def _empty_pd_df(self):
return self._query_compiler._empty_pd_ef()
def squeeze(self, axis=None):
return DataFrame(
query_compiler=self._query_compiler.squeeze(axis)
)
def select_dtypes(self, include=None, exclude=None):
"""
Return a subset of the DataFrame's columns based on the column dtypes.
Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
Returns
-------
eland.DataFrame
DataFrame contains only columns of selected dtypes
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights',
... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
>>> df.dtypes
AvgTicketPrice float64
Dest object
Cancelled bool
timestamp datetime64[ns]
dayOfWeek int64
dtype: object
>>> df = df.select_dtypes(include=[np.number, 'datetime'])
>>> df.dtypes
AvgTicketPrice float64
timestamp datetime64[ns]
dayOfWeek int64
dtype: object
"""
empty_df = self._empty_pd_df()
@ -779,8 +815,20 @@ class DataFrame(NDFrame):
Returns
-------
shape: tuple
0 - number of rows
1 - number of columns
0. number of rows
1. number of columns
Notes
-----
- number of rows ``len(df)`` queries Elasticsearch
- number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated.
Examples
--------
>>> df = ed.read_es('localhost', 'ecommerce')
>>> df.shape
(4675, 45)
"""
num_rows = len(self)
num_columns = len(self.columns)
@ -891,9 +939,11 @@ class DataFrame(NDFrame):
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = df.query('FlightDelayMin > 60')
>>> df.info()
>>> df = ed.read_es('localhost', 'flights')
>>> df.shape
(13059, 27)
>>> df.query('FlightDelayMin > 60').shape
(2730, 27)
"""
if isinstance(expr, BooleanFilter):
return DataFrame(

View File

@ -539,10 +539,6 @@ class Operations:
task = ('iloc', (index, columns))
self._tasks.append(task)
def squeeze(self, axis):
task = ('squeeze', axis)
self._tasks.append(task)
def index_count(self, query_compiler, field):
# field is the index field so count values
query_params, post_processing = self._resolve_tasks()
@ -660,8 +656,6 @@ class Operations:
if column_indexer is None:
column_indexer = slice(None)
df = df.iloc[index_indexer, column_indexer]
elif action[0] == 'squeeze':
df = df.squeeze(axis=action[1])
# columns could be in here (and we ignore it)
return df

View File

@ -369,13 +369,6 @@ class ElandQueryCompiler:
return result
def squeeze(self, axis=None):
result = self.copy()
result._operations.squeeze(axis)
return result
def view(self, index=None, columns=None):
result = self.copy()

View File

@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData):
# Now create index
index_name = 'eland_test_generate_es_mappings'
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df_head = ed_df.head()
assert_pandas_eland_frame_equal(df, ed_df_head)

View File

@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData):
pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe()
assert_almost_equal(pd_describe[['AvgTicketPrice']],
ed_describe[['AvgTicketPrice']],
assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'),
ed_describe.drop(['25%','50%','75%'], axis='index'),
check_less_precise=True)
# TODO - this fails for all fields now as ES aggregations are approximate
# TODO - this fails for percentile fields as ES aggregations are approximate
# if ES percentile agg uses
# "hdr": {
# "number_of_significant_value_digits": 3

View File

@ -1,8 +1,11 @@
# File called _pytest for PyCharm compatability
import numpy as np
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameDtypes(TestData):
@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData):
pd_flights = self.pd_flights()
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
def test_flights_select_dtypes(self):
ed_flights = self.ed_flights_small()
pd_flights = self.pd_flights_small()
assert_pandas_eland_frame_equal(
pd_flights.select_dtypes(include=np.number),
ed_flights.select_dtypes(include=np.number)
)

View File

@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData):
# Now create index
index_name = 'eland_test_query'
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
assert_pandas_eland_frame_equal(pd_df, ed_df)

View File

@ -1,5 +1,7 @@
# File called _pytest for PyCharm compatability
import pytest
from eland.tests.common import TestData
@ -12,8 +14,9 @@ class TestDataFrameRepr(TestData):
ed_head_101 = ed_flights.head(101)
pd_head_101 = pd_flights.head(101)
# This sets max_rows=60 by default
ed_head_101_str = ed_head_101.to_string()
# This sets max_rows=60 by default (but throws userwarning)
with pytest.warns(UserWarning):
ed_head_101_str = ed_head_101.to_string()
pd_head_101_str = pd_head_101.to_string(max_rows=60)
assert pd_head_101_str == ed_head_101_str

View File

@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData):
# Now create index
index_name = 'eland_test_generate_es_mappings'
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df_head = ed_df.head()
assert_pandas_eland_frame_equal(df, ed_df_head)

View File

@ -1,5 +1,7 @@
# File called _pytest for PyCharm compatability
import pytest
from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData
@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref):
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning):
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning):
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)

View File

@ -26,13 +26,13 @@ def read_es(es_params, index_pattern):
See Also
--------
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
"""
return DataFrame(client=es_params, index_pattern=index_pattern)
def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
geo_points=None):
def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
geo_points=None):
"""
Append a pandas DataFrame to an Elasticsearch index.
Mainly used in testing.
@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
See Also
--------
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
"""
client = Client(es_params)
mapping = Mappings._generate_es_mappings(df, geo_points)
mapping = Mappings._generate_es_mappings(pd_df, geo_points)
# If table exists, check if_exists parameter
if client.index_exists(index=destination_index):
@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
# Now add data
actions = []
n = 0
for row in df.iterrows():
for row in pd_df.iterrows():
# Use index as _id
id = row[0]
@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
return ed_df
def ed_to_pd(ed_df):
def eland_to_pandas(ed_df):
"""
Convert an eland.Dataframe to a pandas.DataFrame
@ -138,7 +138,7 @@ def ed_to_pd(ed_df):
See Also
--------
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
"""
return ed_df._to_pandas()