Renamed ed_to_pd eland_to_pandas and added docs.

+ added some additions to .gitignore
+ removed DataFrame.squeeze for now
This commit is contained in:
Stephen Dodson 2019-11-15 11:21:27 +00:00
parent 29fe2278b7
commit f5025b9f39
18 changed files with 130 additions and 62 deletions

10
.gitignore vendored
View File

@ -2,7 +2,13 @@
*.pyc *.pyc
# Setuptools distribution folder. # Setuptools distribution folder.
/dist/ dist/
# Build folder
build/
# docs build folder
docs/build/
# Python egg metadata, regenerated from source files by setuptools. # Python egg metadata, regenerated from source files by setuptools.
/*.egg-info /*.egg-info
@ -36,4 +42,4 @@ env/
venv/ venv/
ENV/ ENV/
env.bak/ env.bak/
venv.bak/ venv.bak/

View File

@ -1,6 +0,0 @@
eland.ed_to_pd
==============
.. currentmodule:: eland
.. autofunction:: ed_to_pd

View File

@ -0,0 +1,6 @@
eland.eland_to_pandas
=====================
.. currentmodule:: eland
.. autofunction:: eland_to_pandas

View File

@ -0,0 +1,6 @@
eland.pandas_to_eland
=====================
.. currentmodule:: eland
.. autofunction:: pandas_to_eland

View File

@ -1,6 +0,0 @@
eland.pd_to_ed
==============
.. currentmodule:: eland
.. autofunction:: pd_to_ed

View File

@ -24,6 +24,7 @@ Attributes and underlying data
DataFrame.dtypes DataFrame.dtypes
DataFrame.select_dtypes DataFrame.select_dtypes
DataFrame.empty DataFrame.empty
DataFrame.shape
Indexing, iteration Indexing, iteration
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
@ -80,6 +81,9 @@ Serialization / IO / conversion
:toctree: api/ :toctree: api/
DataFrame.info DataFrame.info
DataFrame.to_csv
DataFrame.to_html
DataFrame.to_string
Elasticsearch utilities Elasticsearch utilities
~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -17,5 +17,5 @@ Pandas and Eland
.. autosummary:: .. autosummary::
:toctree: api/ :toctree: api/
pd_to_ed pandas_to_eland
ed_to_pd eland_to_pandas

View File

@ -76,6 +76,7 @@ class DataFrame(NDFrame):
[5 rows x 2 columns] [5 rows x 2 columns]
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field
(TODO - currently index_field must also be a field if not _id)
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp') >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
>>> df.head() >>> df.head()
@ -529,7 +530,11 @@ class DataFrame(NDFrame):
bold_rows=True, classes=None, escape=True, notebook=False, bold_rows=True, classes=None, escape=True, notebook=False,
border=None, table_id=None, render_links=False): border=None, table_id=None, render_links=False):
""" """
From pandas - except we set max_rows default to avoid careless extraction of entire index Render a Elasticsearch data as an HTML table.
See Also
--------
:pandas_api_docs:`to_html` for argument details.
""" """
if max_rows is None: if max_rows is None:
warnings.warn("DataFrame.to_string called without max_rows set " warnings.warn("DataFrame.to_string called without max_rows set "
@ -568,7 +573,13 @@ class DataFrame(NDFrame):
max_rows=None, max_cols=None, show_dimensions=False, max_rows=None, max_cols=None, show_dimensions=False,
decimal='.', line_width=None): decimal='.', line_width=None):
""" """
From pandas - except we set max_rows default to avoid careless extraction of entire index Render a DataFrame to a console-friendly tabular output.
Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
See Also
--------
:pandas_api_docs:`to_string` for argument details.
""" """
if max_rows is None: if max_rows is None:
warnings.warn("DataFrame.to_string called without max_rows set " warnings.warn("DataFrame.to_string called without max_rows set "
@ -718,6 +729,13 @@ class DataFrame(NDFrame):
quotechar='"', line_terminator=None, chunksize=None, quotechar='"', line_terminator=None, chunksize=None,
tupleize_cols=None, date_format=None, doublequote=True, tupleize_cols=None, date_format=None, doublequote=True,
escapechar=None, decimal='.'): escapechar=None, decimal='.'):
"""
Write Elasticsearch data to a comma-separated values (csv) file.
See Also
--------
:pandas_api_docs:`to_csv` for argument details.
"""
kwargs = { kwargs = {
"path_or_buf": path_or_buf, "path_or_buf": path_or_buf,
"sep": sep, "sep": sep,
@ -754,16 +772,34 @@ class DataFrame(NDFrame):
def _empty_pd_df(self): def _empty_pd_df(self):
return self._query_compiler._empty_pd_ef() return self._query_compiler._empty_pd_ef()
def squeeze(self, axis=None):
return DataFrame(
query_compiler=self._query_compiler.squeeze(axis)
)
def select_dtypes(self, include=None, exclude=None): def select_dtypes(self, include=None, exclude=None):
""" """
Return a subset of the DataFrame's columns based on the column dtypes. Return a subset of the DataFrame's columns based on the column dtypes.
Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes` Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
Returns
-------
eland.DataFrame
DataFrame contains only columns of selected dtypes
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights',
... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
>>> df.dtypes
AvgTicketPrice float64
Dest object
Cancelled bool
timestamp datetime64[ns]
dayOfWeek int64
dtype: object
>>> df = df.select_dtypes(include=[np.number, 'datetime'])
>>> df.dtypes
AvgTicketPrice float64
timestamp datetime64[ns]
dayOfWeek int64
dtype: object
""" """
empty_df = self._empty_pd_df() empty_df = self._empty_pd_df()
@ -779,8 +815,20 @@ class DataFrame(NDFrame):
Returns Returns
------- -------
shape: tuple shape: tuple
0 - number of rows
1 - number of columns 0. number of rows
1. number of columns
Notes
-----
- number of rows ``len(df)`` queries Elasticsearch
- number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated.
Examples
--------
>>> df = ed.read_es('localhost', 'ecommerce')
>>> df.shape
(4675, 45)
""" """
num_rows = len(self) num_rows = len(self)
num_columns = len(self.columns) num_columns = len(self.columns)
@ -891,9 +939,11 @@ class DataFrame(NDFrame):
Examples Examples
-------- --------
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.read_es('localhost', 'flights')
>>> df = df.query('FlightDelayMin > 60') >>> df.shape
>>> df.info() (13059, 27)
>>> df.query('FlightDelayMin > 60').shape
(2730, 27)
""" """
if isinstance(expr, BooleanFilter): if isinstance(expr, BooleanFilter):
return DataFrame( return DataFrame(

View File

@ -539,10 +539,6 @@ class Operations:
task = ('iloc', (index, columns)) task = ('iloc', (index, columns))
self._tasks.append(task) self._tasks.append(task)
def squeeze(self, axis):
task = ('squeeze', axis)
self._tasks.append(task)
def index_count(self, query_compiler, field): def index_count(self, query_compiler, field):
# field is the index field so count values # field is the index field so count values
query_params, post_processing = self._resolve_tasks() query_params, post_processing = self._resolve_tasks()
@ -660,8 +656,6 @@ class Operations:
if column_indexer is None: if column_indexer is None:
column_indexer = slice(None) column_indexer = slice(None)
df = df.iloc[index_indexer, column_indexer] df = df.iloc[index_indexer, column_indexer]
elif action[0] == 'squeeze':
df = df.squeeze(axis=action[1])
# columns could be in here (and we ignore it) # columns could be in here (and we ignore it)
return df return df

View File

@ -369,13 +369,6 @@ class ElandQueryCompiler:
return result return result
def squeeze(self, axis=None):
result = self.copy()
result._operations.squeeze(axis)
return result
def view(self, index=None, columns=None): def view(self, index=None, columns=None):
result = self.copy() result = self.copy()

View File

@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData):
# Now create index # Now create index
index_name = 'eland_test_generate_es_mappings' index_name = 'eland_test_generate_es_mappings'
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df_head = ed_df.head() ed_df_head = ed_df.head()
assert_pandas_eland_frame_equal(df, ed_df_head) assert_pandas_eland_frame_equal(df, ed_df_head)

View File

@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData):
pd_describe = pd_flights.describe() pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe() ed_describe = ed_flights.describe()
assert_almost_equal(pd_describe[['AvgTicketPrice']], assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'),
ed_describe[['AvgTicketPrice']], ed_describe.drop(['25%','50%','75%'], axis='index'),
check_less_precise=True) check_less_precise=True)
# TODO - this fails for all fields now as ES aggregations are approximate # TODO - this fails for percentile fields as ES aggregations are approximate
# if ES percentile agg uses # if ES percentile agg uses
# "hdr": { # "hdr": {
# "number_of_significant_value_digits": 3 # "number_of_significant_value_digits": 3

View File

@ -1,8 +1,11 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import numpy as np
from pandas.util.testing import assert_series_equal from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameDtypes(TestData): class TestDataFrameDtypes(TestData):
@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes) assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
def test_flights_select_dtypes(self):
ed_flights = self.ed_flights_small()
pd_flights = self.pd_flights_small()
assert_pandas_eland_frame_equal(
pd_flights.select_dtypes(include=np.number),
ed_flights.select_dtypes(include=np.number)
)

View File

@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData):
# Now create index # Now create index
index_name = 'eland_test_query' index_name = 'eland_test_query'
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
assert_pandas_eland_frame_equal(pd_df, ed_df) assert_pandas_eland_frame_equal(pd_df, ed_df)

View File

@ -1,5 +1,7 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pytest
from eland.tests.common import TestData from eland.tests.common import TestData
@ -12,8 +14,9 @@ class TestDataFrameRepr(TestData):
ed_head_101 = ed_flights.head(101) ed_head_101 = ed_flights.head(101)
pd_head_101 = pd_flights.head(101) pd_head_101 = pd_flights.head(101)
# This sets max_rows=60 by default # This sets max_rows=60 by default (but throws userwarning)
ed_head_101_str = ed_head_101.to_string() with pytest.warns(UserWarning):
ed_head_101_str = ed_head_101.to_string()
pd_head_101_str = pd_head_101.to_string(max_rows=60) pd_head_101_str = pd_head_101.to_string(max_rows=60)
assert pd_head_101_str == ed_head_101_str assert pd_head_101_str == ed_head_101_str

View File

@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData):
# Now create index # Now create index
index_name = 'eland_test_generate_es_mappings' index_name = 'eland_test_generate_es_mappings'
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df_head = ed_df.head() ed_df_head = ed_df.head()
assert_pandas_eland_frame_equal(df, ed_df_head) assert_pandas_eland_frame_equal(df, ed_df_head)

View File

@ -1,5 +1,7 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pytest
from matplotlib.testing.decorators import check_figures_equal from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData from eland.tests.common import TestData
@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref):
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']] pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']] ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
pd_ax = fig_ref.subplots() # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
pd_flights.hist(ax=pd_ax) with pytest.warns(UserWarning):
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
ed_ax = fig_test.subplots() # This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
ed_flights.hist(ax=ed_ax) with pytest.warns(UserWarning):
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)

View File

@ -26,13 +26,13 @@ def read_es(es_params, index_pattern):
See Also See Also
-------- --------
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
""" """
return DataFrame(client=es_params, index_pattern=index_pattern) return DataFrame(client=es_params, index_pattern=index_pattern)
def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
geo_points=None): geo_points=None):
""" """
Append a pandas DataFrame to an Elasticsearch index. Append a pandas DataFrame to an Elasticsearch index.
Mainly used in testing. Mainly used in testing.
@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
See Also See Also
-------- --------
eland.read_es: Create an eland.Dataframe from an Elasticsearch index eland.read_es: Create an eland.Dataframe from an Elasticsearch index
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
""" """
client = Client(es_params) client = Client(es_params)
mapping = Mappings._generate_es_mappings(df, geo_points) mapping = Mappings._generate_es_mappings(pd_df, geo_points)
# If table exists, check if_exists parameter # If table exists, check if_exists parameter
if client.index_exists(index=destination_index): if client.index_exists(index=destination_index):
@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
# Now add data # Now add data
actions = [] actions = []
n = 0 n = 0
for row in df.iterrows(): for row in pd_df.iterrows():
# Use index as _id # Use index as _id
id = row[0] id = row[0]
@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
return ed_df return ed_df
def ed_to_pd(ed_df): def eland_to_pandas(ed_df):
""" """
Convert an eland.Dataframe to a pandas.DataFrame Convert an eland.Dataframe to a pandas.DataFrame
@ -138,7 +138,7 @@ def ed_to_pd(ed_df):
See Also See Also
-------- --------
eland.read_es: Create an eland.Dataframe from an Elasticsearch index eland.read_es: Create an eland.Dataframe from an Elasticsearch index
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
""" """
return ed_df._to_pandas() return ed_df._to_pandas()