mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Renamed ed_to_pd eland_to_pandas and added docs.
+ added some additions to .gitignore + removed DataFrame.squeeze for now
This commit is contained in:
parent
29fe2278b7
commit
f5025b9f39
8
.gitignore
vendored
8
.gitignore
vendored
@ -2,7 +2,13 @@
|
||||
*.pyc
|
||||
|
||||
# Setuptools distribution folder.
|
||||
/dist/
|
||||
dist/
|
||||
|
||||
# Build folder
|
||||
build/
|
||||
|
||||
# docs build folder
|
||||
docs/build/
|
||||
|
||||
# Python egg metadata, regenerated from source files by setuptools.
|
||||
/*.egg-info
|
||||
|
@ -1,6 +0,0 @@
|
||||
eland.ed_to_pd
|
||||
==============
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: ed_to_pd
|
6
docs/source/reference/api/eland.eland_to_pandas.rst
Normal file
6
docs/source/reference/api/eland.eland_to_pandas.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.eland_to_pandas
|
||||
=====================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: eland_to_pandas
|
6
docs/source/reference/api/eland.pandas_to_eland.rst
Normal file
6
docs/source/reference/api/eland.pandas_to_eland.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.pandas_to_eland
|
||||
=====================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: pandas_to_eland
|
@ -1,6 +0,0 @@
|
||||
eland.pd_to_ed
|
||||
==============
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: pd_to_ed
|
@ -24,6 +24,7 @@ Attributes and underlying data
|
||||
DataFrame.dtypes
|
||||
DataFrame.select_dtypes
|
||||
DataFrame.empty
|
||||
DataFrame.shape
|
||||
|
||||
Indexing, iteration
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
@ -80,6 +81,9 @@ Serialization / IO / conversion
|
||||
:toctree: api/
|
||||
|
||||
DataFrame.info
|
||||
DataFrame.to_csv
|
||||
DataFrame.to_html
|
||||
DataFrame.to_string
|
||||
|
||||
Elasticsearch utilities
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -17,5 +17,5 @@ Pandas and Eland
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
pd_to_ed
|
||||
ed_to_pd
|
||||
pandas_to_eland
|
||||
eland_to_pandas
|
||||
|
@ -76,6 +76,7 @@ class DataFrame(NDFrame):
|
||||
[5 rows x 2 columns]
|
||||
|
||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field
|
||||
(TODO - currently index_field must also be a field if not _id)
|
||||
|
||||
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
|
||||
>>> df.head()
|
||||
@ -529,7 +530,11 @@ class DataFrame(NDFrame):
|
||||
bold_rows=True, classes=None, escape=True, notebook=False,
|
||||
border=None, table_id=None, render_links=False):
|
||||
"""
|
||||
From pandas - except we set max_rows default to avoid careless extraction of entire index
|
||||
Render a Elasticsearch data as an HTML table.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`to_html` for argument details.
|
||||
"""
|
||||
if max_rows is None:
|
||||
warnings.warn("DataFrame.to_string called without max_rows set "
|
||||
@ -568,7 +573,13 @@ class DataFrame(NDFrame):
|
||||
max_rows=None, max_cols=None, show_dimensions=False,
|
||||
decimal='.', line_width=None):
|
||||
"""
|
||||
From pandas - except we set max_rows default to avoid careless extraction of entire index
|
||||
Render a DataFrame to a console-friendly tabular output.
|
||||
|
||||
Follows pandas implementation except we set max_rows default to avoid careless extraction of entire index.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`to_string` for argument details.
|
||||
"""
|
||||
if max_rows is None:
|
||||
warnings.warn("DataFrame.to_string called without max_rows set "
|
||||
@ -718,6 +729,13 @@ class DataFrame(NDFrame):
|
||||
quotechar='"', line_terminator=None, chunksize=None,
|
||||
tupleize_cols=None, date_format=None, doublequote=True,
|
||||
escapechar=None, decimal='.'):
|
||||
"""
|
||||
Write Elasticsearch data to a comma-separated values (csv) file.
|
||||
|
||||
See Also
|
||||
--------
|
||||
:pandas_api_docs:`to_csv` for argument details.
|
||||
"""
|
||||
kwargs = {
|
||||
"path_or_buf": path_or_buf,
|
||||
"sep": sep,
|
||||
@ -754,16 +772,34 @@ class DataFrame(NDFrame):
|
||||
def _empty_pd_df(self):
|
||||
return self._query_compiler._empty_pd_ef()
|
||||
|
||||
def squeeze(self, axis=None):
|
||||
return DataFrame(
|
||||
query_compiler=self._query_compiler.squeeze(axis)
|
||||
)
|
||||
|
||||
def select_dtypes(self, include=None, exclude=None):
|
||||
"""
|
||||
Return a subset of the DataFrame's columns based on the column dtypes.
|
||||
|
||||
Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
|
||||
|
||||
Returns
|
||||
-------
|
||||
eland.DataFrame
|
||||
DataFrame contains only columns of selected dtypes
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights',
|
||||
... columns=['AvgTicketPrice', 'Dest', 'Cancelled', 'timestamp', 'dayOfWeek'])
|
||||
>>> df.dtypes
|
||||
AvgTicketPrice float64
|
||||
Dest object
|
||||
Cancelled bool
|
||||
timestamp datetime64[ns]
|
||||
dayOfWeek int64
|
||||
dtype: object
|
||||
>>> df = df.select_dtypes(include=[np.number, 'datetime'])
|
||||
>>> df.dtypes
|
||||
AvgTicketPrice float64
|
||||
timestamp datetime64[ns]
|
||||
dayOfWeek int64
|
||||
dtype: object
|
||||
"""
|
||||
empty_df = self._empty_pd_df()
|
||||
|
||||
@ -779,8 +815,20 @@ class DataFrame(NDFrame):
|
||||
Returns
|
||||
-------
|
||||
shape: tuple
|
||||
0 - number of rows
|
||||
1 - number of columns
|
||||
|
||||
0. number of rows
|
||||
1. number of columns
|
||||
|
||||
Notes
|
||||
-----
|
||||
- number of rows ``len(df)`` queries Elasticsearch
|
||||
- number of columns ``len(df.columns)`` is cached. If mappings are updated, DataFrame must be updated.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.read_es('localhost', 'ecommerce')
|
||||
>>> df.shape
|
||||
(4675, 45)
|
||||
"""
|
||||
num_rows = len(self)
|
||||
num_columns = len(self.columns)
|
||||
@ -891,9 +939,11 @@ class DataFrame(NDFrame):
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> df = df.query('FlightDelayMin > 60')
|
||||
>>> df.info()
|
||||
>>> df = ed.read_es('localhost', 'flights')
|
||||
>>> df.shape
|
||||
(13059, 27)
|
||||
>>> df.query('FlightDelayMin > 60').shape
|
||||
(2730, 27)
|
||||
"""
|
||||
if isinstance(expr, BooleanFilter):
|
||||
return DataFrame(
|
||||
|
@ -539,10 +539,6 @@ class Operations:
|
||||
task = ('iloc', (index, columns))
|
||||
self._tasks.append(task)
|
||||
|
||||
def squeeze(self, axis):
|
||||
task = ('squeeze', axis)
|
||||
self._tasks.append(task)
|
||||
|
||||
def index_count(self, query_compiler, field):
|
||||
# field is the index field so count values
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
@ -660,8 +656,6 @@ class Operations:
|
||||
if column_indexer is None:
|
||||
column_indexer = slice(None)
|
||||
df = df.iloc[index_indexer, column_indexer]
|
||||
elif action[0] == 'squeeze':
|
||||
df = df.squeeze(axis=action[1])
|
||||
# columns could be in here (and we ignore it)
|
||||
|
||||
return df
|
||||
|
@ -369,13 +369,6 @@ class ElandQueryCompiler:
|
||||
|
||||
return result
|
||||
|
||||
def squeeze(self, axis=None):
|
||||
result = self.copy()
|
||||
|
||||
result._operations.squeeze(axis)
|
||||
|
||||
return result
|
||||
|
||||
def view(self, index=None, columns=None):
|
||||
result = self.copy()
|
||||
|
||||
|
@ -37,7 +37,7 @@ class TestDataFrameDateTime(TestData):
|
||||
# Now create index
|
||||
index_name = 'eland_test_generate_es_mappings'
|
||||
|
||||
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
@ -14,11 +14,11 @@ class TestDataFrameDescribe(TestData):
|
||||
pd_describe = pd_flights.describe()
|
||||
ed_describe = ed_flights.describe()
|
||||
|
||||
assert_almost_equal(pd_describe[['AvgTicketPrice']],
|
||||
ed_describe[['AvgTicketPrice']],
|
||||
assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'),
|
||||
ed_describe.drop(['25%','50%','75%'], axis='index'),
|
||||
check_less_precise=True)
|
||||
|
||||
# TODO - this fails for all fields now as ES aggregations are approximate
|
||||
# TODO - this fails for percentile fields as ES aggregations are approximate
|
||||
# if ES percentile agg uses
|
||||
# "hdr": {
|
||||
# "number_of_significant_value_digits": 3
|
||||
|
@ -1,8 +1,11 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
|
||||
class TestDataFrameDtypes(TestData):
|
||||
@ -12,3 +15,12 @@ class TestDataFrameDtypes(TestData):
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
|
||||
|
||||
def test_flights_select_dtypes(self):
|
||||
ed_flights = self.ed_flights_small()
|
||||
pd_flights = self.pd_flights_small()
|
||||
|
||||
assert_pandas_eland_frame_equal(
|
||||
pd_flights.select_dtypes(include=np.number),
|
||||
ed_flights.select_dtypes(include=np.number)
|
||||
)
|
||||
|
@ -19,7 +19,7 @@ class TestDataFrameQuery(TestData):
|
||||
# Now create index
|
||||
index_name = 'eland_test_query'
|
||||
|
||||
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df = ed.pandas_to_eland(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_df, ed_df)
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import pytest
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
@ -12,7 +14,8 @@ class TestDataFrameRepr(TestData):
|
||||
ed_head_101 = ed_flights.head(101)
|
||||
pd_head_101 = pd_flights.head(101)
|
||||
|
||||
# This sets max_rows=60 by default
|
||||
# This sets max_rows=60 by default (but throws userwarning)
|
||||
with pytest.warns(UserWarning):
|
||||
ed_head_101_str = ed_head_101.to_string()
|
||||
pd_head_101_str = pd_head_101.to_string(max_rows=60)
|
||||
|
||||
|
@ -36,7 +36,7 @@ class TestDataFrameUtils(TestData):
|
||||
# Now create index
|
||||
index_name = 'eland_test_generate_es_mappings'
|
||||
|
||||
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df = ed.pandas_to_eland(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
@ -1,5 +1,7 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import pytest
|
||||
|
||||
from matplotlib.testing.decorators import check_figures_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
@ -12,8 +14,12 @@ def test_plot_hist(fig_test, fig_ref):
|
||||
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
|
||||
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
|
||||
with pytest.warns(UserWarning):
|
||||
pd_ax = fig_ref.subplots()
|
||||
pd_flights.hist(ax=pd_ax)
|
||||
|
||||
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
|
||||
with pytest.warns(UserWarning):
|
||||
ed_ax = fig_test.subplots()
|
||||
ed_flights.hist(ax=ed_ax)
|
||||
|
@ -26,12 +26,12 @@ def read_es(es_params, index_pattern):
|
||||
|
||||
See Also
|
||||
--------
|
||||
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
|
||||
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
|
||||
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
|
||||
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
|
||||
"""
|
||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||
|
||||
def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||
def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||
geo_points=None):
|
||||
"""
|
||||
Append a pandas DataFrame to an Elasticsearch index.
|
||||
@ -66,11 +66,11 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
|
||||
See Also
|
||||
--------
|
||||
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
|
||||
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
|
||||
"""
|
||||
client = Client(es_params)
|
||||
|
||||
mapping = Mappings._generate_es_mappings(df, geo_points)
|
||||
mapping = Mappings._generate_es_mappings(pd_df, geo_points)
|
||||
|
||||
# If table exists, check if_exists parameter
|
||||
if client.index_exists(index=destination_index):
|
||||
@ -92,7 +92,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
|
||||
# Now add data
|
||||
actions = []
|
||||
n = 0
|
||||
for row in df.iterrows():
|
||||
for row in pd_df.iterrows():
|
||||
# Use index as _id
|
||||
id = row[0]
|
||||
|
||||
@ -118,7 +118,7 @@ def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=1000
|
||||
|
||||
return ed_df
|
||||
|
||||
def ed_to_pd(ed_df):
|
||||
def eland_to_pandas(ed_df):
|
||||
"""
|
||||
Convert an eland.Dataframe to a pandas.DataFrame
|
||||
|
||||
@ -138,7 +138,7 @@ def ed_to_pd(ed_df):
|
||||
See Also
|
||||
--------
|
||||
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
|
||||
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
|
||||
"""
|
||||
return ed_df._to_pandas()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user