From e181476dfe25a2abbe3dcf16ea663d2cbf09eab2 Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Tue, 12 Nov 2019 20:26:59 +0000 Subject: [PATCH] First effort at tidying up docs. Still work-in-progress. --- NOTES.md | 2 +- docs/Makefile | 20 +++ docs/make.bat | 35 ++++ docs/source/conf.py | 79 +++++++++ docs/source/index.rst | 30 ++++ .../reference/api/eland.DataFrame.columns.rst | 6 + .../reference/api/eland.DataFrame.head.rst | 6 + .../reference/api/eland.DataFrame.index.rst | 6 + docs/source/reference/api/eland.DataFrame.rst | 18 ++ .../reference/api/eland.DataFrame.tail.rst | 6 + docs/source/reference/api/eland.ed_to_pd.rst | 6 + docs/source/reference/api/eland.pd_to_ed.rst | 6 + docs/source/reference/api/eland.read_es.rst | 6 + docs/source/reference/dataframe.rst | 35 ++++ .../reference/general_utility_functions.rst | 21 +++ docs/source/reference/index.rst | 14 ++ eland/dataframe.py | 165 +++++++++++++++++- eland/ndframe.py | 6 + eland/series.py | 4 +- eland/tests/dataframe/test_datetime_pytest.py | 4 +- eland/tests/dataframe/test_init_pytest.py | 31 ++++ eland/tests/dataframe/test_query_pytest.py | 3 +- eland/tests/dataframe/test_utils_pytest.py | 8 +- eland/utils.py | 109 +++++++++--- make_docs.sh | 9 + requirements-dev.txt | 3 + 26 files changed, 591 insertions(+), 47 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 docs/source/reference/api/eland.DataFrame.columns.rst create mode 100644 docs/source/reference/api/eland.DataFrame.head.rst create mode 100644 docs/source/reference/api/eland.DataFrame.index.rst create mode 100644 docs/source/reference/api/eland.DataFrame.rst create mode 100644 docs/source/reference/api/eland.DataFrame.tail.rst create mode 100644 docs/source/reference/api/eland.ed_to_pd.rst create mode 100644 docs/source/reference/api/eland.pd_to_ed.rst create mode 100644 docs/source/reference/api/eland.read_es.rst create mode 100644 docs/source/reference/dataframe.rst create mode 100644 docs/source/reference/general_utility_functions.rst create mode 100644 docs/source/reference/index.rst create mode 100644 eland/tests/dataframe/test_init_pytest.py create mode 100644 make_docs.sh diff --git a/NOTES.md b/NOTES.md index 6b71e1a..7fa3635 100644 --- a/NOTES.md +++ b/NOTES.md @@ -47,7 +47,7 @@ the `pandas.DataFrame` API. This resolves some of the issues above as: than a new index * Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for -Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame.to_pandas()` +Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame._to_pandas()` method which would explicitly export all data to a `pandas.DataFrame` * Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However, diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..6247f7e --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..f37c4d8 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,79 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath("../sphinxext")) +sys.path.extend( + [ + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext") + ] +) + + + +# -- Project information ----------------------------------------------------- + +project = 'eland' +copyright = '2019, Stephen Dodson' +author = 'Stephen Dodson' + +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + "sphinx.ext.doctest", + 'numpydoc' +] + +doctest_global_setup = ''' +try: + import eland as ed +except ImportError: + ed = None +try: + import pandas as pd +except ImportError: + pd = None +''' + +numpydoc_attributes_as_param_list = False + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..871dd61 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,30 @@ +.. eland documentation master file, created by + +.. module:: eland + +**************************************************************** +eland: pandas-like data analysis toolkit backed by Elasticsearch +**************************************************************** + +**Date**: |today| **Version**: |version| + +**Useful links**: +`Source Repository `__ | +`Issues & Ideas `__ | +`Q&A Support `__ | + +:mod:`eland` is an open source, Apache2-licensed elasticsearch Python client to analyse, explore and manipulate data that resides in elasticsearch. +Where possible the package uses existing Python APIs and data structures to make it easy to switch between Numpy, Pandas, Scikit-learn to their elasticsearch powered equivalents. +In general, the data resides in elasticsearch and not in memory, which allows eland to access large datasets stored in elasticsearch. + + +.. toctree:: + :maxdepth: 2 + :hidden: + + reference/index + +* :doc:`reference/index` + + * :doc:`reference/general_utility_functions` + * :doc:`reference/dataframe` diff --git a/docs/source/reference/api/eland.DataFrame.columns.rst b/docs/source/reference/api/eland.DataFrame.columns.rst new file mode 100644 index 0000000..8bcdf83 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.columns.rst @@ -0,0 +1,6 @@ +eland.DataFrame.columns +======================= + +.. currentmodule:: eland + +.. autoattribute:: DataFrame.columns diff --git a/docs/source/reference/api/eland.DataFrame.head.rst b/docs/source/reference/api/eland.DataFrame.head.rst new file mode 100644 index 0000000..16d4173 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.head.rst @@ -0,0 +1,6 @@ +eland.DataFrame.head +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.head diff --git a/docs/source/reference/api/eland.DataFrame.index.rst b/docs/source/reference/api/eland.DataFrame.index.rst new file mode 100644 index 0000000..c3d0ab0 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.index.rst @@ -0,0 +1,6 @@ +eland.DataFrame.index +===================== + +.. currentmodule:: eland + +.. autoattribute:: DataFrame.index diff --git a/docs/source/reference/api/eland.DataFrame.rst b/docs/source/reference/api/eland.DataFrame.rst new file mode 100644 index 0000000..8929d81 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.rst @@ -0,0 +1,18 @@ +eland.DataFrame +================ + +.. currentmodule:: eland + +.. autoclass:: DataFrame + + + + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + + DataFrame.abs + DataFrame.add + diff --git a/docs/source/reference/api/eland.DataFrame.tail.rst b/docs/source/reference/api/eland.DataFrame.tail.rst new file mode 100644 index 0000000..b4ec087 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.tail.rst @@ -0,0 +1,6 @@ +eland.DataFrame.tail +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.tail diff --git a/docs/source/reference/api/eland.ed_to_pd.rst b/docs/source/reference/api/eland.ed_to_pd.rst new file mode 100644 index 0000000..55dcf64 --- /dev/null +++ b/docs/source/reference/api/eland.ed_to_pd.rst @@ -0,0 +1,6 @@ +eland.ed_to_pd +============== + +.. currentmodule:: eland + +.. autofunction:: ed_to_pd diff --git a/docs/source/reference/api/eland.pd_to_ed.rst b/docs/source/reference/api/eland.pd_to_ed.rst new file mode 100644 index 0000000..615c987 --- /dev/null +++ b/docs/source/reference/api/eland.pd_to_ed.rst @@ -0,0 +1,6 @@ +eland.pd_to_ed +============== + +.. currentmodule:: eland + +.. autofunction:: pd_to_ed diff --git a/docs/source/reference/api/eland.read_es.rst b/docs/source/reference/api/eland.read_es.rst new file mode 100644 index 0000000..e31751e --- /dev/null +++ b/docs/source/reference/api/eland.read_es.rst @@ -0,0 +1,6 @@ +eland.read_es +============= + +.. currentmodule:: eland + +.. autofunction:: read_es diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst new file mode 100644 index 0000000..f4510b3 --- /dev/null +++ b/docs/source/reference/dataframe.rst @@ -0,0 +1,35 @@ +.. _api.dataframe: + +========= +DataFrame +========= +.. currentmodule:: eland + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + +.. autosummary:: + :toctree: api/ + + DataFrame.index + DataFrame.columns + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.head + DataFrame.tail + + + + diff --git a/docs/source/reference/general_utility_functions.rst b/docs/source/reference/general_utility_functions.rst new file mode 100644 index 0000000..63e1865 --- /dev/null +++ b/docs/source/reference/general_utility_functions.rst @@ -0,0 +1,21 @@ +.. _api.general_utility_functions: + +========================= +General utility functions +========================= +.. currentmodule:: eland + +Elasticsearch access +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + read_es + +Pandas and Eland +~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + pd_to_ed + ed_to_pd diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 0000000..8f79abe --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,14 @@ +.. _api: + +============= +API reference +============= + +This page gives an overview of all public eland objects, functions and +methods. All classes and functions exposed in ``eland.*`` namespace are public. + +.. toctree:: + :maxdepth: 2 + + general_utility_functions + dataframe diff --git a/eland/dataframe.py b/eland/dataframe.py index 9e58759..516391a 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,17 +19,86 @@ from eland import NDFrame from eland import Series from eland.filter import BooleanFilter, ScriptFilter - class DataFrame(NDFrame): - # This is effectively 2 constructors - # 1. client, index_pattern, columns, index_field - # 2. query_compiler + """ + Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes + (rows and columns) referencing data stored in Elasticsearch indices. + Where possible APIs mirror pandas.DataFrame APIs. + The underlying data is stored in Elasticsearch rather than core memory. + + Parameters + ---------- + client: Elasticsearch client argument(s) (e.g. 'localhost:9200') + - elasticsearch-py parameters or + - elasticsearch-py instance or + - eland.Client instance + index_pattern: str + Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*') + columns: list of str, optional + List of DataFrame columns. A subset of the Elasticsearch index's fields. + index_field: str, optional + The Elasticsearch index field to use as the DataFrame index. Defaults to _id if None is used. + + Examples + -------- + Constructing DataFrame from an Elasticsearch configuration arguments and an Elasticsearch index + + >>> df = ed.DataFrame('localhost:9200', 'flights') + >>> df.head() + AvgTicketPrice Cancelled Carrier Dest ... OriginRegion OriginWeather dayOfWeek timestamp + 0 841.265642 False Kibana Airlines Sydney Kingsford Smith International Airport ... DE-HE Sunny 0 2018-01-01 00:00:00 + 1 882.982662 False Logstash Airways Venice Marco Polo Airport ... SE-BD Clear 0 2018-01-01 18:27:00 + 2 190.636904 False Logstash Airways Venice Marco Polo Airport ... IT-34 Rain 0 2018-01-01 17:11:14 + 3 181.694216 True Kibana Airlines Treviso-Sant'Angelo Airport ... IT-72 Thunder & Lightning 0 2018-01-01 10:33:28 + 4 730.041778 False Kibana Airlines Xi'an Xianyang International Airport ... MX-DIF Damaging Wind 0 2018-01-01 05:13:00 + + [5 rows x 27 columns] + + Constructing DataFrame from an Elasticsearch client and an Elasticsearch index + + >>> from elasticsearch import Elasticsearch + >>> es = Elasticsearch("localhost:9200") + >>> df = ed.DataFrame(client=es, index_pattern='flights', columns=['AvgTicketPrice', 'Cancelled']) + >>> df.head() + AvgTicketPrice Cancelled + 0 841.265642 False + 1 882.982662 False + 2 190.636904 False + 3 181.694216 True + 4 730.041778 False + + [5 rows x 2 columns] + + Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field + + >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp') + >>> df.head() + AvgTicketPrice timestamp + 2018-01-01T00:00:00 841.265642 2018-01-01 00:00:00 + 2018-01-01T00:02:06 772.100846 2018-01-01 00:02:06 + 2018-01-01T00:06:27 159.990962 2018-01-01 00:06:27 + 2018-01-01T00:33:31 800.217104 2018-01-01 00:33:31 + 2018-01-01T00:36:51 803.015200 2018-01-01 00:36:51 + + [5 rows x 2 columns] + """ def __init__(self, client=None, index_pattern=None, columns=None, index_field=None, query_compiler=None): + """ + There are effectively 2 constructors: + + 1. client, index_pattern, columns, index_field + 2. query_compiler (eland.ElandQueryCompiler) + + The constructor with 'query_compiler' is for internal use only. + """ + if query_compiler is None: + if client is None or index_pattern is None: + raise ValueError("client and index_pattern must be defined in DataFrame constructor") # python 3 syntax super().__init__( client=client, @@ -39,6 +108,27 @@ class DataFrame(NDFrame): query_compiler=query_compiler) def _get_columns(self): + """ + The column labels of the DataFrame. + + Returns + ------- + Elasticsearch field names as pandas.Index + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> assert isinstance(df.columns, pd.Index) + >>> df.columns + Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', + ... 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion', + ... 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay', + ... 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour', + ... 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName', + ... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', + ... 'dayOfWeek', 'timestamp'], + ... dtype='object') + """ return self._query_compiler.columns columns = property(_get_columns) @@ -51,14 +141,70 @@ class DataFrame(NDFrame): True if the DataFrame is empty. False otherwise. """ - # TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337 - # (as Index.__len__ performs an query) we may want to cache self.index.empty() return len(self.columns) == 0 or len(self.index) == 0 def head(self, n=5): + """ + Return the first n rows. + + This function returns the first n rows for the object based on position. + The row order is sorted by index field. + It is useful for quickly testing if your object has the right type of data in it. + + Parameters + ---------- + n: int, default 5 + Number of rows to select. + + Returns + ------- + eland.DataFrame + eland DataFrame filtered on first n rows sorted by index field + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest']) + >>> df.head(3) + Origin Dest + 0 Frankfurt am Main Airport Sydney Kingsford Smith International Airport + 1 Cape Town International Airport Venice Marco Polo Airport + 2 Venice Marco Polo Airport Venice Marco Polo Airport + + [3 rows x 2 columns] + """ return DataFrame(query_compiler=self._query_compiler.head(n)) def tail(self, n=5): + """ + Return the last n rows. + + This function returns the last n rows for the object based on position. + The row order is sorted by index field. + It is useful for quickly testing if your object has the right type of data in it. + + Parameters + ---------- + n: int, default 5 + Number of rows to select. + + Returns + ------- + eland.DataFrame: + eland DataFrame filtered on last n rows sorted by index field + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest']) + >>> df.tail() + Origin Dest + 13054 Pisa International Airport Xi'an Xianyang International Airport + 13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport + 13056 Licenciado Benito Juarez International Airport Ukrainka Air Base + 13057 Itami Airport Ministro Pistarini International Airport + 13058 Adelaide International Airport Washington Dulles International Airport + + [5 rows x 2 columns] + """ return DataFrame(query_compiler=self._query_compiler.tail(n)) def __repr__(self): @@ -459,6 +605,13 @@ class DataFrame(NDFrame): return self._query_compiler.to_csv(**kwargs) def _to_pandas(self): + """ + Utility method to convert eland.Dataframe to pandas.Dataframe + + Returns + ------- + pandas.DataFrame + """ return self._query_compiler.to_pandas() def _empty_pd_df(self): diff --git a/eland/ndframe.py b/eland/ndframe.py index 4b46e63..3c8f53b 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -56,6 +56,12 @@ class NDFrame: self._query_compiler = query_compiler def _get_index(self): + """ + + Returns + ------- + + """ return self._query_compiler.index index = property(_get_index) diff --git a/eland/series.py b/eland/series.py index 198f5b5..66f27e3 100644 --- a/eland/series.py +++ b/eland/series.py @@ -91,8 +91,6 @@ class Series(NDFrame): True if the Series is empty. False otherwise. """ - # TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337 - # (as Index.__len__ performs an query) we may want to cache self.index.empty() return len(self.index) == 0 def _get_name(self): @@ -152,7 +150,7 @@ class Series(NDFrame): ) def _to_pandas(self): - return self._query_compiler.to_pandas()[self.name] + return self._query_compiler._to_pandas()[self.name] def __gt__(self, other): if isinstance(other, Series): diff --git a/eland/tests/dataframe/test_datetime_pytest.py b/eland/tests/dataframe/test_datetime_pytest.py index 5f4d580..ae7fe8a 100644 --- a/eland/tests/dataframe/test_datetime_pytest.py +++ b/eland/tests/dataframe/test_datetime_pytest.py @@ -37,9 +37,7 @@ class TestDataFrameDateTime(TestData): # Now create index index_name = 'eland_test_generate_es_mappings' - ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) - - ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name) + ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df_head = ed_df.head() assert_pandas_eland_frame_equal(df, ed_df_head) diff --git a/eland/tests/dataframe/test_init_pytest.py b/eland/tests/dataframe/test_init_pytest.py new file mode 100644 index 0000000..9754b0f --- /dev/null +++ b/eland/tests/dataframe/test_init_pytest.py @@ -0,0 +1,31 @@ +# File called _pytest for PyCharm compatability + +import eland as ed + +import pytest + +from eland.tests import ELASTICSEARCH_HOST +from eland.tests import FLIGHTS_INDEX_NAME + +class TestDataFrameInit: + + def test_init(self): + # Construct empty DataFrame (throws) + with pytest.raises(ValueError): + df = ed.DataFrame() + + # Construct invalid DataFrame (throws) + with pytest.raises(ValueError): + df = ed.DataFrame(client=ELASTICSEARCH_HOST) + + # Construct invalid DataFrame (throws) + with pytest.raises(ValueError): + df = ed.DataFrame(index_pattern=FLIGHTS_INDEX_NAME) + + # Good constructors + df0 = ed.DataFrame(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME) + df1 = ed.DataFrame(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME) + + qc = ed.ElandQueryCompiler(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME) + df2 = ed.DataFrame(query_compiler=qc) + diff --git a/eland/tests/dataframe/test_query_pytest.py b/eland/tests/dataframe/test_query_pytest.py index 7cab24b..cabac07 100644 --- a/eland/tests/dataframe/test_query_pytest.py +++ b/eland/tests/dataframe/test_query_pytest.py @@ -19,8 +19,7 @@ class TestDataFrameQuery(TestData): # Now create index index_name = 'eland_test_query1' - ed.pandas_to_es(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) - ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name) + ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) assert_pandas_eland_frame_equal(pd_df, ed_df) diff --git a/eland/tests/dataframe/test_utils_pytest.py b/eland/tests/dataframe/test_utils_pytest.py index a2ce298..021f32e 100644 --- a/eland/tests/dataframe/test_utils_pytest.py +++ b/eland/tests/dataframe/test_utils_pytest.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import eland as ed -from eland.tests.common import ELASTICSEARCH_HOST +from eland.tests.common import ELASTICSEARCH_HOST, assert_pandas_eland_frame_equal from eland.tests.common import TestData @@ -36,9 +36,7 @@ class TestDataFrameUtils(TestData): # Now create index index_name = 'eland_test_generate_es_mappings' - ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) - - ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name) + ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df_head = ed_df.head() - # assert_frame_equal(df, ed_df_head) + assert_pandas_eland_frame_equal(df, ed_df_head) diff --git a/eland/utils.py b/eland/utils.py index 98774c3..1299f6c 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -2,44 +2,71 @@ from eland import Client from eland import DataFrame from eland import Mappings +import pandas as pd + def read_es(es_params, index_pattern): - return DataFrame(client=es_params, index_pattern=index_pattern) - - -def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, - geo_points=None): """ - Append a pandas DataFrame to an Elasticsearch index. - Mainly used in testing. + Utility method to create an eland.Dataframe from an Elasticsearch index_pattern. + (Similar to pandas.read_csv, but source data is an Elasticsearch index rather than + a csv file) Parameters ---------- - es_params : Elasticsearch client argument - elasticsearch-py parameters or - elasticsearch-py instance or - eland.Client instance + es_params: Elasticsearch client argument(s) + - elasticsearch-py parameters or + - elasticsearch-py instance or + - eland.Client instance + index_pattern: str + Elasticsearch index pattern - destination_index : str - Name of Elasticsearch index to be written + Returns + ------- + eland.DataFrame - if_exists : str, default 'fail' - Behavior when the destination index exists. Value can be one of: - ``'fail'`` - If table exists, do nothing. - ``'replace'`` - If table exists, drop it, recreate it, and insert data. - ``'append'`` - If table exists, insert data. Create if does not exist. + See Also + -------- + eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame + eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame + """ + return DataFrame(client=es_params, index_pattern=index_pattern) - dropna : bool - ``'True'`` - Remove missing values (see pandas.Series.dropna) - ``'False;`` - Include missing values - may cause bulk to fail +def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, + geo_points=None): + """ + Append a pandas DataFrame to an Elasticsearch index. + Mainly used in testing. + Modifies the elasticsearch destination index - geo_points : list or None + Parameters + ---------- + es_params: Elasticsearch client argument(s) + - elasticsearch-py parameters or + - elasticsearch-py instance or + - eland.Client instance + destination_index: str + Name of Elasticsearch index to be appended to + if_exists : {'fail', 'replace', 'append'}, default 'fail' + How to behave if the index already exists. + + - fail: Raise a ValueError. + - replace: Delete the index before inserting new values. + - append: Insert new values to the existing index. Create if does not exist. + dropna: bool, default 'False' + * True: Remove missing values (see pandas.Series.dropna) + * False: Include missing values - may cause bulk to fail + geo_points: list, default None List of columns to map to geo_point data type + + Returns + ------- + eland.Dataframe + eland.DataFrame referencing data in destination_index + + See Also + -------- + eland.read_es: Create an eland.Dataframe from an Elasticsearch index + eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame """ client = Client(es_params) @@ -86,3 +113,31 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size= actions = [] client.bulk(actions, refresh=refresh) + + ed_df = DataFrame(client, destination_index) + + return ed_df + +def ed_to_pd(ed_df): + """ + Convert an eland.Dataframe to a pandas.DataFrame + + **Note: this loads the entire Elasticsearch index into in core pandas.DataFrame structures. For large + indices this can create significant load on the Elasticsearch cluster and require signficant memory** + + Parameters + ---------- + ed_df: eland.DataFrame + The source eland.Dataframe referencing the Elasticsearch index + + Returns + ------- + pandas.Dataframe + pandas.DataFrame contains all rows and columns in eland.DataFrame + + See Also + -------- + eland.read_es: Create an eland.Dataframe from an Elasticsearch index + eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame + """ + return ed_df._to_pandas() diff --git a/make_docs.sh b/make_docs.sh new file mode 100644 index 0000000..5134e70 --- /dev/null +++ b/make_docs.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +python setup.py install + +cd docs + +make clean +make html + diff --git a/requirements-dev.txt b/requirements-dev.txt index d18de0e..f6a7ec9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,6 @@ elasticsearch>=7.0.5 pandas==0.25.1 +matplotlib pytest>=5.2.1 +sphinx_rtd_theme +numpydoc==0.8