mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Merge pull request #31 from stevedodson/master
Creating docs framework.
This commit is contained in:
commit
fd35fbd9f5
2
NOTES.md
2
NOTES.md
@ -47,7 +47,7 @@ the `pandas.DataFrame` API. This resolves some of the issues above as:
|
||||
than a new index
|
||||
|
||||
* Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for
|
||||
Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame.to_pandas()`
|
||||
Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame._to_pandas()`
|
||||
method which would explicitly export all data to a `pandas.DataFrame`
|
||||
|
||||
* Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However,
|
||||
|
20
docs/Makefile
Normal file
20
docs/Makefile
Normal file
@ -0,0 +1,20 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
35
docs/make.bat
Normal file
35
docs/make.bat
Normal file
@ -0,0 +1,35 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
79
docs/source/conf.py
Normal file
79
docs/source/conf.py
Normal file
@ -0,0 +1,79 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath("../sphinxext"))
|
||||
sys.path.extend(
|
||||
[
|
||||
# numpy standard doc extensions
|
||||
os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext")
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'eland'
|
||||
copyright = '2019, Stephen Dodson'
|
||||
author = 'Stephen Dodson'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.1'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
"sphinx.ext.doctest",
|
||||
'numpydoc'
|
||||
]
|
||||
|
||||
doctest_global_setup = '''
|
||||
try:
|
||||
import eland as ed
|
||||
except ImportError:
|
||||
ed = None
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
pd = None
|
||||
'''
|
||||
|
||||
numpydoc_attributes_as_param_list = False
|
||||
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
30
docs/source/index.rst
Normal file
30
docs/source/index.rst
Normal file
@ -0,0 +1,30 @@
|
||||
.. eland documentation master file, created by
|
||||
|
||||
.. module:: eland
|
||||
|
||||
****************************************************************
|
||||
eland: pandas-like data analysis toolkit backed by Elasticsearch
|
||||
****************************************************************
|
||||
|
||||
**Date**: |today| **Version**: |version|
|
||||
|
||||
**Useful links**:
|
||||
`Source Repository <https://github.com/elastic/eland>`__ |
|
||||
`Issues & Ideas <https://github.com/elastic/eland/issues>`__ |
|
||||
`Q&A Support <https://discuss.elastic.co>`__ |
|
||||
|
||||
:mod:`eland` is an open source, Apache2-licensed elasticsearch Python client to analyse, explore and manipulate data that resides in elasticsearch.
|
||||
Where possible the package uses existing Python APIs and data structures to make it easy to switch between Numpy, Pandas, Scikit-learn to their elasticsearch powered equivalents.
|
||||
In general, the data resides in elasticsearch and not in memory, which allows eland to access large datasets stored in elasticsearch.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
reference/index
|
||||
|
||||
* :doc:`reference/index`
|
||||
|
||||
* :doc:`reference/general_utility_functions`
|
||||
* :doc:`reference/dataframe`
|
6
docs/source/reference/api/eland.DataFrame.columns.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.columns.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.DataFrame.columns
|
||||
=======================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autoattribute:: DataFrame.columns
|
6
docs/source/reference/api/eland.DataFrame.head.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.head.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.DataFrame.head
|
||||
====================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. automethod:: DataFrame.head
|
6
docs/source/reference/api/eland.DataFrame.index.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.index.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.DataFrame.index
|
||||
=====================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autoattribute:: DataFrame.index
|
18
docs/source/reference/api/eland.DataFrame.rst
Normal file
18
docs/source/reference/api/eland.DataFrame.rst
Normal file
@ -0,0 +1,18 @@
|
||||
eland.DataFrame
|
||||
================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autoclass:: DataFrame
|
||||
|
||||
|
||||
|
||||
|
||||
..
|
||||
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
|
||||
.. autosummary::
|
||||
:toctree:
|
||||
|
||||
DataFrame.abs
|
||||
DataFrame.add
|
||||
|
6
docs/source/reference/api/eland.DataFrame.tail.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.tail.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.DataFrame.tail
|
||||
====================
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. automethod:: DataFrame.tail
|
6
docs/source/reference/api/eland.ed_to_pd.rst
Normal file
6
docs/source/reference/api/eland.ed_to_pd.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.ed_to_pd
|
||||
==============
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: ed_to_pd
|
6
docs/source/reference/api/eland.pd_to_ed.rst
Normal file
6
docs/source/reference/api/eland.pd_to_ed.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.pd_to_ed
|
||||
==============
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: pd_to_ed
|
6
docs/source/reference/api/eland.read_es.rst
Normal file
6
docs/source/reference/api/eland.read_es.rst
Normal file
@ -0,0 +1,6 @@
|
||||
eland.read_es
|
||||
=============
|
||||
|
||||
.. currentmodule:: eland
|
||||
|
||||
.. autofunction:: read_es
|
35
docs/source/reference/dataframe.rst
Normal file
35
docs/source/reference/dataframe.rst
Normal file
@ -0,0 +1,35 @@
|
||||
.. _api.dataframe:
|
||||
|
||||
=========
|
||||
DataFrame
|
||||
=========
|
||||
.. currentmodule:: eland
|
||||
|
||||
Constructor
|
||||
~~~~~~~~~~~
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
DataFrame
|
||||
|
||||
Attributes and underlying data
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
**Axes**
|
||||
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
DataFrame.index
|
||||
DataFrame.columns
|
||||
|
||||
Indexing, iteration
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
DataFrame.head
|
||||
DataFrame.tail
|
||||
|
||||
|
||||
|
||||
|
21
docs/source/reference/general_utility_functions.rst
Normal file
21
docs/source/reference/general_utility_functions.rst
Normal file
@ -0,0 +1,21 @@
|
||||
.. _api.general_utility_functions:
|
||||
|
||||
=========================
|
||||
General utility functions
|
||||
=========================
|
||||
.. currentmodule:: eland
|
||||
|
||||
Elasticsearch access
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
read_es
|
||||
|
||||
Pandas and Eland
|
||||
~~~~~~~~~~~~~~~~
|
||||
.. autosummary::
|
||||
:toctree: api/
|
||||
|
||||
pd_to_ed
|
||||
ed_to_pd
|
14
docs/source/reference/index.rst
Normal file
14
docs/source/reference/index.rst
Normal file
@ -0,0 +1,14 @@
|
||||
.. _api:
|
||||
|
||||
=============
|
||||
API reference
|
||||
=============
|
||||
|
||||
This page gives an overview of all public eland objects, functions and
|
||||
methods. All classes and functions exposed in ``eland.*`` namespace are public.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
general_utility_functions
|
||||
dataframe
|
@ -1,14 +1,14 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from eland.client import *
|
||||
from eland.dataframe import *
|
||||
from eland.filter import *
|
||||
from eland.index import *
|
||||
from eland.mappings import *
|
||||
from eland.ndframe import *
|
||||
from eland.operations import *
|
||||
from eland.plotting import *
|
||||
from eland.query import *
|
||||
from eland.operations import *
|
||||
from eland.query_compiler import *
|
||||
from eland.plotting import *
|
||||
from eland.ndframe import *
|
||||
from eland.series import *
|
||||
from eland.dataframe import *
|
||||
from eland.utils import *
|
||||
|
@ -1,6 +1,5 @@
|
||||
import sys
|
||||
import warnings
|
||||
from distutils.version import LooseVersion
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
@ -20,17 +19,86 @@ from eland import NDFrame
|
||||
from eland import Series
|
||||
from eland.filter import BooleanFilter, ScriptFilter
|
||||
|
||||
|
||||
class DataFrame(NDFrame):
|
||||
# This is effectively 2 constructors
|
||||
# 1. client, index_pattern, columns, index_field
|
||||
# 2. query_compiler
|
||||
"""
|
||||
Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes
|
||||
(rows and columns) referencing data stored in Elasticsearch indices.
|
||||
Where possible APIs mirror pandas.DataFrame APIs.
|
||||
The underlying data is stored in Elasticsearch rather than core memory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
client: Elasticsearch client argument(s) (e.g. 'localhost:9200')
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance or
|
||||
- eland.Client instance
|
||||
index_pattern: str
|
||||
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*')
|
||||
columns: list of str, optional
|
||||
List of DataFrame columns. A subset of the Elasticsearch index's fields.
|
||||
index_field: str, optional
|
||||
The Elasticsearch index field to use as the DataFrame index. Defaults to _id if None is used.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Constructing DataFrame from an Elasticsearch configuration arguments and an Elasticsearch index
|
||||
|
||||
>>> df = ed.DataFrame('localhost:9200', 'flights')
|
||||
>>> df.head()
|
||||
AvgTicketPrice Cancelled Carrier Dest ... OriginRegion OriginWeather dayOfWeek timestamp
|
||||
0 841.265642 False Kibana Airlines Sydney Kingsford Smith International Airport ... DE-HE Sunny 0 2018-01-01 00:00:00
|
||||
1 882.982662 False Logstash Airways Venice Marco Polo Airport ... SE-BD Clear 0 2018-01-01 18:27:00
|
||||
2 190.636904 False Logstash Airways Venice Marco Polo Airport ... IT-34 Rain 0 2018-01-01 17:11:14
|
||||
3 181.694216 True Kibana Airlines Treviso-Sant'Angelo Airport ... IT-72 Thunder & Lightning 0 2018-01-01 10:33:28
|
||||
4 730.041778 False Kibana Airlines Xi'an Xianyang International Airport ... MX-DIF Damaging Wind 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
|
||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
||||
|
||||
>>> from elasticsearch import Elasticsearch
|
||||
>>> es = Elasticsearch("localhost:9200")
|
||||
>>> df = ed.DataFrame(client=es, index_pattern='flights', columns=['AvgTicketPrice', 'Cancelled'])
|
||||
>>> df.head()
|
||||
AvgTicketPrice Cancelled
|
||||
0 841.265642 False
|
||||
1 882.982662 False
|
||||
2 190.636904 False
|
||||
3 181.694216 True
|
||||
4 730.041778 False
|
||||
<BLANKLINE>
|
||||
[5 rows x 2 columns]
|
||||
|
||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field
|
||||
|
||||
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
|
||||
>>> df.head()
|
||||
AvgTicketPrice timestamp
|
||||
2018-01-01T00:00:00 841.265642 2018-01-01 00:00:00
|
||||
2018-01-01T00:02:06 772.100846 2018-01-01 00:02:06
|
||||
2018-01-01T00:06:27 159.990962 2018-01-01 00:06:27
|
||||
2018-01-01T00:33:31 800.217104 2018-01-01 00:33:31
|
||||
2018-01-01T00:36:51 803.015200 2018-01-01 00:36:51
|
||||
<BLANKLINE>
|
||||
[5 rows x 2 columns]
|
||||
"""
|
||||
def __init__(self,
|
||||
client=None,
|
||||
index_pattern=None,
|
||||
columns=None,
|
||||
index_field=None,
|
||||
query_compiler=None):
|
||||
"""
|
||||
There are effectively 2 constructors:
|
||||
|
||||
1. client, index_pattern, columns, index_field
|
||||
2. query_compiler (eland.ElandQueryCompiler)
|
||||
|
||||
The constructor with 'query_compiler' is for internal use only.
|
||||
"""
|
||||
if query_compiler is None:
|
||||
if client is None or index_pattern is None:
|
||||
raise ValueError("client and index_pattern must be defined in DataFrame constructor")
|
||||
# python 3 syntax
|
||||
super().__init__(
|
||||
client=client,
|
||||
@ -40,6 +108,27 @@ class DataFrame(NDFrame):
|
||||
query_compiler=query_compiler)
|
||||
|
||||
def _get_columns(self):
|
||||
"""
|
||||
The column labels of the DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Elasticsearch field names as pandas.Index
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights')
|
||||
>>> assert isinstance(df.columns, pd.Index)
|
||||
>>> df.columns
|
||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
|
||||
... 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
|
||||
... 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay',
|
||||
... 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour',
|
||||
... 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
|
||||
... 'dayOfWeek', 'timestamp'],
|
||||
... dtype='object')
|
||||
"""
|
||||
return self._query_compiler.columns
|
||||
|
||||
columns = property(_get_columns)
|
||||
@ -52,14 +141,70 @@ class DataFrame(NDFrame):
|
||||
True if the DataFrame is empty.
|
||||
False otherwise.
|
||||
"""
|
||||
# TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337
|
||||
# (as Index.__len__ performs an query) we may want to cache self.index.empty()
|
||||
return len(self.columns) == 0 or len(self.index) == 0
|
||||
|
||||
def head(self, n=5):
|
||||
"""
|
||||
Return the first n rows.
|
||||
|
||||
This function returns the first n rows for the object based on position.
|
||||
The row order is sorted by index field.
|
||||
It is useful for quickly testing if your object has the right type of data in it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n: int, default 5
|
||||
Number of rows to select.
|
||||
|
||||
Returns
|
||||
-------
|
||||
eland.DataFrame
|
||||
eland DataFrame filtered on first n rows sorted by index field
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||
>>> df.head(3)
|
||||
Origin Dest
|
||||
0 Frankfurt am Main Airport Sydney Kingsford Smith International Airport
|
||||
1 Cape Town International Airport Venice Marco Polo Airport
|
||||
2 Venice Marco Polo Airport Venice Marco Polo Airport
|
||||
<BLANKLINE>
|
||||
[3 rows x 2 columns]
|
||||
"""
|
||||
return DataFrame(query_compiler=self._query_compiler.head(n))
|
||||
|
||||
def tail(self, n=5):
|
||||
"""
|
||||
Return the last n rows.
|
||||
|
||||
This function returns the last n rows for the object based on position.
|
||||
The row order is sorted by index field.
|
||||
It is useful for quickly testing if your object has the right type of data in it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n: int, default 5
|
||||
Number of rows to select.
|
||||
|
||||
Returns
|
||||
-------
|
||||
eland.DataFrame:
|
||||
eland DataFrame filtered on last n rows sorted by index field
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||
>>> df.tail()
|
||||
Origin Dest
|
||||
13054 Pisa International Airport Xi'an Xianyang International Airport
|
||||
13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport
|
||||
13056 Licenciado Benito Juarez International Airport Ukrainka Air Base
|
||||
13057 Itami Airport Ministro Pistarini International Airport
|
||||
13058 Adelaide International Airport Washington Dulles International Airport
|
||||
<BLANKLINE>
|
||||
[5 rows x 2 columns]
|
||||
"""
|
||||
return DataFrame(query_compiler=self._query_compiler.tail(n))
|
||||
|
||||
def __repr__(self):
|
||||
@ -92,18 +237,8 @@ class DataFrame(NDFrame):
|
||||
"""
|
||||
From pandas
|
||||
"""
|
||||
try:
|
||||
import IPython
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
if LooseVersion(IPython.__version__) < LooseVersion('3.0'):
|
||||
if console.in_qtconsole():
|
||||
# 'HTML output is disabled in QtConsole'
|
||||
return None
|
||||
|
||||
if self._info_repr():
|
||||
buf = StringIO()
|
||||
buf = StringIO("")
|
||||
self.info(buf=buf)
|
||||
# need to escape the <class>, should be the first line.
|
||||
val = buf.getvalue().replace('<', r'<', 1)
|
||||
@ -138,7 +273,7 @@ class DataFrame(NDFrame):
|
||||
def info_es(self):
|
||||
buf = StringIO()
|
||||
|
||||
super().info_es(buf)
|
||||
super()._info_es(buf)
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
@ -470,6 +605,13 @@ class DataFrame(NDFrame):
|
||||
return self._query_compiler.to_csv(**kwargs)
|
||||
|
||||
def _to_pandas(self):
|
||||
"""
|
||||
Utility method to convert eland.Dataframe to pandas.Dataframe
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
"""
|
||||
return self._query_compiler.to_pandas()
|
||||
|
||||
def _empty_pd_df(self):
|
||||
@ -529,7 +671,7 @@ class DataFrame(NDFrame):
|
||||
- string function name
|
||||
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
|
||||
- dict of axis labels -> functions, function names or list of such.
|
||||
%(axis)s
|
||||
axis
|
||||
*args
|
||||
Positional arguments to pass to `func`.
|
||||
**kwargs
|
||||
@ -570,7 +712,7 @@ class DataFrame(NDFrame):
|
||||
"""
|
||||
if isinstance(expr, BooleanFilter):
|
||||
return DataFrame(
|
||||
query_compiler=self._query_compiler._update_query(key)
|
||||
query_compiler=self._query_compiler._update_query(BooleanFilter(expr))
|
||||
)
|
||||
elif isinstance(expr, six.string_types):
|
||||
return DataFrame(
|
||||
|
@ -56,6 +56,12 @@ class NDFrame:
|
||||
self._query_compiler = query_compiler
|
||||
|
||||
def _get_index(self):
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
return self._query_compiler.index
|
||||
|
||||
index = property(_get_index)
|
||||
@ -114,14 +120,7 @@ class NDFrame:
|
||||
"""
|
||||
return len(self.index)
|
||||
|
||||
@property
|
||||
def iloc(self):
|
||||
"""Purely integer-location based indexing for selection by position.
|
||||
|
||||
"""
|
||||
return _iLocIndexer(self)
|
||||
|
||||
def info_es(self, buf):
|
||||
def _info_es(self, buf):
|
||||
self._query_compiler.info_es(buf)
|
||||
|
||||
def drop(
|
||||
|
@ -436,34 +436,6 @@ class ElandQueryCompiler:
|
||||
def _hist(self, num_bins):
|
||||
return self._operations.hist(self, num_bins)
|
||||
|
||||
def apply(self, func, axis, *args, **kwargs):
|
||||
"""Apply func across given axis.
|
||||
|
||||
Args:
|
||||
func: The function to apply.
|
||||
axis: Target axis to apply the function along.
|
||||
|
||||
Returns:
|
||||
A new QueryCompiler.
|
||||
"""
|
||||
"""Apply func across given axis.
|
||||
|
||||
Args:
|
||||
func: The function to apply.
|
||||
axis: Target axis to apply the function along.
|
||||
|
||||
Returns:
|
||||
A new PandasQueryCompiler.
|
||||
"""
|
||||
if callable(func):
|
||||
return self._callable_func(func, axis, *args, **kwargs)
|
||||
elif isinstance(func, dict):
|
||||
return self._dict_func(func, axis, *args, **kwargs)
|
||||
elif is_list_like(func):
|
||||
return self._list_like_func(func, axis, *args, **kwargs)
|
||||
else:
|
||||
pass
|
||||
|
||||
def _update_query(self, boolean_filter):
|
||||
result = self.copy()
|
||||
|
||||
|
@ -35,7 +35,7 @@ class Series(NDFrame):
|
||||
index_pattern : str
|
||||
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*).
|
||||
|
||||
field_name : str
|
||||
index_field : str
|
||||
The field to base the series on
|
||||
|
||||
See Also
|
||||
@ -91,8 +91,6 @@ class Series(NDFrame):
|
||||
True if the Series is empty.
|
||||
False otherwise.
|
||||
"""
|
||||
# TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337
|
||||
# (as Index.__len__ performs an query) we may want to cache self.index.empty()
|
||||
return len(self.index) == 0
|
||||
|
||||
def _get_name(self):
|
||||
@ -152,7 +150,7 @@ class Series(NDFrame):
|
||||
)
|
||||
|
||||
def _to_pandas(self):
|
||||
return self._query_compiler.to_pandas()[self.name]
|
||||
return self._query_compiler._to_pandas()[self.name]
|
||||
|
||||
def __gt__(self, other):
|
||||
if isinstance(other, Series):
|
||||
|
File diff suppressed because one or more lines are too long
@ -37,9 +37,7 @@ class TestDataFrameDateTime(TestData):
|
||||
# Now create index
|
||||
index_name = 'eland_test_generate_es_mappings'
|
||||
|
||||
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
|
||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
31
eland/tests/dataframe/test_init_pytest.py
Normal file
31
eland/tests/dataframe/test_init_pytest.py
Normal file
@ -0,0 +1,31 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import eland as ed
|
||||
|
||||
import pytest
|
||||
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_INDEX_NAME
|
||||
|
||||
class TestDataFrameInit:
|
||||
|
||||
def test_init(self):
|
||||
# Construct empty DataFrame (throws)
|
||||
with pytest.raises(ValueError):
|
||||
df = ed.DataFrame()
|
||||
|
||||
# Construct invalid DataFrame (throws)
|
||||
with pytest.raises(ValueError):
|
||||
df = ed.DataFrame(client=ELASTICSEARCH_HOST)
|
||||
|
||||
# Construct invalid DataFrame (throws)
|
||||
with pytest.raises(ValueError):
|
||||
df = ed.DataFrame(index_pattern=FLIGHTS_INDEX_NAME)
|
||||
|
||||
# Good constructors
|
||||
df0 = ed.DataFrame(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
|
||||
df1 = ed.DataFrame(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME)
|
||||
|
||||
qc = ed.ElandQueryCompiler(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME)
|
||||
df2 = ed.DataFrame(query_compiler=qc)
|
||||
|
@ -19,8 +19,7 @@ class TestDataFrameQuery(TestData):
|
||||
# Now create index
|
||||
index_name = 'eland_test_query1'
|
||||
|
||||
ed.pandas_to_es(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_df, ed_df)
|
||||
|
||||
|
@ -4,7 +4,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import ELASTICSEARCH_HOST
|
||||
from eland.tests.common import ELASTICSEARCH_HOST, assert_pandas_eland_frame_equal
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
@ -36,9 +36,7 @@ class TestDataFrameUtils(TestData):
|
||||
# Now create index
|
||||
index_name = 'eland_test_generate_es_mappings'
|
||||
|
||||
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
|
||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# assert_frame_equal(df, ed_df_head)
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
@ -7144,7 +7144,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.8"
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
105
eland/utils.py
105
eland/utils.py
@ -2,44 +2,71 @@ from eland import Client
|
||||
from eland import DataFrame
|
||||
from eland import Mappings
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_es(es_params, index_pattern):
|
||||
"""
|
||||
Utility method to create an eland.Dataframe from an Elasticsearch index_pattern.
|
||||
(Similar to pandas.read_csv, but source data is an Elasticsearch index rather than
|
||||
a csv file)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
es_params: Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance or
|
||||
- eland.Client instance
|
||||
index_pattern: str
|
||||
Elasticsearch index pattern
|
||||
|
||||
Returns
|
||||
-------
|
||||
eland.DataFrame
|
||||
|
||||
See Also
|
||||
--------
|
||||
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
|
||||
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
|
||||
"""
|
||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||
|
||||
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||
def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||
geo_points=None):
|
||||
"""
|
||||
Append a pandas DataFrame to an Elasticsearch index.
|
||||
Mainly used in testing.
|
||||
Modifies the elasticsearch destination index
|
||||
|
||||
Parameters
|
||||
----------
|
||||
es_params : Elasticsearch client argument
|
||||
elasticsearch-py parameters or
|
||||
elasticsearch-py instance or
|
||||
eland.Client instance
|
||||
es_params: Elasticsearch client argument(s)
|
||||
- elasticsearch-py parameters or
|
||||
- elasticsearch-py instance or
|
||||
- eland.Client instance
|
||||
destination_index: str
|
||||
Name of Elasticsearch index to be appended to
|
||||
if_exists : {'fail', 'replace', 'append'}, default 'fail'
|
||||
How to behave if the index already exists.
|
||||
|
||||
destination_index : str
|
||||
Name of Elasticsearch index to be written
|
||||
|
||||
if_exists : str, default 'fail'
|
||||
Behavior when the destination index exists. Value can be one of:
|
||||
``'fail'``
|
||||
If table exists, do nothing.
|
||||
``'replace'``
|
||||
If table exists, drop it, recreate it, and insert data.
|
||||
``'append'``
|
||||
If table exists, insert data. Create if does not exist.
|
||||
|
||||
dropna : bool
|
||||
``'True'``
|
||||
Remove missing values (see pandas.Series.dropna)
|
||||
``'False;``
|
||||
Include missing values - may cause bulk to fail
|
||||
|
||||
geo_points : list or None
|
||||
- fail: Raise a ValueError.
|
||||
- replace: Delete the index before inserting new values.
|
||||
- append: Insert new values to the existing index. Create if does not exist.
|
||||
dropna: bool, default 'False'
|
||||
* True: Remove missing values (see pandas.Series.dropna)
|
||||
* False: Include missing values - may cause bulk to fail
|
||||
geo_points: list, default None
|
||||
List of columns to map to geo_point data type
|
||||
|
||||
Returns
|
||||
-------
|
||||
eland.Dataframe
|
||||
eland.DataFrame referencing data in destination_index
|
||||
|
||||
See Also
|
||||
--------
|
||||
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
|
||||
"""
|
||||
client = Client(es_params)
|
||||
|
||||
@ -86,3 +113,31 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
||||
actions = []
|
||||
|
||||
client.bulk(actions, refresh=refresh)
|
||||
|
||||
ed_df = DataFrame(client, destination_index)
|
||||
|
||||
return ed_df
|
||||
|
||||
def ed_to_pd(ed_df):
|
||||
"""
|
||||
Convert an eland.Dataframe to a pandas.DataFrame
|
||||
|
||||
**Note: this loads the entire Elasticsearch index into in core pandas.DataFrame structures. For large
|
||||
indices this can create significant load on the Elasticsearch cluster and require signficant memory**
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ed_df: eland.DataFrame
|
||||
The source eland.Dataframe referencing the Elasticsearch index
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.Dataframe
|
||||
pandas.DataFrame contains all rows and columns in eland.DataFrame
|
||||
|
||||
See Also
|
||||
--------
|
||||
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
|
||||
"""
|
||||
return ed_df._to_pandas()
|
||||
|
9
make_docs.sh
Normal file
9
make_docs.sh
Normal file
@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
python setup.py install
|
||||
|
||||
cd docs
|
||||
|
||||
make clean
|
||||
make html
|
||||
|
@ -1,3 +1,6 @@
|
||||
elasticsearch>=7.0.5
|
||||
pandas==0.25.1
|
||||
matplotlib
|
||||
pytest>=5.2.1
|
||||
sphinx_rtd_theme
|
||||
numpydoc==0.8
|
||||
|
Loading…
x
Reference in New Issue
Block a user