mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Merge pull request #31 from stevedodson/master
Creating docs framework.
This commit is contained in:
commit
fd35fbd9f5
2
NOTES.md
2
NOTES.md
@ -47,7 +47,7 @@ the `pandas.DataFrame` API. This resolves some of the issues above as:
|
|||||||
than a new index
|
than a new index
|
||||||
|
|
||||||
* Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for
|
* Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for
|
||||||
Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame.to_pandas()`
|
Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame._to_pandas()`
|
||||||
method which would explicitly export all data to a `pandas.DataFrame`
|
method which would explicitly export all data to a `pandas.DataFrame`
|
||||||
|
|
||||||
* Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However,
|
* Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However,
|
||||||
|
20
docs/Makefile
Normal file
20
docs/Makefile
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = source
|
||||||
|
BUILDDIR = build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
35
docs/make.bat
Normal file
35
docs/make.bat
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
@ECHO OFF
|
||||||
|
|
||||||
|
pushd %~dp0
|
||||||
|
|
||||||
|
REM Command file for Sphinx documentation
|
||||||
|
|
||||||
|
if "%SPHINXBUILD%" == "" (
|
||||||
|
set SPHINXBUILD=sphinx-build
|
||||||
|
)
|
||||||
|
set SOURCEDIR=source
|
||||||
|
set BUILDDIR=build
|
||||||
|
|
||||||
|
if "%1" == "" goto help
|
||||||
|
|
||||||
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
|
if errorlevel 9009 (
|
||||||
|
echo.
|
||||||
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||||
|
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||||
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||||
|
echo.may add the Sphinx directory to PATH.
|
||||||
|
echo.
|
||||||
|
echo.If you don't have Sphinx installed, grab it from
|
||||||
|
echo.http://sphinx-doc.org/
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
goto end
|
||||||
|
|
||||||
|
:help
|
||||||
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
|
||||||
|
:end
|
||||||
|
popd
|
79
docs/source/conf.py
Normal file
79
docs/source/conf.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
# Configuration file for the Sphinx documentation builder.
|
||||||
|
#
|
||||||
|
# This file only contains a selection of the most common options. For a full
|
||||||
|
# list see the documentation:
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
# -- Path setup --------------------------------------------------------------
|
||||||
|
|
||||||
|
# If extensions (or modules to document with autodoc) are in another directory,
|
||||||
|
# add these directories to sys.path here. If the directory is relative to the
|
||||||
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
#
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, os.path.abspath("../sphinxext"))
|
||||||
|
sys.path.extend(
|
||||||
|
[
|
||||||
|
# numpy standard doc extensions
|
||||||
|
os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext")
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
|
project = 'eland'
|
||||||
|
copyright = '2019, Stephen Dodson'
|
||||||
|
author = 'Stephen Dodson'
|
||||||
|
|
||||||
|
# The full version, including alpha/beta/rc tags
|
||||||
|
release = '0.1'
|
||||||
|
|
||||||
|
|
||||||
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|
||||||
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
|
# ones.
|
||||||
|
extensions = [
|
||||||
|
'sphinx.ext.autodoc',
|
||||||
|
"sphinx.ext.doctest",
|
||||||
|
'numpydoc'
|
||||||
|
]
|
||||||
|
|
||||||
|
doctest_global_setup = '''
|
||||||
|
try:
|
||||||
|
import eland as ed
|
||||||
|
except ImportError:
|
||||||
|
ed = None
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
except ImportError:
|
||||||
|
pd = None
|
||||||
|
'''
|
||||||
|
|
||||||
|
numpydoc_attributes_as_param_list = False
|
||||||
|
|
||||||
|
|
||||||
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
|
templates_path = ['_templates']
|
||||||
|
|
||||||
|
# List of patterns, relative to source directory, that match files and
|
||||||
|
# directories to ignore when looking for source files.
|
||||||
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
|
exclude_patterns = []
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
||||||
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
|
# a list of builtin themes.
|
||||||
|
#
|
||||||
|
html_theme = 'sphinx_rtd_theme'
|
||||||
|
|
||||||
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
|
html_static_path = ['_static']
|
30
docs/source/index.rst
Normal file
30
docs/source/index.rst
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
.. eland documentation master file, created by
|
||||||
|
|
||||||
|
.. module:: eland
|
||||||
|
|
||||||
|
****************************************************************
|
||||||
|
eland: pandas-like data analysis toolkit backed by Elasticsearch
|
||||||
|
****************************************************************
|
||||||
|
|
||||||
|
**Date**: |today| **Version**: |version|
|
||||||
|
|
||||||
|
**Useful links**:
|
||||||
|
`Source Repository <https://github.com/elastic/eland>`__ |
|
||||||
|
`Issues & Ideas <https://github.com/elastic/eland/issues>`__ |
|
||||||
|
`Q&A Support <https://discuss.elastic.co>`__ |
|
||||||
|
|
||||||
|
:mod:`eland` is an open source, Apache2-licensed elasticsearch Python client to analyse, explore and manipulate data that resides in elasticsearch.
|
||||||
|
Where possible the package uses existing Python APIs and data structures to make it easy to switch between Numpy, Pandas, Scikit-learn to their elasticsearch powered equivalents.
|
||||||
|
In general, the data resides in elasticsearch and not in memory, which allows eland to access large datasets stored in elasticsearch.
|
||||||
|
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:hidden:
|
||||||
|
|
||||||
|
reference/index
|
||||||
|
|
||||||
|
* :doc:`reference/index`
|
||||||
|
|
||||||
|
* :doc:`reference/general_utility_functions`
|
||||||
|
* :doc:`reference/dataframe`
|
6
docs/source/reference/api/eland.DataFrame.columns.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.columns.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.DataFrame.columns
|
||||||
|
=======================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autoattribute:: DataFrame.columns
|
6
docs/source/reference/api/eland.DataFrame.head.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.head.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.DataFrame.head
|
||||||
|
====================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. automethod:: DataFrame.head
|
6
docs/source/reference/api/eland.DataFrame.index.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.index.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.DataFrame.index
|
||||||
|
=====================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autoattribute:: DataFrame.index
|
18
docs/source/reference/api/eland.DataFrame.rst
Normal file
18
docs/source/reference/api/eland.DataFrame.rst
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
eland.DataFrame
|
||||||
|
================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autoclass:: DataFrame
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
..
|
||||||
|
HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
|
||||||
|
.. autosummary::
|
||||||
|
:toctree:
|
||||||
|
|
||||||
|
DataFrame.abs
|
||||||
|
DataFrame.add
|
||||||
|
|
6
docs/source/reference/api/eland.DataFrame.tail.rst
Normal file
6
docs/source/reference/api/eland.DataFrame.tail.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.DataFrame.tail
|
||||||
|
====================
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. automethod:: DataFrame.tail
|
6
docs/source/reference/api/eland.ed_to_pd.rst
Normal file
6
docs/source/reference/api/eland.ed_to_pd.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.ed_to_pd
|
||||||
|
==============
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autofunction:: ed_to_pd
|
6
docs/source/reference/api/eland.pd_to_ed.rst
Normal file
6
docs/source/reference/api/eland.pd_to_ed.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.pd_to_ed
|
||||||
|
==============
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autofunction:: pd_to_ed
|
6
docs/source/reference/api/eland.read_es.rst
Normal file
6
docs/source/reference/api/eland.read_es.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.read_es
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autofunction:: read_es
|
35
docs/source/reference/dataframe.rst
Normal file
35
docs/source/reference/dataframe.rst
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
.. _api.dataframe:
|
||||||
|
|
||||||
|
=========
|
||||||
|
DataFrame
|
||||||
|
=========
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
Constructor
|
||||||
|
~~~~~~~~~~~
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
DataFrame
|
||||||
|
|
||||||
|
Attributes and underlying data
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
**Axes**
|
||||||
|
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
DataFrame.index
|
||||||
|
DataFrame.columns
|
||||||
|
|
||||||
|
Indexing, iteration
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
DataFrame.head
|
||||||
|
DataFrame.tail
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
21
docs/source/reference/general_utility_functions.rst
Normal file
21
docs/source/reference/general_utility_functions.rst
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
.. _api.general_utility_functions:
|
||||||
|
|
||||||
|
=========================
|
||||||
|
General utility functions
|
||||||
|
=========================
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
Elasticsearch access
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
read_es
|
||||||
|
|
||||||
|
Pandas and Eland
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
pd_to_ed
|
||||||
|
ed_to_pd
|
14
docs/source/reference/index.rst
Normal file
14
docs/source/reference/index.rst
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
.. _api:
|
||||||
|
|
||||||
|
=============
|
||||||
|
API reference
|
||||||
|
=============
|
||||||
|
|
||||||
|
This page gives an overview of all public eland objects, functions and
|
||||||
|
methods. All classes and functions exposed in ``eland.*`` namespace are public.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
general_utility_functions
|
||||||
|
dataframe
|
@ -1,14 +1,14 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
from eland.client import *
|
from eland.client import *
|
||||||
from eland.dataframe import *
|
|
||||||
from eland.filter import *
|
from eland.filter import *
|
||||||
from eland.index import *
|
from eland.index import *
|
||||||
from eland.mappings import *
|
from eland.mappings import *
|
||||||
from eland.ndframe import *
|
|
||||||
from eland.operations import *
|
|
||||||
from eland.plotting import *
|
|
||||||
from eland.query import *
|
from eland.query import *
|
||||||
|
from eland.operations import *
|
||||||
from eland.query_compiler import *
|
from eland.query_compiler import *
|
||||||
|
from eland.plotting import *
|
||||||
|
from eland.ndframe import *
|
||||||
from eland.series import *
|
from eland.series import *
|
||||||
|
from eland.dataframe import *
|
||||||
from eland.utils import *
|
from eland.utils import *
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from distutils.version import LooseVersion
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -20,17 +19,86 @@ from eland import NDFrame
|
|||||||
from eland import Series
|
from eland import Series
|
||||||
from eland.filter import BooleanFilter, ScriptFilter
|
from eland.filter import BooleanFilter, ScriptFilter
|
||||||
|
|
||||||
|
|
||||||
class DataFrame(NDFrame):
|
class DataFrame(NDFrame):
|
||||||
# This is effectively 2 constructors
|
"""
|
||||||
# 1. client, index_pattern, columns, index_field
|
Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes
|
||||||
# 2. query_compiler
|
(rows and columns) referencing data stored in Elasticsearch indices.
|
||||||
|
Where possible APIs mirror pandas.DataFrame APIs.
|
||||||
|
The underlying data is stored in Elasticsearch rather than core memory.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
client: Elasticsearch client argument(s) (e.g. 'localhost:9200')
|
||||||
|
- elasticsearch-py parameters or
|
||||||
|
- elasticsearch-py instance or
|
||||||
|
- eland.Client instance
|
||||||
|
index_pattern: str
|
||||||
|
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*')
|
||||||
|
columns: list of str, optional
|
||||||
|
List of DataFrame columns. A subset of the Elasticsearch index's fields.
|
||||||
|
index_field: str, optional
|
||||||
|
The Elasticsearch index field to use as the DataFrame index. Defaults to _id if None is used.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
Constructing DataFrame from an Elasticsearch configuration arguments and an Elasticsearch index
|
||||||
|
|
||||||
|
>>> df = ed.DataFrame('localhost:9200', 'flights')
|
||||||
|
>>> df.head()
|
||||||
|
AvgTicketPrice Cancelled Carrier Dest ... OriginRegion OriginWeather dayOfWeek timestamp
|
||||||
|
0 841.265642 False Kibana Airlines Sydney Kingsford Smith International Airport ... DE-HE Sunny 0 2018-01-01 00:00:00
|
||||||
|
1 882.982662 False Logstash Airways Venice Marco Polo Airport ... SE-BD Clear 0 2018-01-01 18:27:00
|
||||||
|
2 190.636904 False Logstash Airways Venice Marco Polo Airport ... IT-34 Rain 0 2018-01-01 17:11:14
|
||||||
|
3 181.694216 True Kibana Airlines Treviso-Sant'Angelo Airport ... IT-72 Thunder & Lightning 0 2018-01-01 10:33:28
|
||||||
|
4 730.041778 False Kibana Airlines Xi'an Xianyang International Airport ... MX-DIF Damaging Wind 0 2018-01-01 05:13:00
|
||||||
|
<BLANKLINE>
|
||||||
|
[5 rows x 27 columns]
|
||||||
|
|
||||||
|
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
||||||
|
|
||||||
|
>>> from elasticsearch import Elasticsearch
|
||||||
|
>>> es = Elasticsearch("localhost:9200")
|
||||||
|
>>> df = ed.DataFrame(client=es, index_pattern='flights', columns=['AvgTicketPrice', 'Cancelled'])
|
||||||
|
>>> df.head()
|
||||||
|
AvgTicketPrice Cancelled
|
||||||
|
0 841.265642 False
|
||||||
|
1 882.982662 False
|
||||||
|
2 190.636904 False
|
||||||
|
3 181.694216 True
|
||||||
|
4 730.041778 False
|
||||||
|
<BLANKLINE>
|
||||||
|
[5 rows x 2 columns]
|
||||||
|
|
||||||
|
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field
|
||||||
|
|
||||||
|
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
|
||||||
|
>>> df.head()
|
||||||
|
AvgTicketPrice timestamp
|
||||||
|
2018-01-01T00:00:00 841.265642 2018-01-01 00:00:00
|
||||||
|
2018-01-01T00:02:06 772.100846 2018-01-01 00:02:06
|
||||||
|
2018-01-01T00:06:27 159.990962 2018-01-01 00:06:27
|
||||||
|
2018-01-01T00:33:31 800.217104 2018-01-01 00:33:31
|
||||||
|
2018-01-01T00:36:51 803.015200 2018-01-01 00:36:51
|
||||||
|
<BLANKLINE>
|
||||||
|
[5 rows x 2 columns]
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
client=None,
|
client=None,
|
||||||
index_pattern=None,
|
index_pattern=None,
|
||||||
columns=None,
|
columns=None,
|
||||||
index_field=None,
|
index_field=None,
|
||||||
query_compiler=None):
|
query_compiler=None):
|
||||||
|
"""
|
||||||
|
There are effectively 2 constructors:
|
||||||
|
|
||||||
|
1. client, index_pattern, columns, index_field
|
||||||
|
2. query_compiler (eland.ElandQueryCompiler)
|
||||||
|
|
||||||
|
The constructor with 'query_compiler' is for internal use only.
|
||||||
|
"""
|
||||||
|
if query_compiler is None:
|
||||||
|
if client is None or index_pattern is None:
|
||||||
|
raise ValueError("client and index_pattern must be defined in DataFrame constructor")
|
||||||
# python 3 syntax
|
# python 3 syntax
|
||||||
super().__init__(
|
super().__init__(
|
||||||
client=client,
|
client=client,
|
||||||
@ -40,6 +108,27 @@ class DataFrame(NDFrame):
|
|||||||
query_compiler=query_compiler)
|
query_compiler=query_compiler)
|
||||||
|
|
||||||
def _get_columns(self):
|
def _get_columns(self):
|
||||||
|
"""
|
||||||
|
The column labels of the DataFrame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Elasticsearch field names as pandas.Index
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame('localhost', 'flights')
|
||||||
|
>>> assert isinstance(df.columns, pd.Index)
|
||||||
|
>>> df.columns
|
||||||
|
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
|
||||||
|
... 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
|
||||||
|
... 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay',
|
||||||
|
... 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour',
|
||||||
|
... 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||||
|
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
|
||||||
|
... 'dayOfWeek', 'timestamp'],
|
||||||
|
... dtype='object')
|
||||||
|
"""
|
||||||
return self._query_compiler.columns
|
return self._query_compiler.columns
|
||||||
|
|
||||||
columns = property(_get_columns)
|
columns = property(_get_columns)
|
||||||
@ -52,14 +141,70 @@ class DataFrame(NDFrame):
|
|||||||
True if the DataFrame is empty.
|
True if the DataFrame is empty.
|
||||||
False otherwise.
|
False otherwise.
|
||||||
"""
|
"""
|
||||||
# TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337
|
|
||||||
# (as Index.__len__ performs an query) we may want to cache self.index.empty()
|
|
||||||
return len(self.columns) == 0 or len(self.index) == 0
|
return len(self.columns) == 0 or len(self.index) == 0
|
||||||
|
|
||||||
def head(self, n=5):
|
def head(self, n=5):
|
||||||
|
"""
|
||||||
|
Return the first n rows.
|
||||||
|
|
||||||
|
This function returns the first n rows for the object based on position.
|
||||||
|
The row order is sorted by index field.
|
||||||
|
It is useful for quickly testing if your object has the right type of data in it.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
n: int, default 5
|
||||||
|
Number of rows to select.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
eland.DataFrame
|
||||||
|
eland DataFrame filtered on first n rows sorted by index field
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||||
|
>>> df.head(3)
|
||||||
|
Origin Dest
|
||||||
|
0 Frankfurt am Main Airport Sydney Kingsford Smith International Airport
|
||||||
|
1 Cape Town International Airport Venice Marco Polo Airport
|
||||||
|
2 Venice Marco Polo Airport Venice Marco Polo Airport
|
||||||
|
<BLANKLINE>
|
||||||
|
[3 rows x 2 columns]
|
||||||
|
"""
|
||||||
return DataFrame(query_compiler=self._query_compiler.head(n))
|
return DataFrame(query_compiler=self._query_compiler.head(n))
|
||||||
|
|
||||||
def tail(self, n=5):
|
def tail(self, n=5):
|
||||||
|
"""
|
||||||
|
Return the last n rows.
|
||||||
|
|
||||||
|
This function returns the last n rows for the object based on position.
|
||||||
|
The row order is sorted by index field.
|
||||||
|
It is useful for quickly testing if your object has the right type of data in it.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
n: int, default 5
|
||||||
|
Number of rows to select.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
eland.DataFrame:
|
||||||
|
eland DataFrame filtered on last n rows sorted by index field
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
|
||||||
|
>>> df.tail()
|
||||||
|
Origin Dest
|
||||||
|
13054 Pisa International Airport Xi'an Xianyang International Airport
|
||||||
|
13055 Winnipeg / James Armstrong Richardson Internat... Zurich Airport
|
||||||
|
13056 Licenciado Benito Juarez International Airport Ukrainka Air Base
|
||||||
|
13057 Itami Airport Ministro Pistarini International Airport
|
||||||
|
13058 Adelaide International Airport Washington Dulles International Airport
|
||||||
|
<BLANKLINE>
|
||||||
|
[5 rows x 2 columns]
|
||||||
|
"""
|
||||||
return DataFrame(query_compiler=self._query_compiler.tail(n))
|
return DataFrame(query_compiler=self._query_compiler.tail(n))
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
@ -92,18 +237,8 @@ class DataFrame(NDFrame):
|
|||||||
"""
|
"""
|
||||||
From pandas
|
From pandas
|
||||||
"""
|
"""
|
||||||
try:
|
|
||||||
import IPython
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
if LooseVersion(IPython.__version__) < LooseVersion('3.0'):
|
|
||||||
if console.in_qtconsole():
|
|
||||||
# 'HTML output is disabled in QtConsole'
|
|
||||||
return None
|
|
||||||
|
|
||||||
if self._info_repr():
|
if self._info_repr():
|
||||||
buf = StringIO()
|
buf = StringIO("")
|
||||||
self.info(buf=buf)
|
self.info(buf=buf)
|
||||||
# need to escape the <class>, should be the first line.
|
# need to escape the <class>, should be the first line.
|
||||||
val = buf.getvalue().replace('<', r'<', 1)
|
val = buf.getvalue().replace('<', r'<', 1)
|
||||||
@ -138,7 +273,7 @@ class DataFrame(NDFrame):
|
|||||||
def info_es(self):
|
def info_es(self):
|
||||||
buf = StringIO()
|
buf = StringIO()
|
||||||
|
|
||||||
super().info_es(buf)
|
super()._info_es(buf)
|
||||||
|
|
||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
@ -470,6 +605,13 @@ class DataFrame(NDFrame):
|
|||||||
return self._query_compiler.to_csv(**kwargs)
|
return self._query_compiler.to_csv(**kwargs)
|
||||||
|
|
||||||
def _to_pandas(self):
|
def _to_pandas(self):
|
||||||
|
"""
|
||||||
|
Utility method to convert eland.Dataframe to pandas.Dataframe
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
"""
|
||||||
return self._query_compiler.to_pandas()
|
return self._query_compiler.to_pandas()
|
||||||
|
|
||||||
def _empty_pd_df(self):
|
def _empty_pd_df(self):
|
||||||
@ -529,7 +671,7 @@ class DataFrame(NDFrame):
|
|||||||
- string function name
|
- string function name
|
||||||
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
|
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
|
||||||
- dict of axis labels -> functions, function names or list of such.
|
- dict of axis labels -> functions, function names or list of such.
|
||||||
%(axis)s
|
axis
|
||||||
*args
|
*args
|
||||||
Positional arguments to pass to `func`.
|
Positional arguments to pass to `func`.
|
||||||
**kwargs
|
**kwargs
|
||||||
@ -570,7 +712,7 @@ class DataFrame(NDFrame):
|
|||||||
"""
|
"""
|
||||||
if isinstance(expr, BooleanFilter):
|
if isinstance(expr, BooleanFilter):
|
||||||
return DataFrame(
|
return DataFrame(
|
||||||
query_compiler=self._query_compiler._update_query(key)
|
query_compiler=self._query_compiler._update_query(BooleanFilter(expr))
|
||||||
)
|
)
|
||||||
elif isinstance(expr, six.string_types):
|
elif isinstance(expr, six.string_types):
|
||||||
return DataFrame(
|
return DataFrame(
|
||||||
|
@ -56,6 +56,12 @@ class NDFrame:
|
|||||||
self._query_compiler = query_compiler
|
self._query_compiler = query_compiler
|
||||||
|
|
||||||
def _get_index(self):
|
def _get_index(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
|
||||||
|
"""
|
||||||
return self._query_compiler.index
|
return self._query_compiler.index
|
||||||
|
|
||||||
index = property(_get_index)
|
index = property(_get_index)
|
||||||
@ -114,14 +120,7 @@ class NDFrame:
|
|||||||
"""
|
"""
|
||||||
return len(self.index)
|
return len(self.index)
|
||||||
|
|
||||||
@property
|
def _info_es(self, buf):
|
||||||
def iloc(self):
|
|
||||||
"""Purely integer-location based indexing for selection by position.
|
|
||||||
|
|
||||||
"""
|
|
||||||
return _iLocIndexer(self)
|
|
||||||
|
|
||||||
def info_es(self, buf):
|
|
||||||
self._query_compiler.info_es(buf)
|
self._query_compiler.info_es(buf)
|
||||||
|
|
||||||
def drop(
|
def drop(
|
||||||
|
@ -436,34 +436,6 @@ class ElandQueryCompiler:
|
|||||||
def _hist(self, num_bins):
|
def _hist(self, num_bins):
|
||||||
return self._operations.hist(self, num_bins)
|
return self._operations.hist(self, num_bins)
|
||||||
|
|
||||||
def apply(self, func, axis, *args, **kwargs):
|
|
||||||
"""Apply func across given axis.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
func: The function to apply.
|
|
||||||
axis: Target axis to apply the function along.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A new QueryCompiler.
|
|
||||||
"""
|
|
||||||
"""Apply func across given axis.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
func: The function to apply.
|
|
||||||
axis: Target axis to apply the function along.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A new PandasQueryCompiler.
|
|
||||||
"""
|
|
||||||
if callable(func):
|
|
||||||
return self._callable_func(func, axis, *args, **kwargs)
|
|
||||||
elif isinstance(func, dict):
|
|
||||||
return self._dict_func(func, axis, *args, **kwargs)
|
|
||||||
elif is_list_like(func):
|
|
||||||
return self._list_like_func(func, axis, *args, **kwargs)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _update_query(self, boolean_filter):
|
def _update_query(self, boolean_filter):
|
||||||
result = self.copy()
|
result = self.copy()
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ class Series(NDFrame):
|
|||||||
index_pattern : str
|
index_pattern : str
|
||||||
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*).
|
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*).
|
||||||
|
|
||||||
field_name : str
|
index_field : str
|
||||||
The field to base the series on
|
The field to base the series on
|
||||||
|
|
||||||
See Also
|
See Also
|
||||||
@ -91,8 +91,6 @@ class Series(NDFrame):
|
|||||||
True if the Series is empty.
|
True if the Series is empty.
|
||||||
False otherwise.
|
False otherwise.
|
||||||
"""
|
"""
|
||||||
# TODO - this is called on every attribute get (most methods) from modin/pandas/base.py:3337
|
|
||||||
# (as Index.__len__ performs an query) we may want to cache self.index.empty()
|
|
||||||
return len(self.index) == 0
|
return len(self.index) == 0
|
||||||
|
|
||||||
def _get_name(self):
|
def _get_name(self):
|
||||||
@ -152,7 +150,7 @@ class Series(NDFrame):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _to_pandas(self):
|
def _to_pandas(self):
|
||||||
return self._query_compiler.to_pandas()[self.name]
|
return self._query_compiler._to_pandas()[self.name]
|
||||||
|
|
||||||
def __gt__(self, other):
|
def __gt__(self, other):
|
||||||
if isinstance(other, Series):
|
if isinstance(other, Series):
|
||||||
|
File diff suppressed because one or more lines are too long
@ -37,9 +37,7 @@ class TestDataFrameDateTime(TestData):
|
|||||||
# Now create index
|
# Now create index
|
||||||
index_name = 'eland_test_generate_es_mappings'
|
index_name = 'eland_test_generate_es_mappings'
|
||||||
|
|
||||||
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||||
|
|
||||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
|
||||||
ed_df_head = ed_df.head()
|
ed_df_head = ed_df.head()
|
||||||
|
|
||||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||||
|
31
eland/tests/dataframe/test_init_pytest.py
Normal file
31
eland/tests/dataframe/test_init_pytest.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import eland as ed
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from eland.tests import ELASTICSEARCH_HOST
|
||||||
|
from eland.tests import FLIGHTS_INDEX_NAME
|
||||||
|
|
||||||
|
class TestDataFrameInit:
|
||||||
|
|
||||||
|
def test_init(self):
|
||||||
|
# Construct empty DataFrame (throws)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
df = ed.DataFrame()
|
||||||
|
|
||||||
|
# Construct invalid DataFrame (throws)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
df = ed.DataFrame(client=ELASTICSEARCH_HOST)
|
||||||
|
|
||||||
|
# Construct invalid DataFrame (throws)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
df = ed.DataFrame(index_pattern=FLIGHTS_INDEX_NAME)
|
||||||
|
|
||||||
|
# Good constructors
|
||||||
|
df0 = ed.DataFrame(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
|
||||||
|
df1 = ed.DataFrame(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME)
|
||||||
|
|
||||||
|
qc = ed.ElandQueryCompiler(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME)
|
||||||
|
df2 = ed.DataFrame(query_compiler=qc)
|
||||||
|
|
@ -19,8 +19,7 @@ class TestDataFrameQuery(TestData):
|
|||||||
# Now create index
|
# Now create index
|
||||||
index_name = 'eland_test_query1'
|
index_name = 'eland_test_query1'
|
||||||
|
|
||||||
ed.pandas_to_es(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
|
||||||
|
|
||||||
assert_pandas_eland_frame_equal(pd_df, ed_df)
|
assert_pandas_eland_frame_equal(pd_df, ed_df)
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
from eland.tests.common import ELASTICSEARCH_HOST
|
from eland.tests.common import ELASTICSEARCH_HOST, assert_pandas_eland_frame_equal
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
@ -36,9 +36,7 @@ class TestDataFrameUtils(TestData):
|
|||||||
# Now create index
|
# Now create index
|
||||||
index_name = 'eland_test_generate_es_mappings'
|
index_name = 'eland_test_generate_es_mappings'
|
||||||
|
|
||||||
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
ed_df = ed.pd_to_ed(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||||
|
|
||||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
|
||||||
ed_df_head = ed_df.head()
|
ed_df_head = ed_df.head()
|
||||||
|
|
||||||
# assert_frame_equal(df, ed_df_head)
|
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||||
|
@ -7144,7 +7144,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.6.8"
|
"version": "3.7.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
109
eland/utils.py
109
eland/utils.py
@ -2,44 +2,71 @@ from eland import Client
|
|||||||
from eland import DataFrame
|
from eland import DataFrame
|
||||||
from eland import Mappings
|
from eland import Mappings
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def read_es(es_params, index_pattern):
|
def read_es(es_params, index_pattern):
|
||||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
|
||||||
|
|
||||||
|
|
||||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
|
||||||
geo_points=None):
|
|
||||||
"""
|
"""
|
||||||
Append a pandas DataFrame to an Elasticsearch index.
|
Utility method to create an eland.Dataframe from an Elasticsearch index_pattern.
|
||||||
Mainly used in testing.
|
(Similar to pandas.read_csv, but source data is an Elasticsearch index rather than
|
||||||
|
a csv file)
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
es_params : Elasticsearch client argument
|
es_params: Elasticsearch client argument(s)
|
||||||
elasticsearch-py parameters or
|
- elasticsearch-py parameters or
|
||||||
elasticsearch-py instance or
|
- elasticsearch-py instance or
|
||||||
eland.Client instance
|
- eland.Client instance
|
||||||
|
index_pattern: str
|
||||||
|
Elasticsearch index pattern
|
||||||
|
|
||||||
destination_index : str
|
Returns
|
||||||
Name of Elasticsearch index to be written
|
-------
|
||||||
|
eland.DataFrame
|
||||||
|
|
||||||
if_exists : str, default 'fail'
|
See Also
|
||||||
Behavior when the destination index exists. Value can be one of:
|
--------
|
||||||
``'fail'``
|
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
|
||||||
If table exists, do nothing.
|
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
|
||||||
``'replace'``
|
"""
|
||||||
If table exists, drop it, recreate it, and insert data.
|
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||||
``'append'``
|
|
||||||
If table exists, insert data. Create if does not exist.
|
|
||||||
|
|
||||||
dropna : bool
|
def pd_to_ed(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||||
``'True'``
|
geo_points=None):
|
||||||
Remove missing values (see pandas.Series.dropna)
|
"""
|
||||||
``'False;``
|
Append a pandas DataFrame to an Elasticsearch index.
|
||||||
Include missing values - may cause bulk to fail
|
Mainly used in testing.
|
||||||
|
Modifies the elasticsearch destination index
|
||||||
|
|
||||||
geo_points : list or None
|
Parameters
|
||||||
|
----------
|
||||||
|
es_params: Elasticsearch client argument(s)
|
||||||
|
- elasticsearch-py parameters or
|
||||||
|
- elasticsearch-py instance or
|
||||||
|
- eland.Client instance
|
||||||
|
destination_index: str
|
||||||
|
Name of Elasticsearch index to be appended to
|
||||||
|
if_exists : {'fail', 'replace', 'append'}, default 'fail'
|
||||||
|
How to behave if the index already exists.
|
||||||
|
|
||||||
|
- fail: Raise a ValueError.
|
||||||
|
- replace: Delete the index before inserting new values.
|
||||||
|
- append: Insert new values to the existing index. Create if does not exist.
|
||||||
|
dropna: bool, default 'False'
|
||||||
|
* True: Remove missing values (see pandas.Series.dropna)
|
||||||
|
* False: Include missing values - may cause bulk to fail
|
||||||
|
geo_points: list, default None
|
||||||
List of columns to map to geo_point data type
|
List of columns to map to geo_point data type
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
eland.Dataframe
|
||||||
|
eland.DataFrame referencing data in destination_index
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||||
|
eland.ed_to_pd: Create a pandas.Dataframe from eland.DataFrame
|
||||||
"""
|
"""
|
||||||
client = Client(es_params)
|
client = Client(es_params)
|
||||||
|
|
||||||
@ -86,3 +113,31 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
|||||||
actions = []
|
actions = []
|
||||||
|
|
||||||
client.bulk(actions, refresh=refresh)
|
client.bulk(actions, refresh=refresh)
|
||||||
|
|
||||||
|
ed_df = DataFrame(client, destination_index)
|
||||||
|
|
||||||
|
return ed_df
|
||||||
|
|
||||||
|
def ed_to_pd(ed_df):
|
||||||
|
"""
|
||||||
|
Convert an eland.Dataframe to a pandas.DataFrame
|
||||||
|
|
||||||
|
**Note: this loads the entire Elasticsearch index into in core pandas.DataFrame structures. For large
|
||||||
|
indices this can create significant load on the Elasticsearch cluster and require signficant memory**
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ed_df: eland.DataFrame
|
||||||
|
The source eland.Dataframe referencing the Elasticsearch index
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.Dataframe
|
||||||
|
pandas.DataFrame contains all rows and columns in eland.DataFrame
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||||
|
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
|
||||||
|
"""
|
||||||
|
return ed_df._to_pandas()
|
||||||
|
9
make_docs.sh
Normal file
9
make_docs.sh
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
cd docs
|
||||||
|
|
||||||
|
make clean
|
||||||
|
make html
|
||||||
|
|
@ -1,3 +1,6 @@
|
|||||||
elasticsearch>=7.0.5
|
elasticsearch>=7.0.5
|
||||||
pandas==0.25.1
|
pandas==0.25.1
|
||||||
|
matplotlib
|
||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
|
sphinx_rtd_theme
|
||||||
|
numpydoc==0.8
|
||||||
|
Loading…
x
Reference in New Issue
Block a user