More doc updates.

This commit is contained in:
Stephen Dodson 2019-11-13 18:23:43 +00:00
parent d8c1e18161
commit dff49d01fe
27 changed files with 518 additions and 144 deletions

View File

@ -40,7 +40,10 @@ release = '0.1'
extensions = [
'sphinx.ext.autodoc',
"sphinx.ext.doctest",
'numpydoc'
"sphinx.ext.extlinks",
'numpydoc',
"matplotlib.sphinxext.plot_directive",
"sphinx.ext.todo",
]
doctest_global_setup = '''
@ -54,7 +57,18 @@ except ImportError:
pd = None
'''
extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')}
numpydoc_attributes_as_param_list = False
numpydoc_show_class_members = False
# matplotlib plot directive
plot_include_source = True
plot_formats = [("png", 90)]
plot_html_show_formats = False
plot_html_show_source_link = False
plot_pre_code = """import numpy as np
import eland as ed"""
# Add any paths that contain templates here, relative to this directory.

View File

@ -0,0 +1,6 @@
eland.DataFrame.agg
===================
.. currentmodule:: eland
.. automethod:: DataFrame.agg

View File

@ -0,0 +1,6 @@
eland.DataFrame.aggregate
=========================
.. currentmodule:: eland
.. automethod:: DataFrame.aggregate

View File

@ -0,0 +1,6 @@
eland.DataFrame.count
=====================
.. currentmodule:: eland
.. automethod:: DataFrame.count

View File

@ -0,0 +1,6 @@
eland.DataFrame.describe
========================
.. currentmodule:: eland
.. automethod:: DataFrame.describe

View File

@ -0,0 +1,6 @@
eland.DataFrame.drop
====================
.. currentmodule:: eland
.. automethod:: DataFrame.drop

View File

@ -0,0 +1,6 @@
eland.DataFrame.dtypes
======================
.. currentmodule:: eland
.. autoattribute:: DataFrame.dtypes

View File

@ -0,0 +1,6 @@
eland.DataFrame.empty
=====================
.. currentmodule:: eland
.. autoattribute:: DataFrame.empty

View File

@ -0,0 +1,6 @@
eland.DataFrame.get
===================
.. currentmodule:: eland
.. automethod:: DataFrame.get

View File

@ -0,0 +1,6 @@
eland.DataFrame.hist
====================
.. currentmodule:: eland
.. automethod:: DataFrame.hist

View File

@ -0,0 +1,6 @@
eland.DataFrame.info
====================
.. currentmodule:: eland
.. automethod:: DataFrame.info

View File

@ -0,0 +1,6 @@
eland.DataFrame.select_dtypes
=============================
.. currentmodule:: eland
.. automethod:: DataFrame.select_dtypes

View File

@ -0,0 +1,6 @@
eland.Index
===========
.. currentmodule:: eland
.. autoclass:: Index

View File

@ -21,6 +21,9 @@ Attributes and underlying data
DataFrame.index
DataFrame.columns
DataFrame.dtypes
DataFrame.select_dtypes
DataFrame.empty
Indexing, iteration
~~~~~~~~~~~~~~~~~~~
@ -29,7 +32,45 @@ Indexing, iteration
DataFrame.head
DataFrame.tail
DataFrame.get
Function application, GroupBy & window
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.agg
DataFrame.aggregate
.. _api.dataframe.stats:
Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.count
DataFrame.describe
DataFrame.info
Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.drop
Plotting
~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.hist
Serialization / IO / conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.info

View File

@ -12,3 +12,4 @@ methods. All classes and functions exposed in ``eland.*`` namespace are public.
general_utility_functions
dataframe
indexing

View File

@ -0,0 +1,15 @@
.. _api.index:
=====
Index
=====
.. currentmodule:: eland
**Many of these methods or variants thereof are available on the objects
that contain an index (Series/DataFrame) and those should most likely be
used before calling these methods directly.**
.. autosummary::
:toctree: api/
Index

17
eland/conftest.py Normal file
View File

@ -0,0 +1,17 @@
import pytest
import numpy as np
import pandas as pd
import eland as ed
# Fix console sizxe for consistent test results
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 100)
@pytest.fixture(autouse=True)
def add_imports(doctest_namespace):
doctest_namespace["np"] = np
doctest_namespace["pd"] = pd
doctest_namespace["ed"] = ed

View File

@ -8,7 +8,6 @@ import six
from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path
from pandas.io.formats import console
from pandas.io.formats import format as fmt
@ -19,6 +18,7 @@ from eland import NDFrame
from eland import Series
from eland.filter import BooleanFilter, ScriptFilter
class DataFrame(NDFrame):
"""
Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes
@ -39,21 +39,26 @@ class DataFrame(NDFrame):
index_field: str, optional
The Elasticsearch index field to use as the DataFrame index. Defaults to _id if None is used.
See Also
--------
:pandas_docs:`pandas.DataFrame`
Examples
--------
Constructing DataFrame from an Elasticsearch configuration arguments and an Elasticsearch index
>>> df = ed.DataFrame('localhost:9200', 'flights')
>>> df.head()
AvgTicketPrice Cancelled Carrier Dest ... OriginRegion OriginWeather dayOfWeek timestamp
0 841.265642 False Kibana Airlines Sydney Kingsford Smith International Airport ... DE-HE Sunny 0 2018-01-01 00:00:00
1 882.982662 False Logstash Airways Venice Marco Polo Airport ... SE-BD Clear 0 2018-01-01 18:27:00
2 190.636904 False Logstash Airways Venice Marco Polo Airport ... IT-34 Rain 0 2018-01-01 17:11:14
3 181.694216 True Kibana Airlines Treviso-Sant'Angelo Airport ... IT-72 Thunder & Lightning 0 2018-01-01 10:33:28
4 730.041778 False Kibana Airlines Xi'an Xianyang International Airport ... MX-DIF Damaging Wind 0 2018-01-01 05:13:00
AvgTicketPrice Cancelled ... dayOfWeek timestamp
0 841.265642 False ... 0 2018-01-01 00:00:00
1 882.982662 False ... 0 2018-01-01 18:27:00
2 190.636904 False ... 0 2018-01-01 17:11:14
3 181.694216 True ... 0 2018-01-01 10:33:28
4 730.041778 False ... 0 2018-01-01 05:13:00
<BLANKLINE>
[5 rows x 27 columns]
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
>>> from elasticsearch import Elasticsearch
@ -82,6 +87,7 @@ class DataFrame(NDFrame):
<BLANKLINE>
[5 rows x 2 columns]
"""
def __init__(self,
client=None,
index_pattern=None,
@ -115,18 +121,21 @@ class DataFrame(NDFrame):
-------
Elasticsearch field names as pandas.Index
See Also
--------
:pandas_docs:`pandas.DataFrame.columns`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> assert isinstance(df.columns, pd.Index)
>>> df.columns
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
... 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
... 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay',
... 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour',
... 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
... 'dayOfWeek', 'timestamp'],
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
... 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
... 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
... 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
... 'timestamp'],
... dtype='object')
"""
return self._query_compiler.columns
@ -137,9 +146,20 @@ class DataFrame(NDFrame):
def empty(self):
"""Determines if the DataFrame is empty.
Returns:
True if the DataFrame is empty.
False otherwise.
Returns
-------
bool
If DataFrame is empty, return True, if not return False.
See Also
--------
:pandas_docs:`pandas.DataFrame.empty`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.empty
False
"""
return len(self.columns) == 0 or len(self.index) == 0
@ -161,6 +181,10 @@ class DataFrame(NDFrame):
eland.DataFrame
eland DataFrame filtered on first n rows sorted by index field
See Also
--------
:pandas_docs:`pandas.DataFrame.head`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
@ -192,6 +216,10 @@ class DataFrame(NDFrame):
eland.DataFrame:
eland DataFrame filtered on last n rows sorted by index field
See Also
--------
:pandas_docs:`pandas.DataFrame.tail`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
@ -257,20 +285,45 @@ class DataFrame(NDFrame):
def count(self):
"""
Count non-NA cells for each column (TODO row)
Count non-NA cells for each column.
Counts are based on exists queries against ES
Counts are based on exists queries against ES.
This is inefficient, as it creates N queries (N is number of fields).
An alternative approach is to use value_count aggregations. However, they have issues in that:
1. They can only be used with aggregatable fields (e.g. keyword not text)
2. For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2
for a single document.
- They can only be used with aggregatable fields (e.g. keyword not text)
- For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 for a single document.
TODO - add additional pandas.DataFrame.count features
Returns
-------
pandas.Series:
Summary of column counts
See Also
--------
:pandas_docs:`pandas.DataFrame.count`
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
>>> df.count()
customer_first_name 4675
geoip.city_name 4094
dtype: int64
"""
return self._query_compiler.count()
def info_es(self):
"""
Returns
-------
None
This method prints a debug summary of the task list Elasticsearch
"""
buf = StringIO()
super()._info_es(buf)
@ -297,9 +350,25 @@ class DataFrame(NDFrame):
This method prints information about a DataFrame including
the index dtype and column dtypes, non-null values and memory usage.
See :pandas_docs:`pandas.DataFrame.info` for full details.
Notes
-----
This copies a lot of code from pandas.DataFrame.info as it is difficult
to split out the appropriate code or creating a SparseDataFrame gives
incorrect results on types and counts.
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name'])
>>> df.info()
<class 'eland.dataframe.DataFrame'>
Index: 4675 entries, 0 to 4674
Data columns (total 2 columns):
customer_first_name 4675 non-null object
geoip.city_name 4094 non-null object
dtypes: object(2)
memory usage: 96.0 bytes
"""
if buf is None: # pragma: no cover
buf = sys.stdout
@ -386,7 +455,7 @@ class DataFrame(NDFrame):
else:
_verbose_repr()
counts = self.get_dtype_counts()
counts = self.dtypes.value_counts()
dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
in sorted(counts.items())]
lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
@ -623,7 +692,11 @@ class DataFrame(NDFrame):
)
def select_dtypes(self, include=None, exclude=None):
# get empty df
"""
Return a subset of the DataFrame's columns based on the column dtypes.
Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes`
"""
empty_df = self._empty_pd_df()
empty_df = empty_df.select_dtypes(include=include, exclude=exclude)
@ -649,19 +722,13 @@ class DataFrame(NDFrame):
def keys(self):
return self.columns
def groupby(self, by=None, axis=0, *args, **kwargs):
axis = pd.DataFrame._get_axis_number(axis)
if axis == 1:
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
def aggregate(self, func, axis=0, *args, **kwargs):
"""
Aggregate using one or more operations over the specified axis.
Parameters
----------
func : function, str, list or dict
func: function, str, list or dict
Function to use for aggregating the data. If a function, must either
work when passed a %(klass)s or when passed to %(klass)s.apply.
@ -671,11 +738,15 @@ class DataFrame(NDFrame):
- string function name
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
- dict of axis labels -> functions, function names or list of such.
Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
'rank', 'sem', 'skew', 'sum', 'std', 'var']``
axis
Currently, we only support axis=0 (index)
*args
Positional arguments to pass to `func`.
Positional arguments to pass to `func`
**kwargs
Keyword arguments to pass to `func`.
Keyword arguments to pass to `func`
Returns
-------
@ -684,6 +755,19 @@ class DataFrame(NDFrame):
if DataFrame.agg is called with several functions, returns a DataFrame
if Series.agg is called with single function, returns a scalar
if Series.agg is called with several functions, returns a Series
See Also
--------
:pandas_docs:`pandas.DataFrame.aggregate`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std'])
DistanceKilometers AvgTicketPrice
sum 9.261629e+07 8.204365e+06
min 0.000000e+00 1.000205e+02
std 4.578263e+03 2.663867e+02
"""
axis = pd.DataFrame._get_axis_number(axis)
@ -722,16 +806,38 @@ class DataFrame(NDFrame):
raise NotImplementedError(expr, type(expr))
def get(self, key, default=None):
"""Get item from object for given key (DataFrame column, Panel
slice, etc.). Returns default value if not found.
"""
Get item from object for given key (ex: DataFrame column).
Returns default value if not found.
Args:
key (DataFrame column, Panel slice) : the key for which value
to get
Parameters
----------
key: object
Returns:
value (type of items contained in object) : A value that is
stored at the key
Returns
-------
value: same type as items contained in object
See Also
--------
:pandas_docs:`pandas.DataFrame.get`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.get('Carrier')
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
"""
if key in self.keys():
return self._getitem(key)

View File

@ -1,27 +1,23 @@
"""
class Index
The index for an eland.DataFrame.
Currently, the index is a field that exists in every document in an Elasticsearch index.
For slicing and sorting operations it must be a docvalues field. By default _id is used,
which can't be used for range queries and is inefficient for sorting:
https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html
(The value of the _id field is also accessible in aggregations or for sorting,
but doing so is discouraged as it requires to load a lot of data in memory.
In case sorting or aggregating on the _id field is required, it is advised to duplicate
the content of the _id field in another field that has doc_values enabled.)
"""
class Index:
"""
The index for an eland.DataFrame.
TODO - This currently has very different behaviour than pandas.Index
Currently, the index is a field that exists in every document in an Elasticsearch index.
For slicing and sorting operations it must be a docvalues field. By default _id is used,
which can't be used for range queries and is inefficient for sorting:
https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html
(The value of the _id field is also accessible in aggregations or for sorting,
but doing so is discouraged as it requires to load a lot of data in memory.
In case sorting or aggregating on the _id field is required, it is advised to duplicate
the content of the _id field in another field that has doc_values enabled.)
"""
ID_INDEX_FIELD = '_id'
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
def __init__(self, query_compiler, index_field=None):
# Calls setter
self.index_field = index_field
self._query_compiler = query_compiler

View File

@ -420,13 +420,13 @@ class Mappings:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[
columns].index.tolist()
(self._mappings_capabilities.pd_dtype == 'bool'))].reindex(
columns).index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].loc[
columns].index.tolist()
(self._mappings_capabilities.pd_dtype == 'float64'))].reindex(
columns).index.tolist()
else:
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
@ -469,26 +469,6 @@ class Mappings:
return pd.Series(self._source_field_pd_dtypes)
def get_dtype_counts(self, columns=None):
"""
Return counts of unique dtypes in this object.
Returns
-------
get_dtype_counts : Series
Series with the count of columns with each dtype.
"""
if columns is not None:
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True]
.loc[columns]
.groupby('pd_dtype')['_source']
.count().to_dict())
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True]
.groupby('pd_dtype')['_source']
.count().to_dict())
def info_es(self, buf):
buf.write("Mappings:\n")
buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))

View File

@ -57,10 +57,23 @@ class NDFrame:
def _get_index(self):
"""
Return eland index referencing Elasticsearch field to index a DataFrame/Series
Returns
-------
eland.Index:
Note eland.Index has a very limited API compared to pandas.Index
See Also
--------
:pandas_docs:`pandas.DataFrame.index`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> assert isinstance(df.index, ed.Index)
>>> df.index.index_field
'_id'
"""
return self._query_compiler.index
@ -68,10 +81,30 @@ class NDFrame:
@property
def dtypes(self):
return self._query_compiler.dtypes
"""
Return the pandas dtypes in the DataFrame. Elasticsearch types are mapped
to pandas dtypes via Mappings._es_dtype_to_pd_dtype.__doc__
def get_dtype_counts(self):
return self._query_compiler.get_dtype_counts()
Returns
-------
pandas.Series
The data type of each column.
See Also
--------
:pandas_docs:`pandas.DataFrame.dtypes`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek'])
>>> df.dtypes
Origin object
AvgTicketPrice float64
timestamp datetime64[ns]
dayOfWeek int64
dtype: object
"""
return self._query_compiler.dtypes
def _build_repr_df(self, num_rows, num_cols):
# Overriden version of BasePandasDataset._build_repr_df
@ -134,21 +167,71 @@ class NDFrame:
errors="raise",
):
"""Return new object with labels in requested axis removed.
Args:
labels: Index or column labels to drop.
axis: Whether to drop labels from the index (0 / 'index') or
columns (1 / 'columns').
index, columns: Alternative to specifying axis (labels, axis=1 is
equivalent to columns=labels).
level: For MultiIndex
inplace: If True, do operation inplace and return None.
errors: If 'ignore', suppress error and existing labels are
dropped.
Returns:
dropped : type of caller
(derived from modin.base.BasePandasDataset)
Parameters
----------
labels:
Index or column labels to drop.
axis:
Whether to drop labels from the index (0 / 'index') or columns (1 / 'columns').
index, columns:
Alternative to specifying axis (labels, axis=1 is equivalent to columns=labels).
level:
For MultiIndex - not supported
inplace:
If True, do operation inplace and return None.
errors:
If 'ignore', suppress error and existing labels are dropped.
Returns
-------
dropped:
type of caller
See Also
--------
:pandas_docs:`pandas.DataFrame.drop`
Examples
--------
Drop a column
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'email', 'user'])
>>> df.drop(columns=['user'])
customer_first_name email
0 Eddie eddie@underwood-family.zzz
1 Mary mary@bailey-family.zzz
2 Gwen gwen@butler-family.zzz
3 Diane diane@chandler-family.zzz
4 Eddie eddie@weber-family.zzz
... ... ...
4670 Mary mary@lambert-family.zzz
4671 Jim jim@gilbert-family.zzz
4672 Yahya yahya@rivera-family.zzz
4673 Mary mary@hampton-family.zzz
4674 Jackson jackson@hopkins-family.zzz
<BLANKLINE>
[4675 rows x 2 columns]
Drop rows by index value (axis=0)
>>> df.drop(['1', '2'])
customer_first_name email user
0 Eddie eddie@underwood-family.zzz eddie
3 Diane diane@chandler-family.zzz diane
4 Eddie eddie@weber-family.zzz eddie
5 Diane diane@goodwin-family.zzz diane
6 Oliver oliver@rios-family.zzz oliver
... ... ... ...
4670 Mary mary@lambert-family.zzz mary
4671 Jim jim@gilbert-family.zzz jim
4672 Yahya yahya@rivera-family.zzz yahya
4673 Mary mary@hampton-family.zzz mary
4674 Jackson jackson@hopkins-family.zzz jackson
<BLANKLINE>
[4673 rows x 3 columns]
"""
#(derived from modin.base.BasePandasDataset)
# Level not supported
if level is not None:
raise NotImplementedError("level not supported {}".format(level))
@ -242,4 +325,36 @@ class NDFrame:
return self._query_compiler._hist(num_bins)
def describe(self):
"""
Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
datasets distribution, excluding NaN values.
Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
The output will vary depending on what is provided. Refer to the notes below for more detail.
TODO - add additional arguments (current only numeric values supported)
Returns
-------
pandas.Dataframe:
Summary information
See Also
--------
:pandas_docs:`pandas.DataFrame.describe`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay'])
>>> df.describe() # ignoring percentiles as they don't generate consistent results
AvgTicketPrice FlightDelay
count 13059.000000 13059.000000
mean 628.253689 0.251168
std 266.386661 0.433685
min 100.020531 0.000000
...
...
...
max 1199.729004 1.000000
"""
return self._query_compiler.describe()

View File

@ -10,36 +10,42 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
"""
Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1
See :pandas_docs:`pandas.DataFrame.hist` for usage.
Ideally, we'd call hist_frame directly with histogram data,
Notes
-----
Derived from ``pandas.plotting._core.hist_frame 0.24.2`` - TODO update to ``0.25.1``
Ideally, we'd call `hist_frame` directly with histogram data,
but weights are applied to ALL series. For example, we can
plot a histogram of pre-binned data via:
.. code-block:: python
counts, bins = np.histogram(data)
plt.hist(bins[:-1], bins, weights=counts)
However,
.. code-block:: python
ax.hist(data[col].dropna().values, bins=bins, **kwds)
is for [col] and weights are a single array.
is for ``[col]`` and weights are a single array.
We therefore cut/paste code.
Examples
--------
.. plot::
:context: close-figs
>>> df = ed.DataFrame('localhost', 'flights')
>>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10])
"""
# Start with empty pandas data frame derived from
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
if by is not None:
raise NotImplementedError("TODO")
"""
axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid,
figsize=figsize, sharex=sharex, sharey=sharey,
layout=layout, bins=bins, xlabelsize=xlabelsize,
xrot=xrot, ylabelsize=ylabelsize,
yrot=yrot, **kwds)
"""
return axes
if column is not None:
if not isinstance(column, (list, np.ndarray, ABCIndexClass)):

View File

@ -84,11 +84,6 @@ class ElandQueryCompiler:
return self._mappings.dtypes(columns)
def get_dtype_counts(self):
columns = self._operations.get_columns()
return self._mappings.get_dtype_counts(columns)
# END Index, columns, and dtypes objects
def _es_results_to_pandas(self, results, batch_size=None):

View File

@ -150,7 +150,7 @@ class Series(NDFrame):
)
def _to_pandas(self):
return self._query_compiler._to_pandas()[self.name]
return self._query_compiler.to_pandas()[self.name]
def __gt__(self, other):
if isinstance(other, Series):

View File

@ -4,6 +4,7 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
import pandas as pd
class TestDataFrameCount(TestData):

View File

@ -24,22 +24,3 @@ class TestMappingsDtypes(TestData):
ed_dtypes = ed_flights._query_compiler._mappings.dtypes(columns=['Carrier', 'AvgTicketPrice', 'Cancelled'])
assert_series_equal(pd_dtypes, ed_dtypes)
def test_flights_get_dtype_counts_all(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
ed_dtypes = ed_flights._query_compiler._mappings.get_dtype_counts().sort_index()
assert_series_equal(pd_dtypes, ed_dtypes)
def test_flights_get_dtype_counts_columns(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
ed_dtypes = ed_flights._query_compiler._mappings. \
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
assert_series_equal(pd_dtypes, ed_dtypes)

View File

@ -141,3 +141,37 @@ def ed_to_pd(ed_df):
eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame
"""
return ed_df._to_pandas()
def _inherit_docstrings(parent, excluded=[]):
"""Creates a decorator which overwrites a decorated class' __doc__
attribute with parent's __doc__ attribute. Also overwrites __doc__ of
methods and properties defined in the class with the __doc__ of matching
methods and properties in parent.
Args:
parent (object): Class from which the decorated class inherits __doc__.
excluded (list): List of parent objects from which the class does not
inherit docstrings.
Returns:
function: decorator which replaces the decorated class' documentation
parent's documentation.
"""
def decorator(cls):
if parent not in excluded:
cls.__doc__ = parent.__doc__
for attr, obj in cls.__dict__.items():
parent_obj = getattr(parent, attr, None)
if parent_obj in excluded or (
not callable(parent_obj) and not isinstance(parent_obj, property)
):
continue
if callable(obj):
obj.__doc__ = parent_obj.__doc__
elif isinstance(obj, property) and obj.fget is not None:
p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__)
setattr(cls, attr, p)
return cls
return decorator