diff --git a/docs/source/conf.py b/docs/source/conf.py index f37c4d8..9eba27d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -40,7 +40,10 @@ release = '0.1' extensions = [ 'sphinx.ext.autodoc', "sphinx.ext.doctest", - 'numpydoc' + "sphinx.ext.extlinks", + 'numpydoc', + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.todo", ] doctest_global_setup = ''' @@ -54,7 +57,18 @@ except ImportError: pd = None ''' +extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')} + numpydoc_attributes_as_param_list = False +numpydoc_show_class_members = False + +# matplotlib plot directive +plot_include_source = True +plot_formats = [("png", 90)] +plot_html_show_formats = False +plot_html_show_source_link = False +plot_pre_code = """import numpy as np +import eland as ed""" # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/reference/api/eland.DataFrame.agg.rst b/docs/source/reference/api/eland.DataFrame.agg.rst new file mode 100644 index 0000000..ef8c092 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.agg.rst @@ -0,0 +1,6 @@ +eland.DataFrame.agg +=================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.agg diff --git a/docs/source/reference/api/eland.DataFrame.aggregate.rst b/docs/source/reference/api/eland.DataFrame.aggregate.rst new file mode 100644 index 0000000..2e3468f --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.aggregate.rst @@ -0,0 +1,6 @@ +eland.DataFrame.aggregate +========================= + +.. currentmodule:: eland + +.. automethod:: DataFrame.aggregate diff --git a/docs/source/reference/api/eland.DataFrame.count.rst b/docs/source/reference/api/eland.DataFrame.count.rst new file mode 100644 index 0000000..a2d74fd --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.count.rst @@ -0,0 +1,6 @@ +eland.DataFrame.count +===================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.count diff --git a/docs/source/reference/api/eland.DataFrame.describe.rst b/docs/source/reference/api/eland.DataFrame.describe.rst new file mode 100644 index 0000000..41a5d0c --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.describe.rst @@ -0,0 +1,6 @@ +eland.DataFrame.describe +======================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.describe diff --git a/docs/source/reference/api/eland.DataFrame.drop.rst b/docs/source/reference/api/eland.DataFrame.drop.rst new file mode 100644 index 0000000..a01d5ce --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.drop.rst @@ -0,0 +1,6 @@ +eland.DataFrame.drop +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.drop diff --git a/docs/source/reference/api/eland.DataFrame.dtypes.rst b/docs/source/reference/api/eland.DataFrame.dtypes.rst new file mode 100644 index 0000000..6ec2883 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.dtypes.rst @@ -0,0 +1,6 @@ +eland.DataFrame.dtypes +====================== + +.. currentmodule:: eland + +.. autoattribute:: DataFrame.dtypes diff --git a/docs/source/reference/api/eland.DataFrame.empty.rst b/docs/source/reference/api/eland.DataFrame.empty.rst new file mode 100644 index 0000000..e693934 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.empty.rst @@ -0,0 +1,6 @@ +eland.DataFrame.empty +===================== + +.. currentmodule:: eland + +.. autoattribute:: DataFrame.empty diff --git a/docs/source/reference/api/eland.DataFrame.get.rst b/docs/source/reference/api/eland.DataFrame.get.rst new file mode 100644 index 0000000..dc069ad --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.get.rst @@ -0,0 +1,6 @@ +eland.DataFrame.get +=================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.get diff --git a/docs/source/reference/api/eland.DataFrame.hist.rst b/docs/source/reference/api/eland.DataFrame.hist.rst new file mode 100644 index 0000000..73c478c --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.hist.rst @@ -0,0 +1,6 @@ +eland.DataFrame.hist +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.hist diff --git a/docs/source/reference/api/eland.DataFrame.info.rst b/docs/source/reference/api/eland.DataFrame.info.rst new file mode 100644 index 0000000..452adf2 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.info.rst @@ -0,0 +1,6 @@ +eland.DataFrame.info +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.info diff --git a/docs/source/reference/api/eland.DataFrame.select_dtypes.rst b/docs/source/reference/api/eland.DataFrame.select_dtypes.rst new file mode 100644 index 0000000..3a8272b --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.select_dtypes.rst @@ -0,0 +1,6 @@ +eland.DataFrame.select_dtypes +============================= + +.. currentmodule:: eland + +.. automethod:: DataFrame.select_dtypes diff --git a/docs/source/reference/api/eland.Index.rst b/docs/source/reference/api/eland.Index.rst new file mode 100644 index 0000000..20c53d8 --- /dev/null +++ b/docs/source/reference/api/eland.Index.rst @@ -0,0 +1,6 @@ +eland.Index +=========== + +.. currentmodule:: eland + +.. autoclass:: Index diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst index f4510b3..aaed5a3 100644 --- a/docs/source/reference/dataframe.rst +++ b/docs/source/reference/dataframe.rst @@ -21,6 +21,9 @@ Attributes and underlying data DataFrame.index DataFrame.columns + DataFrame.dtypes + DataFrame.select_dtypes + DataFrame.empty Indexing, iteration ~~~~~~~~~~~~~~~~~~~ @@ -29,7 +32,45 @@ Indexing, iteration DataFrame.head DataFrame.tail + DataFrame.get +Function application, GroupBy & window +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + DataFrame.agg + DataFrame.aggregate +.. _api.dataframe.stats: + +Computations / descriptive stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.count + DataFrame.describe + DataFrame.info + +Reindexing / selection / label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.drop + +Plotting +~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.hist + +Serialization / IO / conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.info diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 8f79abe..a623800 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -12,3 +12,4 @@ methods. All classes and functions exposed in ``eland.*`` namespace are public. general_utility_functions dataframe + indexing diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst new file mode 100644 index 0000000..1824209 --- /dev/null +++ b/docs/source/reference/indexing.rst @@ -0,0 +1,15 @@ +.. _api.index: + +===== +Index +===== +.. currentmodule:: eland + +**Many of these methods or variants thereof are available on the objects +that contain an index (Series/DataFrame) and those should most likely be +used before calling these methods directly.** + +.. autosummary:: + :toctree: api/ + + Index diff --git a/eland/conftest.py b/eland/conftest.py new file mode 100644 index 0000000..ce62d3b --- /dev/null +++ b/eland/conftest.py @@ -0,0 +1,17 @@ +import pytest + +import numpy as np +import pandas as pd +import eland as ed + +# Fix console sizxe for consistent test results +pd.set_option('display.max_rows', 10) +pd.set_option('display.max_columns', 5) +pd.set_option('display.width', 100) + +@pytest.fixture(autouse=True) +def add_imports(doctest_namespace): + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["ed"] = ed + diff --git a/eland/dataframe.py b/eland/dataframe.py index 516391a..bf476a9 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -8,7 +8,6 @@ import six from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.dtypes.common import is_list_like from pandas.core.indexing import check_bool_indexer - from pandas.io.common import _expand_user, _stringify_path from pandas.io.formats import console from pandas.io.formats import format as fmt @@ -19,6 +18,7 @@ from eland import NDFrame from eland import Series from eland.filter import BooleanFilter, ScriptFilter + class DataFrame(NDFrame): """ Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes @@ -39,21 +39,26 @@ class DataFrame(NDFrame): index_field: str, optional The Elasticsearch index field to use as the DataFrame index. Defaults to _id if None is used. + See Also + -------- + :pandas_docs:`pandas.DataFrame` + Examples -------- Constructing DataFrame from an Elasticsearch configuration arguments and an Elasticsearch index >>> df = ed.DataFrame('localhost:9200', 'flights') >>> df.head() - AvgTicketPrice Cancelled Carrier Dest ... OriginRegion OriginWeather dayOfWeek timestamp - 0 841.265642 False Kibana Airlines Sydney Kingsford Smith International Airport ... DE-HE Sunny 0 2018-01-01 00:00:00 - 1 882.982662 False Logstash Airways Venice Marco Polo Airport ... SE-BD Clear 0 2018-01-01 18:27:00 - 2 190.636904 False Logstash Airways Venice Marco Polo Airport ... IT-34 Rain 0 2018-01-01 17:11:14 - 3 181.694216 True Kibana Airlines Treviso-Sant'Angelo Airport ... IT-72 Thunder & Lightning 0 2018-01-01 10:33:28 - 4 730.041778 False Kibana Airlines Xi'an Xianyang International Airport ... MX-DIF Damaging Wind 0 2018-01-01 05:13:00 + AvgTicketPrice Cancelled ... dayOfWeek timestamp + 0 841.265642 False ... 0 2018-01-01 00:00:00 + 1 882.982662 False ... 0 2018-01-01 18:27:00 + 2 190.636904 False ... 0 2018-01-01 17:11:14 + 3 181.694216 True ... 0 2018-01-01 10:33:28 + 4 730.041778 False ... 0 2018-01-01 05:13:00 [5 rows x 27 columns] + Constructing DataFrame from an Elasticsearch client and an Elasticsearch index >>> from elasticsearch import Elasticsearch @@ -82,6 +87,7 @@ class DataFrame(NDFrame): [5 rows x 2 columns] """ + def __init__(self, client=None, index_pattern=None, @@ -115,19 +121,22 @@ class DataFrame(NDFrame): ------- Elasticsearch field names as pandas.Index + See Also + -------- + :pandas_docs:`pandas.DataFrame.columns` + Examples -------- >>> df = ed.DataFrame('localhost', 'flights') >>> assert isinstance(df.columns, pd.Index) >>> df.columns - Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', - ... 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion', - ... 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay', - ... 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour', - ... 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName', - ... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', - ... 'dayOfWeek', 'timestamp'], - ... dtype='object') + Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName', + ... 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers', + ... 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum', + ... 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName', + ... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek', + ... 'timestamp'], + ... dtype='object') """ return self._query_compiler.columns @@ -137,9 +146,20 @@ class DataFrame(NDFrame): def empty(self): """Determines if the DataFrame is empty. - Returns: - True if the DataFrame is empty. - False otherwise. + Returns + ------- + bool + If DataFrame is empty, return True, if not return False. + + See Also + -------- + :pandas_docs:`pandas.DataFrame.empty` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.empty + False """ return len(self.columns) == 0 or len(self.index) == 0 @@ -161,6 +181,10 @@ class DataFrame(NDFrame): eland.DataFrame eland DataFrame filtered on first n rows sorted by index field + See Also + -------- + :pandas_docs:`pandas.DataFrame.head` + Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest']) @@ -192,6 +216,10 @@ class DataFrame(NDFrame): eland.DataFrame: eland DataFrame filtered on last n rows sorted by index field + See Also + -------- + :pandas_docs:`pandas.DataFrame.tail` + Examples -------- >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest']) @@ -257,20 +285,45 @@ class DataFrame(NDFrame): def count(self): """ - Count non-NA cells for each column (TODO row) + Count non-NA cells for each column. - Counts are based on exists queries against ES + Counts are based on exists queries against ES. This is inefficient, as it creates N queries (N is number of fields). - An alternative approach is to use value_count aggregations. However, they have issues in that: - 1. They can only be used with aggregatable fields (e.g. keyword not text) - 2. For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 - for a single document. + + - They can only be used with aggregatable fields (e.g. keyword not text) + - For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 for a single document. + + TODO - add additional pandas.DataFrame.count features + + Returns + ------- + pandas.Series: + Summary of column counts + + See Also + -------- + :pandas_docs:`pandas.DataFrame.count` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) + >>> df.count() + customer_first_name 4675 + geoip.city_name 4094 + dtype: int64 """ return self._query_compiler.count() def info_es(self): + """ + + Returns + ------- + None + This method prints a debug summary of the task list Elasticsearch + """ buf = StringIO() super()._info_es(buf) @@ -297,9 +350,25 @@ class DataFrame(NDFrame): This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage. + See :pandas_docs:`pandas.DataFrame.info` for full details. + + Notes + ----- This copies a lot of code from pandas.DataFrame.info as it is difficult to split out the appropriate code or creating a SparseDataFrame gives incorrect results on types and counts. + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'geoip.city_name']) + >>> df.info() + + Index: 4675 entries, 0 to 4674 + Data columns (total 2 columns): + customer_first_name 4675 non-null object + geoip.city_name 4094 non-null object + dtypes: object(2) + memory usage: 96.0 bytes """ if buf is None: # pragma: no cover buf = sys.stdout @@ -386,7 +455,7 @@ class DataFrame(NDFrame): else: _verbose_repr() - counts = self.get_dtype_counts() + counts = self.dtypes.value_counts() dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k in sorted(counts.items())] lines.append('dtypes: {types}'.format(types=', '.join(dtypes))) @@ -623,7 +692,11 @@ class DataFrame(NDFrame): ) def select_dtypes(self, include=None, exclude=None): - # get empty df + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes` + """ empty_df = self._empty_pd_df() empty_df = empty_df.select_dtypes(include=include, exclude=exclude) @@ -649,19 +722,13 @@ class DataFrame(NDFrame): def keys(self): return self.columns - def groupby(self, by=None, axis=0, *args, **kwargs): - axis = pd.DataFrame._get_axis_number(axis) - - if axis == 1: - raise NotImplementedError("Aggregating via index not currently implemented - needs index transform") - def aggregate(self, func, axis=0, *args, **kwargs): """ Aggregate using one or more operations over the specified axis. Parameters ---------- - func : function, str, list or dict + func: function, str, list or dict Function to use for aggregating the data. If a function, must either work when passed a %(klass)s or when passed to %(klass)s.apply. @@ -671,11 +738,15 @@ class DataFrame(NDFrame): - string function name - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - dict of axis labels -> functions, function names or list of such. + + Currently, we only support ``['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', + 'rank', 'sem', 'skew', 'sum', 'std', 'var']`` axis + Currently, we only support axis=0 (index) *args - Positional arguments to pass to `func`. + Positional arguments to pass to `func` **kwargs - Keyword arguments to pass to `func`. + Keyword arguments to pass to `func` Returns ------- @@ -684,6 +755,19 @@ class DataFrame(NDFrame): if DataFrame.agg is called with several functions, returns a DataFrame if Series.agg is called with single function, returns a scalar if Series.agg is called with several functions, returns a Series + + See Also + -------- + :pandas_docs:`pandas.DataFrame.aggregate` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) + DistanceKilometers AvgTicketPrice + sum 9.261629e+07 8.204365e+06 + min 0.000000e+00 1.000205e+02 + std 4.578263e+03 2.663867e+02 """ axis = pd.DataFrame._get_axis_number(axis) @@ -722,17 +806,39 @@ class DataFrame(NDFrame): raise NotImplementedError(expr, type(expr)) def get(self, key, default=None): - """Get item from object for given key (DataFrame column, Panel - slice, etc.). Returns default value if not found. + """ + Get item from object for given key (ex: DataFrame column). + Returns default value if not found. - Args: - key (DataFrame column, Panel slice) : the key for which value - to get + Parameters + ---------- + key: object - Returns: - value (type of items contained in object) : A value that is - stored at the key - """ + Returns + ------- + value: same type as items contained in object + + See Also + -------- + :pandas_docs:`pandas.DataFrame.get` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.get('Carrier') + 0 Kibana Airlines + 1 Logstash Airways + 2 Logstash Airways + 3 Kibana Airlines + 4 Kibana Airlines + ... + 13054 Logstash Airways + 13055 Logstash Airways + 13056 Logstash Airways + 13057 JetBeats + 13058 JetBeats + Name: Carrier, Length: 13059, dtype: object + """ if key in self.keys(): return self._getitem(key) else: diff --git a/eland/index.py b/eland/index.py index 7d4a355..dfd0846 100644 --- a/eland/index.py +++ b/eland/index.py @@ -1,27 +1,23 @@ -""" -class Index - -The index for an eland.DataFrame. - -Currently, the index is a field that exists in every document in an Elasticsearch index. -For slicing and sorting operations it must be a docvalues field. By default _id is used, -which can't be used for range queries and is inefficient for sorting: - -https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html -(The value of the _id field is also accessible in aggregations or for sorting, -but doing so is discouraged as it requires to load a lot of data in memory. -In case sorting or aggregating on the _id field is required, it is advised to duplicate -the content of the _id field in another field that has doc_values enabled.) - -""" - - class Index: + """ + The index for an eland.DataFrame. + + TODO - This currently has very different behaviour than pandas.Index + + Currently, the index is a field that exists in every document in an Elasticsearch index. + For slicing and sorting operations it must be a docvalues field. By default _id is used, + which can't be used for range queries and is inefficient for sorting: + + https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html + (The value of the _id field is also accessible in aggregations or for sorting, + but doing so is discouraged as it requires to load a lot of data in memory. + In case sorting or aggregating on the _id field is required, it is advised to duplicate + the content of the _id field in another field that has doc_values enabled.) + """ ID_INDEX_FIELD = '_id' ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc def __init__(self, query_compiler, index_field=None): - # Calls setter self.index_field = index_field self._query_compiler = query_compiler diff --git a/eland/mappings.py b/eland/mappings.py index 2b61b6c..ec7072e 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -420,13 +420,13 @@ class Mappings: return self._mappings_capabilities[(self._mappings_capabilities._source == True) & ((self._mappings_capabilities.pd_dtype == 'int64') | (self._mappings_capabilities.pd_dtype == 'float64') | - (self._mappings_capabilities.pd_dtype == 'bool'))].loc[ - columns].index.tolist() + (self._mappings_capabilities.pd_dtype == 'bool'))].reindex( + columns).index.tolist() else: return self._mappings_capabilities[(self._mappings_capabilities._source == True) & ((self._mappings_capabilities.pd_dtype == 'int64') | - (self._mappings_capabilities.pd_dtype == 'float64'))].loc[ - columns].index.tolist() + (self._mappings_capabilities.pd_dtype == 'float64'))].reindex( + columns).index.tolist() else: if include_bool == True: return self._mappings_capabilities[(self._mappings_capabilities._source == True) & @@ -469,26 +469,6 @@ class Mappings: return pd.Series(self._source_field_pd_dtypes) - def get_dtype_counts(self, columns=None): - """ - Return counts of unique dtypes in this object. - - Returns - ------- - get_dtype_counts : Series - Series with the count of columns with each dtype. - """ - - if columns is not None: - return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True] - .loc[columns] - .groupby('pd_dtype')['_source'] - .count().to_dict()) - - return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True] - .groupby('pd_dtype')['_source'] - .count().to_dict()) - def info_es(self, buf): buf.write("Mappings:\n") buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities)) diff --git a/eland/ndframe.py b/eland/ndframe.py index 3c8f53b..605ce43 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -57,10 +57,23 @@ class NDFrame: def _get_index(self): """ + Return eland index referencing Elasticsearch field to index a DataFrame/Series Returns ------- + eland.Index: + Note eland.Index has a very limited API compared to pandas.Index + See Also + -------- + :pandas_docs:`pandas.DataFrame.index` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> assert isinstance(df.index, ed.Index) + >>> df.index.index_field + '_id' """ return self._query_compiler.index @@ -68,10 +81,30 @@ class NDFrame: @property def dtypes(self): - return self._query_compiler.dtypes + """ + Return the pandas dtypes in the DataFrame. Elasticsearch types are mapped + to pandas dtypes via Mappings._es_dtype_to_pd_dtype.__doc__ - def get_dtype_counts(self): - return self._query_compiler.get_dtype_counts() + Returns + ------- + pandas.Series + The data type of each column. + + See Also + -------- + :pandas_docs:`pandas.DataFrame.dtypes` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'AvgTicketPrice', 'timestamp', 'dayOfWeek']) + >>> df.dtypes + Origin object + AvgTicketPrice float64 + timestamp datetime64[ns] + dayOfWeek int64 + dtype: object + """ + return self._query_compiler.dtypes def _build_repr_df(self, num_rows, num_cols): # Overriden version of BasePandasDataset._build_repr_df @@ -134,21 +167,71 @@ class NDFrame: errors="raise", ): """Return new object with labels in requested axis removed. - Args: - labels: Index or column labels to drop. - axis: Whether to drop labels from the index (0 / 'index') or - columns (1 / 'columns'). - index, columns: Alternative to specifying axis (labels, axis=1 is - equivalent to columns=labels). - level: For MultiIndex - inplace: If True, do operation inplace and return None. - errors: If 'ignore', suppress error and existing labels are - dropped. - Returns: - dropped : type of caller - (derived from modin.base.BasePandasDataset) + Parameters + ---------- + labels: + Index or column labels to drop. + axis: + Whether to drop labels from the index (0 / 'index') or columns (1 / 'columns'). + index, columns: + Alternative to specifying axis (labels, axis=1 is equivalent to columns=labels). + level: + For MultiIndex - not supported + inplace: + If True, do operation inplace and return None. + errors: + If 'ignore', suppress error and existing labels are dropped. + + Returns + ------- + dropped: + type of caller + + See Also + -------- + :pandas_docs:`pandas.DataFrame.drop` + + Examples + -------- + Drop a column + + >>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'email', 'user']) + >>> df.drop(columns=['user']) + customer_first_name email + 0 Eddie eddie@underwood-family.zzz + 1 Mary mary@bailey-family.zzz + 2 Gwen gwen@butler-family.zzz + 3 Diane diane@chandler-family.zzz + 4 Eddie eddie@weber-family.zzz + ... ... ... + 4670 Mary mary@lambert-family.zzz + 4671 Jim jim@gilbert-family.zzz + 4672 Yahya yahya@rivera-family.zzz + 4673 Mary mary@hampton-family.zzz + 4674 Jackson jackson@hopkins-family.zzz + + [4675 rows x 2 columns] + + Drop rows by index value (axis=0) + + >>> df.drop(['1', '2']) + customer_first_name email user + 0 Eddie eddie@underwood-family.zzz eddie + 3 Diane diane@chandler-family.zzz diane + 4 Eddie eddie@weber-family.zzz eddie + 5 Diane diane@goodwin-family.zzz diane + 6 Oliver oliver@rios-family.zzz oliver + ... ... ... ... + 4670 Mary mary@lambert-family.zzz mary + 4671 Jim jim@gilbert-family.zzz jim + 4672 Yahya yahya@rivera-family.zzz yahya + 4673 Mary mary@hampton-family.zzz mary + 4674 Jackson jackson@hopkins-family.zzz jackson + + [4673 rows x 3 columns] """ + #(derived from modin.base.BasePandasDataset) # Level not supported if level is not None: raise NotImplementedError("level not supported {}".format(level)) @@ -242,4 +325,36 @@ class NDFrame: return self._query_compiler._hist(num_bins) def describe(self): + """ + Generate descriptive statistics that summarize the central tendency, dispersion and shape of a + dataset’s distribution, excluding NaN values. + + Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types. + The output will vary depending on what is provided. Refer to the notes below for more detail. + + TODO - add additional arguments (current only numeric values supported) + + Returns + ------- + pandas.Dataframe: + Summary information + + See Also + -------- + :pandas_docs:`pandas.DataFrame.describe` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay']) + >>> df.describe() # ignoring percentiles as they don't generate consistent results + AvgTicketPrice FlightDelay + count 13059.000000 13059.000000 + mean 628.253689 0.251168 + std 266.386661 0.433685 + min 100.020531 0.000000 + ... + ... + ... + max 1199.729004 1.000000 + """ return self._query_compiler.describe() diff --git a/eland/plotting.py b/eland/plotting.py index 09f9c7f..0b5e4c7 100644 --- a/eland/plotting.py +++ b/eland/plotting.py @@ -10,36 +10,42 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, **kwds): """ - Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1 + See :pandas_docs:`pandas.DataFrame.hist` for usage. - Ideally, we'd call hist_frame directly with histogram data, + Notes + ----- + Derived from ``pandas.plotting._core.hist_frame 0.24.2`` - TODO update to ``0.25.1`` + + Ideally, we'd call `hist_frame` directly with histogram data, but weights are applied to ALL series. For example, we can plot a histogram of pre-binned data via: - counts, bins = np.histogram(data) - plt.hist(bins[:-1], bins, weights=counts) + .. code-block:: python + + counts, bins = np.histogram(data) + plt.hist(bins[:-1], bins, weights=counts) However, - ax.hist(data[col].dropna().values, bins=bins, **kwds) + .. code-block:: python - is for [col] and weights are a single array. + ax.hist(data[col].dropna().values, bins=bins, **kwds) - We therefore cut/paste code. + is for ``[col]`` and weights are a single array. + + Examples + -------- + .. plot:: + :context: close-figs + + >>> df = ed.DataFrame('localhost', 'flights') + >>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10]) """ # Start with empty pandas data frame derived from ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins) if by is not None: raise NotImplementedError("TODO") - """ - axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, - figsize=figsize, sharex=sharex, sharey=sharey, - layout=layout, bins=bins, xlabelsize=xlabelsize, - xrot=xrot, ylabelsize=ylabelsize, - yrot=yrot, **kwds) - """ - return axes if column is not None: if not isinstance(column, (list, np.ndarray, ABCIndexClass)): diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 7f78614..e2cc5a5 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -84,11 +84,6 @@ class ElandQueryCompiler: return self._mappings.dtypes(columns) - def get_dtype_counts(self): - columns = self._operations.get_columns() - - return self._mappings.get_dtype_counts(columns) - # END Index, columns, and dtypes objects def _es_results_to_pandas(self, results, batch_size=None): diff --git a/eland/series.py b/eland/series.py index 66f27e3..28b5d23 100644 --- a/eland/series.py +++ b/eland/series.py @@ -150,7 +150,7 @@ class Series(NDFrame): ) def _to_pandas(self): - return self._query_compiler._to_pandas()[self.name] + return self._query_compiler.to_pandas()[self.name] def __gt__(self, other): if isinstance(other, Series): diff --git a/eland/tests/dataframe/test_count_pytest.py b/eland/tests/dataframe/test_count_pytest.py index 72d09af..3dab08e 100644 --- a/eland/tests/dataframe/test_count_pytest.py +++ b/eland/tests/dataframe/test_count_pytest.py @@ -4,6 +4,7 @@ from pandas.util.testing import assert_series_equal from eland.tests.common import TestData +import pandas as pd class TestDataFrameCount(TestData): diff --git a/eland/tests/mappings/test_dtypes_pytest.py b/eland/tests/mappings/test_dtypes_pytest.py index 0987169..43d3e3e 100644 --- a/eland/tests/mappings/test_dtypes_pytest.py +++ b/eland/tests/mappings/test_dtypes_pytest.py @@ -24,22 +24,3 @@ class TestMappingsDtypes(TestData): ed_dtypes = ed_flights._query_compiler._mappings.dtypes(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']) assert_series_equal(pd_dtypes, ed_dtypes) - - def test_flights_get_dtype_counts_all(self): - ed_flights = self.ed_flights() - pd_flights = self.pd_flights() - - pd_dtypes = pd_flights.get_dtype_counts().sort_index() - ed_dtypes = ed_flights._query_compiler._mappings.get_dtype_counts().sort_index() - - assert_series_equal(pd_dtypes, ed_dtypes) - - def test_flights_get_dtype_counts_columns(self): - ed_flights = self.ed_flights() - pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']] - - pd_dtypes = pd_flights.get_dtype_counts().sort_index() - ed_dtypes = ed_flights._query_compiler._mappings. \ - get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index() - - assert_series_equal(pd_dtypes, ed_dtypes) diff --git a/eland/utils.py b/eland/utils.py index 1299f6c..e55e348 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -141,3 +141,37 @@ def ed_to_pd(ed_df): eland.pd_to_ed: Create an eland.Dataframe from pandas.DataFrame """ return ed_df._to_pandas() + +def _inherit_docstrings(parent, excluded=[]): + """Creates a decorator which overwrites a decorated class' __doc__ + attribute with parent's __doc__ attribute. Also overwrites __doc__ of + methods and properties defined in the class with the __doc__ of matching + methods and properties in parent. + + Args: + parent (object): Class from which the decorated class inherits __doc__. + excluded (list): List of parent objects from which the class does not + inherit docstrings. + + Returns: + function: decorator which replaces the decorated class' documentation + parent's documentation. + """ + + def decorator(cls): + if parent not in excluded: + cls.__doc__ = parent.__doc__ + for attr, obj in cls.__dict__.items(): + parent_obj = getattr(parent, attr, None) + if parent_obj in excluded or ( + not callable(parent_obj) and not isinstance(parent_obj, property) + ): + continue + if callable(obj): + obj.__doc__ = parent_obj.__doc__ + elif isinstance(obj, property) and obj.fget is not None: + p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__) + setattr(cls, attr, p) + return cls + + return decorator