From 84e23ab5d1ee3956648b7bb9fb1ec0cb9e17389f Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Fri, 22 Nov 2019 15:44:55 +0000 Subject: [PATCH] Added Series metric aggs + Series docs Also, improved Series.to_string() --- .../source/reference/api/eland.Series.add.rst | 6 + .../reference/api/eland.Series.describe.rst | 6 + .../source/reference/api/eland.Series.div.rst | 6 + .../reference/api/eland.Series.empty.rst | 6 + .../reference/api/eland.Series.floordiv.rst | 6 + .../reference/api/eland.Series.head.rst | 6 + .../reference/api/eland.Series.index.rst | 6 + .../source/reference/api/eland.Series.max.rst | 6 + .../reference/api/eland.Series.mean.rst | 6 + .../source/reference/api/eland.Series.min.rst | 6 + .../source/reference/api/eland.Series.mod.rst | 6 + .../source/reference/api/eland.Series.mul.rst | 6 + .../reference/api/eland.Series.name.rst | 6 + .../reference/api/eland.Series.nunique.rst | 6 + .../source/reference/api/eland.Series.pow.rst | 6 + .../reference/api/eland.Series.rename.rst | 6 + docs/source/reference/api/eland.Series.rst | 6 + .../reference/api/eland.Series.shape.rst | 6 + .../source/reference/api/eland.Series.sub.rst | 6 + .../source/reference/api/eland.Series.sum.rst | 6 + .../reference/api/eland.Series.tail.rst | 6 + .../reference/api/eland.Series.to_string.rst | 6 + .../reference/api/eland.Series.truediv.rst | 6 + .../api/eland.Series.value_counts.rst | 2 +- docs/source/reference/dataframe.rst | 2 - docs/source/reference/series.rst | 70 +- eland/__init__.py | 1 + eland/common.py | 8 + eland/dataframe.py | 62 +- eland/ndframe.py | 11 +- eland/operations.py | 40 +- eland/query_compiler.py | 6 +- eland/series.py | 620 ++++++++++++++++-- eland/tests/__init__.py | 4 +- eland/tests/series/test_arithmetics_pytest.py | 50 +- eland/tests/series/test_info_es_pytest.py | 17 + eland/tests/series/test_metrics_pytest.py | 44 ++ eland/tests/series/test_repr_pytest.py | 14 +- 38 files changed, 973 insertions(+), 116 deletions(-) create mode 100644 docs/source/reference/api/eland.Series.add.rst create mode 100644 docs/source/reference/api/eland.Series.describe.rst create mode 100644 docs/source/reference/api/eland.Series.div.rst create mode 100644 docs/source/reference/api/eland.Series.empty.rst create mode 100644 docs/source/reference/api/eland.Series.floordiv.rst create mode 100644 docs/source/reference/api/eland.Series.head.rst create mode 100644 docs/source/reference/api/eland.Series.index.rst create mode 100644 docs/source/reference/api/eland.Series.max.rst create mode 100644 docs/source/reference/api/eland.Series.mean.rst create mode 100644 docs/source/reference/api/eland.Series.min.rst create mode 100644 docs/source/reference/api/eland.Series.mod.rst create mode 100644 docs/source/reference/api/eland.Series.mul.rst create mode 100644 docs/source/reference/api/eland.Series.name.rst create mode 100644 docs/source/reference/api/eland.Series.nunique.rst create mode 100644 docs/source/reference/api/eland.Series.pow.rst create mode 100644 docs/source/reference/api/eland.Series.rename.rst create mode 100644 docs/source/reference/api/eland.Series.rst create mode 100644 docs/source/reference/api/eland.Series.shape.rst create mode 100644 docs/source/reference/api/eland.Series.sub.rst create mode 100644 docs/source/reference/api/eland.Series.sum.rst create mode 100644 docs/source/reference/api/eland.Series.tail.rst create mode 100644 docs/source/reference/api/eland.Series.to_string.rst create mode 100644 docs/source/reference/api/eland.Series.truediv.rst create mode 100644 eland/common.py create mode 100644 eland/tests/series/test_info_es_pytest.py create mode 100644 eland/tests/series/test_metrics_pytest.py diff --git a/docs/source/reference/api/eland.Series.add.rst b/docs/source/reference/api/eland.Series.add.rst new file mode 100644 index 0000000..da552b7 --- /dev/null +++ b/docs/source/reference/api/eland.Series.add.rst @@ -0,0 +1,6 @@ +eland.Series.add +================ + +.. currentmodule:: eland + +.. automethod:: Series.add diff --git a/docs/source/reference/api/eland.Series.describe.rst b/docs/source/reference/api/eland.Series.describe.rst new file mode 100644 index 0000000..195c410 --- /dev/null +++ b/docs/source/reference/api/eland.Series.describe.rst @@ -0,0 +1,6 @@ +eland.Series.describe +===================== + +.. currentmodule:: eland + +.. automethod:: Series.describe diff --git a/docs/source/reference/api/eland.Series.div.rst b/docs/source/reference/api/eland.Series.div.rst new file mode 100644 index 0000000..0d9698b --- /dev/null +++ b/docs/source/reference/api/eland.Series.div.rst @@ -0,0 +1,6 @@ +eland.Series.div +================ + +.. currentmodule:: eland + +.. automethod:: Series.div diff --git a/docs/source/reference/api/eland.Series.empty.rst b/docs/source/reference/api/eland.Series.empty.rst new file mode 100644 index 0000000..6ca71ec --- /dev/null +++ b/docs/source/reference/api/eland.Series.empty.rst @@ -0,0 +1,6 @@ +eland.Series.empty +================== + +.. currentmodule:: eland + +.. autoattribute:: Series.empty diff --git a/docs/source/reference/api/eland.Series.floordiv.rst b/docs/source/reference/api/eland.Series.floordiv.rst new file mode 100644 index 0000000..543f47d --- /dev/null +++ b/docs/source/reference/api/eland.Series.floordiv.rst @@ -0,0 +1,6 @@ +eland.Series.floordiv +===================== + +.. currentmodule:: eland + +.. automethod:: Series.floordiv diff --git a/docs/source/reference/api/eland.Series.head.rst b/docs/source/reference/api/eland.Series.head.rst new file mode 100644 index 0000000..78bcdbb --- /dev/null +++ b/docs/source/reference/api/eland.Series.head.rst @@ -0,0 +1,6 @@ +eland.Series.head +================= + +.. currentmodule:: eland + +.. automethod:: Series.head diff --git a/docs/source/reference/api/eland.Series.index.rst b/docs/source/reference/api/eland.Series.index.rst new file mode 100644 index 0000000..e996294 --- /dev/null +++ b/docs/source/reference/api/eland.Series.index.rst @@ -0,0 +1,6 @@ +eland.Series.index +================== + +.. currentmodule:: eland + +.. autoattribute:: Series.index diff --git a/docs/source/reference/api/eland.Series.max.rst b/docs/source/reference/api/eland.Series.max.rst new file mode 100644 index 0000000..8deec8a --- /dev/null +++ b/docs/source/reference/api/eland.Series.max.rst @@ -0,0 +1,6 @@ +eland.Series.max +================ + +.. currentmodule:: eland + +.. automethod:: Series.max diff --git a/docs/source/reference/api/eland.Series.mean.rst b/docs/source/reference/api/eland.Series.mean.rst new file mode 100644 index 0000000..5d5f2de --- /dev/null +++ b/docs/source/reference/api/eland.Series.mean.rst @@ -0,0 +1,6 @@ +eland.Series.mean +================= + +.. currentmodule:: eland + +.. automethod:: Series.mean diff --git a/docs/source/reference/api/eland.Series.min.rst b/docs/source/reference/api/eland.Series.min.rst new file mode 100644 index 0000000..484e077 --- /dev/null +++ b/docs/source/reference/api/eland.Series.min.rst @@ -0,0 +1,6 @@ +eland.Series.min +================ + +.. currentmodule:: eland + +.. automethod:: Series.min diff --git a/docs/source/reference/api/eland.Series.mod.rst b/docs/source/reference/api/eland.Series.mod.rst new file mode 100644 index 0000000..2d63164 --- /dev/null +++ b/docs/source/reference/api/eland.Series.mod.rst @@ -0,0 +1,6 @@ +eland.Series.mod +================ + +.. currentmodule:: eland + +.. automethod:: Series.mod diff --git a/docs/source/reference/api/eland.Series.mul.rst b/docs/source/reference/api/eland.Series.mul.rst new file mode 100644 index 0000000..91b0c4f --- /dev/null +++ b/docs/source/reference/api/eland.Series.mul.rst @@ -0,0 +1,6 @@ +eland.Series.mul +================ + +.. currentmodule:: eland + +.. automethod:: Series.mul diff --git a/docs/source/reference/api/eland.Series.name.rst b/docs/source/reference/api/eland.Series.name.rst new file mode 100644 index 0000000..a4086ab --- /dev/null +++ b/docs/source/reference/api/eland.Series.name.rst @@ -0,0 +1,6 @@ +eland.Series.name +================= + +.. currentmodule:: eland + +.. autoattribute:: Series.name diff --git a/docs/source/reference/api/eland.Series.nunique.rst b/docs/source/reference/api/eland.Series.nunique.rst new file mode 100644 index 0000000..40e75ea --- /dev/null +++ b/docs/source/reference/api/eland.Series.nunique.rst @@ -0,0 +1,6 @@ +eland.Series.nunique +==================== + +.. currentmodule:: eland + +.. automethod:: Series.nunique diff --git a/docs/source/reference/api/eland.Series.pow.rst b/docs/source/reference/api/eland.Series.pow.rst new file mode 100644 index 0000000..858e518 --- /dev/null +++ b/docs/source/reference/api/eland.Series.pow.rst @@ -0,0 +1,6 @@ +eland.Series.pow +================ + +.. currentmodule:: eland + +.. automethod:: Series.pow diff --git a/docs/source/reference/api/eland.Series.rename.rst b/docs/source/reference/api/eland.Series.rename.rst new file mode 100644 index 0000000..19e38ac --- /dev/null +++ b/docs/source/reference/api/eland.Series.rename.rst @@ -0,0 +1,6 @@ +eland.Series.rename +=================== + +.. currentmodule:: eland + +.. automethod:: Series.rename diff --git a/docs/source/reference/api/eland.Series.rst b/docs/source/reference/api/eland.Series.rst new file mode 100644 index 0000000..451bfc9 --- /dev/null +++ b/docs/source/reference/api/eland.Series.rst @@ -0,0 +1,6 @@ +eland.Series +============ + +.. currentmodule:: eland + +.. autoclass:: Series diff --git a/docs/source/reference/api/eland.Series.shape.rst b/docs/source/reference/api/eland.Series.shape.rst new file mode 100644 index 0000000..fe1a581 --- /dev/null +++ b/docs/source/reference/api/eland.Series.shape.rst @@ -0,0 +1,6 @@ +eland.Series.shape +================== + +.. currentmodule:: eland + +.. autoattribute:: Series.shape diff --git a/docs/source/reference/api/eland.Series.sub.rst b/docs/source/reference/api/eland.Series.sub.rst new file mode 100644 index 0000000..e2d0a21 --- /dev/null +++ b/docs/source/reference/api/eland.Series.sub.rst @@ -0,0 +1,6 @@ +eland.Series.sub +================ + +.. currentmodule:: eland + +.. automethod:: Series.sub diff --git a/docs/source/reference/api/eland.Series.sum.rst b/docs/source/reference/api/eland.Series.sum.rst new file mode 100644 index 0000000..28ef324 --- /dev/null +++ b/docs/source/reference/api/eland.Series.sum.rst @@ -0,0 +1,6 @@ +eland.Series.sum +================ + +.. currentmodule:: eland + +.. automethod:: Series.sum diff --git a/docs/source/reference/api/eland.Series.tail.rst b/docs/source/reference/api/eland.Series.tail.rst new file mode 100644 index 0000000..109fd8a --- /dev/null +++ b/docs/source/reference/api/eland.Series.tail.rst @@ -0,0 +1,6 @@ +eland.Series.tail +================= + +.. currentmodule:: eland + +.. automethod:: Series.tail diff --git a/docs/source/reference/api/eland.Series.to_string.rst b/docs/source/reference/api/eland.Series.to_string.rst new file mode 100644 index 0000000..ed1ac41 --- /dev/null +++ b/docs/source/reference/api/eland.Series.to_string.rst @@ -0,0 +1,6 @@ +eland.Series.to_string +====================== + +.. currentmodule:: eland + +.. automethod:: Series.to_string diff --git a/docs/source/reference/api/eland.Series.truediv.rst b/docs/source/reference/api/eland.Series.truediv.rst new file mode 100644 index 0000000..f89cf08 --- /dev/null +++ b/docs/source/reference/api/eland.Series.truediv.rst @@ -0,0 +1,6 @@ +eland.Series.truediv +==================== + +.. currentmodule:: eland + +.. automethod:: Series.truediv diff --git a/docs/source/reference/api/eland.Series.value_counts.rst b/docs/source/reference/api/eland.Series.value_counts.rst index 8d020b0..930d9db 100644 --- a/docs/source/reference/api/eland.Series.value_counts.rst +++ b/docs/source/reference/api/eland.Series.value_counts.rst @@ -1,5 +1,5 @@ eland.Series.value_counts -=========================== +========================= .. currentmodule:: eland diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst index e1e71fa..64f5b29 100644 --- a/docs/source/reference/dataframe.rst +++ b/docs/source/reference/dataframe.rst @@ -91,5 +91,3 @@ Elasticsearch utilities :toctree: api/ DataFrame.info_es - - diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst index cbc8898..366e57a 100644 --- a/docs/source/reference/series.rst +++ b/docs/source/reference/series.rst @@ -5,9 +5,77 @@ Series ========= .. currentmodule:: eland +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + +.. autosummary:: + :toctree: api/ + + Series.index + Series.shape + Series.name + Series.empty + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.head + Series.tail + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.add + Series.sub + Series.mul + Series.div + Series.truediv + Series.floordiv + Series.mod + Series.pow + Computations / descriptive stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ - Series.value_counts \ No newline at end of file + Series.describe + Series.max + Series.mean + Series.min + Series.sum + Series.nunique + Series.value_counts + +Reindexing / selection / label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.rename + +Serialization / IO / conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.to_string + +Elasticsearch utilities +~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + Series.info_es diff --git a/eland/__init__.py b/eland/__init__.py index 79b89f9..699e880 100644 --- a/eland/__init__.py +++ b/eland/__init__.py @@ -1,5 +1,6 @@ from __future__ import absolute_import +from eland.common import * from eland.client import * from eland.filter import * from eland.index import * diff --git a/eland/common.py b/eland/common.py new file mode 100644 index 0000000..ff36d08 --- /dev/null +++ b/eland/common.py @@ -0,0 +1,8 @@ +# Default number of rows displayed (different to pandas where ALL could be displayed) +DEFAULT_NUM_ROWS_DISPLAYED = 60 + +def docstring_parameter(*sub): + def dec(obj): + obj.__doc__ = obj.__doc__.format(*sub) + return obj + return dec diff --git a/eland/dataframe.py b/eland/dataframe.py index 51373ed..53a7f76 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -18,15 +18,7 @@ import eland.plotting as gfx from eland import NDFrame from eland import Series from eland.filter import BooleanFilter, ScriptFilter - -# Default number of rows displayed (different to pandas where ALL could be displayed) -DEFAULT_NUM_ROWS_DISPLAYED = 60 - -def docstring_parameter(*sub): - def dec(obj): - obj.__doc__ = obj.__doc__.format(*sub) - return obj - return dec +from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter class DataFrame(NDFrame): @@ -43,7 +35,7 @@ class DataFrame(NDFrame): - elasticsearch-py instance or - eland.Client instance index_pattern: str - Elasticsearch index pattern (e.g. 'flights' or 'filebeat-*') + Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*') columns: list of str, optional List of DataFrame columns. A subset of the Elasticsearch index's fields. index_field: str, optional @@ -98,7 +90,6 @@ class DataFrame(NDFrame): [5 rows x 2 columns] """ - def __init__(self, client=None, index_pattern=None, @@ -586,7 +577,7 @@ class DataFrame(NDFrame): max_rows = 1 # Create a slightly bigger dataframe than display - df = self._build_repr_df(max_rows + 1, max_cols) + df = self._build_repr(max_rows + 1) if buf is not None: _buf = _expand_user(_stringify_path(buf)) @@ -651,7 +642,7 @@ class DataFrame(NDFrame): max_rows = 1 # Create a slightly bigger dataframe than display - df = self._build_repr_df(max_rows + 1, max_cols) + df = self._build_repr(max_rows + 1) if buf is not None: _buf = _expand_user(_stringify_path(buf)) @@ -1064,3 +1055,48 @@ class DataFrame(NDFrame): return self._getitem(key) else: return default + + @property + def values(self): + """ + Not implemented. + + In pandas this returns a Numpy representation of the DataFrame. This would involve scan/scrolling the + entire index. + + If this is required, call ``ed.eland_to_pandas(ed_df).values``, _but beware this will scan/scroll the entire + Elasticsearch index(s) into memory_ + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.values` + + Examples + -------- + >>> ed_df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5) + >>> pd_df = ed.eland_to_pandas(ed_df) + >>> print("type(ed_df)={0}\\ntype(pd_df)={1}".format(type(ed_df), type(pd_df))) + type(ed_df)= + type(pd_df)= + >>> ed_df + AvgTicketPrice Carrier + 0 841.265642 Kibana Airlines + 1 882.982662 Logstash Airways + 2 190.636904 Logstash Airways + 3 181.694216 Kibana Airlines + 4 730.041778 Kibana Airlines + + [5 rows x 2 columns] + >>> pd_df.values + array([[841.2656419677076, 'Kibana Airlines'], + [882.9826615595518, 'Logstash Airways'], + [190.6369038508356, 'Logstash Airways'], + [181.69421554118, 'Kibana Airlines'], + [730.041778346198, 'Kibana Airlines']], dtype=object) + """ + raise NotImplementedError( + "This method would scan/scroll the entire Elasticsearch index(s) into memory." + "If this is explicitly required and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`" + ) + + to_numpy = values diff --git a/eland/ndframe.py b/eland/ndframe.py index c98dd22..3abfed6 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -31,7 +31,6 @@ from pandas.util._validators import validate_bool_kwarg from eland import ElandQueryCompiler - class NDFrame: def __init__(self, @@ -65,6 +64,7 @@ class NDFrame: See Also -------- :pandas_api_docs:`pandas.DataFrame.index` + :pandas_api_docs:`pandas.Series.index` Examples -------- @@ -72,6 +72,10 @@ class NDFrame: >>> assert isinstance(df.index, ed.Index) >>> df.index.index_field '_id' + >>> s = df['Carrier'] + >>> assert isinstance(s.index, ed.Index) + >>> s.index.index_field + '_id' """ return self._query_compiler.index @@ -104,9 +108,8 @@ class NDFrame: """ return self._query_compiler.dtypes - def _build_repr_df(self, num_rows, num_cols): - # Overriden version of BasePandasDataset._build_repr_df - # to avoid issues with concat + def _build_repr(self, num_rows): + # self could be Series or DataFrame if len(self.index) <= num_rows: return self._to_pandas() diff --git a/eland/operations.py b/eland/operations.py index 5c40778..20dfe14 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -588,6 +588,7 @@ class Operations: df = self._apply_df_post_processing(df, post_processing) collector.collect(df) + def iloc(self, index, field_names): # index and field_names are indexers task = ('iloc', (index, field_names)) @@ -881,9 +882,10 @@ class Operations: left_field = item[1][1][1][0] right_field = item[1][1][1][1] + # https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math if isinstance(right_field, str): """ - (if op_name = 'truediv') + (if op_name = '__truediv__') "script_fields": { "field_name": { @@ -893,12 +895,23 @@ class Operations: } } """ - if op_name == 'truediv': - op = '/' + if op_name == '__add__': + source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field) + elif op_name == '__truediv__': + source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field) + elif op_name == '__floordiv__': + source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field) + elif op_name == '__pow__': + source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field) + elif op_name == '__mod__': + source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field) + elif op_name == '__mul__': + source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field) + elif op_name == '__sub__': + source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field) else: raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) - source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field) if query_params['query_script_fields'] is None: query_params['query_script_fields'] = {} @@ -909,7 +922,7 @@ class Operations: } else: """ - (if op_name = 'truediv') + (if op_name = '__truediv__') "script_fields": { "field_name": { @@ -919,12 +932,23 @@ class Operations: } } """ - if op_name == 'truediv': - op = '/' + if op_name == '__add__': + source = "doc['{0}'].value + {1}".format(left_field, right_field) + elif op_name == '__truediv__': + source = "doc['{0}'].value / {1}".format(left_field, right_field) + elif op_name == '__floordiv__': + source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field) + elif op_name == '__pow__': + source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field) + elif op_name == '__mod__': + source = "doc['{0}'].value % {1}".format(left_field, right_field) + elif op_name == '__mul__': + source = "doc['{0}'].value * {1}".format(left_field, right_field) + elif op_name == '__sub__': + source = "doc['{0}'].value - {1}".format(left_field, right_field) else: raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) - source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field) if query_params['query_script_fields'] is None: query_params['query_script_fields'] = {} diff --git a/eland/query_compiler.py b/eland/query_compiler.py index ef01756..e057807 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -239,9 +239,9 @@ class ElandQueryCompiler: # Create pandas DataFrame df = pd.DataFrame(data=rows, index=index) - # _source may not contain all columns in the mapping - # therefore, fill in missing columns - # (note this returns self.columns NOT IN df.columns) + # _source may not contain all field_names in the mapping + # therefore, fill in missing field_names + # (note this returns self.field_names NOT IN df.columns) missing_field_names = list(set(self.field_names) - set(df.columns)) for missing in missing_field_names: diff --git a/eland/series.py b/eland/series.py index 4918b2d..4e39e85 100644 --- a/eland/series.py +++ b/eland/series.py @@ -11,19 +11,26 @@ without storing the dataset in local memory. Implementation Details ---------------------- -Based on NDFrame which underpins eland.1DataFrame +Based on NDFrame which underpins eland.DataFrame """ +import sys +import warnings from io import StringIO import pandas as pd -import numpy as np +from pandas.io.common import _expand_user, _stringify_path from eland import NDFrame +from eland.common import DEFAULT_NUM_ROWS_DISPLAYED from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn +def _get_method_name(): + return sys._getframe(1).f_code.co_name + + class Series(NDFrame): """ pandas.Series like API that proxies into Elasticsearch index(es). @@ -34,35 +41,35 @@ class Series(NDFrame): A reference to a Elasticsearch python client index_pattern : str - An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*). + An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-\*\). index_field : str The field to base the series on - See Also - -------- - - Examples - -------- - - import eland as ed - client = ed.Client(Elasticsearch()) - s = ed.DataFrame(client, 'reviews', 'date') - df.head() - reviewerId vendorId rating date - 0 0 0 5 2006-04-07 17:08 - 1 1 1 5 2006-05-04 12:16 - 2 2 2 4 2006-04-21 12:26 - 3 3 3 5 2006-04-18 15:48 - 4 3 4 5 2006-04-18 15:49 - - Notice that the types are based on Elasticsearch mappings - Notes ----- If the Elasticsearch index is deleted or index mappings are changed after this object is created, the object is not rebuilt and so inconsistencies can occur. + See Also + -------- + :pandas_api_docs:`pandas.Series` + + Examples + -------- + >>> ed.Series(client='localhost', index_pattern='flights', name='Carrier') + 0 Kibana Airlines + 1 Logstash Airways + 2 Logstash Airways + 3 Kibana Airlines + 4 Kibana Airlines + ... + 13054 Logstash Airways + 13055 Logstash Airways + 13056 Logstash Airways + 13057 JetBeats + 13058 JetBeats + Name: Carrier, Length: 13059, dtype: object """ def __init__(self, @@ -94,6 +101,34 @@ class Series(NDFrame): """ return len(self.index) == 0 + @property + def shape(self): + """ + Return a tuple representing the dimensionality of the Series. + + Returns + ------- + shape: tuple + + 0. number of rows + 1. number of columns + + Notes + ----- + - number of rows ``len(series)`` queries Elasticsearch + - number of columns == 1 + + Examples + -------- + >>> df = ed.Series('localhost', 'ecommerce', name='total_quantity') + >>> df.shape + (4675, 1) + """ + num_rows = len(self) + num_columns = 1 + + return num_rows, num_columns + def _get_name(self): return self._query_compiler.columns[0] @@ -118,7 +153,7 @@ class Series(NDFrame): See Also -------- - :pandas_api_docs:pandas.Series.rename + :pandas_api_docs:`pandas.Series.rename` Examples -------- @@ -200,12 +235,39 @@ class Series(NDFrame): return self._query_compiler.value_counts(es_size) + # dtype not implemented for Series as causes query to fail + # in pandas.core.computation.ops.Term.type + # ---------------------------------------------------------------------- # Rendering Methods def __repr__(self): - num_rows = pd.get_option("max_rows") or 60 + """ + Return a string representation for a particular Series. + """ + buf = StringIO() - return self.to_string(max_rows=num_rows) + # max_rows and max_cols determine the maximum size of the pretty printed tabular + # representation of the series. pandas defaults are 60 and 20 respectively. + # series where len(series) > max_rows shows a truncated view with 10 rows shown. + max_rows = pd.get_option("display.max_rows") + min_rows = pd.get_option("display.min_rows") + + if len(self) > max_rows: + max_rows = min_rows + + show_dimensions = pd.get_option("display.show_dimensions") + + self.to_string( + buf=buf, + name=self.name, + dtype=True, + min_rows=min_rows, + max_rows=max_rows, + length=show_dimensions, + ) + result = buf.getvalue() + + return result def to_string( self, @@ -217,33 +279,69 @@ class Series(NDFrame): length=False, dtype=False, name=False, - max_rows=None): - - if max_rows is None: + max_rows=None, + min_rows=None, + ): + # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this + # by limiting rows by default. + num_rows = len(self) # avoid multiple calls + if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: + if max_rows is None: + max_rows = num_rows + else: + max_rows = min(num_rows, max_rows) + elif max_rows is None: warnings.warn("Series.to_string called without max_rows set " "- this will return entire index results. " - "Setting max_rows=60, overwrite if different behaviour is required.") - max_rows = 60 + "Setting max_rows={default}" + " overwrite if different behaviour is required." + .format(default=DEFAULT_NUM_ROWS_DISPLAYED), + UserWarning) + max_rows = DEFAULT_NUM_ROWS_DISPLAYED + + # because of the way pandas handles max_rows=0, not having this throws an error + # see eland issue #56 + if max_rows == 0: + max_rows = 1 # Create a slightly bigger dataframe than display - temp_df = self._build_repr_df(max_rows + 1, None) - if isinstance(temp_df, pd.DataFrame): - temp_df = temp_df[self.name] - temp_str = repr(temp_df) - if self.name is not None: - name_str = "Name: {}, ".format(str(self.name)) + temp_series = self._build_repr(max_rows + 1) + + if buf is not None: + _buf = _expand_user(_stringify_path(buf)) else: - name_str = "" - if len(self.index) > max_rows: - len_str = "Length: {}, ".format(len(self.index)) - else: - len_str = "" - dtype_str = "dtype: {}".format(temp_str.rsplit("dtype: ", 1)[-1]) - if len(self) == 0: - return "Series([], {}{}".format(name_str, dtype_str) - return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format( - name_str, len_str, dtype_str - ) + _buf = StringIO() + + # Create repr of fake series without name, length, dtype summary + temp_str = temp_series.to_string(buf=_buf, + na_rep=na_rep, + float_format=float_format, + header=header, + index=index, + length=False, + dtype=False, + name=False, + max_rows=max_rows) + + # Create the summary + footer = "" + if name and self.name is not None: + footer += "Name: {}".format(str(self.name)) + if length and len(self) > max_rows: + if footer: + footer += ", " + footer += "Length: {}".format(len(self.index)) + if dtype: + if footer: + footer += ", " + footer += "dtype: {}".format(temp_series.dtype) + + if len(footer) > 0: + _buf.write("\n{}".format(footer)) + + if buf is None: + result = _buf.getvalue() + return result def _to_pandas(self): return self._query_compiler.to_pandas()[self.name] @@ -321,13 +419,16 @@ class Series(NDFrame): @property def ndim(self): """ - Returns 1 by definition of a Series1 + Returns 1 by definition of a Series Returns ------- int By definition 1 + See Also + -------- + :pandas_api_docs:`pandas.Series.ndim` """ return 1 @@ -338,34 +439,317 @@ class Series(NDFrame): return buf.getvalue() - def __truediv__(self, right): - return self.truediv(right) - - def truediv(self, right): + def __add__(self, right): """ - return a / b + Return addition of series and right, element-wise (binary operator add). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price + df.total_quantity + 0 38.980000 + 1 55.980000 + 2 201.979996 + 3 176.979996 + 4 82.980003 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + def __truediv__(self, right): + """ + Return floating division of series and right, element-wise (binary operator truediv). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price / df.total_quantity + 0 18.490000 + 1 26.990000 + 2 99.989998 + 3 87.489998 + 4 40.490002 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + def __floordiv__(self, right): + """ + Return integer division of series and right, element-wise (binary operator floordiv //). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price // df.total_quantity + 0 18.0 + 1 26.0 + 2 99.0 + 3 87.0 + 4 40.0 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + def __mod__(self, right): + """ + Return modulo of series and right, element-wise (binary operator mod %). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price % df.total_quantity + 0 0.980000 + 1 1.980000 + 2 1.979996 + 3 0.979996 + 4 0.980003 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + def __mul__(self, right): + """ + Return multiplication of series and right, element-wise (binary operator mul). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price * df.total_quantity + 0 73.959999 + 1 107.959999 + 2 399.959991 + 3 349.959991 + 4 161.960007 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + def __sub__(self, right): + """ + Return subtraction of series and right, element-wise (binary operator sub). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price - df.total_quantity + 0 34.980000 + 1 51.980000 + 2 197.979996 + 3 172.979996 + 4 78.980003 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + def __pow__(self, right): + """ + Return exponential power of series and right, element-wise (binary operator pow \**\). + + Parameters + ---------- + right: eland.Series + + Returns + ------- + eland.Series + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'ecommerce').head(5) + >>> df.taxful_total_price + 0 36.98 + 1 53.98 + 2 199.98 + 3 174.98 + 4 80.98 + Name: taxful_total_price, dtype: float64 + >>> df.total_quantity + 0 2 + 1 2 + 2 2 + 3 2 + 4 2 + Name: total_quantity, dtype: int64 + >>> df.taxful_total_price ** df.total_quantity + 0 1367.520366 + 1 2913.840351 + 2 39991.998691 + 3 30617.998905 + 4 6557.760944 + dtype: float64 + """ + return self._numeric_op(right, _get_method_name()) + + add = __add__ + div = __truediv__ + divide = __truediv__ + floordiv = __floordiv__ + mod = __mod__ + mul = __mul__ + multiply = __mul__ + pow = __pow__ + sub = __sub__ + subtract = __sub__ + truediv = __truediv__ + + def _numeric_op(self, right, method_name): + """ + return a op b a & b == Series a & b must share same eland.Client, index_pattern and index_field + a == Series, b == numeric """ if isinstance(right, Series): # Check compatibility self._query_compiler.check_arithmetics(right._query_compiler) - new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name) + new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name) # Compatible, so create new Series series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( - new_field_name, 'truediv', self.name, right.name)) + new_field_name, method_name, self.name, right.name)) series.name = None return series - elif isinstance(right, (int, float)): # TODO extend to numpy types - new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_')) + elif isinstance(right, (int, float)): # TODO extend to numpy types + new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_')) # Compatible, so create new Series series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( - new_field_name, 'truediv', self.name, float(right))) # force rhs to float + new_field_name, method_name, self.name, float(right))) # force rhs to float # name of Series remains original name series.name = self.name @@ -374,5 +758,123 @@ class Series(NDFrame): else: raise TypeError( "Can only perform arithmetic operation on selected types " - "{0} != {1}".format(type(self), type(right)) + "{0} != {1} for {2}".format(type(self), type(right), method_name) ) + + def max(self): + """ + Return the maximum of the Series values + + TODO - implement remainder of pandas arguments, currently non-numerics are not supported + + Returns + ------- + float + max value + + See Also + -------- + :pandas_api_docs:`pandas.Series.max` + + Examples + -------- + >>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') + >>> int(s.max()) + 1199 + """ + results = super().max() + return results.squeeze() + + def mean(self): + """ + Return the mean of the Series values + + TODO - implement remainder of pandas arguments, currently non-numerics are not supported + + Returns + ------- + float + max value + + See Also + -------- + :pandas_api_docs:`pandas.Series.mean` + + Examples + -------- + >>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') + >>> int(s.mean()) + 628 + """ + results = super().mean() + return results.squeeze() + + def min(self): + """ + Return the minimum of the Series values + + TODO - implement remainder of pandas arguments, currently non-numerics are not supported + + Returns + ------- + float + max value + + See Also + -------- + :pandas_api_docs:`pandas.Series.min` + + Examples + -------- + >>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') + >>> int(s.min()) + 100 + """ + results = super().min() + return results.squeeze() + + def sum(self): + """ + Return the sum of the Series values + + TODO - implement remainder of pandas arguments, currently non-numerics are not supported + + Returns + ------- + float + max value + + See Also + -------- + :pandas_api_docs:`pandas.Series.sum` + + Examples + -------- + >>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice') + >>> int(s.sum()) + 8204364 + """ + results = super().sum() + return results.squeeze() + + def nunique(self): + """ + Return the sum of the Series values + + Returns + ------- + float + max value + + See Also + -------- + :pandas_api_docs:`pandas.Series.sum` + + Examples + -------- + >>> s = ed.Series('localhost', 'flights', name='Carrier') + >>> s.nunique() + 4 + """ + results = super().nunique() + return results.squeeze() diff --git a/eland/tests/__init__.py b/eland/tests/__init__.py index f5dfb16..4791380 100644 --- a/eland/tests/__init__.py +++ b/eland/tests/__init__.py @@ -279,10 +279,10 @@ ECOMMERCE_MAPPING = {"mappings": { "type": "keyword" }, "taxful_total_price": { - "type": "half_float" + "type": "float" }, "taxless_total_price": { - "type": "half_float" + "type": "float" }, "total_quantity": { "type": "integer" diff --git a/eland/tests/series/test_arithmetics_pytest.py b/eland/tests/series/test_arithmetics_pytest.py index 5a510ea..d13595f 100644 --- a/eland/tests/series/test_arithmetics_pytest.py +++ b/eland/tests/series/test_arithmetics_pytest.py @@ -4,6 +4,8 @@ from eland.tests.common import TestData, assert_pandas_eland_series_equal from pandas.util.testing import assert_series_equal import pytest +import numpy as np + class TestSeriesArithmetics(TestData): @@ -15,29 +17,35 @@ class TestSeriesArithmetics(TestData): with pytest.raises(TypeError): ed_df['total_quantity'] / pd_df['taxful_total_price'] - def test_ecommerce_series_div(self): - pd_df = self.pd_ecommerce() - ed_df = self.ed_ecommerce() + def test_ecommerce_series_basic_arithmetics(self): + pd_df = self.pd_ecommerce().head(100) + ed_df = self.ed_ecommerce().head(100) - pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price'] - ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price'] + ops = ['__add__', + '__truediv__', + '__floordiv__', + '__pow__', + '__mod__', + '__mul__', + '__sub__', + 'add', + 'truediv', + 'floordiv', + 'pow', + 'mod', + 'mul', + 'sub'] - assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) + for op in ops: + pd_series = getattr(pd_df['taxful_total_price'], op)(pd_df['total_quantity']) + ed_series = getattr(ed_df['taxful_total_price'], op)(ed_df['total_quantity']) + assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True) - def test_ecommerce_series_div_float(self): - pd_df = self.pd_ecommerce() - ed_df = self.ed_ecommerce() + pd_series = getattr(pd_df['taxful_total_price'], op)(10.56) + ed_series = getattr(ed_df['taxful_total_price'], op)(10.56) + assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True) - pd_avg_price = pd_df['total_quantity'] / 10.0 - ed_avg_price = ed_df['total_quantity'] / 10.0 + pd_series = getattr(pd_df['taxful_total_price'], op)(int(8)) + ed_series = getattr(ed_df['taxful_total_price'], op)(int(8)) + assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True) - assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) - - def test_ecommerce_series_div_int(self): - pd_df = self.pd_ecommerce() - ed_df = self.ed_ecommerce() - - pd_avg_price = pd_df['total_quantity'] / int(10) - ed_avg_price = ed_df['total_quantity'] / int(10) - - assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) diff --git a/eland/tests/series/test_info_es_pytest.py b/eland/tests/series/test_info_es_pytest.py new file mode 100644 index 0000000..cc6b633 --- /dev/null +++ b/eland/tests/series/test_info_es_pytest.py @@ -0,0 +1,17 @@ +# File called _pytest for PyCharm compatability + +from pandas.util.testing import assert_almost_equal + +from eland.tests.common import TestData + +import eland as ed + + +class TestSeriesInfoEs(TestData): + + def test_flights_info_es(self): + ed_flights = self.ed_flights()['AvgTicketPrice'] + + # No assertion, just test it can be called + info_es = ed_flights.info_es() + diff --git a/eland/tests/series/test_metrics_pytest.py b/eland/tests/series/test_metrics_pytest.py new file mode 100644 index 0000000..ef221ba --- /dev/null +++ b/eland/tests/series/test_metrics_pytest.py @@ -0,0 +1,44 @@ +# File called _pytest for PyCharm compatability + +from pandas.util.testing import assert_almost_equal + +from eland.tests.common import TestData + +import eland as ed + + +class TestSeriesMetrics(TestData): + + funcs = ['max', 'min', 'mean', 'sum'] + + def test_flights_metrics(self): + pd_flights = self.pd_flights()['AvgTicketPrice'] + ed_flights = self.ed_flights()['AvgTicketPrice'] + + for func in self.funcs: + pd_metric = getattr(pd_flights, func)() + ed_metric = getattr(ed_flights, func)() + assert_almost_equal(pd_metric, ed_metric, check_less_precise=True) + + def test_ecommerce_selected_non_numeric_source_fields(self): + # None of these are numeric + column = 'category' + + ed_ecommerce = self.ed_ecommerce()[column] + + for func in self.funcs: + ed_metric = getattr(ed_ecommerce, func)() + assert ed_metric.empty + + + def test_ecommerce_selected_all_numeric_source_fields(self): + # All of these are numeric + columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price'] + + for column in columns: + pd_ecommerce = self.pd_ecommerce()[column] + ed_ecommerce = self.ed_ecommerce()[column] + + for func in self.funcs: + assert_almost_equal(getattr(pd_ecommerce, func)(), getattr(ed_ecommerce, func)(), + check_less_precise=True) diff --git a/eland/tests/series/test_repr_pytest.py b/eland/tests/series/test_repr_pytest.py index 9b937c8..e83d6e9 100644 --- a/eland/tests/series/test_repr_pytest.py +++ b/eland/tests/series/test_repr_pytest.py @@ -1,13 +1,14 @@ # File called _pytest for PyCharm compatability import eland as ed +import pandas as pd from eland.tests import ELASTICSEARCH_HOST -from eland.tests import FLIGHTS_INDEX_NAME +from eland.tests import FLIGHTS_INDEX_NAME, ECOMMERCE_INDEX_NAME from eland.tests.common import TestData class TestSeriesRepr(TestData): - def test_repr(self): + def test_repr_flights_carrier(self): pd_s = self.pd_flights()['Carrier'] ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier') @@ -15,3 +16,12 @@ class TestSeriesRepr(TestData): ed_repr = repr(ed_s) assert pd_repr == ed_repr + + def test_repr_flights_carrier_5(self): + pd_s = self.pd_flights()['Carrier'].head(5) + ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier').head(5) + + pd_repr = repr(pd_s) + ed_repr = repr(ed_s) + + assert pd_repr == ed_repr