Added Series metric aggs + Series docs

Also, improved Series.to_string()
This commit is contained in:
Stephen Dodson 2019-11-22 15:44:55 +00:00
parent 5d119215f8
commit 84e23ab5d1
38 changed files with 973 additions and 116 deletions

View File

@ -0,0 +1,6 @@
eland.Series.add
================
.. currentmodule:: eland
.. automethod:: Series.add

View File

@ -0,0 +1,6 @@
eland.Series.describe
=====================
.. currentmodule:: eland
.. automethod:: Series.describe

View File

@ -0,0 +1,6 @@
eland.Series.div
================
.. currentmodule:: eland
.. automethod:: Series.div

View File

@ -0,0 +1,6 @@
eland.Series.empty
==================
.. currentmodule:: eland
.. autoattribute:: Series.empty

View File

@ -0,0 +1,6 @@
eland.Series.floordiv
=====================
.. currentmodule:: eland
.. automethod:: Series.floordiv

View File

@ -0,0 +1,6 @@
eland.Series.head
=================
.. currentmodule:: eland
.. automethod:: Series.head

View File

@ -0,0 +1,6 @@
eland.Series.index
==================
.. currentmodule:: eland
.. autoattribute:: Series.index

View File

@ -0,0 +1,6 @@
eland.Series.max
================
.. currentmodule:: eland
.. automethod:: Series.max

View File

@ -0,0 +1,6 @@
eland.Series.mean
=================
.. currentmodule:: eland
.. automethod:: Series.mean

View File

@ -0,0 +1,6 @@
eland.Series.min
================
.. currentmodule:: eland
.. automethod:: Series.min

View File

@ -0,0 +1,6 @@
eland.Series.mod
================
.. currentmodule:: eland
.. automethod:: Series.mod

View File

@ -0,0 +1,6 @@
eland.Series.mul
================
.. currentmodule:: eland
.. automethod:: Series.mul

View File

@ -0,0 +1,6 @@
eland.Series.name
=================
.. currentmodule:: eland
.. autoattribute:: Series.name

View File

@ -0,0 +1,6 @@
eland.Series.nunique
====================
.. currentmodule:: eland
.. automethod:: Series.nunique

View File

@ -0,0 +1,6 @@
eland.Series.pow
================
.. currentmodule:: eland
.. automethod:: Series.pow

View File

@ -0,0 +1,6 @@
eland.Series.rename
===================
.. currentmodule:: eland
.. automethod:: Series.rename

View File

@ -0,0 +1,6 @@
eland.Series
============
.. currentmodule:: eland
.. autoclass:: Series

View File

@ -0,0 +1,6 @@
eland.Series.shape
==================
.. currentmodule:: eland
.. autoattribute:: Series.shape

View File

@ -0,0 +1,6 @@
eland.Series.sub
================
.. currentmodule:: eland
.. automethod:: Series.sub

View File

@ -0,0 +1,6 @@
eland.Series.sum
================
.. currentmodule:: eland
.. automethod:: Series.sum

View File

@ -0,0 +1,6 @@
eland.Series.tail
=================
.. currentmodule:: eland
.. automethod:: Series.tail

View File

@ -0,0 +1,6 @@
eland.Series.to_string
======================
.. currentmodule:: eland
.. automethod:: Series.to_string

View File

@ -0,0 +1,6 @@
eland.Series.truediv
====================
.. currentmodule:: eland
.. automethod:: Series.truediv

View File

@ -1,5 +1,5 @@
eland.Series.value_counts eland.Series.value_counts
=========================== =========================
.. currentmodule:: eland .. currentmodule:: eland

View File

@ -91,5 +91,3 @@ Elasticsearch utilities
:toctree: api/ :toctree: api/
DataFrame.info_es DataFrame.info_es

View File

@ -5,9 +5,77 @@ Series
========= =========
.. currentmodule:: eland .. currentmodule:: eland
Constructor
~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series
Attributes and underlying data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Axes**
.. autosummary::
:toctree: api/
Series.index
Series.shape
Series.name
Series.empty
Indexing, iteration
~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.head
Series.tail
Binary operator functions
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.add
Series.sub
Series.mul
Series.div
Series.truediv
Series.floordiv
Series.mod
Series.pow
Computations / descriptive stats Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary:: .. autosummary::
:toctree: api/ :toctree: api/
Series.describe
Series.max
Series.mean
Series.min
Series.sum
Series.nunique
Series.value_counts Series.value_counts
Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.rename
Serialization / IO / conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.to_string
Elasticsearch utilities
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.info_es

View File

@ -1,5 +1,6 @@
from __future__ import absolute_import from __future__ import absolute_import
from eland.common import *
from eland.client import * from eland.client import *
from eland.filter import * from eland.filter import *
from eland.index import * from eland.index import *

8
eland/common.py Normal file
View File

@ -0,0 +1,8 @@
# Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub):
def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub)
return obj
return dec

View File

@ -18,15 +18,7 @@ import eland.plotting as gfx
from eland import NDFrame from eland import NDFrame
from eland import Series from eland import Series
from eland.filter import BooleanFilter, ScriptFilter from eland.filter import BooleanFilter, ScriptFilter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
# Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub):
def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub)
return obj
return dec
class DataFrame(NDFrame): class DataFrame(NDFrame):
@ -43,7 +35,7 @@ class DataFrame(NDFrame):
- elasticsearch-py instance or - elasticsearch-py instance or
- eland.Client instance - eland.Client instance
index_pattern: str index_pattern: str
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-*') Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*')
columns: list of str, optional columns: list of str, optional
List of DataFrame columns. A subset of the Elasticsearch index's fields. List of DataFrame columns. A subset of the Elasticsearch index's fields.
index_field: str, optional index_field: str, optional
@ -98,7 +90,6 @@ class DataFrame(NDFrame):
<BLANKLINE> <BLANKLINE>
[5 rows x 2 columns] [5 rows x 2 columns]
""" """
def __init__(self, def __init__(self,
client=None, client=None,
index_pattern=None, index_pattern=None,
@ -586,7 +577,7 @@ class DataFrame(NDFrame):
max_rows = 1 max_rows = 1
# Create a slightly bigger dataframe than display # Create a slightly bigger dataframe than display
df = self._build_repr_df(max_rows + 1, max_cols) df = self._build_repr(max_rows + 1)
if buf is not None: if buf is not None:
_buf = _expand_user(_stringify_path(buf)) _buf = _expand_user(_stringify_path(buf))
@ -651,7 +642,7 @@ class DataFrame(NDFrame):
max_rows = 1 max_rows = 1
# Create a slightly bigger dataframe than display # Create a slightly bigger dataframe than display
df = self._build_repr_df(max_rows + 1, max_cols) df = self._build_repr(max_rows + 1)
if buf is not None: if buf is not None:
_buf = _expand_user(_stringify_path(buf)) _buf = _expand_user(_stringify_path(buf))
@ -1064,3 +1055,48 @@ class DataFrame(NDFrame):
return self._getitem(key) return self._getitem(key)
else: else:
return default return default
@property
def values(self):
"""
Not implemented.
In pandas this returns a Numpy representation of the DataFrame. This would involve scan/scrolling the
entire index.
If this is required, call ``ed.eland_to_pandas(ed_df).values``, _but beware this will scan/scroll the entire
Elasticsearch index(s) into memory_
See Also
--------
:pandas_api_docs:`pandas.DataFrame.values`
Examples
--------
>>> ed_df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5)
>>> pd_df = ed.eland_to_pandas(ed_df)
>>> print("type(ed_df)={0}\\ntype(pd_df)={1}".format(type(ed_df), type(pd_df)))
type(ed_df)=<class 'eland.dataframe.DataFrame'>
type(pd_df)=<class 'pandas.core.frame.DataFrame'>
>>> ed_df
AvgTicketPrice Carrier
0 841.265642 Kibana Airlines
1 882.982662 Logstash Airways
2 190.636904 Logstash Airways
3 181.694216 Kibana Airlines
4 730.041778 Kibana Airlines
<BLANKLINE>
[5 rows x 2 columns]
>>> pd_df.values
array([[841.2656419677076, 'Kibana Airlines'],
[882.9826615595518, 'Logstash Airways'],
[190.6369038508356, 'Logstash Airways'],
[181.69421554118, 'Kibana Airlines'],
[730.041778346198, 'Kibana Airlines']], dtype=object)
"""
raise NotImplementedError(
"This method would scan/scroll the entire Elasticsearch index(s) into memory."
"If this is explicitly required and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`"
)
to_numpy = values

View File

@ -31,7 +31,6 @@ from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler from eland import ElandQueryCompiler
class NDFrame: class NDFrame:
def __init__(self, def __init__(self,
@ -65,6 +64,7 @@ class NDFrame:
See Also See Also
-------- --------
:pandas_api_docs:`pandas.DataFrame.index` :pandas_api_docs:`pandas.DataFrame.index`
:pandas_api_docs:`pandas.Series.index`
Examples Examples
-------- --------
@ -72,6 +72,10 @@ class NDFrame:
>>> assert isinstance(df.index, ed.Index) >>> assert isinstance(df.index, ed.Index)
>>> df.index.index_field >>> df.index.index_field
'_id' '_id'
>>> s = df['Carrier']
>>> assert isinstance(s.index, ed.Index)
>>> s.index.index_field
'_id'
""" """
return self._query_compiler.index return self._query_compiler.index
@ -104,9 +108,8 @@ class NDFrame:
""" """
return self._query_compiler.dtypes return self._query_compiler.dtypes
def _build_repr_df(self, num_rows, num_cols): def _build_repr(self, num_rows):
# Overriden version of BasePandasDataset._build_repr_df # self could be Series or DataFrame
# to avoid issues with concat
if len(self.index) <= num_rows: if len(self.index) <= num_rows:
return self._to_pandas() return self._to_pandas()

View File

@ -588,6 +588,7 @@ class Operations:
df = self._apply_df_post_processing(df, post_processing) df = self._apply_df_post_processing(df, post_processing)
collector.collect(df) collector.collect(df)
def iloc(self, index, field_names): def iloc(self, index, field_names):
# index and field_names are indexers # index and field_names are indexers
task = ('iloc', (index, field_names)) task = ('iloc', (index, field_names))
@ -881,9 +882,10 @@ class Operations:
left_field = item[1][1][1][0] left_field = item[1][1][1][0]
right_field = item[1][1][1][1] right_field = item[1][1][1][1]
# https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
if isinstance(right_field, str): if isinstance(right_field, str):
""" """
(if op_name = 'truediv') (if op_name = '__truediv__')
"script_fields": { "script_fields": {
"field_name": { "field_name": {
@ -893,12 +895,23 @@ class Operations:
} }
} }
""" """
if op_name == 'truediv': if op_name == '__add__':
op = '/' source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
else: else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
if query_params['query_script_fields'] is None: if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {} query_params['query_script_fields'] = {}
@ -909,7 +922,7 @@ class Operations:
} }
else: else:
""" """
(if op_name = 'truediv') (if op_name = '__truediv__')
"script_fields": { "script_fields": {
"field_name": { "field_name": {
@ -919,12 +932,23 @@ class Operations:
} }
} }
""" """
if op_name == 'truediv': if op_name == '__add__':
op = '/' source = "doc['{0}'].value + {1}".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / {1}".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % {1}".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * {1}".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - {1}".format(left_field, right_field)
else: else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field)
if query_params['query_script_fields'] is None: if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {} query_params['query_script_fields'] = {}

View File

@ -239,9 +239,9 @@ class ElandQueryCompiler:
# Create pandas DataFrame # Create pandas DataFrame
df = pd.DataFrame(data=rows, index=index) df = pd.DataFrame(data=rows, index=index)
# _source may not contain all columns in the mapping # _source may not contain all field_names in the mapping
# therefore, fill in missing columns # therefore, fill in missing field_names
# (note this returns self.columns NOT IN df.columns) # (note this returns self.field_names NOT IN df.columns)
missing_field_names = list(set(self.field_names) - set(df.columns)) missing_field_names = list(set(self.field_names) - set(df.columns))
for missing in missing_field_names: for missing in missing_field_names:

View File

@ -11,19 +11,26 @@ without storing the dataset in local memory.
Implementation Details Implementation Details
---------------------- ----------------------
Based on NDFrame which underpins eland.1DataFrame Based on NDFrame which underpins eland.DataFrame
""" """
import sys
import warnings
from io import StringIO from io import StringIO
import pandas as pd import pandas as pd
import numpy as np from pandas.io.common import _expand_user, _stringify_path
from eland import NDFrame from eland import NDFrame
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
def _get_method_name():
return sys._getframe(1).f_code.co_name
class Series(NDFrame): class Series(NDFrame):
""" """
pandas.Series like API that proxies into Elasticsearch index(es). pandas.Series like API that proxies into Elasticsearch index(es).
@ -34,35 +41,35 @@ class Series(NDFrame):
A reference to a Elasticsearch python client A reference to a Elasticsearch python client
index_pattern : str index_pattern : str
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*). An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-\*\).
index_field : str index_field : str
The field to base the series on The field to base the series on
See Also
--------
Examples
--------
import eland as ed
client = ed.Client(Elasticsearch())
s = ed.DataFrame(client, 'reviews', 'date')
df.head()
reviewerId vendorId rating date
0 0 0 5 2006-04-07 17:08
1 1 1 5 2006-05-04 12:16
2 2 2 4 2006-04-21 12:26
3 3 3 5 2006-04-18 15:48
4 3 4 5 2006-04-18 15:49
Notice that the types are based on Elasticsearch mappings
Notes Notes
----- -----
If the Elasticsearch index is deleted or index mappings are changed after this If the Elasticsearch index is deleted or index mappings are changed after this
object is created, the object is not rebuilt and so inconsistencies can occur. object is created, the object is not rebuilt and so inconsistencies can occur.
See Also
--------
:pandas_api_docs:`pandas.Series`
Examples
--------
>>> ed.Series(client='localhost', index_pattern='flights', name='Carrier')
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
""" """
def __init__(self, def __init__(self,
@ -94,6 +101,34 @@ class Series(NDFrame):
""" """
return len(self.index) == 0 return len(self.index) == 0
@property
def shape(self):
"""
Return a tuple representing the dimensionality of the Series.
Returns
-------
shape: tuple
0. number of rows
1. number of columns
Notes
-----
- number of rows ``len(series)`` queries Elasticsearch
- number of columns == 1
Examples
--------
>>> df = ed.Series('localhost', 'ecommerce', name='total_quantity')
>>> df.shape
(4675, 1)
"""
num_rows = len(self)
num_columns = 1
return num_rows, num_columns
def _get_name(self): def _get_name(self):
return self._query_compiler.columns[0] return self._query_compiler.columns[0]
@ -118,7 +153,7 @@ class Series(NDFrame):
See Also See Also
-------- --------
:pandas_api_docs:pandas.Series.rename :pandas_api_docs:`pandas.Series.rename`
Examples Examples
-------- --------
@ -200,12 +235,39 @@ class Series(NDFrame):
return self._query_compiler.value_counts(es_size) return self._query_compiler.value_counts(es_size)
# dtype not implemented for Series as causes query to fail
# in pandas.core.computation.ops.Term.type
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# Rendering Methods # Rendering Methods
def __repr__(self): def __repr__(self):
num_rows = pd.get_option("max_rows") or 60 """
Return a string representation for a particular Series.
"""
buf = StringIO()
return self.to_string(max_rows=num_rows) # max_rows and max_cols determine the maximum size of the pretty printed tabular
# representation of the series. pandas defaults are 60 and 20 respectively.
# series where len(series) > max_rows shows a truncated view with 10 rows shown.
max_rows = pd.get_option("display.max_rows")
min_rows = pd.get_option("display.min_rows")
if len(self) > max_rows:
max_rows = min_rows
show_dimensions = pd.get_option("display.show_dimensions")
self.to_string(
buf=buf,
name=self.name,
dtype=True,
min_rows=min_rows,
max_rows=max_rows,
length=show_dimensions,
)
result = buf.getvalue()
return result
def to_string( def to_string(
self, self,
@ -217,33 +279,69 @@ class Series(NDFrame):
length=False, length=False,
dtype=False, dtype=False,
name=False, name=False,
max_rows=None): max_rows=None,
min_rows=None,
if max_rows is None: ):
# In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
# by limiting rows by default.
num_rows = len(self) # avoid multiple calls
if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
if max_rows is None:
max_rows = num_rows
else:
max_rows = min(num_rows, max_rows)
elif max_rows is None:
warnings.warn("Series.to_string called without max_rows set " warnings.warn("Series.to_string called without max_rows set "
"- this will return entire index results. " "- this will return entire index results. "
"Setting max_rows=60, overwrite if different behaviour is required.") "Setting max_rows={default}"
max_rows = 60 " overwrite if different behaviour is required."
.format(default=DEFAULT_NUM_ROWS_DISPLAYED),
UserWarning)
max_rows = DEFAULT_NUM_ROWS_DISPLAYED
# because of the way pandas handles max_rows=0, not having this throws an error
# see eland issue #56
if max_rows == 0:
max_rows = 1
# Create a slightly bigger dataframe than display # Create a slightly bigger dataframe than display
temp_df = self._build_repr_df(max_rows + 1, None) temp_series = self._build_repr(max_rows + 1)
if isinstance(temp_df, pd.DataFrame):
temp_df = temp_df[self.name] if buf is not None:
temp_str = repr(temp_df) _buf = _expand_user(_stringify_path(buf))
if self.name is not None:
name_str = "Name: {}, ".format(str(self.name))
else: else:
name_str = "" _buf = StringIO()
if len(self.index) > max_rows:
len_str = "Length: {}, ".format(len(self.index)) # Create repr of fake series without name, length, dtype summary
else: temp_str = temp_series.to_string(buf=_buf,
len_str = "" na_rep=na_rep,
dtype_str = "dtype: {}".format(temp_str.rsplit("dtype: ", 1)[-1]) float_format=float_format,
if len(self) == 0: header=header,
return "Series([], {}{}".format(name_str, dtype_str) index=index,
return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format( length=False,
name_str, len_str, dtype_str dtype=False,
) name=False,
max_rows=max_rows)
# Create the summary
footer = ""
if name and self.name is not None:
footer += "Name: {}".format(str(self.name))
if length and len(self) > max_rows:
if footer:
footer += ", "
footer += "Length: {}".format(len(self.index))
if dtype:
if footer:
footer += ", "
footer += "dtype: {}".format(temp_series.dtype)
if len(footer) > 0:
_buf.write("\n{}".format(footer))
if buf is None:
result = _buf.getvalue()
return result
def _to_pandas(self): def _to_pandas(self):
return self._query_compiler.to_pandas()[self.name] return self._query_compiler.to_pandas()[self.name]
@ -321,13 +419,16 @@ class Series(NDFrame):
@property @property
def ndim(self): def ndim(self):
""" """
Returns 1 by definition of a Series1 Returns 1 by definition of a Series
Returns Returns
------- -------
int int
By definition 1 By definition 1
See Also
--------
:pandas_api_docs:`pandas.Series.ndim`
""" """
return 1 return 1
@ -338,34 +439,317 @@ class Series(NDFrame):
return buf.getvalue() return buf.getvalue()
def __truediv__(self, right): def __add__(self, right):
return self.truediv(right)
def truediv(self, right):
""" """
return a / b Return addition of series and right, element-wise (binary operator add).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price + df.total_quantity
0 38.980000
1 55.980000
2 201.979996
3 176.979996
4 82.980003
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __truediv__(self, right):
"""
Return floating division of series and right, element-wise (binary operator truediv).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price / df.total_quantity
0 18.490000
1 26.990000
2 99.989998
3 87.489998
4 40.490002
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __floordiv__(self, right):
"""
Return integer division of series and right, element-wise (binary operator floordiv //).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price // df.total_quantity
0 18.0
1 26.0
2 99.0
3 87.0
4 40.0
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __mod__(self, right):
"""
Return modulo of series and right, element-wise (binary operator mod %).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price % df.total_quantity
0 0.980000
1 1.980000
2 1.979996
3 0.979996
4 0.980003
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __mul__(self, right):
"""
Return multiplication of series and right, element-wise (binary operator mul).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price * df.total_quantity
0 73.959999
1 107.959999
2 399.959991
3 349.959991
4 161.960007
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __sub__(self, right):
"""
Return subtraction of series and right, element-wise (binary operator sub).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price - df.total_quantity
0 34.980000
1 51.980000
2 197.979996
3 172.979996
4 78.980003
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __pow__(self, right):
"""
Return exponential power of series and right, element-wise (binary operator pow \**\).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price ** df.total_quantity
0 1367.520366
1 2913.840351
2 39991.998691
3 30617.998905
4 6557.760944
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
add = __add__
div = __truediv__
divide = __truediv__
floordiv = __floordiv__
mod = __mod__
mul = __mul__
multiply = __mul__
pow = __pow__
sub = __sub__
subtract = __sub__
truediv = __truediv__
def _numeric_op(self, right, method_name):
"""
return a op b
a & b == Series a & b == Series
a & b must share same eland.Client, index_pattern and index_field a & b must share same eland.Client, index_pattern and index_field
a == Series, b == numeric
""" """
if isinstance(right, Series): if isinstance(right, Series):
# Check compatibility # Check compatibility
self._query_compiler.check_arithmetics(right._query_compiler) self._query_compiler.check_arithmetics(right._query_compiler)
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name) new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
# Compatible, so create new Series # Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, right.name)) new_field_name, method_name, self.name, right.name))
series.name = None series.name = None
return series return series
elif isinstance(right, (int, float)): # TODO extend to numpy types elif isinstance(right, (int, float)): # TODO extend to numpy types
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_')) new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
# Compatible, so create new Series # Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, float(right))) # force rhs to float new_field_name, method_name, self.name, float(right))) # force rhs to float
# name of Series remains original name # name of Series remains original name
series.name = self.name series.name = self.name
@ -374,5 +758,123 @@ class Series(NDFrame):
else: else:
raise TypeError( raise TypeError(
"Can only perform arithmetic operation on selected types " "Can only perform arithmetic operation on selected types "
"{0} != {1}".format(type(self), type(right)) "{0} != {1} for {2}".format(type(self), type(right), method_name)
) )
def max(self):
"""
Return the maximum of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.max`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.max())
1199
"""
results = super().max()
return results.squeeze()
def mean(self):
"""
Return the mean of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.mean`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.mean())
628
"""
results = super().mean()
return results.squeeze()
def min(self):
"""
Return the minimum of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.min`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.min())
100
"""
results = super().min()
return results.squeeze()
def sum(self):
"""
Return the sum of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.sum`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.sum())
8204364
"""
results = super().sum()
return results.squeeze()
def nunique(self):
"""
Return the sum of the Series values
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.sum`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='Carrier')
>>> s.nunique()
4
"""
results = super().nunique()
return results.squeeze()

View File

@ -279,10 +279,10 @@ ECOMMERCE_MAPPING = {"mappings": {
"type": "keyword" "type": "keyword"
}, },
"taxful_total_price": { "taxful_total_price": {
"type": "half_float" "type": "float"
}, },
"taxless_total_price": { "taxless_total_price": {
"type": "half_float" "type": "float"
}, },
"total_quantity": { "total_quantity": {
"type": "integer" "type": "integer"

View File

@ -4,6 +4,8 @@ from eland.tests.common import TestData, assert_pandas_eland_series_equal
from pandas.util.testing import assert_series_equal from pandas.util.testing import assert_series_equal
import pytest import pytest
import numpy as np
class TestSeriesArithmetics(TestData): class TestSeriesArithmetics(TestData):
@ -15,29 +17,35 @@ class TestSeriesArithmetics(TestData):
with pytest.raises(TypeError): with pytest.raises(TypeError):
ed_df['total_quantity'] / pd_df['taxful_total_price'] ed_df['total_quantity'] / pd_df['taxful_total_price']
def test_ecommerce_series_div(self): def test_ecommerce_series_basic_arithmetics(self):
pd_df = self.pd_ecommerce() pd_df = self.pd_ecommerce().head(100)
ed_df = self.ed_ecommerce() ed_df = self.ed_ecommerce().head(100)
pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price'] ops = ['__add__',
ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price'] '__truediv__',
'__floordiv__',
'__pow__',
'__mod__',
'__mul__',
'__sub__',
'add',
'truediv',
'floordiv',
'pow',
'mod',
'mul',
'sub']
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) for op in ops:
pd_series = getattr(pd_df['taxful_total_price'], op)(pd_df['total_quantity'])
ed_series = getattr(ed_df['taxful_total_price'], op)(ed_df['total_quantity'])
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
def test_ecommerce_series_div_float(self): pd_series = getattr(pd_df['taxful_total_price'], op)(10.56)
pd_df = self.pd_ecommerce() ed_series = getattr(ed_df['taxful_total_price'], op)(10.56)
ed_df = self.ed_ecommerce() assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
pd_avg_price = pd_df['total_quantity'] / 10.0 pd_series = getattr(pd_df['taxful_total_price'], op)(int(8))
ed_avg_price = ed_df['total_quantity'] / 10.0 ed_series = getattr(ed_df['taxful_total_price'], op)(int(8))
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
def test_ecommerce_series_div_int(self):
pd_df = self.pd_ecommerce()
ed_df = self.ed_ecommerce()
pd_avg_price = pd_df['total_quantity'] / int(10)
ed_avg_price = ed_df['total_quantity'] / int(10)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)

View File

@ -0,0 +1,17 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
import eland as ed
class TestSeriesInfoEs(TestData):
def test_flights_info_es(self):
ed_flights = self.ed_flights()['AvgTicketPrice']
# No assertion, just test it can be called
info_es = ed_flights.info_es()

View File

@ -0,0 +1,44 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
import eland as ed
class TestSeriesMetrics(TestData):
funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self):
pd_flights = self.pd_flights()['AvgTicketPrice']
ed_flights = self.ed_flights()['AvgTicketPrice']
for func in self.funcs:
pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)()
assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
column = 'category'
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs:
ed_metric = getattr(ed_ecommerce, func)()
assert ed_metric.empty
def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
for column in columns:
pd_ecommerce = self.pd_ecommerce()[column]
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs:
assert_almost_equal(getattr(pd_ecommerce, func)(), getattr(ed_ecommerce, func)(),
check_less_precise=True)

View File

@ -1,13 +1,14 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import eland as ed import eland as ed
import pandas as pd
from eland.tests import ELASTICSEARCH_HOST from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME from eland.tests import FLIGHTS_INDEX_NAME, ECOMMERCE_INDEX_NAME
from eland.tests.common import TestData from eland.tests.common import TestData
class TestSeriesRepr(TestData): class TestSeriesRepr(TestData):
def test_repr(self): def test_repr_flights_carrier(self):
pd_s = self.pd_flights()['Carrier'] pd_s = self.pd_flights()['Carrier']
ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier') ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier')
@ -15,3 +16,12 @@ class TestSeriesRepr(TestData):
ed_repr = repr(ed_s) ed_repr = repr(ed_s)
assert pd_repr == ed_repr assert pd_repr == ed_repr
def test_repr_flights_carrier_5(self):
pd_s = self.pd_flights()['Carrier'].head(5)
ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier').head(5)
pd_repr = repr(pd_s)
ed_repr = repr(ed_s)
assert pd_repr == ed_repr