Added Series metric aggs + Series docs

Also, improved Series.to_string()
This commit is contained in:
Stephen Dodson 2019-11-22 15:44:55 +00:00
parent 5d119215f8
commit 84e23ab5d1
38 changed files with 973 additions and 116 deletions

View File

@ -0,0 +1,6 @@
eland.Series.add
================
.. currentmodule:: eland
.. automethod:: Series.add

View File

@ -0,0 +1,6 @@
eland.Series.describe
=====================
.. currentmodule:: eland
.. automethod:: Series.describe

View File

@ -0,0 +1,6 @@
eland.Series.div
================
.. currentmodule:: eland
.. automethod:: Series.div

View File

@ -0,0 +1,6 @@
eland.Series.empty
==================
.. currentmodule:: eland
.. autoattribute:: Series.empty

View File

@ -0,0 +1,6 @@
eland.Series.floordiv
=====================
.. currentmodule:: eland
.. automethod:: Series.floordiv

View File

@ -0,0 +1,6 @@
eland.Series.head
=================
.. currentmodule:: eland
.. automethod:: Series.head

View File

@ -0,0 +1,6 @@
eland.Series.index
==================
.. currentmodule:: eland
.. autoattribute:: Series.index

View File

@ -0,0 +1,6 @@
eland.Series.max
================
.. currentmodule:: eland
.. automethod:: Series.max

View File

@ -0,0 +1,6 @@
eland.Series.mean
=================
.. currentmodule:: eland
.. automethod:: Series.mean

View File

@ -0,0 +1,6 @@
eland.Series.min
================
.. currentmodule:: eland
.. automethod:: Series.min

View File

@ -0,0 +1,6 @@
eland.Series.mod
================
.. currentmodule:: eland
.. automethod:: Series.mod

View File

@ -0,0 +1,6 @@
eland.Series.mul
================
.. currentmodule:: eland
.. automethod:: Series.mul

View File

@ -0,0 +1,6 @@
eland.Series.name
=================
.. currentmodule:: eland
.. autoattribute:: Series.name

View File

@ -0,0 +1,6 @@
eland.Series.nunique
====================
.. currentmodule:: eland
.. automethod:: Series.nunique

View File

@ -0,0 +1,6 @@
eland.Series.pow
================
.. currentmodule:: eland
.. automethod:: Series.pow

View File

@ -0,0 +1,6 @@
eland.Series.rename
===================
.. currentmodule:: eland
.. automethod:: Series.rename

View File

@ -0,0 +1,6 @@
eland.Series
============
.. currentmodule:: eland
.. autoclass:: Series

View File

@ -0,0 +1,6 @@
eland.Series.shape
==================
.. currentmodule:: eland
.. autoattribute:: Series.shape

View File

@ -0,0 +1,6 @@
eland.Series.sub
================
.. currentmodule:: eland
.. automethod:: Series.sub

View File

@ -0,0 +1,6 @@
eland.Series.sum
================
.. currentmodule:: eland
.. automethod:: Series.sum

View File

@ -0,0 +1,6 @@
eland.Series.tail
=================
.. currentmodule:: eland
.. automethod:: Series.tail

View File

@ -0,0 +1,6 @@
eland.Series.to_string
======================
.. currentmodule:: eland
.. automethod:: Series.to_string

View File

@ -0,0 +1,6 @@
eland.Series.truediv
====================
.. currentmodule:: eland
.. automethod:: Series.truediv

View File

@ -1,5 +1,5 @@
eland.Series.value_counts
===========================
=========================
.. currentmodule:: eland

View File

@ -91,5 +91,3 @@ Elasticsearch utilities
:toctree: api/
DataFrame.info_es

View File

@ -5,9 +5,77 @@ Series
=========
.. currentmodule:: eland
Constructor
~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series
Attributes and underlying data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Axes**
.. autosummary::
:toctree: api/
Series.index
Series.shape
Series.name
Series.empty
Indexing, iteration
~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.head
Series.tail
Binary operator functions
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.add
Series.sub
Series.mul
Series.div
Series.truediv
Series.floordiv
Series.mod
Series.pow
Computations / descriptive stats
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.value_counts
Series.describe
Series.max
Series.mean
Series.min
Series.sum
Series.nunique
Series.value_counts
Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.rename
Serialization / IO / conversion
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.to_string
Elasticsearch utilities
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
Series.info_es

View File

@ -1,5 +1,6 @@
from __future__ import absolute_import
from eland.common import *
from eland.client import *
from eland.filter import *
from eland.index import *

8
eland/common.py Normal file
View File

@ -0,0 +1,8 @@
# Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub):
def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub)
return obj
return dec

View File

@ -18,15 +18,7 @@ import eland.plotting as gfx
from eland import NDFrame
from eland import Series
from eland.filter import BooleanFilter, ScriptFilter
# Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub):
def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub)
return obj
return dec
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
class DataFrame(NDFrame):
@ -43,7 +35,7 @@ class DataFrame(NDFrame):
- elasticsearch-py instance or
- eland.Client instance
index_pattern: str
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-*')
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*')
columns: list of str, optional
List of DataFrame columns. A subset of the Elasticsearch index's fields.
index_field: str, optional
@ -98,7 +90,6 @@ class DataFrame(NDFrame):
<BLANKLINE>
[5 rows x 2 columns]
"""
def __init__(self,
client=None,
index_pattern=None,
@ -586,7 +577,7 @@ class DataFrame(NDFrame):
max_rows = 1
# Create a slightly bigger dataframe than display
df = self._build_repr_df(max_rows + 1, max_cols)
df = self._build_repr(max_rows + 1)
if buf is not None:
_buf = _expand_user(_stringify_path(buf))
@ -651,7 +642,7 @@ class DataFrame(NDFrame):
max_rows = 1
# Create a slightly bigger dataframe than display
df = self._build_repr_df(max_rows + 1, max_cols)
df = self._build_repr(max_rows + 1)
if buf is not None:
_buf = _expand_user(_stringify_path(buf))
@ -1064,3 +1055,48 @@ class DataFrame(NDFrame):
return self._getitem(key)
else:
return default
@property
def values(self):
"""
Not implemented.
In pandas this returns a Numpy representation of the DataFrame. This would involve scan/scrolling the
entire index.
If this is required, call ``ed.eland_to_pandas(ed_df).values``, _but beware this will scan/scroll the entire
Elasticsearch index(s) into memory_
See Also
--------
:pandas_api_docs:`pandas.DataFrame.values`
Examples
--------
>>> ed_df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'Carrier']).head(5)
>>> pd_df = ed.eland_to_pandas(ed_df)
>>> print("type(ed_df)={0}\\ntype(pd_df)={1}".format(type(ed_df), type(pd_df)))
type(ed_df)=<class 'eland.dataframe.DataFrame'>
type(pd_df)=<class 'pandas.core.frame.DataFrame'>
>>> ed_df
AvgTicketPrice Carrier
0 841.265642 Kibana Airlines
1 882.982662 Logstash Airways
2 190.636904 Logstash Airways
3 181.694216 Kibana Airlines
4 730.041778 Kibana Airlines
<BLANKLINE>
[5 rows x 2 columns]
>>> pd_df.values
array([[841.2656419677076, 'Kibana Airlines'],
[882.9826615595518, 'Logstash Airways'],
[190.6369038508356, 'Logstash Airways'],
[181.69421554118, 'Kibana Airlines'],
[730.041778346198, 'Kibana Airlines']], dtype=object)
"""
raise NotImplementedError(
"This method would scan/scroll the entire Elasticsearch index(s) into memory."
"If this is explicitly required and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`"
)
to_numpy = values

View File

@ -31,7 +31,6 @@ from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler
class NDFrame:
def __init__(self,
@ -65,6 +64,7 @@ class NDFrame:
See Also
--------
:pandas_api_docs:`pandas.DataFrame.index`
:pandas_api_docs:`pandas.Series.index`
Examples
--------
@ -72,6 +72,10 @@ class NDFrame:
>>> assert isinstance(df.index, ed.Index)
>>> df.index.index_field
'_id'
>>> s = df['Carrier']
>>> assert isinstance(s.index, ed.Index)
>>> s.index.index_field
'_id'
"""
return self._query_compiler.index
@ -104,9 +108,8 @@ class NDFrame:
"""
return self._query_compiler.dtypes
def _build_repr_df(self, num_rows, num_cols):
# Overriden version of BasePandasDataset._build_repr_df
# to avoid issues with concat
def _build_repr(self, num_rows):
# self could be Series or DataFrame
if len(self.index) <= num_rows:
return self._to_pandas()

View File

@ -588,6 +588,7 @@ class Operations:
df = self._apply_df_post_processing(df, post_processing)
collector.collect(df)
def iloc(self, index, field_names):
# index and field_names are indexers
task = ('iloc', (index, field_names))
@ -881,9 +882,10 @@ class Operations:
left_field = item[1][1][1][0]
right_field = item[1][1][1][1]
# https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
if isinstance(right_field, str):
"""
(if op_name = 'truediv')
(if op_name = '__truediv__')
"script_fields": {
"field_name": {
@ -893,12 +895,23 @@ class Operations:
}
}
"""
if op_name == 'truediv':
op = '/'
if op_name == '__add__':
source = "doc['{0}'].value + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
@ -909,7 +922,7 @@ class Operations:
}
else:
"""
(if op_name = 'truediv')
(if op_name = '__truediv__')
"script_fields": {
"field_name": {
@ -919,12 +932,23 @@ class Operations:
}
}
"""
if op_name == 'truediv':
op = '/'
if op_name == '__add__':
source = "doc['{0}'].value + {1}".format(left_field, right_field)
elif op_name == '__truediv__':
source = "doc['{0}'].value / {1}".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor(doc['{0}'].value / {1})".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow(doc['{0}'].value, {1})".format(left_field, right_field)
elif op_name == '__mod__':
source = "doc['{0}'].value % {1}".format(left_field, right_field)
elif op_name == '__mul__':
source = "doc['{0}'].value * {1}".format(left_field, right_field)
elif op_name == '__sub__':
source = "doc['{0}'].value - {1}".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}

View File

@ -239,9 +239,9 @@ class ElandQueryCompiler:
# Create pandas DataFrame
df = pd.DataFrame(data=rows, index=index)
# _source may not contain all columns in the mapping
# therefore, fill in missing columns
# (note this returns self.columns NOT IN df.columns)
# _source may not contain all field_names in the mapping
# therefore, fill in missing field_names
# (note this returns self.field_names NOT IN df.columns)
missing_field_names = list(set(self.field_names) - set(df.columns))
for missing in missing_field_names:

View File

@ -11,19 +11,26 @@ without storing the dataset in local memory.
Implementation Details
----------------------
Based on NDFrame which underpins eland.1DataFrame
Based on NDFrame which underpins eland.DataFrame
"""
import sys
import warnings
from io import StringIO
import pandas as pd
import numpy as np
from pandas.io.common import _expand_user, _stringify_path
from eland import NDFrame
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
def _get_method_name():
return sys._getframe(1).f_code.co_name
class Series(NDFrame):
"""
pandas.Series like API that proxies into Elasticsearch index(es).
@ -34,35 +41,35 @@ class Series(NDFrame):
A reference to a Elasticsearch python client
index_pattern : str
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*).
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-\*\).
index_field : str
The field to base the series on
See Also
--------
Examples
--------
import eland as ed
client = ed.Client(Elasticsearch())
s = ed.DataFrame(client, 'reviews', 'date')
df.head()
reviewerId vendorId rating date
0 0 0 5 2006-04-07 17:08
1 1 1 5 2006-05-04 12:16
2 2 2 4 2006-04-21 12:26
3 3 3 5 2006-04-18 15:48
4 3 4 5 2006-04-18 15:49
Notice that the types are based on Elasticsearch mappings
Notes
-----
If the Elasticsearch index is deleted or index mappings are changed after this
object is created, the object is not rebuilt and so inconsistencies can occur.
See Also
--------
:pandas_api_docs:`pandas.Series`
Examples
--------
>>> ed.Series(client='localhost', index_pattern='flights', name='Carrier')
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
"""
def __init__(self,
@ -94,6 +101,34 @@ class Series(NDFrame):
"""
return len(self.index) == 0
@property
def shape(self):
"""
Return a tuple representing the dimensionality of the Series.
Returns
-------
shape: tuple
0. number of rows
1. number of columns
Notes
-----
- number of rows ``len(series)`` queries Elasticsearch
- number of columns == 1
Examples
--------
>>> df = ed.Series('localhost', 'ecommerce', name='total_quantity')
>>> df.shape
(4675, 1)
"""
num_rows = len(self)
num_columns = 1
return num_rows, num_columns
def _get_name(self):
return self._query_compiler.columns[0]
@ -118,7 +153,7 @@ class Series(NDFrame):
See Also
--------
:pandas_api_docs:pandas.Series.rename
:pandas_api_docs:`pandas.Series.rename`
Examples
--------
@ -200,12 +235,39 @@ class Series(NDFrame):
return self._query_compiler.value_counts(es_size)
# dtype not implemented for Series as causes query to fail
# in pandas.core.computation.ops.Term.type
# ----------------------------------------------------------------------
# Rendering Methods
def __repr__(self):
num_rows = pd.get_option("max_rows") or 60
"""
Return a string representation for a particular Series.
"""
buf = StringIO()
return self.to_string(max_rows=num_rows)
# max_rows and max_cols determine the maximum size of the pretty printed tabular
# representation of the series. pandas defaults are 60 and 20 respectively.
# series where len(series) > max_rows shows a truncated view with 10 rows shown.
max_rows = pd.get_option("display.max_rows")
min_rows = pd.get_option("display.min_rows")
if len(self) > max_rows:
max_rows = min_rows
show_dimensions = pd.get_option("display.show_dimensions")
self.to_string(
buf=buf,
name=self.name,
dtype=True,
min_rows=min_rows,
max_rows=max_rows,
length=show_dimensions,
)
result = buf.getvalue()
return result
def to_string(
self,
@ -217,33 +279,69 @@ class Series(NDFrame):
length=False,
dtype=False,
name=False,
max_rows=None):
if max_rows is None:
max_rows=None,
min_rows=None,
):
# In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
# by limiting rows by default.
num_rows = len(self) # avoid multiple calls
if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
if max_rows is None:
max_rows = num_rows
else:
max_rows = min(num_rows, max_rows)
elif max_rows is None:
warnings.warn("Series.to_string called without max_rows set "
"- this will return entire index results. "
"Setting max_rows=60, overwrite if different behaviour is required.")
max_rows = 60
"Setting max_rows={default}"
" overwrite if different behaviour is required."
.format(default=DEFAULT_NUM_ROWS_DISPLAYED),
UserWarning)
max_rows = DEFAULT_NUM_ROWS_DISPLAYED
# because of the way pandas handles max_rows=0, not having this throws an error
# see eland issue #56
if max_rows == 0:
max_rows = 1
# Create a slightly bigger dataframe than display
temp_df = self._build_repr_df(max_rows + 1, None)
if isinstance(temp_df, pd.DataFrame):
temp_df = temp_df[self.name]
temp_str = repr(temp_df)
if self.name is not None:
name_str = "Name: {}, ".format(str(self.name))
temp_series = self._build_repr(max_rows + 1)
if buf is not None:
_buf = _expand_user(_stringify_path(buf))
else:
name_str = ""
if len(self.index) > max_rows:
len_str = "Length: {}, ".format(len(self.index))
else:
len_str = ""
dtype_str = "dtype: {}".format(temp_str.rsplit("dtype: ", 1)[-1])
if len(self) == 0:
return "Series([], {}{}".format(name_str, dtype_str)
return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format(
name_str, len_str, dtype_str
)
_buf = StringIO()
# Create repr of fake series without name, length, dtype summary
temp_str = temp_series.to_string(buf=_buf,
na_rep=na_rep,
float_format=float_format,
header=header,
index=index,
length=False,
dtype=False,
name=False,
max_rows=max_rows)
# Create the summary
footer = ""
if name and self.name is not None:
footer += "Name: {}".format(str(self.name))
if length and len(self) > max_rows:
if footer:
footer += ", "
footer += "Length: {}".format(len(self.index))
if dtype:
if footer:
footer += ", "
footer += "dtype: {}".format(temp_series.dtype)
if len(footer) > 0:
_buf.write("\n{}".format(footer))
if buf is None:
result = _buf.getvalue()
return result
def _to_pandas(self):
return self._query_compiler.to_pandas()[self.name]
@ -321,13 +419,16 @@ class Series(NDFrame):
@property
def ndim(self):
"""
Returns 1 by definition of a Series1
Returns 1 by definition of a Series
Returns
-------
int
By definition 1
See Also
--------
:pandas_api_docs:`pandas.Series.ndim`
"""
return 1
@ -338,34 +439,317 @@ class Series(NDFrame):
return buf.getvalue()
def __truediv__(self, right):
return self.truediv(right)
def truediv(self, right):
def __add__(self, right):
"""
return a / b
Return addition of series and right, element-wise (binary operator add).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price + df.total_quantity
0 38.980000
1 55.980000
2 201.979996
3 176.979996
4 82.980003
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __truediv__(self, right):
"""
Return floating division of series and right, element-wise (binary operator truediv).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price / df.total_quantity
0 18.490000
1 26.990000
2 99.989998
3 87.489998
4 40.490002
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __floordiv__(self, right):
"""
Return integer division of series and right, element-wise (binary operator floordiv //).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price // df.total_quantity
0 18.0
1 26.0
2 99.0
3 87.0
4 40.0
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __mod__(self, right):
"""
Return modulo of series and right, element-wise (binary operator mod %).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price % df.total_quantity
0 0.980000
1 1.980000
2 1.979996
3 0.979996
4 0.980003
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __mul__(self, right):
"""
Return multiplication of series and right, element-wise (binary operator mul).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price * df.total_quantity
0 73.959999
1 107.959999
2 399.959991
3 349.959991
4 161.960007
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __sub__(self, right):
"""
Return subtraction of series and right, element-wise (binary operator sub).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price - df.total_quantity
0 34.980000
1 51.980000
2 197.979996
3 172.979996
4 78.980003
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
def __pow__(self, right):
"""
Return exponential power of series and right, element-wise (binary operator pow \**\).
Parameters
----------
right: eland.Series
Returns
-------
eland.Series
Examples
--------
>>> df = ed.DataFrame('localhost', 'ecommerce').head(5)
>>> df.taxful_total_price
0 36.98
1 53.98
2 199.98
3 174.98
4 80.98
Name: taxful_total_price, dtype: float64
>>> df.total_quantity
0 2
1 2
2 2
3 2
4 2
Name: total_quantity, dtype: int64
>>> df.taxful_total_price ** df.total_quantity
0 1367.520366
1 2913.840351
2 39991.998691
3 30617.998905
4 6557.760944
dtype: float64
"""
return self._numeric_op(right, _get_method_name())
add = __add__
div = __truediv__
divide = __truediv__
floordiv = __floordiv__
mod = __mod__
mul = __mul__
multiply = __mul__
pow = __pow__
sub = __sub__
subtract = __sub__
truediv = __truediv__
def _numeric_op(self, right, method_name):
"""
return a op b
a & b == Series
a & b must share same eland.Client, index_pattern and index_field
a == Series, b == numeric
"""
if isinstance(right, Series):
# Check compatibility
self._query_compiler.check_arithmetics(right._query_compiler)
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name)
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, right.name)
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, right.name))
new_field_name, method_name, self.name, right.name))
series.name = None
return series
elif isinstance(right, (int, float)): # TODO extend to numpy types
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_'))
elif isinstance(right, (int, float)): # TODO extend to numpy types
new_field_name = "{0}_{1}_{2}".format(self.name, method_name, str(right).replace('.', '_'))
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, float(right))) # force rhs to float
new_field_name, method_name, self.name, float(right))) # force rhs to float
# name of Series remains original name
series.name = self.name
@ -374,5 +758,123 @@ class Series(NDFrame):
else:
raise TypeError(
"Can only perform arithmetic operation on selected types "
"{0} != {1}".format(type(self), type(right))
"{0} != {1} for {2}".format(type(self), type(right), method_name)
)
def max(self):
"""
Return the maximum of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.max`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.max())
1199
"""
results = super().max()
return results.squeeze()
def mean(self):
"""
Return the mean of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.mean`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.mean())
628
"""
results = super().mean()
return results.squeeze()
def min(self):
"""
Return the minimum of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.min`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.min())
100
"""
results = super().min()
return results.squeeze()
def sum(self):
"""
Return the sum of the Series values
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.sum`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='AvgTicketPrice')
>>> int(s.sum())
8204364
"""
results = super().sum()
return results.squeeze()
def nunique(self):
"""
Return the sum of the Series values
Returns
-------
float
max value
See Also
--------
:pandas_api_docs:`pandas.Series.sum`
Examples
--------
>>> s = ed.Series('localhost', 'flights', name='Carrier')
>>> s.nunique()
4
"""
results = super().nunique()
return results.squeeze()

View File

@ -279,10 +279,10 @@ ECOMMERCE_MAPPING = {"mappings": {
"type": "keyword"
},
"taxful_total_price": {
"type": "half_float"
"type": "float"
},
"taxless_total_price": {
"type": "half_float"
"type": "float"
},
"total_quantity": {
"type": "integer"

View File

@ -4,6 +4,8 @@ from eland.tests.common import TestData, assert_pandas_eland_series_equal
from pandas.util.testing import assert_series_equal
import pytest
import numpy as np
class TestSeriesArithmetics(TestData):
@ -15,29 +17,35 @@ class TestSeriesArithmetics(TestData):
with pytest.raises(TypeError):
ed_df['total_quantity'] / pd_df['taxful_total_price']
def test_ecommerce_series_div(self):
pd_df = self.pd_ecommerce()
ed_df = self.ed_ecommerce()
def test_ecommerce_series_basic_arithmetics(self):
pd_df = self.pd_ecommerce().head(100)
ed_df = self.ed_ecommerce().head(100)
pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price']
ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price']
ops = ['__add__',
'__truediv__',
'__floordiv__',
'__pow__',
'__mod__',
'__mul__',
'__sub__',
'add',
'truediv',
'floordiv',
'pow',
'mod',
'mul',
'sub']
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
for op in ops:
pd_series = getattr(pd_df['taxful_total_price'], op)(pd_df['total_quantity'])
ed_series = getattr(ed_df['taxful_total_price'], op)(ed_df['total_quantity'])
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
def test_ecommerce_series_div_float(self):
pd_df = self.pd_ecommerce()
ed_df = self.ed_ecommerce()
pd_series = getattr(pd_df['taxful_total_price'], op)(10.56)
ed_series = getattr(ed_df['taxful_total_price'], op)(10.56)
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
pd_avg_price = pd_df['total_quantity'] / 10.0
ed_avg_price = ed_df['total_quantity'] / 10.0
pd_series = getattr(pd_df['taxful_total_price'], op)(int(8))
ed_series = getattr(ed_df['taxful_total_price'], op)(int(8))
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
def test_ecommerce_series_div_int(self):
pd_df = self.pd_ecommerce()
ed_df = self.ed_ecommerce()
pd_avg_price = pd_df['total_quantity'] / int(10)
ed_avg_price = ed_df['total_quantity'] / int(10)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)

View File

@ -0,0 +1,17 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
import eland as ed
class TestSeriesInfoEs(TestData):
def test_flights_info_es(self):
ed_flights = self.ed_flights()['AvgTicketPrice']
# No assertion, just test it can be called
info_es = ed_flights.info_es()

View File

@ -0,0 +1,44 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
import eland as ed
class TestSeriesMetrics(TestData):
funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self):
pd_flights = self.pd_flights()['AvgTicketPrice']
ed_flights = self.ed_flights()['AvgTicketPrice']
for func in self.funcs:
pd_metric = getattr(pd_flights, func)()
ed_metric = getattr(ed_flights, func)()
assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
column = 'category'
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs:
ed_metric = getattr(ed_ecommerce, func)()
assert ed_metric.empty
def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
for column in columns:
pd_ecommerce = self.pd_ecommerce()[column]
ed_ecommerce = self.ed_ecommerce()[column]
for func in self.funcs:
assert_almost_equal(getattr(pd_ecommerce, func)(), getattr(ed_ecommerce, func)(),
check_less_precise=True)

View File

@ -1,13 +1,14 @@
# File called _pytest for PyCharm compatability
import eland as ed
import pandas as pd
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests import FLIGHTS_INDEX_NAME, ECOMMERCE_INDEX_NAME
from eland.tests.common import TestData
class TestSeriesRepr(TestData):
def test_repr(self):
def test_repr_flights_carrier(self):
pd_s = self.pd_flights()['Carrier']
ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier')
@ -15,3 +16,12 @@ class TestSeriesRepr(TestData):
ed_repr = repr(ed_s)
assert pd_repr == ed_repr
def test_repr_flights_carrier_5(self):
pd_s = self.pd_flights()['Carrier'].head(5)
ed_s = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier').head(5)
pd_repr = repr(pd_s)
ed_repr = repr(ed_s)
assert pd_repr == ed_repr