Adding __r* operations and resolving issues with df.info()

This commit is contained in:
Stephen Dodson 2019-11-25 15:00:02 +00:00
parent ac8cb302de
commit b99f25e4ee
4 changed files with 157 additions and 20 deletions

View File

@ -519,7 +519,12 @@ class DataFrame(NDFrame):
else: else:
_verbose_repr() _verbose_repr()
# pandas 0.25.1 uses get_dtype_counts() here. This
# returns a Series with strings as the index NOT dtypes.
# Therefore, to get consistent ordering we need to
# align types with pandas method.
counts = self.dtypes.value_counts() counts = self.dtypes.value_counts()
counts.index = counts.index.astype(str)
dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
in sorted(counts.items())] in sorted(counts.items())]
lines.append('dtypes: {types}'.format(types=', '.join(dtypes))) lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))

View File

@ -1,6 +1,7 @@
import copy import copy
from enum import Enum from enum import Enum
import numpy as np
import pandas as pd import pandas as pd
from eland import Index from eland import Index
@ -538,7 +539,6 @@ class Operations:
return collector.ret return collector.ret
def _es_results(self, query_compiler, collector): def _es_results(self, query_compiler, collector):
query_params, post_processing = self._resolve_tasks() query_params, post_processing = self._resolve_tasks()
@ -561,12 +561,24 @@ class Operations:
is_scan = False is_scan = False
if size is not None and size <= 10000: if size is not None and size <= 10000:
if size > 0: if size > 0:
try:
es_results = query_compiler._client.search( es_results = query_compiler._client.search(
index=query_compiler._index_pattern, index=query_compiler._index_pattern,
size=size, size=size,
sort=sort_params, sort=sort_params,
body=body, body=body,
_source=field_names) _source=field_names)
except:
# Catch ES error and print debug (currently to stdout)
error = {
'index': query_compiler._index_pattern,
'size': size,
'sort': sort_params,
'body': body,
'_source': field_names
}
print("Elasticsearch error:", error)
raise
else: else:
is_scan = True is_scan = True
es_results = query_compiler._client.scan( es_results = query_compiler._client.scan(
@ -589,7 +601,6 @@ class Operations:
df = self._apply_df_post_processing(df, post_processing) df = self._apply_df_post_processing(df, post_processing)
collector.collect(df) collector.collect(df)
def iloc(self, index, field_names): def iloc(self, index, field_names):
# index and field_names are indexers # index and field_names are indexers
task = ('iloc', (index, field_names)) task = ('iloc', (index, field_names))
@ -884,7 +895,7 @@ class Operations:
right_field = item[1][1][1][1] right_field = item[1][1][1][1]
# https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math # https://www.elastic.co/guide/en/elasticsearch/painless/current/painless-api-reference-shared-java-lang.html#painless-api-reference-shared-Math
if isinstance(right_field, str): if isinstance(left_field, str) and isinstance(right_field, str):
""" """
(if op_name = '__truediv__') (if op_name = '__truediv__')
@ -913,7 +924,6 @@ class Operations:
else: else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
if query_params['query_script_fields'] is None: if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {} query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = { query_params['query_script_fields'][field_name] = {
@ -921,7 +931,7 @@ class Operations:
'source': source 'source': source
} }
} }
else: elif isinstance(left_field, str) and np.issubdtype(np.dtype(type(right_field)), np.number):
""" """
(if op_name = '__truediv__') (if op_name = '__truediv__')
@ -949,7 +959,36 @@ class Operations:
source = "doc['{0}'].value - {1}".format(left_field, right_field) source = "doc['{0}'].value - {1}".format(left_field, right_field)
else: else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
elif np.issubdtype(np.dtype(type(left_field)), np.number) and isinstance(right_field, str):
"""
(if op_name = '__truediv__')
"script_fields": {
"field_name": {
"script": {
"source": "left_field / doc['right_field'].value"
}
}
}
"""
if op_name == '__add__':
source = "{0} + doc['{1}'].value".format(left_field, right_field)
elif op_name == '__truediv__':
source = "{0} / doc['{1}'].value".format(left_field, right_field)
elif op_name == '__floordiv__':
source = "Math.floor({0} / doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__pow__':
source = "Math.pow({0}, doc['{1}'].value)".format(left_field, right_field)
elif op_name == '__mod__':
source = "{0} % doc['{1}'].value".format(left_field, right_field)
elif op_name == '__mul__':
source = "{0} * doc['{1}'].value".format(left_field, right_field)
elif op_name == '__sub__':
source = "{0} - doc['{1}'].value".format(left_field, right_field)
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
else:
raise TypeError("Types for operation inconsistent {} {} {}", type(left_field), type(right_field), op_name)
if query_params['query_script_fields'] is None: if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {} query_params['query_script_fields'] = {}
@ -961,6 +1000,7 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _resolve_post_processing_task(self, item, query_params, post_processing): def _resolve_post_processing_task(self, item, query_params, post_processing):
# Just do this in post-processing # Just do this in post-processing
if item[0] != 'field_names': if item[0] != 'field_names':
@ -968,6 +1008,7 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _size(self, query_params, post_processing): def _size(self, query_params, post_processing):
# Shrink wrap code around checking if size parameter is set # Shrink wrap code around checking if size parameter is set
size = query_params['query_size'] # can be None size = query_params['query_size'] # can be None
@ -982,6 +1023,7 @@ class Operations:
# This can return None # This can return None
return size return size
def info_es(self, buf): def info_es(self, buf):
buf.write("Operations:\n") buf.write("Operations:\n")
buf.write(" tasks: {0}\n".format(self._tasks)) buf.write(" tasks: {0}\n".format(self._tasks))
@ -1002,6 +1044,7 @@ class Operations:
buf.write(" body: {0}\n".format(body)) buf.write(" body: {0}\n".format(body))
buf.write(" post_processing: {0}\n".format(post_processing)) buf.write(" post_processing: {0}\n".format(post_processing))
def update_query(self, boolean_filter): def update_query(self, boolean_filter):
task = ('boolean_filter', boolean_filter) task = ('boolean_filter', boolean_filter)
self._tasks.append(task) self._tasks.append(task)

View File

@ -499,6 +499,7 @@ class Series(NDFrame):
""" """
return self._numeric_op(right, _get_method_name()) return self._numeric_op(right, _get_method_name())
def __truediv__(self, right): def __truediv__(self, right):
""" """
Return floating division of series and right, element-wise (binary operator truediv). Return floating division of series and right, element-wise (binary operator truediv).
@ -528,7 +529,7 @@ class Series(NDFrame):
3 2 3 2
4 2 4 2
Name: total_quantity, dtype: int64 Name: total_quantity, dtype: int64
>>> df.taxful_total_price / df.total_quantity >>> df.taxful_total_price / df.total_quantity # doctest: +SKIP
0 18.490000 0 18.490000
1 26.990000 1 26.990000
2 99.989998 2 99.989998
@ -733,6 +734,21 @@ class Series(NDFrame):
""" """
return self._numeric_op(right, _get_method_name()) return self._numeric_op(right, _get_method_name())
def __radd__(self, left):
return self._numeric_rop(left, _get_method_name())
def __rtruediv__(self, left):
return self._numeric_rop(left, _get_method_name())
def __rfloordiv__(self, left):
return self._numeric_rop(left, _get_method_name())
def __rmod__(self, left):
return self._numeric_rop(left, _get_method_name())
def __rmul__(self, left):
return self._numeric_rop(left, _get_method_name())
def __rpow__(self, left):
return self._numeric_rop(left, _get_method_name())
def __rsub__(self, left):
return self._numeric_rop(left, _get_method_name())
add = __add__ add = __add__
div = __truediv__ div = __truediv__
divide = __truediv__ divide = __truediv__
@ -745,6 +761,18 @@ class Series(NDFrame):
subtract = __sub__ subtract = __sub__
truediv = __truediv__ truediv = __truediv__
radd = __radd__
rdiv = __rtruediv__
rdivide = __rtruediv__
rfloordiv = __rfloordiv__
rmod = __rmod__
rmul = __rmul__
rmultiply = __rmul__
rpow = __rpow__
rsub = __rsub__
rsubtract = __rsub__
rtruediv = __rtruediv__
def _numeric_op(self, right, method_name): def _numeric_op(self, right, method_name):
""" """
return a op b return a op b
@ -787,6 +815,31 @@ class Series(NDFrame):
"unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(right)) "unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(right))
) )
def _numeric_rop(self, left, method_name):
"""
e.g. 1 + ed.Series
"""
op_method_name = str(method_name).replace('__r', '__')
if isinstance(left, Series):
# if both are Series, revese args and call normal op method and remove 'r' from radd etc.
return left._numeric_op(self, op_method_name)
elif np.issubdtype(np.dtype(type(left)), np.number): # allow np types
# Prefix new field name with 'f_' so it's a valid ES field name
new_field_name = "f_{0}_{1}_{2}".format(str(left).replace('.', '_'), op_method_name, self.name)
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, op_method_name, left, self.name))
# name of Series pinned to valid series (like pandas)
series.name = self.name
return series
else:
raise TypeError(
"unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(left))
)
def max(self): def max(self):
""" """
Return the maximum of the Series values Return the maximum of the Series values

View File

@ -104,3 +104,39 @@ class TestSeriesArithmetics(TestData):
pd_series = getattr(pd_df['total_quantity'], op)(pd_df['currency']) pd_series = getattr(pd_df['total_quantity'], op)(pd_df['currency'])
with pytest.raises(TypeError): with pytest.raises(TypeError):
ed_series = getattr(ed_df['total_quantity'], op)(ed_df['currency']) ed_series = getattr(ed_df['total_quantity'], op)(ed_df['currency'])
def test_ecommerce_series_basic_rarithmetics(self):
pd_df = self.pd_ecommerce().head(10)
ed_df = self.ed_ecommerce().head(10)
ops = ['__radd__',
'__rtruediv__',
'__rfloordiv__',
'__rpow__',
'__rmod__',
'__rmul__',
'__rsub__',
'radd',
'rtruediv',
'rfloordiv',
'rpow',
'rmod',
'rmul',
'rsub']
for op in ops:
pd_series = getattr(pd_df['taxful_total_price'], op)(pd_df['total_quantity'])
ed_series = getattr(ed_df['taxful_total_price'], op)(ed_df['total_quantity'])
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
pd_series = getattr(pd_df['taxful_total_price'], op)(3.141)
ed_series = getattr(ed_df['taxful_total_price'], op)(3.141)
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
pd_series = getattr(pd_df['taxful_total_price'], op)(np.float32(2.879))
ed_series = getattr(ed_df['taxful_total_price'], op)(np.float32(2.879))
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)
pd_series = getattr(pd_df['taxful_total_price'], op)(int(6))
ed_series = getattr(ed_df['taxful_total_price'], op)(int(6))
assert_pandas_eland_series_equal(pd_series, ed_series, check_less_precise=True)