Fixing rename and truediv issues

tests pass
TODO - implement additional orithmetic ops
This commit is contained in:
Stephen Dodson 2019-11-21 20:37:54 +00:00
parent c12bf9357b
commit 5d119215f8
7 changed files with 172 additions and 62 deletions

View File

@ -389,10 +389,10 @@ class DataFrame(NDFrame):
<BLANKLINE>
[27 rows x 5 columns]
Operations:
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('field_names', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
size: 5
sort_params: _doc:desc
columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
field_names: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
post_processing: ['sort_index']
<BLANKLINE>
"""

View File

@ -881,31 +881,58 @@ class Operations:
left_field = item[1][1][1][0]
right_field = item[1][1][1][1]
"""
(if op_name = 'truediv')
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / doc[right_field].value"
}
if isinstance(right_field, str):
"""
(if op_name = 'truediv')
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / doc[right_field].value"
}
}
}
"""
if op_name == 'truediv':
op = '/'
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
}
}
}
"""
if op_name == 'truediv':
op = '/'
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
"""
(if op_name = 'truediv')
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
"script_fields": {
"field_name": {
"script": {
"source": "doc[left_field].value / right_field"
}
}
}
"""
if op_name == 'truediv':
op = '/'
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
}
}
}
return query_params, post_processing

View File

@ -96,6 +96,7 @@ class ElandQueryCompiler:
self._operations.set_field_names(columns)
columns = property(_get_columns, _set_columns)
index = property(_get_index)
@property
@ -241,9 +242,9 @@ class ElandQueryCompiler:
# _source may not contain all columns in the mapping
# therefore, fill in missing columns
# (note this returns self.columns NOT IN df.columns)
missing_columns = list(set(self.columns) - set(df.columns))
missing_field_names = list(set(self.field_names) - set(df.columns))
for missing in missing_columns:
for missing in missing_field_names:
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
df[missing] = pd.Series(dtype=pd_dtype)
@ -252,7 +253,8 @@ class ElandQueryCompiler:
df.rename(columns=self._name_mapper.display_names_mapper(), inplace=True)
# Sort columns in mapping order
df = df[self.columns]
if len(self.columns) > 1:
df = df[self.columns]
return partial_result, df
@ -343,12 +345,14 @@ class ElandQueryCompiler:
index_field=self._index.index_field, operations=self._operations.copy(),
name_mapper=self._name_mapper.copy())
def rename(self, renames):
result = self.copy()
result._name_mapper.rename_display_name(renames)
return result
def rename(self, renames, inplace=False):
if inplace:
self._name_mapper.rename_display_name(renames)
return self
else:
result = self.copy()
result._name_mapper.rename_display_name(renames)
return result
def head(self, n):
result = self.copy()
@ -503,10 +507,10 @@ class ElandQueryCompiler:
"{0} != {1}".format(self._index_pattern, right._index_pattern)
)
def arithmetic_op_fields(self, field_name, op, left_field, right_field):
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field):
result = self.copy()
result._operations.arithmetic_op_fields(field_name, op, left_field, right_field)
result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field)
return result
@ -547,10 +551,10 @@ class ElandQueryCompiler:
self._field_to_display_names[field_name] = new_display_name
def field_names_to_list(self):
return self._field_to_display_names.keys()
return sorted(list(self._field_to_display_names.keys()))
def display_names_to_list(self):
return self._display_to_field_names.keys()
return sorted(list(self._display_to_field_names.keys()))
# Return mapper values as dict
def display_names_mapper(self):
@ -595,7 +599,7 @@ class ElandQueryCompiler:
def copy(self):
return self.__constructor__(
field_to_display_names=self._field_to_display_names,
display_to_field_names = self._display_to_field_names
field_to_display_names=self._field_to_display_names.copy(),
display_to_field_names = self._display_to_field_names.copy()
)

View File

@ -18,6 +18,7 @@ Based on NDFrame which underpins eland.1DataFrame
from io import StringIO
import pandas as pd
import numpy as np
from eland import NDFrame
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
@ -96,19 +97,58 @@ class Series(NDFrame):
def _get_name(self):
return self._query_compiler.columns[0]
name = property(_get_name)
def _set_name(self, name):
self._query_compiler.rename({self.name: name}, inplace=True)
name = property(_get_name, _set_name)
def rename(self, new_name):
"""
ONLY COLUMN rename supported
Rename name of series. Only column rename is supported. This does not change the underlying
Elasticsearch index, but adds a soft link from the new name (column) to the Elasticsearch field name
Parameters
----------
new_name
new_name: str
Returns
-------
eland.Series
eland.Series with new name.
See Also
--------
:pandas_api_docs:pandas.Series.rename
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.Carrier
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
>>> df.Carrier.rename('Airline')
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Airline, Length: 13059, dtype: object
"""
return Series(query_compiler=self._query_compiler.rename({self.name: new_name}))
@ -312,11 +352,25 @@ class Series(NDFrame):
# Check compatibility
self._query_compiler.check_arithmetics(right._query_compiler)
field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name)
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name)
# Compatible, so create new Series
return Series(query_compiler=self._query_compiler.arithmetic_op_fields(
field_name, 'truediv', self.name, right.name))
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, right.name))
series.name = None
return series
elif isinstance(right, (int, float)): # TODO extend to numpy types
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_'))
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, float(right))) # force rhs to float
# name of Series remains original name
series.name = self.name
return series
else:
raise TypeError(
"Can only perform arithmetic operation on selected types "

View File

@ -53,23 +53,23 @@ class TestQueryCompilerRename(TestData):
update_A = {'a' : 'A'}
mapper.rename_display_name(update_A)
assert display_names == mapper.display_names(columns)
assert display_names == mapper.field_to_display_names(columns)
# Invalid update
display_names = ['A', 'b', 'c', 'd']
update_ZZ = {'a' : 'ZZ'}
mapper.rename_display_name(update_ZZ)
assert display_names == mapper.display_names(columns)
assert display_names == mapper.field_to_display_names(columns)
display_names = ['AA', 'b', 'c', 'd']
update_AA = {'A' : 'AA'} # already renamed to 'A'
mapper.rename_display_name(update_AA)
assert display_names == mapper.display_names(columns)
assert display_names == mapper.field_to_display_names(columns)
display_names = ['AA', 'b', 'C', 'd']
update_AA_C = {'a' : 'AA', 'c' : 'C'} # 'a' rename ignored
mapper.rename_display_name(update_AA_C)
assert display_names == mapper.display_names(columns)
assert display_names == mapper.field_to_display_names(columns)

View File

@ -20,10 +20,7 @@ class TestSeriesArithmetics(TestData):
ed_df = self.ed_ecommerce()
pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price']
print(pd_avg_price) # this has None as name
ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price']
print(ed_avg_price)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
@ -32,19 +29,15 @@ class TestSeriesArithmetics(TestData):
ed_df = self.ed_ecommerce()
pd_avg_price = pd_df['total_quantity'] / 10.0
print(pd_avg_price)
ed_avg_price = ed_df['total_quantity'] / 10.0
print(ed_avg_price)
def test_ecommerce_series_div_other(self):
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
def test_ecommerce_series_div_int(self):
pd_df = self.pd_ecommerce()
ed_df = self.ed_ecommerce()
ed_s1 = ed_df.total_quantity
ed_s2 = ed_df.taxful_total_price
pd_avg_price = pd_df['total_quantity'] / int(10)
ed_avg_price = ed_df['total_quantity'] / int(10)
print(ed_s1)
print(ed_s2)
print(ed_s1)
print(ed_s2)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)

View File

@ -0,0 +1,32 @@
# File called _pytest for PyCharm compatability
import eland as ed
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_series_equal
class TestSeriesName(TestData):
def test_name(self):
# deep copy pandas DataFrame as .name alters this reference frame
pd_series = self.pd_flights()['Carrier'].copy(deep=True)
ed_series = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier')
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name
pd_series.name = "renamed1"
ed_series.name = "renamed1"
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name
pd_series.name = "renamed2"
ed_series.name = "renamed2"
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name