Fixing rename and truediv issues

tests pass
TODO - implement additional orithmetic ops
This commit is contained in:
Stephen Dodson 2019-11-21 20:37:54 +00:00
parent c12bf9357b
commit 5d119215f8
7 changed files with 172 additions and 62 deletions

View File

@ -389,10 +389,10 @@ class DataFrame(NDFrame):
<BLANKLINE> <BLANKLINE>
[27 rows x 5 columns] [27 rows x 5 columns]
Operations: Operations:
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))] tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('field_names', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
size: 5 size: 5
sort_params: _doc:desc sort_params: _doc:desc
columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin'] field_names: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
post_processing: ['sort_index'] post_processing: ['sort_index']
<BLANKLINE> <BLANKLINE>
""" """

View File

@ -881,31 +881,58 @@ class Operations:
left_field = item[1][1][1][0] left_field = item[1][1][1][0]
right_field = item[1][1][1][1] right_field = item[1][1][1][1]
""" if isinstance(right_field, str):
(if op_name = 'truediv') """
(if op_name = 'truediv')
"script_fields": { "script_fields": {
"field_name": { "field_name": {
"script": { "script": {
"source": "doc[left_field].value / doc[right_field].value" "source": "doc[left_field].value / doc[right_field].value"
} }
}
}
"""
if op_name == 'truediv':
op = '/'
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
}
} }
}
"""
if op_name == 'truediv':
op = '/'
else: else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) """
(if op_name = 'truediv')
source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field) "script_fields": {
"field_name": {
if query_params['query_script_fields'] is None: "script": {
query_params['query_script_fields'] = {} "source": "doc[left_field].value / right_field"
query_params['query_script_fields'][field_name] = { }
'script': { }
'source': source }
"""
if op_name == 'truediv':
op = '/'
else:
raise NotImplementedError("Not implemented operation '{0}'".format(op_name))
source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field)
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'][field_name] = {
'script': {
'source': source
}
} }
}
return query_params, post_processing return query_params, post_processing

View File

@ -96,6 +96,7 @@ class ElandQueryCompiler:
self._operations.set_field_names(columns) self._operations.set_field_names(columns)
columns = property(_get_columns, _set_columns) columns = property(_get_columns, _set_columns)
index = property(_get_index) index = property(_get_index)
@property @property
@ -241,9 +242,9 @@ class ElandQueryCompiler:
# _source may not contain all columns in the mapping # _source may not contain all columns in the mapping
# therefore, fill in missing columns # therefore, fill in missing columns
# (note this returns self.columns NOT IN df.columns) # (note this returns self.columns NOT IN df.columns)
missing_columns = list(set(self.columns) - set(df.columns)) missing_field_names = list(set(self.field_names) - set(df.columns))
for missing in missing_columns: for missing in missing_field_names:
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing) is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
df[missing] = pd.Series(dtype=pd_dtype) df[missing] = pd.Series(dtype=pd_dtype)
@ -252,7 +253,8 @@ class ElandQueryCompiler:
df.rename(columns=self._name_mapper.display_names_mapper(), inplace=True) df.rename(columns=self._name_mapper.display_names_mapper(), inplace=True)
# Sort columns in mapping order # Sort columns in mapping order
df = df[self.columns] if len(self.columns) > 1:
df = df[self.columns]
return partial_result, df return partial_result, df
@ -343,12 +345,14 @@ class ElandQueryCompiler:
index_field=self._index.index_field, operations=self._operations.copy(), index_field=self._index.index_field, operations=self._operations.copy(),
name_mapper=self._name_mapper.copy()) name_mapper=self._name_mapper.copy())
def rename(self, renames): def rename(self, renames, inplace=False):
result = self.copy() if inplace:
self._name_mapper.rename_display_name(renames)
result._name_mapper.rename_display_name(renames) return self
else:
return result result = self.copy()
result._name_mapper.rename_display_name(renames)
return result
def head(self, n): def head(self, n):
result = self.copy() result = self.copy()
@ -503,10 +507,10 @@ class ElandQueryCompiler:
"{0} != {1}".format(self._index_pattern, right._index_pattern) "{0} != {1}".format(self._index_pattern, right._index_pattern)
) )
def arithmetic_op_fields(self, field_name, op, left_field, right_field): def arithmetic_op_fields(self, new_field_name, op, left_field, right_field):
result = self.copy() result = self.copy()
result._operations.arithmetic_op_fields(field_name, op, left_field, right_field) result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field)
return result return result
@ -547,10 +551,10 @@ class ElandQueryCompiler:
self._field_to_display_names[field_name] = new_display_name self._field_to_display_names[field_name] = new_display_name
def field_names_to_list(self): def field_names_to_list(self):
return self._field_to_display_names.keys() return sorted(list(self._field_to_display_names.keys()))
def display_names_to_list(self): def display_names_to_list(self):
return self._display_to_field_names.keys() return sorted(list(self._display_to_field_names.keys()))
# Return mapper values as dict # Return mapper values as dict
def display_names_mapper(self): def display_names_mapper(self):
@ -595,7 +599,7 @@ class ElandQueryCompiler:
def copy(self): def copy(self):
return self.__constructor__( return self.__constructor__(
field_to_display_names=self._field_to_display_names, field_to_display_names=self._field_to_display_names.copy(),
display_to_field_names = self._display_to_field_names display_to_field_names = self._display_to_field_names.copy()
) )

View File

@ -18,6 +18,7 @@ Based on NDFrame which underpins eland.1DataFrame
from io import StringIO from io import StringIO
import pandas as pd import pandas as pd
import numpy as np
from eland import NDFrame from eland import NDFrame
from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn
@ -96,19 +97,58 @@ class Series(NDFrame):
def _get_name(self): def _get_name(self):
return self._query_compiler.columns[0] return self._query_compiler.columns[0]
name = property(_get_name) def _set_name(self, name):
self._query_compiler.rename({self.name: name}, inplace=True)
name = property(_get_name, _set_name)
def rename(self, new_name): def rename(self, new_name):
""" """
ONLY COLUMN rename supported Rename name of series. Only column rename is supported. This does not change the underlying
Elasticsearch index, but adds a soft link from the new name (column) to the Elasticsearch field name
Parameters Parameters
---------- ----------
new_name new_name: str
Returns Returns
------- -------
eland.Series
eland.Series with new name.
See Also
--------
:pandas_api_docs:pandas.Series.rename
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.Carrier
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Carrier, Length: 13059, dtype: object
>>> df.Carrier.rename('Airline')
0 Kibana Airlines
1 Logstash Airways
2 Logstash Airways
3 Kibana Airlines
4 Kibana Airlines
...
13054 Logstash Airways
13055 Logstash Airways
13056 Logstash Airways
13057 JetBeats
13058 JetBeats
Name: Airline, Length: 13059, dtype: object
""" """
return Series(query_compiler=self._query_compiler.rename({self.name: new_name})) return Series(query_compiler=self._query_compiler.rename({self.name: new_name}))
@ -312,11 +352,25 @@ class Series(NDFrame):
# Check compatibility # Check compatibility
self._query_compiler.check_arithmetics(right._query_compiler) self._query_compiler.check_arithmetics(right._query_compiler)
field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name) new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name)
# Compatible, so create new Series # Compatible, so create new Series
return Series(query_compiler=self._query_compiler.arithmetic_op_fields( series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
field_name, 'truediv', self.name, right.name)) new_field_name, 'truediv', self.name, right.name))
series.name = None
return series
elif isinstance(right, (int, float)): # TODO extend to numpy types
new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_'))
# Compatible, so create new Series
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, 'truediv', self.name, float(right))) # force rhs to float
# name of Series remains original name
series.name = self.name
return series
else: else:
raise TypeError( raise TypeError(
"Can only perform arithmetic operation on selected types " "Can only perform arithmetic operation on selected types "

View File

@ -53,23 +53,23 @@ class TestQueryCompilerRename(TestData):
update_A = {'a' : 'A'} update_A = {'a' : 'A'}
mapper.rename_display_name(update_A) mapper.rename_display_name(update_A)
assert display_names == mapper.display_names(columns) assert display_names == mapper.field_to_display_names(columns)
# Invalid update # Invalid update
display_names = ['A', 'b', 'c', 'd'] display_names = ['A', 'b', 'c', 'd']
update_ZZ = {'a' : 'ZZ'} update_ZZ = {'a' : 'ZZ'}
mapper.rename_display_name(update_ZZ) mapper.rename_display_name(update_ZZ)
assert display_names == mapper.display_names(columns) assert display_names == mapper.field_to_display_names(columns)
display_names = ['AA', 'b', 'c', 'd'] display_names = ['AA', 'b', 'c', 'd']
update_AA = {'A' : 'AA'} # already renamed to 'A' update_AA = {'A' : 'AA'} # already renamed to 'A'
mapper.rename_display_name(update_AA) mapper.rename_display_name(update_AA)
assert display_names == mapper.display_names(columns) assert display_names == mapper.field_to_display_names(columns)
display_names = ['AA', 'b', 'C', 'd'] display_names = ['AA', 'b', 'C', 'd']
update_AA_C = {'a' : 'AA', 'c' : 'C'} # 'a' rename ignored update_AA_C = {'a' : 'AA', 'c' : 'C'} # 'a' rename ignored
mapper.rename_display_name(update_AA_C) mapper.rename_display_name(update_AA_C)
assert display_names == mapper.display_names(columns) assert display_names == mapper.field_to_display_names(columns)

View File

@ -20,10 +20,7 @@ class TestSeriesArithmetics(TestData):
ed_df = self.ed_ecommerce() ed_df = self.ed_ecommerce()
pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price'] pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price']
print(pd_avg_price) # this has None as name
ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price'] ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price']
print(ed_avg_price)
assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
@ -32,19 +29,15 @@ class TestSeriesArithmetics(TestData):
ed_df = self.ed_ecommerce() ed_df = self.ed_ecommerce()
pd_avg_price = pd_df['total_quantity'] / 10.0 pd_avg_price = pd_df['total_quantity'] / 10.0
print(pd_avg_price)
ed_avg_price = ed_df['total_quantity'] / 10.0 ed_avg_price = ed_df['total_quantity'] / 10.0
print(ed_avg_price)
def test_ecommerce_series_div_other(self): assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
def test_ecommerce_series_div_int(self):
pd_df = self.pd_ecommerce()
ed_df = self.ed_ecommerce() ed_df = self.ed_ecommerce()
ed_s1 = ed_df.total_quantity pd_avg_price = pd_df['total_quantity'] / int(10)
ed_s2 = ed_df.taxful_total_price ed_avg_price = ed_df['total_quantity'] / int(10)
print(ed_s1) assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True)
print(ed_s2)
print(ed_s1)
print(ed_s2)

View File

@ -0,0 +1,32 @@
# File called _pytest for PyCharm compatability
import eland as ed
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_series_equal
class TestSeriesName(TestData):
def test_name(self):
# deep copy pandas DataFrame as .name alters this reference frame
pd_series = self.pd_flights()['Carrier'].copy(deep=True)
ed_series = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier')
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name
pd_series.name = "renamed1"
ed_series.name = "renamed1"
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name
pd_series.name = "renamed2"
ed_series.name = "renamed2"
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name