From 5d119215f8e24b542ee1a7b3f6ea9906e71cfbf0 Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Thu, 21 Nov 2019 20:37:54 +0000 Subject: [PATCH] Fixing rename and truediv issues tests pass TODO - implement additional orithmetic ops --- eland/dataframe.py | 4 +- eland/operations.py | 69 +++++++++++++------ eland/query_compiler.py | 34 +++++---- eland/series.py | 66 ++++++++++++++++-- .../query_compiler/test_rename_pytest.py | 8 +-- eland/tests/series/test_arithmetics_pytest.py | 21 ++---- eland/tests/series/test_name_pytest.py | 32 +++++++++ 7 files changed, 172 insertions(+), 62 deletions(-) create mode 100644 eland/tests/series/test_name_pytest.py diff --git a/eland/dataframe.py b/eland/dataframe.py index 852e06a..51373ed 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -389,10 +389,10 @@ class DataFrame(NDFrame): [27 rows x 5 columns] Operations: - tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))] + tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('field_names', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))] size: 5 sort_params: _doc:desc - columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin'] + field_names: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin'] post_processing: ['sort_index'] """ diff --git a/eland/operations.py b/eland/operations.py index aa9ba2f..5c40778 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -881,31 +881,58 @@ class Operations: left_field = item[1][1][1][0] right_field = item[1][1][1][1] - """ - (if op_name = 'truediv') - - "script_fields": { - "field_name": { - "script": { - "source": "doc[left_field].value / doc[right_field].value" - } + if isinstance(right_field, str): + """ + (if op_name = 'truediv') + + "script_fields": { + "field_name": { + "script": { + "source": "doc[left_field].value / doc[right_field].value" + } + } + } + """ + if op_name == 'truediv': + op = '/' + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + + source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field) + + if query_params['query_script_fields'] is None: + query_params['query_script_fields'] = {} + query_params['query_script_fields'][field_name] = { + 'script': { + 'source': source + } } - } - """ - if op_name == 'truediv': - op = '/' else: - raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + """ + (if op_name = 'truediv') - source = "doc['{0}'].value {1} doc['{2}'].value".format(left_field, op, right_field) - - if query_params['query_script_fields'] is None: - query_params['query_script_fields'] = {} - query_params['query_script_fields'][field_name] = { - 'script': { - 'source': source + "script_fields": { + "field_name": { + "script": { + "source": "doc[left_field].value / right_field" + } + } + } + """ + if op_name == 'truediv': + op = '/' + else: + raise NotImplementedError("Not implemented operation '{0}'".format(op_name)) + + source = "doc['{0}'].value {1} {2}".format(left_field, op, right_field) + + if query_params['query_script_fields'] is None: + query_params['query_script_fields'] = {} + query_params['query_script_fields'][field_name] = { + 'script': { + 'source': source + } } - } return query_params, post_processing diff --git a/eland/query_compiler.py b/eland/query_compiler.py index e94e605..ef01756 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -96,6 +96,7 @@ class ElandQueryCompiler: self._operations.set_field_names(columns) columns = property(_get_columns, _set_columns) + index = property(_get_index) @property @@ -241,9 +242,9 @@ class ElandQueryCompiler: # _source may not contain all columns in the mapping # therefore, fill in missing columns # (note this returns self.columns NOT IN df.columns) - missing_columns = list(set(self.columns) - set(df.columns)) + missing_field_names = list(set(self.field_names) - set(df.columns)) - for missing in missing_columns: + for missing in missing_field_names: is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing) df[missing] = pd.Series(dtype=pd_dtype) @@ -252,7 +253,8 @@ class ElandQueryCompiler: df.rename(columns=self._name_mapper.display_names_mapper(), inplace=True) # Sort columns in mapping order - df = df[self.columns] + if len(self.columns) > 1: + df = df[self.columns] return partial_result, df @@ -343,12 +345,14 @@ class ElandQueryCompiler: index_field=self._index.index_field, operations=self._operations.copy(), name_mapper=self._name_mapper.copy()) - def rename(self, renames): - result = self.copy() - - result._name_mapper.rename_display_name(renames) - - return result + def rename(self, renames, inplace=False): + if inplace: + self._name_mapper.rename_display_name(renames) + return self + else: + result = self.copy() + result._name_mapper.rename_display_name(renames) + return result def head(self, n): result = self.copy() @@ -503,10 +507,10 @@ class ElandQueryCompiler: "{0} != {1}".format(self._index_pattern, right._index_pattern) ) - def arithmetic_op_fields(self, field_name, op, left_field, right_field): + def arithmetic_op_fields(self, new_field_name, op, left_field, right_field): result = self.copy() - result._operations.arithmetic_op_fields(field_name, op, left_field, right_field) + result._operations.arithmetic_op_fields(new_field_name, op, left_field, right_field) return result @@ -547,10 +551,10 @@ class ElandQueryCompiler: self._field_to_display_names[field_name] = new_display_name def field_names_to_list(self): - return self._field_to_display_names.keys() + return sorted(list(self._field_to_display_names.keys())) def display_names_to_list(self): - return self._display_to_field_names.keys() + return sorted(list(self._display_to_field_names.keys())) # Return mapper values as dict def display_names_mapper(self): @@ -595,7 +599,7 @@ class ElandQueryCompiler: def copy(self): return self.__constructor__( - field_to_display_names=self._field_to_display_names, - display_to_field_names = self._display_to_field_names + field_to_display_names=self._field_to_display_names.copy(), + display_to_field_names = self._display_to_field_names.copy() ) diff --git a/eland/series.py b/eland/series.py index 9c96003..4918b2d 100644 --- a/eland/series.py +++ b/eland/series.py @@ -18,6 +18,7 @@ Based on NDFrame which underpins eland.1DataFrame from io import StringIO import pandas as pd +import numpy as np from eland import NDFrame from eland.filter import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter, IsIn @@ -96,19 +97,58 @@ class Series(NDFrame): def _get_name(self): return self._query_compiler.columns[0] - name = property(_get_name) + def _set_name(self, name): + self._query_compiler.rename({self.name: name}, inplace=True) + + name = property(_get_name, _set_name) def rename(self, new_name): """ - ONLY COLUMN rename supported + Rename name of series. Only column rename is supported. This does not change the underlying + Elasticsearch index, but adds a soft link from the new name (column) to the Elasticsearch field name Parameters ---------- - new_name + new_name: str Returns ------- + eland.Series + eland.Series with new name. + See Also + -------- + :pandas_api_docs:pandas.Series.rename + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.Carrier + 0 Kibana Airlines + 1 Logstash Airways + 2 Logstash Airways + 3 Kibana Airlines + 4 Kibana Airlines + ... + 13054 Logstash Airways + 13055 Logstash Airways + 13056 Logstash Airways + 13057 JetBeats + 13058 JetBeats + Name: Carrier, Length: 13059, dtype: object + >>> df.Carrier.rename('Airline') + 0 Kibana Airlines + 1 Logstash Airways + 2 Logstash Airways + 3 Kibana Airlines + 4 Kibana Airlines + ... + 13054 Logstash Airways + 13055 Logstash Airways + 13056 Logstash Airways + 13057 JetBeats + 13058 JetBeats + Name: Airline, Length: 13059, dtype: object """ return Series(query_compiler=self._query_compiler.rename({self.name: new_name})) @@ -312,11 +352,25 @@ class Series(NDFrame): # Check compatibility self._query_compiler.check_arithmetics(right._query_compiler) - field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name) + new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", right.name) # Compatible, so create new Series - return Series(query_compiler=self._query_compiler.arithmetic_op_fields( - field_name, 'truediv', self.name, right.name)) + series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( + new_field_name, 'truediv', self.name, right.name)) + series.name = None + + return series + elif isinstance(right, (int, float)): # TODO extend to numpy types + new_field_name = "{0}_{1}_{2}".format(self.name, "truediv", str(right).replace('.','_')) + + # Compatible, so create new Series + series = Series(query_compiler=self._query_compiler.arithmetic_op_fields( + new_field_name, 'truediv', self.name, float(right))) # force rhs to float + + # name of Series remains original name + series.name = self.name + + return series else: raise TypeError( "Can only perform arithmetic operation on selected types " diff --git a/eland/tests/query_compiler/test_rename_pytest.py b/eland/tests/query_compiler/test_rename_pytest.py index 3948044..40f0534 100644 --- a/eland/tests/query_compiler/test_rename_pytest.py +++ b/eland/tests/query_compiler/test_rename_pytest.py @@ -53,23 +53,23 @@ class TestQueryCompilerRename(TestData): update_A = {'a' : 'A'} mapper.rename_display_name(update_A) - assert display_names == mapper.display_names(columns) + assert display_names == mapper.field_to_display_names(columns) # Invalid update display_names = ['A', 'b', 'c', 'd'] update_ZZ = {'a' : 'ZZ'} mapper.rename_display_name(update_ZZ) - assert display_names == mapper.display_names(columns) + assert display_names == mapper.field_to_display_names(columns) display_names = ['AA', 'b', 'c', 'd'] update_AA = {'A' : 'AA'} # already renamed to 'A' mapper.rename_display_name(update_AA) - assert display_names == mapper.display_names(columns) + assert display_names == mapper.field_to_display_names(columns) display_names = ['AA', 'b', 'C', 'd'] update_AA_C = {'a' : 'AA', 'c' : 'C'} # 'a' rename ignored mapper.rename_display_name(update_AA_C) - assert display_names == mapper.display_names(columns) + assert display_names == mapper.field_to_display_names(columns) diff --git a/eland/tests/series/test_arithmetics_pytest.py b/eland/tests/series/test_arithmetics_pytest.py index 589853c..5a510ea 100644 --- a/eland/tests/series/test_arithmetics_pytest.py +++ b/eland/tests/series/test_arithmetics_pytest.py @@ -20,10 +20,7 @@ class TestSeriesArithmetics(TestData): ed_df = self.ed_ecommerce() pd_avg_price = pd_df['total_quantity'] / pd_df['taxful_total_price'] - print(pd_avg_price) # this has None as name - ed_avg_price = ed_df['total_quantity'] / ed_df['taxful_total_price'] - print(ed_avg_price) assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) @@ -32,19 +29,15 @@ class TestSeriesArithmetics(TestData): ed_df = self.ed_ecommerce() pd_avg_price = pd_df['total_quantity'] / 10.0 - print(pd_avg_price) - ed_avg_price = ed_df['total_quantity'] / 10.0 - print(ed_avg_price) - def test_ecommerce_series_div_other(self): + assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) + + def test_ecommerce_series_div_int(self): + pd_df = self.pd_ecommerce() ed_df = self.ed_ecommerce() - ed_s1 = ed_df.total_quantity - ed_s2 = ed_df.taxful_total_price + pd_avg_price = pd_df['total_quantity'] / int(10) + ed_avg_price = ed_df['total_quantity'] / int(10) - print(ed_s1) - print(ed_s2) - - print(ed_s1) - print(ed_s2) + assert_pandas_eland_series_equal(pd_avg_price, ed_avg_price, check_less_precise=True) diff --git a/eland/tests/series/test_name_pytest.py b/eland/tests/series/test_name_pytest.py new file mode 100644 index 0000000..5e757a8 --- /dev/null +++ b/eland/tests/series/test_name_pytest.py @@ -0,0 +1,32 @@ +# File called _pytest for PyCharm compatability +import eland as ed +from eland.tests import ELASTICSEARCH_HOST +from eland.tests import FLIGHTS_INDEX_NAME +from eland.tests.common import TestData +from eland.tests.common import assert_pandas_eland_series_equal + + +class TestSeriesName(TestData): + + def test_name(self): + # deep copy pandas DataFrame as .name alters this reference frame + pd_series = self.pd_flights()['Carrier'].copy(deep=True) + ed_series = ed.Series(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME, 'Carrier') + + assert_pandas_eland_series_equal(pd_series, ed_series) + assert ed_series.name == pd_series.name + + pd_series.name = "renamed1" + ed_series.name = "renamed1" + + assert_pandas_eland_series_equal(pd_series, ed_series) + assert ed_series.name == pd_series.name + + pd_series.name = "renamed2" + ed_series.name = "renamed2" + + assert_pandas_eland_series_equal(pd_series, ed_series) + assert ed_series.name == pd_series.name + + +