From f263e21b8a4c22f24dec430380d3320ce55f4d09 Mon Sep 17 00:00:00 2001 From: Michael Hirsch Date: Fri, 6 Dec 2019 03:20:09 -0500 Subject: [PATCH] Better Handling of Non Aggregatable Fields (#85) * updates ecommerce mapping to include non-aggregatable text field * updates exists tests and adds new tests for non-aggregatable field * better handling on non-aggregatable fields * fixes formatting * swaps series in assertion * adds newline --- eland/mappings.py | 16 +++++++++------- eland/operations.py | 2 +- eland/series.py | 11 ++++++++--- eland/tests/__init__.py | 2 +- eland/tests/dataframe/test_nunique_pytest.py | 2 +- .../tests/mappings/test_aggregatables_pytest.py | 1 - .../tests/series/test_str_arithmetics_pytest.py | 16 ++++++++++++++++ eland/tests/series/test_value_counts_pytest.py | 5 +++++ 8 files changed, 41 insertions(+), 14 deletions(-) diff --git a/eland/mappings.py b/eland/mappings.py index 794c3ce..78c3b34 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -373,7 +373,11 @@ class Mappings: aggregatable: bool Is the field aggregatable in Elasticsearch? """ - return self._mappings_capabilities.loc[field_name] + try: + field_capabilities = self._mappings_capabilities.loc[field_name] + except KeyError: + field_capabilities = pd.Series() + return field_capabilities def get_date_field_format(self, field_name): """ @@ -447,9 +451,7 @@ class Mappings: """ if field_names is None: field_names = self.source_fields() - aggregatables = {} - for field_name in field_names: capabilities = self.field_capabilities(field_name) if capabilities['aggregatable']: @@ -458,11 +460,11 @@ class Mappings: # Try 'field_name.keyword' field_name_keyword = field_name + '.keyword' capabilities = self.field_capabilities(field_name_keyword) - if capabilities['aggregatable']: + if not capabilities.empty and capabilities.get('aggregatable'): aggregatables[field_name_keyword] = field_name - else: - # Aggregations not supported for this field - raise ValueError("Aggregations not supported for ", field_name) + + if not aggregatables: + raise ValueError("Aggregations not supported for ", field_name) return aggregatables diff --git a/eland/operations.py b/eland/operations.py index 2be5217..5868854 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -249,7 +249,7 @@ class Operations: results = {} for key, value in aggregatable_field_names.items(): - for bucket in response['aggregations'][field_names[0]]['buckets']: + for bucket in response['aggregations'][key]['buckets']: results[bucket['key']] = bucket['doc_count'] try: diff --git a/eland/series.py b/eland/series.py index caa64e3..7e1d7de 100644 --- a/eland/series.py +++ b/eland/series.py @@ -1051,7 +1051,10 @@ class Series(NDFrame): else: # TODO - support limited ops on strings https://github.com/elastic/eland/issues/65 - raise TypeError("Unsupported operation: '{}' {} '{}'".format(self._dtype, method_name, right._dtype)) + raise TypeError( + "unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']" + .format(method_name, type(self), self._dtype, type(right).__name__) + ) # check left number and right numeric series elif np.issubdtype(np.dtype(type(right)), np.number) and np.issubdtype(self._dtype, np.number): @@ -1085,7 +1088,8 @@ class Series(NDFrame): else: # TODO - support limited ops on strings https://github.com/elastic/eland/issues/65 raise TypeError( - "unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(right)) + "unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']" + .format(method_name, type(self), self._dtype, type(right).__name__) ) def _numeric_rop(self, left, method_name, op_type=None): @@ -1127,7 +1131,8 @@ class Series(NDFrame): else: # TODO - support limited ops on strings https://github.com/elastic/eland/issues/65 raise TypeError( - "unsupported operand type(s) for '{}' {} '{}'".format(type(self), method_name, type(left)) + "unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']" + .format(op_method_name, type(self), self._dtype, type(left).__name__) ) def max(self): diff --git a/eland/tests/__init__.py b/eland/tests/__init__.py index 04a4ab6..46a924c 100644 --- a/eland/tests/__init__.py +++ b/eland/tests/__init__.py @@ -147,7 +147,7 @@ ECOMMERCE_MAPPING = {"mappings": { } }, "customer_gender": { - "type": "keyword" + "type": "text" }, "customer_id": { "type": "keyword" diff --git a/eland/tests/dataframe/test_nunique_pytest.py b/eland/tests/dataframe/test_nunique_pytest.py index e160c6a..fc3cf6e 100644 --- a/eland/tests/dataframe/test_nunique_pytest.py +++ b/eland/tests/dataframe/test_nunique_pytest.py @@ -22,7 +22,7 @@ class TestDataFrameNUnique(TestData): # assert_series_equal(pd_nunique, ed_nunique) def test_ecommerce_nunique(self): - columns = ['customer_first_name', 'customer_gender', 'day_of_week_i'] + columns = ['customer_first_name', 'customer_last_name', 'day_of_week_i'] pd_ecommerce = self.pd_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns] diff --git a/eland/tests/mappings/test_aggregatables_pytest.py b/eland/tests/mappings/test_aggregatables_pytest.py index a2f7111..84f19a4 100644 --- a/eland/tests/mappings/test_aggregatables_pytest.py +++ b/eland/tests/mappings/test_aggregatables_pytest.py @@ -15,7 +15,6 @@ class TestMappingsAggregatables(TestData): 'customer_birth_date': 'customer_birth_date', 'customer_first_name.keyword': 'customer_first_name', 'customer_full_name.keyword': 'customer_full_name', - 'customer_gender': 'customer_gender', 'customer_id': 'customer_id', 'customer_last_name.keyword': 'customer_last_name', 'customer_phone': 'customer_phone', diff --git a/eland/tests/series/test_str_arithmetics_pytest.py b/eland/tests/series/test_str_arithmetics_pytest.py index 682d9ac..d5b8341 100644 --- a/eland/tests/series/test_str_arithmetics_pytest.py +++ b/eland/tests/series/test_str_arithmetics_pytest.py @@ -38,3 +38,19 @@ class TestSeriesArithmetics(TestData): pdadd = "The last name is: " + self.pd_ecommerce()['customer_last_name'] assert_pandas_eland_series_equal(pdadd, edadd) + + def test_non_aggregatable_add_str(self): + with pytest.raises(ValueError): + assert self.ed_ecommerce()['customer_gender'] + "is the gender" + + def teststr_add_non_aggregatable(self): + with pytest.raises(ValueError): + assert "The gender is: " + self.ed_ecommerce()['customer_gender'] + + def test_non_aggregatable_add_aggregatable(self): + with pytest.raises(ValueError): + assert self.ed_ecommerce()['customer_gender'] + self.ed_ecommerce()['customer_first_name'] + + def test_aggregatable_add_non_aggregatable(self): + with pytest.raises(ValueError): + assert self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_gender'] diff --git a/eland/tests/series/test_value_counts_pytest.py b/eland/tests/series/test_value_counts_pytest.py index 20187e8..97effbf 100644 --- a/eland/tests/series/test_value_counts_pytest.py +++ b/eland/tests/series/test_value_counts_pytest.py @@ -45,3 +45,8 @@ class TestSeriesValueCounts(TestData): ed_s = self.ed_flights()['Carrier'] with pytest.raises(ValueError): assert ed_s.value_counts(es_size=-9) + + def test_value_counts_non_aggregatable(self): + ed_s = self.ed_ecommerce()['customer_gender'] + with pytest.raises(ValueError): + assert ed_s.value_counts()