diff --git a/docs/source/conf.py b/docs/source/conf.py index 9eba27d..3e214b0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,7 +57,10 @@ except ImportError: pd = None ''' -extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')} +extlinks = { + 'pandas_api_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', ''), + 'pandas_user_guide': ('https://pandas.pydata.org/pandas-docs/stable/user_guide/%s.html', 'Pandas User Guide/') +} numpydoc_attributes_as_param_list = False numpydoc_show_class_members = False @@ -85,7 +88,8 @@ exclude_patterns = [] # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +#html_theme = 'sphinx_rtd_theme' +html_theme = "pandas_sphinx_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/source/reference/api/eland.DataFrame.info_es.rst b/docs/source/reference/api/eland.DataFrame.info_es.rst new file mode 100644 index 0000000..e93f6e8 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.info_es.rst @@ -0,0 +1,6 @@ +eland.DataFrame.info_es +======================= + +.. currentmodule:: eland + +.. automethod:: DataFrame.info_es diff --git a/docs/source/reference/api/eland.DataFrame.keys.rst b/docs/source/reference/api/eland.DataFrame.keys.rst new file mode 100644 index 0000000..e3c8ba3 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.keys.rst @@ -0,0 +1,6 @@ +eland.DataFrame.keys +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.keys diff --git a/docs/source/reference/api/eland.DataFrame.max.rst b/docs/source/reference/api/eland.DataFrame.max.rst new file mode 100644 index 0000000..2448a2e --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.max.rst @@ -0,0 +1,6 @@ +eland.DataFrame.max +=================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.max diff --git a/docs/source/reference/api/eland.DataFrame.mean.rst b/docs/source/reference/api/eland.DataFrame.mean.rst new file mode 100644 index 0000000..efeb800 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.mean.rst @@ -0,0 +1,6 @@ +eland.DataFrame.mean +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.mean diff --git a/docs/source/reference/api/eland.DataFrame.min.rst b/docs/source/reference/api/eland.DataFrame.min.rst new file mode 100644 index 0000000..c793e31 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.min.rst @@ -0,0 +1,6 @@ +eland.DataFrame.min +=================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.min diff --git a/docs/source/reference/api/eland.DataFrame.nunique.rst b/docs/source/reference/api/eland.DataFrame.nunique.rst new file mode 100644 index 0000000..d24165f --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.nunique.rst @@ -0,0 +1,6 @@ +eland.DataFrame.nunique +======================= + +.. currentmodule:: eland + +.. automethod:: DataFrame.nunique diff --git a/docs/source/reference/api/eland.DataFrame.query.rst b/docs/source/reference/api/eland.DataFrame.query.rst new file mode 100644 index 0000000..3cdd4d3 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.query.rst @@ -0,0 +1,6 @@ +eland.DataFrame.query +===================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.query diff --git a/docs/source/reference/api/eland.DataFrame.sum.rst b/docs/source/reference/api/eland.DataFrame.sum.rst new file mode 100644 index 0000000..58fc015 --- /dev/null +++ b/docs/source/reference/api/eland.DataFrame.sum.rst @@ -0,0 +1,6 @@ +eland.DataFrame.sum +=================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.sum diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst index aaed5a3..f009456 100644 --- a/docs/source/reference/dataframe.rst +++ b/docs/source/reference/dataframe.rst @@ -31,8 +31,10 @@ Indexing, iteration :toctree: api/ DataFrame.head + DataFrame.keys DataFrame.tail DataFrame.get + DataFrame.query Function application, GroupBy & window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -52,6 +54,11 @@ Computations / descriptive stats DataFrame.count DataFrame.describe DataFrame.info + DataFrame.max + DataFrame.mean + DataFrame.min + DataFrame.sum + DataFrame.nunique Reindexing / selection / label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -74,3 +81,11 @@ Serialization / IO / conversion DataFrame.info +Elasticsearch utilities +~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.info_es + + diff --git a/eland/dataframe.py b/eland/dataframe.py index bf476a9..34892df 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -5,6 +5,7 @@ from io import StringIO import numpy as np import pandas as pd import six +from pandas.core.computation.eval import eval from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.dtypes.common import is_list_like from pandas.core.indexing import check_bool_indexer @@ -41,7 +42,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame` + :pandas_api_docs:`pandas.DataFrame` Examples -------- @@ -119,11 +120,12 @@ class DataFrame(NDFrame): Returns ------- - Elasticsearch field names as pandas.Index + pandas.Index + Elasticsearch field names as pandas.Index See Also -------- - :pandas_docs:`pandas.DataFrame.columns` + :pandas_api_docs:`pandas.DataFrame.columns` Examples -------- @@ -153,7 +155,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame.empty` + :pandas_api_docs:`pandas.DataFrame.empty` Examples -------- @@ -183,7 +185,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame.head` + :pandas_api_docs:`pandas.DataFrame.head` Examples -------- @@ -218,7 +220,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame.tail` + :pandas_api_docs:`pandas.DataFrame.tail` Examples -------- @@ -304,7 +306,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame.count` + :pandas_api_docs:`pandas.DataFrame.count` Examples -------- @@ -318,11 +320,57 @@ class DataFrame(NDFrame): def info_es(self): """ + A debug summary of an eland DataFrame internals. + + This includes the Elasticsearch search queries and query compiler task list. Returns ------- - None - This method prints a debug summary of the task list Elasticsearch + str + A debug summary of an eland DataFrame internals. + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)] + >>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']] + >>> df = df.tail() + >>> df + timestamp OriginAirportID DestAirportID FlightDelayMin + 12608 2018-02-10 01:20:52 AMS CYEG 120 + 12720 2018-02-10 14:09:40 AMS BHM 255 + 12725 2018-02-10 00:53:01 AMS ATL 360 + 12823 2018-02-10 15:41:20 AMS NGO 120 + 12907 2018-02-11 20:08:25 AMS LIM 225 + + [5 rows x 4 columns] + >>> print(df.info_es()) + index_pattern: flights + Index: + index_field: _id + is_source_field: False + Mappings: + capabilities: _source es_dtype pd_dtype searchable aggregatable + AvgTicketPrice True float float64 True True + Cancelled True boolean bool True True + Carrier True keyword object True True + Dest True keyword object True True + DestAirportID True keyword object True True + ... ... ... ... ... ... + OriginLocation True geo_point object True True + OriginRegion True keyword object True True + OriginWeather True keyword object True True + dayOfWeek True integer int64 True True + timestamp True date datetime64[ns] True True + + [27 rows x 5 columns] + Operations: + tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))] + size: 5 + sort_params: _doc:desc + columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin'] + post_processing: ['sort_index'] + """ buf = StringIO() @@ -350,7 +398,7 @@ class DataFrame(NDFrame): This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage. - See :pandas_docs:`pandas.DataFrame.info` for full details. + See :pandas_api_docs:`pandas.DataFrame.info` for full details. Notes ----- @@ -368,7 +416,7 @@ class DataFrame(NDFrame): customer_first_name 4675 non-null object geoip.city_name 4094 non-null object dtypes: object(2) - memory usage: 96.0 bytes + memory usage: ... """ if buf is None: # pragma: no cover buf = sys.stdout @@ -559,6 +607,26 @@ class DataFrame(NDFrame): result = _buf.getvalue() return result + def __getattr__(self, key): + """After regular attribute access, looks up the name in the columns + + Parameters + ---------- + key: str + Attribute name. + + Returns + ------- + The value of the attribute. + """ + try: + return object.__getattribute__(self, key) + except AttributeError as e: + if key in self.columns: + return self[key] + raise e + + def _getitem(self, key): """Get the column specified by key for this DataFrame. @@ -695,7 +763,7 @@ class DataFrame(NDFrame): """ Return a subset of the DataFrame's columns based on the column dtypes. - Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes` + Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes` """ empty_df = self._empty_pd_df() @@ -720,6 +788,16 @@ class DataFrame(NDFrame): return num_rows, num_columns def keys(self): + """ + Return columns + + See :pandas_api_docs:`pandas.DataFrame.keys` + + Returns + ------- + pandas.Index + Elasticsearch field names as pandas.Index + """ return self.columns def aggregate(self, func, axis=0, *args, **kwargs): @@ -758,7 +836,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame.aggregate` + :pandas_api_docs:`pandas.DataFrame.aggregate` Examples -------- @@ -788,19 +866,49 @@ class DataFrame(NDFrame): hist = gfx.ed_hist_frame - def query(self, expr, inplace=False, **kwargs): - """Queries the Dataframe with a boolean expression + def query(self, expr): + """ + Query the columns of a DataFrame with a boolean expression. - Returns: - A new DataFrame if inplace=False + TODO - add additional pandas arguments + + Parameters + ---------- + expr: str + A boolean expression + + Returns + ------- + eland.DataFrame: + DataFrame populated by results of the query + + TODO - add link to eland user guide + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.query` + :pandas_user_guide:`indexing` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df = df.query('FlightDelayMin > 60') + >>> df.info() """ if isinstance(expr, BooleanFilter): return DataFrame( query_compiler=self._query_compiler._update_query(BooleanFilter(expr)) ) elif isinstance(expr, six.string_types): + column_resolver = {} + for key in self.keys(): + column_resolver[key] = self.get(key) + # Create fake resolvers - index resolver is empty + resolvers = column_resolver, {} + # Use pandas eval to parse query - TODO validate this further + filter = eval(expr, target=self, resolvers=tuple(tuple(resolvers))) return DataFrame( - query_compiler=self._query_compiler._update_query(ScriptFilter(expr)) + query_compiler=self._query_compiler._update_query(filter) ) else: raise NotImplementedError(expr, type(expr)) @@ -820,7 +928,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_docs:`pandas.DataFrame.get` + :pandas_api_docs:`pandas.DataFrame.get` Examples -------- diff --git a/eland/index.py b/eland/index.py index dfd0846..3b5bddc 100644 --- a/eland/index.py +++ b/eland/index.py @@ -58,5 +58,5 @@ class Index: def info_es(self, buf): buf.write("Index:\n") - buf.write("\tindex_field: {0}\n".format(self.index_field)) - buf.write("\tis_source_field: {0}\n".format(self.is_source_field)) + buf.write(" index_field: {0}\n".format(self.index_field)) + buf.write(" is_source_field: {0}\n".format(self.is_source_field)) diff --git a/eland/mappings.py b/eland/mappings.py index ec7072e..5a62bfb 100644 --- a/eland/mappings.py +++ b/eland/mappings.py @@ -408,6 +408,44 @@ class Mappings: return is_source_field + def aggregatable_columns(self, columns=None): + """ + Return a dict of aggregatable columns from all columns or columns list + {'customer_full_name': 'customer_full_name.keyword', ...} + + Logic here is that column names are '_source' fields and keyword fields + may be nested beneath the field. E.g. + customer_full_name: text + customer_full_name.keyword: keyword + + customer_full_name.keyword is the aggregatable field for customer_full_name + + Returns + ------- + dict + e.g. {'customer_full_name': 'customer_full_name.keyword', ...} + """ + if columns is None: + columns = self.source_fields() + + aggregatables = {} + + for column in columns: + capabilities = self.field_capabilities(column) + if capabilities['aggregatable']: + aggregatables[column] = column + else: + # Try 'column.keyword' + column_keyword = column + '.keyword' + capabilities = self.field_capabilities(column_keyword) + if capabilities['aggregatable']: + aggregatables[column_keyword] = column + else: + # Aggregations not supported for this field + raise ValueError("Aggregations not supported for ", column) + + return aggregatables + def numeric_source_fields(self, columns, include_bool=True): """ Returns @@ -471,4 +509,4 @@ class Mappings: def info_es(self, buf): buf.write("Mappings:\n") - buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities)) + buf.write(" capabilities: {0}\n".format(self._mappings_capabilities)) diff --git a/eland/ndframe.py b/eland/ndframe.py index 605ce43..a186714 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -66,7 +66,7 @@ class NDFrame: See Also -------- - :pandas_docs:`pandas.DataFrame.index` + :pandas_api_docs:`pandas.DataFrame.index` Examples -------- @@ -92,7 +92,7 @@ class NDFrame: See Also -------- - :pandas_docs:`pandas.DataFrame.dtypes` + :pandas_api_docs:`pandas.DataFrame.dtypes` Examples -------- @@ -125,22 +125,6 @@ class NDFrame: def __getitem__(self, key): return self._getitem(key) - def __getattr__(self, key): - """After regular attribute access, looks up the name in the columns - - Args: - key (str): Attribute name. - - Returns: - The value of the attribute. - """ - try: - return object.__getattribute__(self, key) - except AttributeError as e: - if key in self.columns: - return self[key] - raise e - def __sizeof__(self): # Don't default to pandas, just return approximation TODO - make this more accurate return sys.getsizeof(self._query_compiler) @@ -190,7 +174,7 @@ class NDFrame: See Also -------- - :pandas_docs:`pandas.DataFrame.drop` + :pandas_api_docs:`pandas.DataFrame.drop` Examples -------- @@ -299,26 +283,185 @@ class NDFrame: ) return self._create_or_update_from_compiler(new_query_compiler, inplace) - # TODO implement arguments - def mean(self): + def mean(self, numeric_only=True): + """ + Return mean value for each numeric column + + TODO - implement remainder of pandas arguments + + Returns + ------- + pandas.Series + mean value for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.mean` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.mean() + AvgTicketPrice 628.253689 + Cancelled 0.128494 + DistanceKilometers 7092.142457 + DistanceMiles 4406.853010 + FlightDelay 0.251168 + FlightDelayMin 47.335171 + FlightTimeHour 8.518797 + FlightTimeMin 511.127842 + dayOfWeek 2.835975 + dtype: float64 + """ + if numeric_only == False: + raise NotImplementedError("Only mean of numeric fields is implemented") return self._query_compiler.mean() def sum(self, numeric_only=True): + """ + Return sum for each numeric column + + TODO - implement remainder of pandas arguments + + Returns + ------- + pandas.Series + sum for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.sum` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.sum() + AvgTicketPrice 8.204365e+06 + Cancelled 1.678000e+03 + DistanceKilometers 9.261629e+07 + DistanceMiles 5.754909e+07 + FlightDelay 3.280000e+03 + FlightDelayMin 6.181500e+05 + FlightTimeHour 1.112470e+05 + FlightTimeMin 6.674818e+06 + dayOfWeek 3.703500e+04 + dtype: float64 + """ if numeric_only == False: raise NotImplementedError("Only sum of numeric fields is implemented") return self._query_compiler.sum() def min(self, numeric_only=True): + """ + Return the minimum value for each numeric column + + TODO - implement remainder of pandas arguments + + Returns + ------- + pandas.Series + min value for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.min` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.min() + AvgTicketPrice 100.020531 + Cancelled 0.000000 + DistanceKilometers 0.000000 + DistanceMiles 0.000000 + FlightDelay 0.000000 + FlightDelayMin 0.000000 + FlightTimeHour 0.000000 + FlightTimeMin 0.000000 + dayOfWeek 0.000000 + dtype: float64 + """ if numeric_only == False: - raise NotImplementedError("Only sum of numeric fields is implemented") + raise NotImplementedError("Only min of numeric fields is implemented") return self._query_compiler.min() def max(self, numeric_only=True): + """ + Return the maximum value for each numeric column + + TODO - implement remainder of pandas arguments + + Returns + ------- + pandas.Series + max value for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.max` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.max() + AvgTicketPrice 1199.729004 + Cancelled 1.000000 + DistanceKilometers 19881.482422 + DistanceMiles 12353.780273 + FlightDelay 1.000000 + FlightDelayMin 360.000000 + FlightTimeHour 31.715034 + FlightTimeMin 1902.901978 + dayOfWeek 6.000000 + dtype: float64 + """ if numeric_only == False: - raise NotImplementedError("Only sum of numeric fields is implemented") + raise NotImplementedError("Only max of numeric fields is implemented") return self._query_compiler.max() def nunique(self): + """ + Return cardinality of each field. + + **Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields** + + This method will try and field aggregatable fields if possible if mapping has:: + + "customer_first_name" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword", + "ignore_above" : 256 + } + } + } + + we will aggregate ``customer_first_name`` columns using ``customer_first_name.keyword``. + + TODO - implement remainder of pandas arguments + + Returns + ------- + pandas.Series + cardinality of each column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.nunique` + + Examples + -------- + >>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user'] + >>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns) + >>> df.nunique() + category 6 + currency 1 + customer_birth_date 0 + customer_first_name 46 + user 46 + dtype: int64 + """ return self._query_compiler.nunique() def _hist(self, num_bins): @@ -341,7 +484,7 @@ class NDFrame: See Also -------- - :pandas_docs:`pandas.DataFrame.describe` + :pandas_api_docs:`pandas.DataFrame.describe` Examples -------- diff --git a/eland/operations.py b/eland/operations.py index d0c0b90..33faba0 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -183,12 +183,13 @@ class Operations: raise NotImplementedError("Can not count field matches if size is set {}".format(size)) columns = self.get_columns() - if columns is None: - columns = query_compiler._mappings.source_fields() + + # Get just aggregatable columns + aggregatable_columns = query_compiler._mappings.aggregatable_columns(columns) body = Query(query_params['query']) - for field in columns: + for field in aggregatable_columns.keys(): body.metric_aggs(field, func, field) response = query_compiler._client.search( @@ -198,10 +199,10 @@ class Operations: results = {} - for field in columns: - results[field] = response['aggregations'][field]['value'] + for key, value in aggregatable_columns.items(): + results[value] = response['aggregations'][key]['value'] - s = pd.Series(data=results, index=columns) + s = pd.Series(data=results, index=results.keys()) return s @@ -845,16 +846,16 @@ class Operations: def info_es(self, buf): buf.write("Operations:\n") - buf.write("\ttasks: {0}\n".format(self._tasks)) + buf.write(" tasks: {0}\n".format(self._tasks)) query_params, post_processing = self._resolve_tasks() size, sort_params = Operations._query_params_to_size_and_sort(query_params) columns = self.get_columns() - buf.write("\tsize: {0}\n".format(size)) - buf.write("\tsort_params: {0}\n".format(sort_params)) - buf.write("\tcolumns: {0}\n".format(columns)) - buf.write("\tpost_processing: {0}\n".format(post_processing)) + buf.write(" size: {0}\n".format(size)) + buf.write(" sort_params: {0}\n".format(sort_params)) + buf.write(" columns: {0}\n".format(columns)) + buf.write(" post_processing: {0}\n".format(post_processing)) def update_query(self, boolean_filter): task = ('boolean_filter', boolean_filter) diff --git a/eland/plotting.py b/eland/plotting.py index 0b5e4c7..b86f14d 100644 --- a/eland/plotting.py +++ b/eland/plotting.py @@ -10,7 +10,7 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10, **kwds): """ - See :pandas_docs:`pandas.DataFrame.hist` for usage. + See :pandas_api_docs:`pandas.DataFrame.hist` for usage. Notes ----- diff --git a/eland/series.py b/eland/series.py index 28b5d23..27c8fc5 100644 --- a/eland/series.py +++ b/eland/series.py @@ -215,3 +215,16 @@ class Series(NDFrame): return NotFilter(Equal(field=self.name, value=other)) else: raise NotImplementedError(other, type(other)) + + @property + def ndim(self): + """ + Returns 1 by definition of a Series1 + + Returns + ------- + int + By definition 1 + + """ + return 1 diff --git a/eland/tests/dataframe/test_keys_pytest.py b/eland/tests/dataframe/test_keys_pytest.py new file mode 100644 index 0000000..e81b3c9 --- /dev/null +++ b/eland/tests/dataframe/test_keys_pytest.py @@ -0,0 +1,26 @@ +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + +from pandas.testing import assert_index_equal + + +class TestDataFrameKeys(TestData): + + def test_ecommerce_keys(self): + pd_ecommerce = self.pd_ecommerce() + ed_ecommerce = self.ed_ecommerce() + + pd_keys = pd_ecommerce.keys() + ed_keys = ed_ecommerce.keys() + + assert_index_equal(pd_keys, ed_keys) + + def test_flights_keys(self): + pd_flights = self.pd_flights() + ed_flights = self.ed_flights() + + pd_keys = pd_flights.keys() + ed_keys = ed_flights.keys() + + assert_index_equal(pd_keys, ed_keys) diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index 0b13355..997a323 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -7,16 +7,16 @@ from eland.tests.common import TestData class TestDataFrameMetrics(TestData): - def test_to_mean(self): + def test_mean(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() - pd_mean = pd_flights.mean() - ed_mean = ed_flights.mean() + pd_mean = pd_flights.mean(numeric_only=True) + ed_mean = ed_flights.mean(numeric_only=True) assert_series_equal(pd_mean, ed_mean) - def test_to_sum(self): + def test_sum(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() @@ -25,7 +25,7 @@ class TestDataFrameMetrics(TestData): assert_series_equal(pd_sum, ed_sum) - def test_to_min(self): + def test_min(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() @@ -34,7 +34,7 @@ class TestDataFrameMetrics(TestData): assert_series_equal(pd_min, ed_min) - def test_to_max(self): + def test_max(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() diff --git a/eland/tests/dataframe/test_nunique_pytest.py b/eland/tests/dataframe/test_nunique_pytest.py new file mode 100644 index 0000000..1b9e530 --- /dev/null +++ b/eland/tests/dataframe/test_nunique_pytest.py @@ -0,0 +1,33 @@ +# File called _pytest for PyCharm compatability +import pandas as pd + +from pandas.util.testing import assert_series_equal + +from eland.tests.common import TestData + + +class TestDataFrameNUnique(TestData): + + def test_flights_nunique(self): + # Note pandas.nunique fails for dict columns (e.g. DestLocation) + columns = ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName'] + pd_flights = self.pd_flights()[columns] + ed_flights = self.ed_flights()[columns] + + pd_nunique = pd_flights.nunique() + ed_nunique = ed_flights.nunique() + + # TODO - ES is approximate counts so these aren't equal... + #E[left]: [13059, 2, 4, 156, 156, 143] + #E[right]: [13132, 2, 4, 156, 156, 143] + #assert_series_equal(pd_nunique, ed_nunique) + + def test_ecommerce_nunique(self): + columns = ['customer_first_name', 'customer_gender', 'day_of_week_i'] + pd_ecommerce = self.pd_ecommerce()[columns] + ed_ecommerce = self.ed_ecommerce()[columns] + + pd_nunique = pd_ecommerce.nunique() + ed_nunique = ed_ecommerce.nunique() + + assert_series_equal(pd_nunique, ed_nunique) diff --git a/eland/tests/dataframe/test_query_pytest.py b/eland/tests/dataframe/test_query_pytest.py index cabac07..8d0b06e 100644 --- a/eland/tests/dataframe/test_query_pytest.py +++ b/eland/tests/dataframe/test_query_pytest.py @@ -10,14 +10,14 @@ from eland.tests.common import assert_pandas_eland_frame_equal class TestDataFrameQuery(TestData): - def test_query(self): + def test_getitem_query(self): # Examples from: # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)}, index=['0', '1', '2', '3', '4']) # Now create index - index_name = 'eland_test_query1' + index_name = 'eland_test_query' ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) @@ -42,3 +42,12 @@ class TestDataFrameQuery(TestData): ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)] assert_pandas_eland_frame_equal(pd_q4, ed_q4) + + def test_query(self): + ed_flights = self.ed_flights() + pd_flights = self.pd_flights() + + #print(ed_flights.query('FlightDelayMin > 60').info_es()) + + print(pd_flights.query('FlightDelayMin > 60').shape) + print(ed_flights.query('FlightDelayMin > 60').shape) diff --git a/eland/tests/dataframe/test_shape_pytest.py b/eland/tests/dataframe/test_shape_pytest.py index 7d268f6..ce8ad5f 100644 --- a/eland/tests/dataframe/test_shape_pytest.py +++ b/eland/tests/dataframe/test_shape_pytest.py @@ -5,7 +5,7 @@ from eland.tests.common import TestData class TestDataFrameShape(TestData): - def test_to_shape1(self): + def test_ecommerce_shape(self): pd_ecommerce = self.pd_ecommerce() ed_ecommerce = self.ed_ecommerce() @@ -14,7 +14,7 @@ class TestDataFrameShape(TestData): assert pd_shape == ed_shape - def test_to_shape2(self): + def test_flights_shape(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() diff --git a/eland/tests/mappings/test_aggregatables_pytest.py b/eland/tests/mappings/test_aggregatables_pytest.py new file mode 100644 index 0000000..8d12f17 --- /dev/null +++ b/eland/tests/mappings/test_aggregatables_pytest.py @@ -0,0 +1,72 @@ +# File called _pytest for PyCharm compatability + +from eland.tests.common import TestData + + +class TestMappingsAggregatables(TestData): + + def test_ecommerce_all_aggregatables(self): + ed_ecommerce = self.ed_ecommerce() + + aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns() + + expected = {'category.keyword': 'category', + 'currency': 'currency', + 'customer_birth_date': 'customer_birth_date', + 'customer_first_name.keyword': 'customer_first_name', + 'customer_full_name.keyword': 'customer_full_name', + 'customer_gender': 'customer_gender', + 'customer_id': 'customer_id', + 'customer_last_name.keyword': 'customer_last_name', + 'customer_phone': 'customer_phone', + 'day_of_week': 'day_of_week', + 'day_of_week_i': 'day_of_week_i', + 'email': 'email', + 'geoip.city_name': 'geoip.city_name', + 'geoip.continent_name': 'geoip.continent_name', + 'geoip.country_iso_code': 'geoip.country_iso_code', + 'geoip.location': 'geoip.location', + 'geoip.region_name': 'geoip.region_name', + 'manufacturer.keyword': 'manufacturer', + 'order_date': 'order_date', + 'order_id': 'order_id', + 'products._id.keyword': 'products._id', + 'products.base_price': 'products.base_price', + 'products.base_unit_price': 'products.base_unit_price', + 'products.category.keyword': 'products.category', + 'products.created_on': 'products.created_on', + 'products.discount_amount': 'products.discount_amount', + 'products.discount_percentage': 'products.discount_percentage', + 'products.manufacturer.keyword': 'products.manufacturer', + 'products.min_price': 'products.min_price', + 'products.price': 'products.price', + 'products.product_id': 'products.product_id', + 'products.product_name.keyword': 'products.product_name', + 'products.quantity': 'products.quantity', + 'products.sku': 'products.sku', + 'products.tax_amount': 'products.tax_amount', + 'products.taxful_price': 'products.taxful_price', + 'products.taxless_price': 'products.taxless_price', + 'products.unit_discount_amount': 'products.unit_discount_amount', + 'sku': 'sku', + 'taxful_total_price': 'taxful_total_price', + 'taxless_total_price': 'taxless_total_price', + 'total_quantity': 'total_quantity', + 'total_unique_products': 'total_unique_products', + 'type': 'type', + 'user': 'user'} + + assert expected == aggregatables + + def test_ecommerce_selected_aggregatables(self): + ed_ecommerce = self.ed_ecommerce() + + expected = {'category.keyword': 'category', + 'currency': 'currency', + 'customer_birth_date': 'customer_birth_date', + 'customer_first_name.keyword': 'customer_first_name', + 'type': 'type', 'user': 'user'} + + aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns(expected.values()) + + assert expected == aggregatables diff --git a/requirements-dev.txt b/requirements-dev.txt index f6a7ec9..1386841 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,5 +2,4 @@ elasticsearch>=7.0.5 pandas==0.25.1 matplotlib pytest>=5.2.1 -sphinx_rtd_theme numpydoc==0.8