Merge pull request #37 from stevedodson/master

Resolve DataFrame.query issues + more docs
This commit is contained in:
stevedodson 2019-11-14 21:07:11 +01:00 committed by GitHub
commit dc2b1acbc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 580 additions and 71 deletions

View File

@ -57,7 +57,10 @@ except ImportError:
pd = None pd = None
''' '''
extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')} extlinks = {
'pandas_api_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', ''),
'pandas_user_guide': ('https://pandas.pydata.org/pandas-docs/stable/user_guide/%s.html', 'Pandas User Guide/')
}
numpydoc_attributes_as_param_list = False numpydoc_attributes_as_param_list = False
numpydoc_show_class_members = False numpydoc_show_class_members = False
@ -85,7 +88,8 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
html_theme = 'sphinx_rtd_theme' #html_theme = 'sphinx_rtd_theme'
html_theme = "pandas_sphinx_theme"
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,

View File

@ -0,0 +1,6 @@
eland.DataFrame.info_es
=======================
.. currentmodule:: eland
.. automethod:: DataFrame.info_es

View File

@ -0,0 +1,6 @@
eland.DataFrame.keys
====================
.. currentmodule:: eland
.. automethod:: DataFrame.keys

View File

@ -0,0 +1,6 @@
eland.DataFrame.max
===================
.. currentmodule:: eland
.. automethod:: DataFrame.max

View File

@ -0,0 +1,6 @@
eland.DataFrame.mean
====================
.. currentmodule:: eland
.. automethod:: DataFrame.mean

View File

@ -0,0 +1,6 @@
eland.DataFrame.min
===================
.. currentmodule:: eland
.. automethod:: DataFrame.min

View File

@ -0,0 +1,6 @@
eland.DataFrame.nunique
=======================
.. currentmodule:: eland
.. automethod:: DataFrame.nunique

View File

@ -0,0 +1,6 @@
eland.DataFrame.query
=====================
.. currentmodule:: eland
.. automethod:: DataFrame.query

View File

@ -0,0 +1,6 @@
eland.DataFrame.sum
===================
.. currentmodule:: eland
.. automethod:: DataFrame.sum

View File

@ -31,8 +31,10 @@ Indexing, iteration
:toctree: api/ :toctree: api/
DataFrame.head DataFrame.head
DataFrame.keys
DataFrame.tail DataFrame.tail
DataFrame.get DataFrame.get
DataFrame.query
Function application, GroupBy & window Function application, GroupBy & window
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -52,6 +54,11 @@ Computations / descriptive stats
DataFrame.count DataFrame.count
DataFrame.describe DataFrame.describe
DataFrame.info DataFrame.info
DataFrame.max
DataFrame.mean
DataFrame.min
DataFrame.sum
DataFrame.nunique
Reindexing / selection / label manipulation Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -74,3 +81,11 @@ Serialization / IO / conversion
DataFrame.info DataFrame.info
Elasticsearch utilities
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.info_es

View File

@ -5,6 +5,7 @@ from io import StringIO
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import six import six
from pandas.core.computation.eval import eval
from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer from pandas.core.indexing import check_bool_indexer
@ -41,7 +42,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame` :pandas_api_docs:`pandas.DataFrame`
Examples Examples
-------- --------
@ -119,11 +120,12 @@ class DataFrame(NDFrame):
Returns Returns
------- -------
pandas.Index
Elasticsearch field names as pandas.Index Elasticsearch field names as pandas.Index
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.columns` :pandas_api_docs:`pandas.DataFrame.columns`
Examples Examples
-------- --------
@ -153,7 +155,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.empty` :pandas_api_docs:`pandas.DataFrame.empty`
Examples Examples
-------- --------
@ -183,7 +185,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.head` :pandas_api_docs:`pandas.DataFrame.head`
Examples Examples
-------- --------
@ -218,7 +220,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.tail` :pandas_api_docs:`pandas.DataFrame.tail`
Examples Examples
-------- --------
@ -304,7 +306,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.count` :pandas_api_docs:`pandas.DataFrame.count`
Examples Examples
-------- --------
@ -318,11 +320,57 @@ class DataFrame(NDFrame):
def info_es(self): def info_es(self):
""" """
A debug summary of an eland DataFrame internals.
This includes the Elasticsearch search queries and query compiler task list.
Returns Returns
------- -------
None str
This method prints a debug summary of the task list Elasticsearch A debug summary of an eland DataFrame internals.
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)]
>>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
>>> df = df.tail()
>>> df
timestamp OriginAirportID DestAirportID FlightDelayMin
12608 2018-02-10 01:20:52 AMS CYEG 120
12720 2018-02-10 14:09:40 AMS BHM 255
12725 2018-02-10 00:53:01 AMS ATL 360
12823 2018-02-10 15:41:20 AMS NGO 120
12907 2018-02-11 20:08:25 AMS LIM 225
<BLANKLINE>
[5 rows x 4 columns]
>>> print(df.info_es())
index_pattern: flights
Index:
index_field: _id
is_source_field: False
Mappings:
capabilities: _source es_dtype pd_dtype searchable aggregatable
AvgTicketPrice True float float64 True True
Cancelled True boolean bool True True
Carrier True keyword object True True
Dest True keyword object True True
DestAirportID True keyword object True True
... ... ... ... ... ...
OriginLocation True geo_point object True True
OriginRegion True keyword object True True
OriginWeather True keyword object True True
dayOfWeek True integer int64 True True
timestamp True date datetime64[ns] True True
<BLANKLINE>
[27 rows x 5 columns]
Operations:
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
size: 5
sort_params: _doc:desc
columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
post_processing: ['sort_index']
<BLANKLINE>
""" """
buf = StringIO() buf = StringIO()
@ -350,7 +398,7 @@ class DataFrame(NDFrame):
This method prints information about a DataFrame including This method prints information about a DataFrame including
the index dtype and column dtypes, non-null values and memory usage. the index dtype and column dtypes, non-null values and memory usage.
See :pandas_docs:`pandas.DataFrame.info` for full details. See :pandas_api_docs:`pandas.DataFrame.info` for full details.
Notes Notes
----- -----
@ -368,7 +416,7 @@ class DataFrame(NDFrame):
customer_first_name 4675 non-null object customer_first_name 4675 non-null object
geoip.city_name 4094 non-null object geoip.city_name 4094 non-null object
dtypes: object(2) dtypes: object(2)
memory usage: 96.0 bytes memory usage: ...
""" """
if buf is None: # pragma: no cover if buf is None: # pragma: no cover
buf = sys.stdout buf = sys.stdout
@ -559,6 +607,26 @@ class DataFrame(NDFrame):
result = _buf.getvalue() result = _buf.getvalue()
return result return result
def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns
Parameters
----------
key: str
Attribute name.
Returns
-------
The value of the attribute.
"""
try:
return object.__getattribute__(self, key)
except AttributeError as e:
if key in self.columns:
return self[key]
raise e
def _getitem(self, key): def _getitem(self, key):
"""Get the column specified by key for this DataFrame. """Get the column specified by key for this DataFrame.
@ -695,7 +763,7 @@ class DataFrame(NDFrame):
""" """
Return a subset of the DataFrame's columns based on the column dtypes. Return a subset of the DataFrame's columns based on the column dtypes.
Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes` Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
""" """
empty_df = self._empty_pd_df() empty_df = self._empty_pd_df()
@ -720,6 +788,16 @@ class DataFrame(NDFrame):
return num_rows, num_columns return num_rows, num_columns
def keys(self): def keys(self):
"""
Return columns
See :pandas_api_docs:`pandas.DataFrame.keys`
Returns
-------
pandas.Index
Elasticsearch field names as pandas.Index
"""
return self.columns return self.columns
def aggregate(self, func, axis=0, *args, **kwargs): def aggregate(self, func, axis=0, *args, **kwargs):
@ -758,7 +836,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.aggregate` :pandas_api_docs:`pandas.DataFrame.aggregate`
Examples Examples
-------- --------
@ -788,19 +866,49 @@ class DataFrame(NDFrame):
hist = gfx.ed_hist_frame hist = gfx.ed_hist_frame
def query(self, expr, inplace=False, **kwargs): def query(self, expr):
"""Queries the Dataframe with a boolean expression """
Query the columns of a DataFrame with a boolean expression.
Returns: TODO - add additional pandas arguments
A new DataFrame if inplace=False
Parameters
----------
expr: str
A boolean expression
Returns
-------
eland.DataFrame:
DataFrame populated by results of the query
TODO - add link to eland user guide
See Also
--------
:pandas_api_docs:`pandas.DataFrame.query`
:pandas_user_guide:`indexing`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = df.query('FlightDelayMin > 60')
>>> df.info()
""" """
if isinstance(expr, BooleanFilter): if isinstance(expr, BooleanFilter):
return DataFrame( return DataFrame(
query_compiler=self._query_compiler._update_query(BooleanFilter(expr)) query_compiler=self._query_compiler._update_query(BooleanFilter(expr))
) )
elif isinstance(expr, six.string_types): elif isinstance(expr, six.string_types):
column_resolver = {}
for key in self.keys():
column_resolver[key] = self.get(key)
# Create fake resolvers - index resolver is empty
resolvers = column_resolver, {}
# Use pandas eval to parse query - TODO validate this further
filter = eval(expr, target=self, resolvers=tuple(tuple(resolvers)))
return DataFrame( return DataFrame(
query_compiler=self._query_compiler._update_query(ScriptFilter(expr)) query_compiler=self._query_compiler._update_query(filter)
) )
else: else:
raise NotImplementedError(expr, type(expr)) raise NotImplementedError(expr, type(expr))
@ -820,7 +928,7 @@ class DataFrame(NDFrame):
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.get` :pandas_api_docs:`pandas.DataFrame.get`
Examples Examples
-------- --------

View File

@ -58,5 +58,5 @@ class Index:
def info_es(self, buf): def info_es(self, buf):
buf.write("Index:\n") buf.write("Index:\n")
buf.write("\tindex_field: {0}\n".format(self.index_field)) buf.write(" index_field: {0}\n".format(self.index_field))
buf.write("\tis_source_field: {0}\n".format(self.is_source_field)) buf.write(" is_source_field: {0}\n".format(self.is_source_field))

View File

@ -408,6 +408,44 @@ class Mappings:
return is_source_field return is_source_field
def aggregatable_columns(self, columns=None):
"""
Return a dict of aggregatable columns from all columns or columns list
{'customer_full_name': 'customer_full_name.keyword', ...}
Logic here is that column names are '_source' fields and keyword fields
may be nested beneath the field. E.g.
customer_full_name: text
customer_full_name.keyword: keyword
customer_full_name.keyword is the aggregatable field for customer_full_name
Returns
-------
dict
e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
"""
if columns is None:
columns = self.source_fields()
aggregatables = {}
for column in columns:
capabilities = self.field_capabilities(column)
if capabilities['aggregatable']:
aggregatables[column] = column
else:
# Try 'column.keyword'
column_keyword = column + '.keyword'
capabilities = self.field_capabilities(column_keyword)
if capabilities['aggregatable']:
aggregatables[column_keyword] = column
else:
# Aggregations not supported for this field
raise ValueError("Aggregations not supported for ", column)
return aggregatables
def numeric_source_fields(self, columns, include_bool=True): def numeric_source_fields(self, columns, include_bool=True):
""" """
Returns Returns
@ -471,4 +509,4 @@ class Mappings:
def info_es(self, buf): def info_es(self, buf):
buf.write("Mappings:\n") buf.write("Mappings:\n")
buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities)) buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))

View File

@ -66,7 +66,7 @@ class NDFrame:
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.index` :pandas_api_docs:`pandas.DataFrame.index`
Examples Examples
-------- --------
@ -92,7 +92,7 @@ class NDFrame:
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.dtypes` :pandas_api_docs:`pandas.DataFrame.dtypes`
Examples Examples
-------- --------
@ -125,22 +125,6 @@ class NDFrame:
def __getitem__(self, key): def __getitem__(self, key):
return self._getitem(key) return self._getitem(key)
def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns
Args:
key (str): Attribute name.
Returns:
The value of the attribute.
"""
try:
return object.__getattribute__(self, key)
except AttributeError as e:
if key in self.columns:
return self[key]
raise e
def __sizeof__(self): def __sizeof__(self):
# Don't default to pandas, just return approximation TODO - make this more accurate # Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler) return sys.getsizeof(self._query_compiler)
@ -190,7 +174,7 @@ class NDFrame:
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.drop` :pandas_api_docs:`pandas.DataFrame.drop`
Examples Examples
-------- --------
@ -299,26 +283,185 @@ class NDFrame:
) )
return self._create_or_update_from_compiler(new_query_compiler, inplace) return self._create_or_update_from_compiler(new_query_compiler, inplace)
# TODO implement arguments def mean(self, numeric_only=True):
def mean(self): """
Return mean value for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
mean value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.mean`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.mean()
AvgTicketPrice 628.253689
Cancelled 0.128494
DistanceKilometers 7092.142457
DistanceMiles 4406.853010
FlightDelay 0.251168
FlightDelayMin 47.335171
FlightTimeHour 8.518797
FlightTimeMin 511.127842
dayOfWeek 2.835975
dtype: float64
"""
if numeric_only == False:
raise NotImplementedError("Only mean of numeric fields is implemented")
return self._query_compiler.mean() return self._query_compiler.mean()
def sum(self, numeric_only=True): def sum(self, numeric_only=True):
"""
Return sum for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
sum for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.sum`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.sum()
AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03
DistanceKilometers 9.261629e+07
DistanceMiles 5.754909e+07
FlightDelay 3.280000e+03
FlightDelayMin 6.181500e+05
FlightTimeHour 1.112470e+05
FlightTimeMin 6.674818e+06
dayOfWeek 3.703500e+04
dtype: float64
"""
if numeric_only == False: if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented") raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.sum() return self._query_compiler.sum()
def min(self, numeric_only=True): def min(self, numeric_only=True):
"""
Return the minimum value for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
min value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.min`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.min()
AvgTicketPrice 100.020531
Cancelled 0.000000
DistanceKilometers 0.000000
DistanceMiles 0.000000
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 0.000000
FlightTimeMin 0.000000
dayOfWeek 0.000000
dtype: float64
"""
if numeric_only == False: if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented") raise NotImplementedError("Only min of numeric fields is implemented")
return self._query_compiler.min() return self._query_compiler.min()
def max(self, numeric_only=True): def max(self, numeric_only=True):
"""
Return the maximum value for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
max value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.max`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.max()
AvgTicketPrice 1199.729004
Cancelled 1.000000
DistanceKilometers 19881.482422
DistanceMiles 12353.780273
FlightDelay 1.000000
FlightDelayMin 360.000000
FlightTimeHour 31.715034
FlightTimeMin 1902.901978
dayOfWeek 6.000000
dtype: float64
"""
if numeric_only == False: if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented") raise NotImplementedError("Only max of numeric fields is implemented")
return self._query_compiler.max() return self._query_compiler.max()
def nunique(self): def nunique(self):
"""
Return cardinality of each field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields**
This method will try and field aggregatable fields if possible if mapping has::
"customer_first_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
we will aggregate ``customer_first_name`` columns using ``customer_first_name.keyword``.
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
cardinality of each column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.nunique`
Examples
--------
>>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns)
>>> df.nunique()
category 6
currency 1
customer_birth_date 0
customer_first_name 46
user 46
dtype: int64
"""
return self._query_compiler.nunique() return self._query_compiler.nunique()
def _hist(self, num_bins): def _hist(self, num_bins):
@ -341,7 +484,7 @@ class NDFrame:
See Also See Also
-------- --------
:pandas_docs:`pandas.DataFrame.describe` :pandas_api_docs:`pandas.DataFrame.describe`
Examples Examples
-------- --------

View File

@ -183,12 +183,13 @@ class Operations:
raise NotImplementedError("Can not count field matches if size is set {}".format(size)) raise NotImplementedError("Can not count field matches if size is set {}".format(size))
columns = self.get_columns() columns = self.get_columns()
if columns is None:
columns = query_compiler._mappings.source_fields() # Get just aggregatable columns
aggregatable_columns = query_compiler._mappings.aggregatable_columns(columns)
body = Query(query_params['query']) body = Query(query_params['query'])
for field in columns: for field in aggregatable_columns.keys():
body.metric_aggs(field, func, field) body.metric_aggs(field, func, field)
response = query_compiler._client.search( response = query_compiler._client.search(
@ -198,10 +199,10 @@ class Operations:
results = {} results = {}
for field in columns: for key, value in aggregatable_columns.items():
results[field] = response['aggregations'][field]['value'] results[value] = response['aggregations'][key]['value']
s = pd.Series(data=results, index=columns) s = pd.Series(data=results, index=results.keys())
return s return s
@ -845,16 +846,16 @@ class Operations:
def info_es(self, buf): def info_es(self, buf):
buf.write("Operations:\n") buf.write("Operations:\n")
buf.write("\ttasks: {0}\n".format(self._tasks)) buf.write(" tasks: {0}\n".format(self._tasks))
query_params, post_processing = self._resolve_tasks() query_params, post_processing = self._resolve_tasks()
size, sort_params = Operations._query_params_to_size_and_sort(query_params) size, sort_params = Operations._query_params_to_size_and_sort(query_params)
columns = self.get_columns() columns = self.get_columns()
buf.write("\tsize: {0}\n".format(size)) buf.write(" size: {0}\n".format(size))
buf.write("\tsort_params: {0}\n".format(sort_params)) buf.write(" sort_params: {0}\n".format(sort_params))
buf.write("\tcolumns: {0}\n".format(columns)) buf.write(" columns: {0}\n".format(columns))
buf.write("\tpost_processing: {0}\n".format(post_processing)) buf.write(" post_processing: {0}\n".format(post_processing))
def update_query(self, boolean_filter): def update_query(self, boolean_filter):
task = ('boolean_filter', boolean_filter) task = ('boolean_filter', boolean_filter)

View File

@ -10,7 +10,7 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds): sharey=False, figsize=None, layout=None, bins=10, **kwds):
""" """
See :pandas_docs:`pandas.DataFrame.hist` for usage. See :pandas_api_docs:`pandas.DataFrame.hist` for usage.
Notes Notes
----- -----

View File

@ -215,3 +215,16 @@ class Series(NDFrame):
return NotFilter(Equal(field=self.name, value=other)) return NotFilter(Equal(field=self.name, value=other))
else: else:
raise NotImplementedError(other, type(other)) raise NotImplementedError(other, type(other))
@property
def ndim(self):
"""
Returns 1 by definition of a Series1
Returns
-------
int
By definition 1
"""
return 1

View File

@ -0,0 +1,26 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.testing import assert_index_equal
class TestDataFrameKeys(TestData):
def test_ecommerce_keys(self):
pd_ecommerce = self.pd_ecommerce()
ed_ecommerce = self.ed_ecommerce()
pd_keys = pd_ecommerce.keys()
ed_keys = ed_ecommerce.keys()
assert_index_equal(pd_keys, ed_keys)
def test_flights_keys(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_keys = pd_flights.keys()
ed_keys = ed_flights.keys()
assert_index_equal(pd_keys, ed_keys)

View File

@ -7,16 +7,16 @@ from eland.tests.common import TestData
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
def test_to_mean(self): def test_mean(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_mean = pd_flights.mean() pd_mean = pd_flights.mean(numeric_only=True)
ed_mean = ed_flights.mean() ed_mean = ed_flights.mean(numeric_only=True)
assert_series_equal(pd_mean, ed_mean) assert_series_equal(pd_mean, ed_mean)
def test_to_sum(self): def test_sum(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
@ -25,7 +25,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(pd_sum, ed_sum) assert_series_equal(pd_sum, ed_sum)
def test_to_min(self): def test_min(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
@ -34,7 +34,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(pd_min, ed_min) assert_series_equal(pd_min, ed_min)
def test_to_max(self): def test_max(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()

View File

@ -0,0 +1,33 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameNUnique(TestData):
def test_flights_nunique(self):
# Note pandas.nunique fails for dict columns (e.g. DestLocation)
columns = ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName']
pd_flights = self.pd_flights()[columns]
ed_flights = self.ed_flights()[columns]
pd_nunique = pd_flights.nunique()
ed_nunique = ed_flights.nunique()
# TODO - ES is approximate counts so these aren't equal...
#E[left]: [13059, 2, 4, 156, 156, 143]
#E[right]: [13132, 2, 4, 156, 156, 143]
#assert_series_equal(pd_nunique, ed_nunique)
def test_ecommerce_nunique(self):
columns = ['customer_first_name', 'customer_gender', 'day_of_week_i']
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
pd_nunique = pd_ecommerce.nunique()
ed_nunique = ed_ecommerce.nunique()
assert_series_equal(pd_nunique, ed_nunique)

View File

@ -10,14 +10,14 @@ from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameQuery(TestData): class TestDataFrameQuery(TestData):
def test_query(self): def test_getitem_query(self):
# Examples from: # Examples from:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)}, pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
index=['0', '1', '2', '3', '4']) index=['0', '1', '2', '3', '4'])
# Now create index # Now create index
index_name = 'eland_test_query1' index_name = 'eland_test_query'
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True) ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
@ -42,3 +42,12 @@ class TestDataFrameQuery(TestData):
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)] ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
assert_pandas_eland_frame_equal(pd_q4, ed_q4) assert_pandas_eland_frame_equal(pd_q4, ed_q4)
def test_query(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
#print(ed_flights.query('FlightDelayMin > 60').info_es())
print(pd_flights.query('FlightDelayMin > 60').shape)
print(ed_flights.query('FlightDelayMin > 60').shape)

View File

@ -5,7 +5,7 @@ from eland.tests.common import TestData
class TestDataFrameShape(TestData): class TestDataFrameShape(TestData):
def test_to_shape1(self): def test_ecommerce_shape(self):
pd_ecommerce = self.pd_ecommerce() pd_ecommerce = self.pd_ecommerce()
ed_ecommerce = self.ed_ecommerce() ed_ecommerce = self.ed_ecommerce()
@ -14,7 +14,7 @@ class TestDataFrameShape(TestData):
assert pd_shape == ed_shape assert pd_shape == ed_shape
def test_to_shape2(self): def test_flights_shape(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()

View File

@ -0,0 +1,72 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
class TestMappingsAggregatables(TestData):
def test_ecommerce_all_aggregatables(self):
ed_ecommerce = self.ed_ecommerce()
aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns()
expected = {'category.keyword': 'category',
'currency': 'currency',
'customer_birth_date': 'customer_birth_date',
'customer_first_name.keyword': 'customer_first_name',
'customer_full_name.keyword': 'customer_full_name',
'customer_gender': 'customer_gender',
'customer_id': 'customer_id',
'customer_last_name.keyword': 'customer_last_name',
'customer_phone': 'customer_phone',
'day_of_week': 'day_of_week',
'day_of_week_i': 'day_of_week_i',
'email': 'email',
'geoip.city_name': 'geoip.city_name',
'geoip.continent_name': 'geoip.continent_name',
'geoip.country_iso_code': 'geoip.country_iso_code',
'geoip.location': 'geoip.location',
'geoip.region_name': 'geoip.region_name',
'manufacturer.keyword': 'manufacturer',
'order_date': 'order_date',
'order_id': 'order_id',
'products._id.keyword': 'products._id',
'products.base_price': 'products.base_price',
'products.base_unit_price': 'products.base_unit_price',
'products.category.keyword': 'products.category',
'products.created_on': 'products.created_on',
'products.discount_amount': 'products.discount_amount',
'products.discount_percentage': 'products.discount_percentage',
'products.manufacturer.keyword': 'products.manufacturer',
'products.min_price': 'products.min_price',
'products.price': 'products.price',
'products.product_id': 'products.product_id',
'products.product_name.keyword': 'products.product_name',
'products.quantity': 'products.quantity',
'products.sku': 'products.sku',
'products.tax_amount': 'products.tax_amount',
'products.taxful_price': 'products.taxful_price',
'products.taxless_price': 'products.taxless_price',
'products.unit_discount_amount': 'products.unit_discount_amount',
'sku': 'sku',
'taxful_total_price': 'taxful_total_price',
'taxless_total_price': 'taxless_total_price',
'total_quantity': 'total_quantity',
'total_unique_products': 'total_unique_products',
'type': 'type',
'user': 'user'}
assert expected == aggregatables
def test_ecommerce_selected_aggregatables(self):
ed_ecommerce = self.ed_ecommerce()
expected = {'category.keyword': 'category',
'currency': 'currency',
'customer_birth_date': 'customer_birth_date',
'customer_first_name.keyword': 'customer_first_name',
'type': 'type', 'user': 'user'}
aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns(expected.values())
assert expected == aggregatables

View File

@ -2,5 +2,4 @@ elasticsearch>=7.0.5
pandas==0.25.1 pandas==0.25.1
matplotlib matplotlib
pytest>=5.2.1 pytest>=5.2.1
sphinx_rtd_theme
numpydoc==0.8 numpydoc==0.8