Resolving DataFrame.query issues + more docs

This commit is contained in:
Stephen Dodson 2019-11-14 20:04:38 +00:00
parent e76a4de79d
commit 5a546577f4
24 changed files with 580 additions and 71 deletions

View File

@ -57,7 +57,10 @@ except ImportError:
pd = None
'''
extlinks = {'pandas_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', '')}
extlinks = {
'pandas_api_docs': ('https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/%s.html', ''),
'pandas_user_guide': ('https://pandas.pydata.org/pandas-docs/stable/user_guide/%s.html', 'Pandas User Guide/')
}
numpydoc_attributes_as_param_list = False
numpydoc_show_class_members = False
@ -85,7 +88,8 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
#html_theme = 'sphinx_rtd_theme'
html_theme = "pandas_sphinx_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,

View File

@ -0,0 +1,6 @@
eland.DataFrame.info_es
=======================
.. currentmodule:: eland
.. automethod:: DataFrame.info_es

View File

@ -0,0 +1,6 @@
eland.DataFrame.keys
====================
.. currentmodule:: eland
.. automethod:: DataFrame.keys

View File

@ -0,0 +1,6 @@
eland.DataFrame.max
===================
.. currentmodule:: eland
.. automethod:: DataFrame.max

View File

@ -0,0 +1,6 @@
eland.DataFrame.mean
====================
.. currentmodule:: eland
.. automethod:: DataFrame.mean

View File

@ -0,0 +1,6 @@
eland.DataFrame.min
===================
.. currentmodule:: eland
.. automethod:: DataFrame.min

View File

@ -0,0 +1,6 @@
eland.DataFrame.nunique
=======================
.. currentmodule:: eland
.. automethod:: DataFrame.nunique

View File

@ -0,0 +1,6 @@
eland.DataFrame.query
=====================
.. currentmodule:: eland
.. automethod:: DataFrame.query

View File

@ -0,0 +1,6 @@
eland.DataFrame.sum
===================
.. currentmodule:: eland
.. automethod:: DataFrame.sum

View File

@ -31,8 +31,10 @@ Indexing, iteration
:toctree: api/
DataFrame.head
DataFrame.keys
DataFrame.tail
DataFrame.get
DataFrame.query
Function application, GroupBy & window
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -52,6 +54,11 @@ Computations / descriptive stats
DataFrame.count
DataFrame.describe
DataFrame.info
DataFrame.max
DataFrame.mean
DataFrame.min
DataFrame.sum
DataFrame.nunique
Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -74,3 +81,11 @@ Serialization / IO / conversion
DataFrame.info
Elasticsearch utilities
~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: api/
DataFrame.info_es

View File

@ -5,6 +5,7 @@ from io import StringIO
import numpy as np
import pandas as pd
import six
from pandas.core.computation.eval import eval
from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer
@ -41,7 +42,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame`
:pandas_api_docs:`pandas.DataFrame`
Examples
--------
@ -119,11 +120,12 @@ class DataFrame(NDFrame):
Returns
-------
Elasticsearch field names as pandas.Index
pandas.Index
Elasticsearch field names as pandas.Index
See Also
--------
:pandas_docs:`pandas.DataFrame.columns`
:pandas_api_docs:`pandas.DataFrame.columns`
Examples
--------
@ -153,7 +155,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame.empty`
:pandas_api_docs:`pandas.DataFrame.empty`
Examples
--------
@ -183,7 +185,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame.head`
:pandas_api_docs:`pandas.DataFrame.head`
Examples
--------
@ -218,7 +220,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame.tail`
:pandas_api_docs:`pandas.DataFrame.tail`
Examples
--------
@ -304,7 +306,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame.count`
:pandas_api_docs:`pandas.DataFrame.count`
Examples
--------
@ -318,11 +320,57 @@ class DataFrame(NDFrame):
def info_es(self):
"""
A debug summary of an eland DataFrame internals.
This includes the Elasticsearch search queries and query compiler task list.
Returns
-------
None
This method prints a debug summary of the task list Elasticsearch
str
A debug summary of an eland DataFrame internals.
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = df[(df.OriginAirportID == 'AMS') & (df.FlightDelayMin > 60)]
>>> df = df[['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']]
>>> df = df.tail()
>>> df
timestamp OriginAirportID DestAirportID FlightDelayMin
12608 2018-02-10 01:20:52 AMS CYEG 120
12720 2018-02-10 14:09:40 AMS BHM 255
12725 2018-02-10 00:53:01 AMS ATL 360
12823 2018-02-10 15:41:20 AMS NGO 120
12907 2018-02-11 20:08:25 AMS LIM 225
<BLANKLINE>
[5 rows x 4 columns]
>>> print(df.info_es())
index_pattern: flights
Index:
index_field: _id
is_source_field: False
Mappings:
capabilities: _source es_dtype pd_dtype searchable aggregatable
AvgTicketPrice True float float64 True True
Cancelled True boolean bool True True
Carrier True keyword object True True
Dest True keyword object True True
DestAirportID True keyword object True True
... ... ... ... ... ...
OriginLocation True geo_point object True True
OriginRegion True keyword object True True
OriginWeather True keyword object True True
dayOfWeek True integer int64 True True
timestamp True date datetime64[ns] True True
<BLANKLINE>
[27 rows x 5 columns]
Operations:
tasks: [('boolean_filter', {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}), ('columns', ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']), ('tail', ('_doc', 5))]
size: 5
sort_params: _doc:desc
columns: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
post_processing: ['sort_index']
<BLANKLINE>
"""
buf = StringIO()
@ -350,7 +398,7 @@ class DataFrame(NDFrame):
This method prints information about a DataFrame including
the index dtype and column dtypes, non-null values and memory usage.
See :pandas_docs:`pandas.DataFrame.info` for full details.
See :pandas_api_docs:`pandas.DataFrame.info` for full details.
Notes
-----
@ -368,7 +416,7 @@ class DataFrame(NDFrame):
customer_first_name 4675 non-null object
geoip.city_name 4094 non-null object
dtypes: object(2)
memory usage: 96.0 bytes
memory usage: ...
"""
if buf is None: # pragma: no cover
buf = sys.stdout
@ -559,6 +607,26 @@ class DataFrame(NDFrame):
result = _buf.getvalue()
return result
def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns
Parameters
----------
key: str
Attribute name.
Returns
-------
The value of the attribute.
"""
try:
return object.__getattribute__(self, key)
except AttributeError as e:
if key in self.columns:
return self[key]
raise e
def _getitem(self, key):
"""Get the column specified by key for this DataFrame.
@ -695,7 +763,7 @@ class DataFrame(NDFrame):
"""
Return a subset of the DataFrame's columns based on the column dtypes.
Compatible with :pandas_docs:`pandas.DataFrame.select_dtypes`
Compatible with :pandas_api_docs:`pandas.DataFrame.select_dtypes`
"""
empty_df = self._empty_pd_df()
@ -720,6 +788,16 @@ class DataFrame(NDFrame):
return num_rows, num_columns
def keys(self):
"""
Return columns
See :pandas_api_docs:`pandas.DataFrame.keys`
Returns
-------
pandas.Index
Elasticsearch field names as pandas.Index
"""
return self.columns
def aggregate(self, func, axis=0, *args, **kwargs):
@ -758,7 +836,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame.aggregate`
:pandas_api_docs:`pandas.DataFrame.aggregate`
Examples
--------
@ -788,19 +866,49 @@ class DataFrame(NDFrame):
hist = gfx.ed_hist_frame
def query(self, expr, inplace=False, **kwargs):
"""Queries the Dataframe with a boolean expression
def query(self, expr):
"""
Query the columns of a DataFrame with a boolean expression.
Returns:
A new DataFrame if inplace=False
TODO - add additional pandas arguments
Parameters
----------
expr: str
A boolean expression
Returns
-------
eland.DataFrame:
DataFrame populated by results of the query
TODO - add link to eland user guide
See Also
--------
:pandas_api_docs:`pandas.DataFrame.query`
:pandas_user_guide:`indexing`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df = df.query('FlightDelayMin > 60')
>>> df.info()
"""
if isinstance(expr, BooleanFilter):
return DataFrame(
query_compiler=self._query_compiler._update_query(BooleanFilter(expr))
)
elif isinstance(expr, six.string_types):
column_resolver = {}
for key in self.keys():
column_resolver[key] = self.get(key)
# Create fake resolvers - index resolver is empty
resolvers = column_resolver, {}
# Use pandas eval to parse query - TODO validate this further
filter = eval(expr, target=self, resolvers=tuple(tuple(resolvers)))
return DataFrame(
query_compiler=self._query_compiler._update_query(ScriptFilter(expr))
query_compiler=self._query_compiler._update_query(filter)
)
else:
raise NotImplementedError(expr, type(expr))
@ -820,7 +928,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_docs:`pandas.DataFrame.get`
:pandas_api_docs:`pandas.DataFrame.get`
Examples
--------

View File

@ -58,5 +58,5 @@ class Index:
def info_es(self, buf):
buf.write("Index:\n")
buf.write("\tindex_field: {0}\n".format(self.index_field))
buf.write("\tis_source_field: {0}\n".format(self.is_source_field))
buf.write(" index_field: {0}\n".format(self.index_field))
buf.write(" is_source_field: {0}\n".format(self.is_source_field))

View File

@ -408,6 +408,44 @@ class Mappings:
return is_source_field
def aggregatable_columns(self, columns=None):
"""
Return a dict of aggregatable columns from all columns or columns list
{'customer_full_name': 'customer_full_name.keyword', ...}
Logic here is that column names are '_source' fields and keyword fields
may be nested beneath the field. E.g.
customer_full_name: text
customer_full_name.keyword: keyword
customer_full_name.keyword is the aggregatable field for customer_full_name
Returns
-------
dict
e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
"""
if columns is None:
columns = self.source_fields()
aggregatables = {}
for column in columns:
capabilities = self.field_capabilities(column)
if capabilities['aggregatable']:
aggregatables[column] = column
else:
# Try 'column.keyword'
column_keyword = column + '.keyword'
capabilities = self.field_capabilities(column_keyword)
if capabilities['aggregatable']:
aggregatables[column_keyword] = column
else:
# Aggregations not supported for this field
raise ValueError("Aggregations not supported for ", column)
return aggregatables
def numeric_source_fields(self, columns, include_bool=True):
"""
Returns
@ -471,4 +509,4 @@ class Mappings:
def info_es(self, buf):
buf.write("Mappings:\n")
buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))
buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))

View File

@ -66,7 +66,7 @@ class NDFrame:
See Also
--------
:pandas_docs:`pandas.DataFrame.index`
:pandas_api_docs:`pandas.DataFrame.index`
Examples
--------
@ -92,7 +92,7 @@ class NDFrame:
See Also
--------
:pandas_docs:`pandas.DataFrame.dtypes`
:pandas_api_docs:`pandas.DataFrame.dtypes`
Examples
--------
@ -125,22 +125,6 @@ class NDFrame:
def __getitem__(self, key):
return self._getitem(key)
def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns
Args:
key (str): Attribute name.
Returns:
The value of the attribute.
"""
try:
return object.__getattribute__(self, key)
except AttributeError as e:
if key in self.columns:
return self[key]
raise e
def __sizeof__(self):
# Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler)
@ -190,7 +174,7 @@ class NDFrame:
See Also
--------
:pandas_docs:`pandas.DataFrame.drop`
:pandas_api_docs:`pandas.DataFrame.drop`
Examples
--------
@ -299,26 +283,185 @@ class NDFrame:
)
return self._create_or_update_from_compiler(new_query_compiler, inplace)
# TODO implement arguments
def mean(self):
def mean(self, numeric_only=True):
"""
Return mean value for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
mean value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.mean`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.mean()
AvgTicketPrice 628.253689
Cancelled 0.128494
DistanceKilometers 7092.142457
DistanceMiles 4406.853010
FlightDelay 0.251168
FlightDelayMin 47.335171
FlightTimeHour 8.518797
FlightTimeMin 511.127842
dayOfWeek 2.835975
dtype: float64
"""
if numeric_only == False:
raise NotImplementedError("Only mean of numeric fields is implemented")
return self._query_compiler.mean()
def sum(self, numeric_only=True):
"""
Return sum for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
sum for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.sum`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.sum()
AvgTicketPrice 8.204365e+06
Cancelled 1.678000e+03
DistanceKilometers 9.261629e+07
DistanceMiles 5.754909e+07
FlightDelay 3.280000e+03
FlightDelayMin 6.181500e+05
FlightTimeHour 1.112470e+05
FlightTimeMin 6.674818e+06
dayOfWeek 3.703500e+04
dtype: float64
"""
if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.sum()
def min(self, numeric_only=True):
"""
Return the minimum value for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
min value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.min`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.min()
AvgTicketPrice 100.020531
Cancelled 0.000000
DistanceKilometers 0.000000
DistanceMiles 0.000000
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 0.000000
FlightTimeMin 0.000000
dayOfWeek 0.000000
dtype: float64
"""
if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented")
raise NotImplementedError("Only min of numeric fields is implemented")
return self._query_compiler.min()
def max(self, numeric_only=True):
"""
Return the maximum value for each numeric column
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
max value for each numeric column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.max`
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.max()
AvgTicketPrice 1199.729004
Cancelled 1.000000
DistanceKilometers 19881.482422
DistanceMiles 12353.780273
FlightDelay 1.000000
FlightDelayMin 360.000000
FlightTimeHour 31.715034
FlightTimeMin 1902.901978
dayOfWeek 6.000000
dtype: float64
"""
if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented")
raise NotImplementedError("Only max of numeric fields is implemented")
return self._query_compiler.max()
def nunique(self):
"""
Return cardinality of each field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields**
This method will try and field aggregatable fields if possible if mapping has::
"customer_first_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
we will aggregate ``customer_first_name`` columns using ``customer_first_name.keyword``.
TODO - implement remainder of pandas arguments
Returns
-------
pandas.Series
cardinality of each column
See Also
--------
:pandas_api_docs:`pandas.DataFrame.nunique`
Examples
--------
>>> columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=columns)
>>> df.nunique()
category 6
currency 1
customer_birth_date 0
customer_first_name 46
user 46
dtype: int64
"""
return self._query_compiler.nunique()
def _hist(self, num_bins):
@ -341,7 +484,7 @@ class NDFrame:
See Also
--------
:pandas_docs:`pandas.DataFrame.describe`
:pandas_api_docs:`pandas.DataFrame.describe`
Examples
--------

View File

@ -183,12 +183,13 @@ class Operations:
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
columns = self.get_columns()
if columns is None:
columns = query_compiler._mappings.source_fields()
# Get just aggregatable columns
aggregatable_columns = query_compiler._mappings.aggregatable_columns(columns)
body = Query(query_params['query'])
for field in columns:
for field in aggregatable_columns.keys():
body.metric_aggs(field, func, field)
response = query_compiler._client.search(
@ -198,10 +199,10 @@ class Operations:
results = {}
for field in columns:
results[field] = response['aggregations'][field]['value']
for key, value in aggregatable_columns.items():
results[value] = response['aggregations'][key]['value']
s = pd.Series(data=results, index=columns)
s = pd.Series(data=results, index=results.keys())
return s
@ -845,16 +846,16 @@ class Operations:
def info_es(self, buf):
buf.write("Operations:\n")
buf.write("\ttasks: {0}\n".format(self._tasks))
buf.write(" tasks: {0}\n".format(self._tasks))
query_params, post_processing = self._resolve_tasks()
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
columns = self.get_columns()
buf.write("\tsize: {0}\n".format(size))
buf.write("\tsort_params: {0}\n".format(sort_params))
buf.write("\tcolumns: {0}\n".format(columns))
buf.write("\tpost_processing: {0}\n".format(post_processing))
buf.write(" size: {0}\n".format(size))
buf.write(" sort_params: {0}\n".format(sort_params))
buf.write(" columns: {0}\n".format(columns))
buf.write(" post_processing: {0}\n".format(post_processing))
def update_query(self, boolean_filter):
task = ('boolean_filter', boolean_filter)

View File

@ -10,7 +10,7 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
"""
See :pandas_docs:`pandas.DataFrame.hist` for usage.
See :pandas_api_docs:`pandas.DataFrame.hist` for usage.
Notes
-----

View File

@ -215,3 +215,16 @@ class Series(NDFrame):
return NotFilter(Equal(field=self.name, value=other))
else:
raise NotImplementedError(other, type(other))
@property
def ndim(self):
"""
Returns 1 by definition of a Series1
Returns
-------
int
By definition 1
"""
return 1

View File

@ -0,0 +1,26 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.testing import assert_index_equal
class TestDataFrameKeys(TestData):
def test_ecommerce_keys(self):
pd_ecommerce = self.pd_ecommerce()
ed_ecommerce = self.ed_ecommerce()
pd_keys = pd_ecommerce.keys()
ed_keys = ed_ecommerce.keys()
assert_index_equal(pd_keys, ed_keys)
def test_flights_keys(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_keys = pd_flights.keys()
ed_keys = ed_flights.keys()
assert_index_equal(pd_keys, ed_keys)

View File

@ -7,16 +7,16 @@ from eland.tests.common import TestData
class TestDataFrameMetrics(TestData):
def test_to_mean(self):
def test_mean(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_mean = pd_flights.mean()
ed_mean = ed_flights.mean()
pd_mean = pd_flights.mean(numeric_only=True)
ed_mean = ed_flights.mean(numeric_only=True)
assert_series_equal(pd_mean, ed_mean)
def test_to_sum(self):
def test_sum(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
@ -25,7 +25,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(pd_sum, ed_sum)
def test_to_min(self):
def test_min(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
@ -34,7 +34,7 @@ class TestDataFrameMetrics(TestData):
assert_series_equal(pd_min, ed_min)
def test_to_max(self):
def test_max(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()

View File

@ -0,0 +1,33 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameNUnique(TestData):
def test_flights_nunique(self):
# Note pandas.nunique fails for dict columns (e.g. DestLocation)
columns = ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName']
pd_flights = self.pd_flights()[columns]
ed_flights = self.ed_flights()[columns]
pd_nunique = pd_flights.nunique()
ed_nunique = ed_flights.nunique()
# TODO - ES is approximate counts so these aren't equal...
#E[left]: [13059, 2, 4, 156, 156, 143]
#E[right]: [13132, 2, 4, 156, 156, 143]
#assert_series_equal(pd_nunique, ed_nunique)
def test_ecommerce_nunique(self):
columns = ['customer_first_name', 'customer_gender', 'day_of_week_i']
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
pd_nunique = pd_ecommerce.nunique()
ed_nunique = ed_ecommerce.nunique()
assert_series_equal(pd_nunique, ed_nunique)

View File

@ -10,14 +10,14 @@ from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameQuery(TestData):
def test_query(self):
def test_getitem_query(self):
# Examples from:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
index=['0', '1', '2', '3', '4'])
# Now create index
index_name = 'eland_test_query1'
index_name = 'eland_test_query'
ed_df = ed.pd_to_ed(pd_df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
@ -42,3 +42,12 @@ class TestDataFrameQuery(TestData):
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
def test_query(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
#print(ed_flights.query('FlightDelayMin > 60').info_es())
print(pd_flights.query('FlightDelayMin > 60').shape)
print(ed_flights.query('FlightDelayMin > 60').shape)

View File

@ -5,7 +5,7 @@ from eland.tests.common import TestData
class TestDataFrameShape(TestData):
def test_to_shape1(self):
def test_ecommerce_shape(self):
pd_ecommerce = self.pd_ecommerce()
ed_ecommerce = self.ed_ecommerce()
@ -14,7 +14,7 @@ class TestDataFrameShape(TestData):
assert pd_shape == ed_shape
def test_to_shape2(self):
def test_flights_shape(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()

View File

@ -0,0 +1,72 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
class TestMappingsAggregatables(TestData):
def test_ecommerce_all_aggregatables(self):
ed_ecommerce = self.ed_ecommerce()
aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns()
expected = {'category.keyword': 'category',
'currency': 'currency',
'customer_birth_date': 'customer_birth_date',
'customer_first_name.keyword': 'customer_first_name',
'customer_full_name.keyword': 'customer_full_name',
'customer_gender': 'customer_gender',
'customer_id': 'customer_id',
'customer_last_name.keyword': 'customer_last_name',
'customer_phone': 'customer_phone',
'day_of_week': 'day_of_week',
'day_of_week_i': 'day_of_week_i',
'email': 'email',
'geoip.city_name': 'geoip.city_name',
'geoip.continent_name': 'geoip.continent_name',
'geoip.country_iso_code': 'geoip.country_iso_code',
'geoip.location': 'geoip.location',
'geoip.region_name': 'geoip.region_name',
'manufacturer.keyword': 'manufacturer',
'order_date': 'order_date',
'order_id': 'order_id',
'products._id.keyword': 'products._id',
'products.base_price': 'products.base_price',
'products.base_unit_price': 'products.base_unit_price',
'products.category.keyword': 'products.category',
'products.created_on': 'products.created_on',
'products.discount_amount': 'products.discount_amount',
'products.discount_percentage': 'products.discount_percentage',
'products.manufacturer.keyword': 'products.manufacturer',
'products.min_price': 'products.min_price',
'products.price': 'products.price',
'products.product_id': 'products.product_id',
'products.product_name.keyword': 'products.product_name',
'products.quantity': 'products.quantity',
'products.sku': 'products.sku',
'products.tax_amount': 'products.tax_amount',
'products.taxful_price': 'products.taxful_price',
'products.taxless_price': 'products.taxless_price',
'products.unit_discount_amount': 'products.unit_discount_amount',
'sku': 'sku',
'taxful_total_price': 'taxful_total_price',
'taxless_total_price': 'taxless_total_price',
'total_quantity': 'total_quantity',
'total_unique_products': 'total_unique_products',
'type': 'type',
'user': 'user'}
assert expected == aggregatables
def test_ecommerce_selected_aggregatables(self):
ed_ecommerce = self.ed_ecommerce()
expected = {'category.keyword': 'category',
'currency': 'currency',
'customer_birth_date': 'customer_birth_date',
'customer_first_name.keyword': 'customer_first_name',
'type': 'type', 'user': 'user'}
aggregatables = ed_ecommerce._query_compiler._mappings.aggregatable_columns(expected.values())
assert expected == aggregatables

View File

@ -2,5 +2,4 @@ elasticsearch>=7.0.5
pandas==0.25.1
matplotlib
pytest>=5.2.1
sphinx_rtd_theme
numpydoc==0.8