Merge pull request #67 from stevedodson/feature/user_guide

Reformat and cleanup based on PyCharm inspect and reformat
This commit is contained in:
stevedodson 2019-11-26 12:17:00 +00:00 committed by GitHub
commit 33f5495352
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 227 additions and 7413 deletions

1
.gitignore vendored
View File

@ -12,7 +12,6 @@ docs/build/
# pytest results # pytest results
eland/tests/dataframe/results/ eland/tests/dataframe/results/
eland/tests/dataframe/results/
result_images/ result_images/

View File

@ -12,6 +12,7 @@
# #
import os import os
import sys import sys
sys.path.insert(0, os.path.abspath("../sphinxext")) sys.path.insert(0, os.path.abspath("../sphinxext"))
sys.path.extend( sys.path.extend(
[ [
@ -20,8 +21,6 @@ sys.path.extend(
] ]
) )
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'eland' project = 'eland'
@ -30,7 +29,6 @@ copyright = '2019, Elasticsearch B.V.'
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = '0.1' release = '0.1'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be # Add any Sphinx extension module names here, as strings. They can be
@ -73,7 +71,6 @@ plot_html_show_source_link = False
plot_pre_code = """import numpy as np plot_pre_code = """import numpy as np
import eland as ed""" import eland as ed"""
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']
@ -82,16 +79,15 @@ templates_path = ['_templates']
# This pattern also affects html_static_path and html_extra_path. # This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [] exclude_patterns = []
# -- Options for HTML output ------------------------------------------------- # -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
#html_theme = 'sphinx_rtd_theme' # html_theme = 'sphinx_rtd_theme'
html_theme = "pandas_sphinx_theme" html_theme = "pandas_sphinx_theme"
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
#html_static_path = ['_static'] # html_static_path = ['_static']

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

View File

@ -4,3 +4,5 @@ eland.DataFrame.hist
.. currentmodule:: eland .. currentmodule:: eland
.. automethod:: DataFrame.hist .. automethod:: DataFrame.hist
.. image:: eland-DataFrame-hist-1.png

View File

@ -1,8 +1,10 @@
# Default number of rows displayed (different to pandas where ALL could be displayed) # Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60 DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub): def docstring_parameter(*sub):
def dec(obj): def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub) obj.__doc__ = obj.__doc__.format(*sub)
return obj return obj
return dec return dec

View File

@ -1,7 +1,7 @@
import pytest
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pytest
import eland as ed import eland as ed
# Fix console size for consistent test results # Fix console size for consistent test results
@ -9,9 +9,9 @@ pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5) pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 100) pd.set_option('display.width', 100)
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def add_imports(doctest_namespace): def add_imports(doctest_namespace):
doctest_namespace["np"] = np doctest_namespace["np"] = np
doctest_namespace["pd"] = pd doctest_namespace["pd"] = pd
doctest_namespace["ed"] = ed doctest_namespace["ed"] = ed

View File

@ -5,8 +5,8 @@ from io import StringIO
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import six import six
from pandas.core.computation.eval import eval
from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.computation.eval import eval
from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, _stringify_path
@ -17,8 +17,8 @@ from pandas.io.formats.printing import pprint_thing
import eland.plotting as gfx import eland.plotting as gfx
from eland import NDFrame from eland import NDFrame
from eland import Series from eland import Series
from eland.filter import BooleanFilter, ScriptFilter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.filter import BooleanFilter
class DataFrame(NDFrame): class DataFrame(NDFrame):
@ -35,7 +35,7 @@ class DataFrame(NDFrame):
- elasticsearch-py instance or - elasticsearch-py instance or
- eland.Client instance - eland.Client instance
index_pattern: str index_pattern: str
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*') Elasticsearch index pattern. This can contain wildcards. (e.g. 'flights')
columns: list of str, optional columns: list of str, optional
List of DataFrame columns. A subset of the Elasticsearch index's fields. List of DataFrame columns. A subset of the Elasticsearch index's fields.
index_field: str, optional index_field: str, optional
@ -76,10 +76,12 @@ class DataFrame(NDFrame):
<BLANKLINE> <BLANKLINE>
[5 rows x 2 columns] [5 rows x 2 columns]
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame
index field
(TODO - currently index_field must also be a field if not _id) (TODO - currently index_field must also be a field if not _id)
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp') >>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'],
... index_field='timestamp')
>>> df.head() >>> df.head()
AvgTicketPrice timestamp AvgTicketPrice timestamp
2018-01-01T00:00:00 841.265642 2018-01-01 00:00:00 2018-01-01T00:00:00 841.265642 2018-01-01 00:00:00
@ -90,6 +92,7 @@ class DataFrame(NDFrame):
<BLANKLINE> <BLANKLINE>
[5 rows x 2 columns] [5 rows x 2 columns]
""" """
def __init__(self, def __init__(self,
client=None, client=None,
index_pattern=None, index_pattern=None,
@ -310,7 +313,8 @@ class DataFrame(NDFrame):
An alternative approach is to use value_count aggregations. However, they have issues in that: An alternative approach is to use value_count aggregations. However, they have issues in that:
- They can only be used with aggregatable fields (e.g. keyword not text) - They can only be used with aggregatable fields (e.g. keyword not text)
- For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 for a single document. - For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 for a
single document.
TODO - add additional pandas.DataFrame.count features TODO - add additional pandas.DataFrame.count features
@ -334,6 +338,7 @@ class DataFrame(NDFrame):
return self._query_compiler.count() return self._query_compiler.count()
def info_es(self): def info_es(self):
# noinspection PyPep8
""" """
A debug summary of an eland DataFrame internals. A debug summary of an eland DataFrame internals.
@ -437,10 +442,7 @@ class DataFrame(NDFrame):
if buf is None: # pragma: no cover if buf is None: # pragma: no cover
buf = sys.stdout buf = sys.stdout
lines = [] lines = [str(type(self)), self._index_summary()]
lines.append(str(type(self)))
lines.append(self._index_summary())
if len(self.columns) == 0: if len(self.columns) == 0:
lines.append('Empty {name}'.format(name=type(self).__name__)) lines.append('Empty {name}'.format(name=type(self).__name__))
@ -697,7 +699,6 @@ class DataFrame(NDFrame):
return self[key] return self[key]
raise e raise e
def _getitem(self, key): def _getitem(self, key):
"""Get the column specified by key for this DataFrame. """Get the column specified by key for this DataFrame.
@ -780,7 +781,8 @@ class DataFrame(NDFrame):
else: else:
self._query_compiler = new_query_compiler self._query_compiler = new_query_compiler
def _reduce_dimension(self, query_compiler): @staticmethod
def _reduce_dimension(query_compiler):
return Series(query_compiler=query_compiler) return Series(query_compiler=query_compiler)
def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
@ -961,7 +963,8 @@ class DataFrame(NDFrame):
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform") raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
# currently we only support a subset of functions that aggregate columns. # currently we only support a subset of functions that aggregate columns.
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] # ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, str): if isinstance(func, str):
# wrap in list # wrap in list
func = [func] func = [func]
@ -1031,6 +1034,7 @@ class DataFrame(NDFrame):
Parameters Parameters
---------- ----------
key: object key: object
default: default value if not found
Returns Returns
------- -------
@ -1079,7 +1083,7 @@ class DataFrame(NDFrame):
eland_to_pandas eland_to_pandas
to_numpy to_numpy
""" """
self.to_numpy() return self.to_numpy()
def to_numpy(self): def to_numpy(self):
""" """
@ -1123,4 +1127,3 @@ class DataFrame(NDFrame):
"This method would scan/scroll the entire Elasticsearch index(s) into memory. " "This method would scan/scroll the entire Elasticsearch index(s) into memory. "
"If this is explicitly required, and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`" "If this is explicitly required, and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`"
) )

View File

@ -38,7 +38,7 @@ class Index:
@index_field.setter @index_field.setter
def index_field(self, index_field): def index_field(self, index_field):
if index_field == None or index_field == Index.ID_INDEX_FIELD: if index_field is None or index_field == Index.ID_INDEX_FIELD:
self._index_field = Index.ID_INDEX_FIELD self._index_field = Index.ID_INDEX_FIELD
self._is_source_field = False self._is_source_field = False
else: else:

View File

@ -13,7 +13,7 @@ class Mappings:
Attributes Attributes
---------- ----------
mappings_capabilities: pandas.DataFrame _mappings_capabilities: pandas.DataFrame
A data frame summarising the capabilities of the index mapping A data frame summarising the capabilities of the index mapping
_source - is top level field (i.e. not a multi-field sub-field) _source - is top level field (i.e. not a multi-field sub-field)
@ -71,7 +71,7 @@ class Mappings:
# (this massively improves performance of DataFrame.flatten) # (this massively improves performance of DataFrame.flatten)
self._source_field_pd_dtypes = {} self._source_field_pd_dtypes = {}
for field_name in self._mappings_capabilities[self._mappings_capabilities._source == True].index: for field_name in self._mappings_capabilities[self._mappings_capabilities._source].index:
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype'] pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
self._source_field_pd_dtypes[field_name] = pd_dtype self._source_field_pd_dtypes[field_name] = pd_dtype
@ -324,8 +324,7 @@ class Mappings:
} }
""" """
mappings = {} mappings = {'properties': {}}
mappings['properties'] = {}
for field_name_name, dtype in dataframe.dtypes.iteritems(): for field_name_name, dtype in dataframe.dtypes.iteritems():
if geo_points is not None and field_name_name in geo_points: if geo_points is not None and field_name_name in geo_points:
es_dtype = 'geo_point' es_dtype = 'geo_point'
@ -453,13 +452,13 @@ class Mappings:
numeric_source_fields: list of str numeric_source_fields: list of str
List of source fields where pd_dtype == (int64 or float64 or bool) List of source fields where pd_dtype == (int64 or float64 or bool)
""" """
if include_bool == True: if include_bool:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) & df = self._mappings_capabilities[self._mappings_capabilities._source &
((self._mappings_capabilities.pd_dtype == 'int64') | ((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') | (self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))] (self._mappings_capabilities.pd_dtype == 'bool'))]
else: else:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) & df = self._mappings_capabilities[self._mappings_capabilities._source &
((self._mappings_capabilities.pd_dtype == 'int64') | ((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))] (self._mappings_capabilities.pd_dtype == 'float64'))]
# if field_names exists, filter index with field_names # if field_names exists, filter index with field_names
@ -487,7 +486,7 @@ class Mappings:
count_source_fields: int count_source_fields: int
Number of source fields in mapping Number of source fields in mapping
""" """
return len(self.source_fields()) return len(self._source_field_pd_dtypes)
def dtypes(self, field_names=None): def dtypes(self, field_names=None):
""" """

View File

@ -31,6 +31,7 @@ from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler from eland import ElandQueryCompiler
class NDFrame: class NDFrame:
def __init__(self, def __init__(self,
@ -216,7 +217,7 @@ class NDFrame:
<BLANKLINE> <BLANKLINE>
[4673 rows x 3 columns] [4673 rows x 3 columns]
""" """
#(derived from modin.base.BasePandasDataset) # (derived from modin.base.BasePandasDataset)
# Level not supported # Level not supported
if level is not None: if level is not None:
raise NotImplementedError("level not supported {}".format(level)) raise NotImplementedError("level not supported {}".format(level))
@ -314,7 +315,7 @@ class NDFrame:
dayOfWeek 2.835975 dayOfWeek 2.835975
dtype: float64 dtype: float64
""" """
if numeric_only == False: if not numeric_only:
raise NotImplementedError("Only mean of numeric fields is implemented") raise NotImplementedError("Only mean of numeric fields is implemented")
return self._query_compiler.mean() return self._query_compiler.mean()
@ -348,7 +349,7 @@ class NDFrame:
dayOfWeek 3.703500e+04 dayOfWeek 3.703500e+04
dtype: float64 dtype: float64
""" """
if numeric_only == False: if not numeric_only:
raise NotImplementedError("Only sum of numeric fields is implemented") raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.sum() return self._query_compiler.sum()
@ -382,7 +383,7 @@ class NDFrame:
dayOfWeek 0.000000 dayOfWeek 0.000000
dtype: float64 dtype: float64
""" """
if numeric_only == False: if not numeric_only:
raise NotImplementedError("Only min of numeric fields is implemented") raise NotImplementedError("Only min of numeric fields is implemented")
return self._query_compiler.min() return self._query_compiler.min()
@ -416,7 +417,7 @@ class NDFrame:
dayOfWeek 6.000000 dayOfWeek 6.000000
dtype: float64 dtype: float64
""" """
if numeric_only == False: if not numeric_only:
raise NotImplementedError("Only max of numeric fields is implemented") raise NotImplementedError("Only max of numeric fields is implemented")
return self._query_compiler.max() return self._query_compiler.max()
@ -424,7 +425,8 @@ class NDFrame:
""" """
Return cardinality of each field. Return cardinality of each field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields** **Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword
rather than text fields**
This method will try and field aggregatable fields if possible if mapping has:: This method will try and field aggregatable fields if possible if mapping has::

View File

@ -39,6 +39,7 @@ class Operations:
return "desc" return "desc"
@staticmethod
def from_string(order): def from_string(order):
if order == "asc": if order == "asc":
return Operations.SortOrder.ASC return Operations.SortOrder.ASC
@ -46,7 +47,7 @@ class Operations:
return Operations.SortOrder.DESC return Operations.SortOrder.DESC
def __init__(self, tasks=None): def __init__(self, tasks=None):
if tasks == None: if tasks is None:
self._tasks = [] self._tasks = []
else: else:
self._tasks = tasks self._tasks = tasks
@ -105,7 +106,8 @@ class Operations:
query_params, post_processing = self._resolve_tasks() query_params, post_processing = self._resolve_tasks()
# Elasticsearch _count is very efficient and so used to return results here. This means that # Elasticsearch _count is very efficient and so used to return results here. This means that
# data frames that have restricted size or sort params will not return valid results (_count doesn't support size). # data frames that have restricted size or sort params will not return valid results
# (_count doesn't support size).
# Longer term we may fall back to pandas, but this may result in loading all index into memory. # Longer term we may fall back to pandas, but this may result in loading all index into memory.
if self._size(query_params, post_processing) is not None: if self._size(query_params, post_processing) is not None:
raise NotImplementedError("Requesting count with additional query and processing parameters " raise NotImplementedError("Requesting count with additional query and processing parameters "
@ -497,10 +499,14 @@ class Operations:
def to_pandas(self, query_compiler): def to_pandas(self, query_compiler):
class PandasDataFrameCollector: class PandasDataFrameCollector:
def __init__(self):
self.df = None
def collect(self, df): def collect(self, df):
self.df = df self.df = df
def batch_size(self): @staticmethod
def batch_size():
return None return None
collector = PandasDataFrameCollector() collector = PandasDataFrameCollector()
@ -528,7 +534,8 @@ class Operations:
self.kwargs['mode'] = 'a' self.kwargs['mode'] = 'a'
df.to_csv(**self.kwargs) df.to_csv(**self.kwargs)
def batch_size(self): @staticmethod
def batch_size():
# By default read 10000 docs to csv # By default read 10000 docs to csv
batch_size = 10000 batch_size = 10000
return batch_size return batch_size
@ -568,8 +575,8 @@ class Operations:
sort=sort_params, sort=sort_params,
body=body, body=body,
_source=field_names) _source=field_names)
except: except Exception:
# Catch ES error and print debug (currently to stdout) # Catch all ES errors and print debug (currently to stdout)
error = { error = {
'index': query_compiler._index_pattern, 'index': query_compiler._index_pattern,
'size': size, 'size': size,
@ -594,7 +601,7 @@ class Operations:
partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size()) partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size())
df = self._apply_df_post_processing(df, post_processing) df = self._apply_df_post_processing(df, post_processing)
collector.collect(df) collector.collect(df)
if partial_result == False: if not partial_result:
break break
else: else:
partial_result, df = query_compiler._es_results_to_pandas(es_results) partial_result, df = query_compiler._es_results_to_pandas(es_results)
@ -761,7 +768,8 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _resolve_head(self, item, query_params, post_processing): @staticmethod
def _resolve_head(item, query_params, post_processing):
# head - sort asc, size n # head - sort asc, size n
# |12345-------------| # |12345-------------|
query_sort_field = item[1][0] query_sort_field = item[1][0]
@ -792,7 +800,8 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _resolve_tail(self, item, query_params, post_processing): @staticmethod
def _resolve_tail(item, query_params, post_processing):
# tail - sort desc, size n, post-process sort asc # tail - sort desc, size n, post-process sort asc
# |-------------12345| # |-------------12345|
query_sort_field = item[1][0] query_sort_field = item[1][0]
@ -802,7 +811,7 @@ class Operations:
# If this is a tail of a tail adjust settings and return # If this is a tail of a tail adjust settings and return
if query_params['query_size'] is not None and \ if query_params['query_size'] is not None and \
query_params['query_sort_order'] == query_sort_order and \ query_params['query_sort_order'] == query_sort_order and \
post_processing == [('sort_index')]: post_processing == ['sort_index']:
if query_size < query_params['query_size']: if query_size < query_params['query_size']:
query_params['query_size'] = query_size query_params['query_size'] = query_size
return query_params, post_processing return query_params, post_processing
@ -830,11 +839,12 @@ class Operations:
# reverse sort order # reverse sort order
query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order) query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
post_processing.append(('sort_index')) post_processing.append('sort_index')
return query_params, post_processing return query_params, post_processing
def _resolve_iloc(self, item, query_params, post_processing): @staticmethod
def _resolve_iloc(item, query_params, post_processing):
# tail - sort desc, size n, post-process sort asc # tail - sort desc, size n, post-process sort asc
# |---4--7-9---------| # |---4--7-9---------|
@ -854,7 +864,8 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _resolve_query_ids(self, item, query_params, post_processing): @staticmethod
def _resolve_query_ids(item, query_params, post_processing):
# task = ('query_ids', ('must_not', items)) # task = ('query_ids', ('must_not', items))
must_clause = item[1][0] must_clause = item[1][0]
ids = item[1][1] ids = item[1][1]
@ -866,7 +877,8 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _resolve_query_terms(self, item, query_params, post_processing): @staticmethod
def _resolve_query_terms(item, query_params, post_processing):
# task = ('query_terms', ('must_not', (field, terms))) # task = ('query_terms', ('must_not', (field, terms)))
must_clause = item[1][0] must_clause = item[1][0]
field = item[1][1][0] field = item[1][1][0]
@ -879,7 +891,8 @@ class Operations:
return query_params, post_processing return query_params, post_processing
def _resolve_boolean_filter(self, item, query_params, post_processing): @staticmethod
def _resolve_boolean_filter(item, query_params, post_processing):
# task = ('boolean_filter', object) # task = ('boolean_filter', object)
boolean_filter = item[1] boolean_filter = item[1]
@ -1000,15 +1013,14 @@ class Operations:
return query_params, post_processing return query_params, post_processing
@staticmethod
def _resolve_post_processing_task(self, item, query_params, post_processing): def _resolve_post_processing_task(item, query_params, post_processing):
# Just do this in post-processing # Just do this in post-processing
if item[0] != 'field_names': if item[0] != 'field_names':
post_processing.append(item) post_processing.append(item)
return query_params, post_processing return query_params, post_processing
def _size(self, query_params, post_processing): def _size(self, query_params, post_processing):
# Shrink wrap code around checking if size parameter is set # Shrink wrap code around checking if size parameter is set
size = query_params['query_size'] # can be None size = query_params['query_size'] # can be None
@ -1023,7 +1035,6 @@ class Operations:
# This can return None # This can return None
return size return size
def info_es(self, buf): def info_es(self, buf):
buf.write("Operations:\n") buf.write("Operations:\n")
buf.write(" tasks: {0}\n".format(self._tasks)) buf.write(" tasks: {0}\n".format(self._tasks))
@ -1044,7 +1055,6 @@ class Operations:
buf.write(" body: {0}\n".format(body)) buf.write(" body: {0}\n".format(body))
buf.write(" post_processing: {0}\n".format(post_processing)) buf.write(" post_processing: {0}\n".format(post_processing))
def update_query(self, boolean_filter): def update_query(self, boolean_filter):
task = ('boolean_filter', boolean_filter) task = ('boolean_filter', boolean_filter)
self._tasks.append(task) self._tasks.append(task)

View File

@ -35,11 +35,8 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
Examples Examples
-------- --------
.. plot::
:context: close-figs
>>> df = ed.DataFrame('localhost', 'flights') >>> df = ed.DataFrame('localhost', 'flights')
>>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10]) >>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10]) # doctest: +SKIP
""" """
# Start with empty pandas data frame derived from # Start with empty pandas data frame derived from
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins) ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)

View File

@ -169,4 +169,3 @@ class Query:
def __repr__(self): def __repr__(self):
return repr(self.to_search_body()) return repr(self.to_search_body())

View File

@ -1,5 +1,5 @@
import pandas as pd
import numpy as np import numpy as np
import pandas as pd
from eland import Client from eland import Client
from eland import Index from eland import Index
@ -188,8 +188,10 @@ class ElandQueryCompiler:
} }
} }
``` ```
TODO - explain how lists are handled (https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html) TODO - explain how lists are handled
TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great) (https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html)
TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists
(which isn't great)
NOTE - using this lists is generally not a good way to use this API NOTE - using this lists is generally not a good way to use this API
""" """
partial_result = False partial_result = False
@ -274,7 +276,8 @@ class ElandQueryCompiler:
elif not is_source_field and type(x) is list: elif not is_source_field and type(x) is list:
for a in x: for a in x:
flatten(a, name) flatten(a, name)
elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping) elif is_source_field: # only print source fields from mappings
# (TODO - not so efficient for large number of fields and filtered mapping)
field_name = name[:-1] field_name = name[:-1]
# Coerce types - for now just datetime # Coerce types - for now just datetime
@ -292,8 +295,8 @@ class ElandQueryCompiler:
# create lists for this pivot (see notes above) # create lists for this pivot (see notes above)
if field_name in out: if field_name in out:
if type(out[field_name]) is not list: if type(out[field_name]) is not list:
l = [out[field_name]] field_as_list = [out[field_name]]
out[field_name] = l out[field_name] = field_as_list
out[field_name].append(x) out[field_name].append(x)
else: else:
out[field_name] = x out[field_name] = x
@ -524,6 +527,7 @@ class ElandQueryCompiler:
""" """
Internal class to deal with column renaming and script_fields Internal class to deal with column renaming and script_fields
""" """
class DisplayNameToFieldNameMapper: class DisplayNameToFieldNameMapper:
def __init__(self, def __init__(self,
field_to_display_names=None, field_to_display_names=None,

View File

@ -20,7 +20,6 @@ import warnings
from io import StringIO from io import StringIO
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, _stringify_path
@ -43,7 +42,7 @@ class Series(NDFrame):
A reference to a Elasticsearch python client A reference to a Elasticsearch python client
index_pattern : str index_pattern : str
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-\*\). An Elasticsearch index pattern. This can contain wildcards.
index_field : str index_field : str
The field to base the series on The field to base the series on
@ -201,7 +200,8 @@ class Series(NDFrame):
""" """
Return the value counts for the specified field. Return the value counts for the specified field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields** **Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword
rather than text fields**
TODO - implement remainder of pandas arguments TODO - implement remainder of pandas arguments
@ -506,7 +506,6 @@ class Series(NDFrame):
""" """
return self._numeric_op(right, _get_method_name()) return self._numeric_op(right, _get_method_name())
def __truediv__(self, right): def __truediv__(self, right):
""" """
Return floating division of series and right, element-wise (binary operator truediv). Return floating division of series and right, element-wise (binary operator truediv).
@ -704,7 +703,7 @@ class Series(NDFrame):
def __pow__(self, right): def __pow__(self, right):
""" """
Return exponential power of series and right, element-wise (binary operator pow \**\). Return exponential power of series and right, element-wise (binary operator pow).
Parameters Parameters
---------- ----------
@ -772,6 +771,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64 Name: taxful_total_price, dtype: float64
""" """
return self._numeric_rop(left, _get_method_name()) return self._numeric_rop(left, _get_method_name())
def __rtruediv__(self, left): def __rtruediv__(self, left):
""" """
Return division of series and left, element-wise (binary operator div). Return division of series and left, element-wise (binary operator div).
@ -803,6 +803,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64 Name: taxful_total_price, dtype: float64
""" """
return self._numeric_rop(left, _get_method_name()) return self._numeric_rop(left, _get_method_name())
def __rfloordiv__(self, left): def __rfloordiv__(self, left):
""" """
Return integer division of series and left, element-wise (binary operator floordiv //). Return integer division of series and left, element-wise (binary operator floordiv //).
@ -834,6 +835,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64 Name: taxful_total_price, dtype: float64
""" """
return self._numeric_rop(left, _get_method_name()) return self._numeric_rop(left, _get_method_name())
def __rmod__(self, left): def __rmod__(self, left):
""" """
Return modulo of series and left, element-wise (binary operator mod %). Return modulo of series and left, element-wise (binary operator mod %).
@ -865,6 +867,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64 Name: taxful_total_price, dtype: float64
""" """
return self._numeric_rop(left, _get_method_name()) return self._numeric_rop(left, _get_method_name())
def __rmul__(self, left): def __rmul__(self, left):
""" """
Return multiplication of series and left, element-wise (binary operator mul). Return multiplication of series and left, element-wise (binary operator mul).
@ -896,9 +899,10 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64 Name: taxful_total_price, dtype: float64
""" """
return self._numeric_rop(left, _get_method_name()) return self._numeric_rop(left, _get_method_name())
def __rpow__(self, left): def __rpow__(self, left):
""" """
Return exponential power of series and left, element-wise (binary operator pow \**\). Return exponential power of series and left, element-wise (binary operator pow).
Parameters Parameters
---------- ----------
@ -927,6 +931,7 @@ class Series(NDFrame):
Name: total_quantity, dtype: float64 Name: total_quantity, dtype: float64
""" """
return self._numeric_rop(left, _get_method_name()) return self._numeric_rop(left, _get_method_name())
def __rsub__(self, left): def __rsub__(self, left):
""" """
Return subtraction of series and left, element-wise (binary operator sub). Return subtraction of series and left, element-wise (binary operator sub).
@ -1170,7 +1175,7 @@ class Series(NDFrame):
results = super().nunique() results = super().nunique()
return results.squeeze() return results.squeeze()
#def values TODO - not implemented as causes current implementation of query to fail # def values TODO - not implemented as causes current implementation of query to fail
def to_numpy(self): def to_numpy(self):
""" """

View File

@ -1,23 +0,0 @@
https://docs.google.com/presentation/d/1A3S5aIJC8SuEbi80PhEzyxTUNMjWJ7-_Om92yU9p3yo/edit#slide=id.g5f8a4bcb09_0_3
https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
https://nbviewer.jupyter.org/github/parente/nbestimate/blob/master/estimate.ipynb
https://stackoverflow.blog/2017/09/14/python-growing-quickly/
https://github.com/elastic/eland
http://localhost:8889/notebooks/eland/tests/demo_day_20190815.ipynb
http://localhost:5601/app/kibana#/dev_tools/console?_g=()
devtool console:
```
GET _cat/indices
# Clean demo
DELETE ed_jetbeats_routes
# Demo day schema
GET flights
GET flights/_search
GET ed_jetbeats_routes
GET ed_jetbeats_routes/_search
```

View File

@ -4,8 +4,6 @@ from elasticsearch import Elasticsearch
import eland as ed import eland as ed
from eland.tests.common import TestData from eland.tests.common import TestData
import pytest
class TestClientEq(TestData): class TestClientEq(TestData):

View File

@ -4,7 +4,6 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
import pandas as pd
class TestDataFrameCount(TestData): class TestDataFrameCount(TestData):

View File

@ -14,8 +14,8 @@ class TestDataFrameDescribe(TestData):
pd_describe = pd_flights.describe() pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe() ed_describe = ed_flights.describe()
assert_almost_equal(pd_describe.drop(['25%','50%','75%'], axis='index'), assert_almost_equal(pd_describe.drop(['25%', '50%', '75%'], axis='index'),
ed_describe.drop(['25%','50%','75%'], axis='index'), ed_describe.drop(['25%', '50%', '75%'], axis='index'),
check_less_precise=True) check_less_precise=True)
# TODO - this fails for percentile fields as ES aggregations are approximate # TODO - this fails for percentile fields as ES aggregations are approximate

View File

@ -1,7 +1,6 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import numpy as np import numpy as np
from pandas.util.testing import assert_series_equal from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
@ -16,8 +15,8 @@ class TestDataFrameDtypes(TestData):
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes) assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
for i in range(0, len(pd_flights.dtypes)-1): for i in range(0, len(pd_flights.dtypes) - 1):
assert type(pd_flights.dtypes[i]) == type(ed_flights.dtypes[i]) assert isinstance(pd_flights.dtypes[i], type(ed_flights.dtypes[i]))
def test_flights_select_dtypes(self): def test_flights_select_dtypes(self):
ed_flights = self.ed_flights_small() ed_flights = self.ed_flights_small()

View File

@ -1,12 +1,12 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import eland as ed
import pytest import pytest
import eland as ed
from eland.tests import ELASTICSEARCH_HOST from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME from eland.tests import FLIGHTS_INDEX_NAME
class TestDataFrameInit: class TestDataFrameInit:
def test_init(self): def test_init(self):
@ -28,4 +28,3 @@ class TestDataFrameInit:
qc = ed.ElandQueryCompiler(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME) qc = ed.ElandQueryCompiler(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME)
df2 = ed.DataFrame(query_compiler=qc) df2 = ed.DataFrame(query_compiler=qc)

View File

@ -1,9 +1,9 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.testing import assert_index_equal from pandas.testing import assert_index_equal
from eland.tests.common import TestData
class TestDataFrameKeys(TestData): class TestDataFrameKeys(TestData):

View File

@ -4,11 +4,8 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
import eland as ed
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
funcs = ['max', 'min', 'mean', 'sum'] funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self): def test_flights_metrics(self):
@ -29,7 +26,8 @@ class TestDataFrameMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs: for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True) check_less_precise=True)
def test_ecommerce_selected_mixed_numeric_source_fields(self): def test_ecommerce_selected_mixed_numeric_source_fields(self):
@ -41,10 +39,10 @@ class TestDataFrameMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs: for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True) check_less_precise=True)
def test_ecommerce_selected_all_numeric_source_fields(self): def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric # All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price'] columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
@ -53,5 +51,6 @@ class TestDataFrameMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[columns] ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs: for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True), assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True) check_less_precise=True)

View File

@ -1,5 +1,4 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
from pandas.util.testing import assert_series_equal from pandas.util.testing import assert_series_equal
@ -18,9 +17,9 @@ class TestDataFrameNUnique(TestData):
ed_nunique = ed_flights.nunique() ed_nunique = ed_flights.nunique()
# TODO - ES is approximate counts so these aren't equal... # TODO - ES is approximate counts so these aren't equal...
#E[left]: [13059, 2, 4, 156, 156, 143] # E[left]: [13059, 2, 4, 156, 156, 143]
#E[right]: [13132, 2, 4, 156, 156, 143] # E[right]: [13132, 2, 4, 156, 156, 143]
#assert_series_equal(pd_nunique, ed_nunique) # assert_series_equal(pd_nunique, ed_nunique)
def test_ecommerce_nunique(self): def test_ecommerce_nunique(self):
columns = ['customer_first_name', 'customer_gender', 'day_of_week_i'] columns = ['customer_first_name', 'customer_gender', 'day_of_week_i']

View File

@ -47,12 +47,11 @@ class TestDataFrameQuery(TestData):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
assert pd_flights.query('FlightDelayMin > 60').shape == \ assert pd_flights.query('FlightDelayMin > 60').shape == ed_flights.query('FlightDelayMin > 60').shape
ed_flights.query('FlightDelayMin > 60').shape
def test_isin_query(self): def test_isin_query(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
assert pd_flights[pd_flights.OriginAirportID.isin(['LHR','SYD'])].shape == \ assert pd_flights[pd_flights.OriginAirportID.isin(['LHR', 'SYD'])].shape == \
ed_flights[ed_flights.OriginAirportID.isin(['LHR','SYD'])].shape ed_flights[ed_flights.OriginAirportID.isin(['LHR', 'SYD'])].shape

View File

@ -1,12 +1,10 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
import pytest import pytest
import pandas as pd
from eland.tests.common import TestData
from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED
from eland.tests.common import TestData
class TestDataFrameRepr(TestData): class TestDataFrameRepr(TestData):
@ -19,6 +17,7 @@ class TestDataFrameRepr(TestData):
""" """
to_string to_string
""" """
def test_num_rows_to_string(self): def test_num_rows_to_string(self):
# check setup works # check setup works
assert pd.get_option('display.max_rows') == 60 assert pd.get_option('display.max_rows') == 60
@ -27,11 +26,11 @@ class TestDataFrameRepr(TestData):
# In pandas calling 'to_string' without max_rows set, will dump ALL rows # In pandas calling 'to_string' without max_rows set, will dump ALL rows
# Test n-1, n, n+1 for edge cases # Test n-1, n, n+1 for edge cases
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED-1) self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED - 1)
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED) self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
# UserWarning displayed by eland here (compare to pandas with max_rows set) # UserWarning displayed by eland here (compare to pandas with max_rows set)
self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED) self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED + 1, None, DEFAULT_NUM_ROWS_DISPLAYED)
# Test for where max_rows lt or gt num_rows # Test for where max_rows lt or gt num_rows
self.num_rows_to_string(10, 5, 5) self.num_rows_to_string(10, 5, 5)
@ -47,8 +46,8 @@ class TestDataFrameRepr(TestData):
ed_head_str = ed_head.to_string(max_rows=max_rows_eland) ed_head_str = ed_head.to_string(max_rows=max_rows_eland)
pd_head_str = pd_head.to_string(max_rows=max_rows_pandas) pd_head_str = pd_head.to_string(max_rows=max_rows_pandas)
#print(ed_head_str) # print(ed_head_str)
#print(pd_head_str) # print(pd_head_str)
assert pd_head_str == ed_head_str assert pd_head_str == ed_head_str
@ -64,13 +63,14 @@ class TestDataFrameRepr(TestData):
""" """
repr repr
""" """
def test_num_rows_repr(self): def test_num_rows_repr(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
self.num_rows_repr(pd.get_option('display.max_rows')-1, pd.get_option('display.max_rows')-1) self.num_rows_repr(pd.get_option('display.max_rows') - 1, pd.get_option('display.max_rows') - 1)
self.num_rows_repr(pd.get_option('display.max_rows'), pd.get_option('display.max_rows')) self.num_rows_repr(pd.get_option('display.max_rows'), pd.get_option('display.max_rows'))
self.num_rows_repr(pd.get_option('display.max_rows')+1, pd.get_option('display.min_rows')) self.num_rows_repr(pd.get_option('display.max_rows') + 1, pd.get_option('display.min_rows'))
def num_rows_repr(self, rows, num_rows_printed): def num_rows_repr(self, rows, num_rows_printed):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
@ -87,7 +87,7 @@ class TestDataFrameRepr(TestData):
num_rows_printed = num_rows_printed + 1 num_rows_printed = num_rows_printed + 1
# number of rows is num_rows_printed + 3 (header, summary) # number of rows is num_rows_printed + 3 (header, summary)
assert (num_rows_printed+3) == len(ed_head_str.splitlines()) assert (num_rows_printed + 3) == len(ed_head_str.splitlines())
assert pd_head_str == ed_head_str assert pd_head_str == ed_head_str
@ -103,6 +103,7 @@ class TestDataFrameRepr(TestData):
""" """
to_html to_html
""" """
def test_num_rows_to_html(self): def test_num_rows_to_html(self):
# check setup works # check setup works
assert pd.get_option('display.max_rows') == 60 assert pd.get_option('display.max_rows') == 60
@ -111,11 +112,11 @@ class TestDataFrameRepr(TestData):
# In pandas calling 'to_string' without max_rows set, will dump ALL rows # In pandas calling 'to_string' without max_rows set, will dump ALL rows
# Test n-1, n, n+1 for edge cases # Test n-1, n, n+1 for edge cases
self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED-1) self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED - 1)
self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED) self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
# UserWarning displayed by eland here # UserWarning displayed by eland here
self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED+1, None, DEFAULT_NUM_ROWS_DISPLAYED) self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED + 1, None, DEFAULT_NUM_ROWS_DISPLAYED)
# Test for where max_rows lt or gt num_rows # Test for where max_rows lt or gt num_rows
self.num_rows_to_html(10, 5, 5) self.num_rows_to_html(10, 5, 5)
@ -131,8 +132,8 @@ class TestDataFrameRepr(TestData):
ed_head_str = ed_head.to_html(max_rows=max_rows_eland) ed_head_str = ed_head.to_html(max_rows=max_rows_eland)
pd_head_str = pd_head.to_html(max_rows=max_rows_pandas) pd_head_str = pd_head.to_html(max_rows=max_rows_pandas)
#print(ed_head_str) # print(ed_head_str)
#print(pd_head_str) # print(pd_head_str)
assert pd_head_str == ed_head_str assert pd_head_str == ed_head_str
@ -145,10 +146,10 @@ class TestDataFrameRepr(TestData):
assert ed_ecom_h == pd_ecom_h assert ed_ecom_h == pd_ecom_h
""" """
_repr_html_ _repr_html_
""" """
def test_num_rows_repr_html(self): def test_num_rows_repr_html(self):
# check setup works # check setup works
assert pd.get_option('display.max_rows') == 60 assert pd.get_option('display.max_rows') == 60
@ -163,9 +164,9 @@ class TestDataFrameRepr(TestData):
# In pandas calling 'to_string' without max_rows set, will dump ALL rows # In pandas calling 'to_string' without max_rows set, will dump ALL rows
# Test n-1, n, n+1 for edge cases # Test n-1, n, n+1 for edge cases
self.num_rows_repr_html(pd.get_option('display.max_rows')-1) self.num_rows_repr_html(pd.get_option('display.max_rows') - 1)
self.num_rows_repr_html(pd.get_option('display.max_rows')) self.num_rows_repr_html(pd.get_option('display.max_rows'))
self.num_rows_repr_html(pd.get_option('display.max_rows')+1, pd.get_option('display.max_rows')) self.num_rows_repr_html(pd.get_option('display.max_rows') + 1, pd.get_option('display.max_rows'))
# Restore default # Restore default
pd.set_option('display.show_dimensions', show_dimensions) pd.set_option('display.show_dimensions', show_dimensions)
@ -180,13 +181,12 @@ class TestDataFrameRepr(TestData):
ed_head_str = ed_head._repr_html_() ed_head_str = ed_head._repr_html_()
pd_head_str = pd_head._repr_html_() pd_head_str = pd_head._repr_html_()
#print(ed_head_str) # print(ed_head_str)
#print(pd_head_str) # print(pd_head_str)
assert pd_head_str == ed_head_str assert pd_head_str == ed_head_str
def test_empty_dataframe_repr_html(self): def test_empty_dataframe_repr_html(self):
# TODO - there is a bug in 'show_dimensions' as it gets added after the last </div> # TODO - there is a bug in 'show_dimensions' as it gets added after the last </div>
# For now test without this # For now test without this
show_dimensions = pd.get_option('display.show_dimensions') show_dimensions = pd.get_option('display.show_dimensions')

View File

@ -3,20 +3,15 @@
import ast import ast
import time import time
import eland as ed
from elasticsearch import Elasticsearch
import pandas as pd import pandas as pd
from elasticsearch import Elasticsearch
from pandas.util.testing import assert_frame_equal from pandas.util.testing import assert_frame_equal
from eland.tests.common import ROOT_DIR import eland as ed
from eland.tests.common import TestData
from eland.tests import ELASTICSEARCH_HOST from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import ROOT_DIR
from eland.tests.common import assert_pandas_eland_frame_equal from eland.tests.common import TestData
class TestDataFrameToCSV(TestData): class TestDataFrameToCSV(TestData):

File diff suppressed because one or more lines are too long

View File

@ -2,8 +2,6 @@
import numpy as np import numpy as np
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
@ -32,7 +30,8 @@ class TestMappingsNumericSourceFields(TestData):
ed_ecommerce = self.ed_ecommerce()[field_names] ed_ecommerce = self.ed_ecommerce()[field_names]
pd_ecommerce = self.pd_ecommerce()[field_names] pd_ecommerce = self.pd_ecommerce()[field_names]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names, include_bool=False) ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number) pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric assert pd_numeric.columns.to_list() == ed_numeric
@ -53,7 +52,8 @@ class TestMappingsNumericSourceFields(TestData):
ed_ecommerce = self.ed_ecommerce()[field_names] ed_ecommerce = self.ed_ecommerce()[field_names]
pd_ecommerce = self.pd_ecommerce()[field_names] pd_ecommerce = self.pd_ecommerce()[field_names]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names, include_bool=False) ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number) pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric assert pd_numeric.columns.to_list() == ed_numeric
@ -71,7 +71,8 @@ class TestMappingsNumericSourceFields(TestData):
ed_ecommerce = self.ed_ecommerce()[field_names] ed_ecommerce = self.ed_ecommerce()[field_names]
pd_ecommerce = self.pd_ecommerce()[field_names] pd_ecommerce = self.pd_ecommerce()[field_names]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names, include_bool=False) ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number) pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric assert pd_numeric.columns.to_list() == ed_numeric

View File

@ -2,7 +2,7 @@
from eland.filter import * from eland.filter import *
class TestOperators(): class TestOperators:
def test_leaf_boolean_filter(self): def test_leaf_boolean_filter(self):
assert GreaterEqual('a', 2).build() == {"range": {"a": {"gte": 2}}} assert GreaterEqual('a', 2).build() == {"range": {"a": {"gte": 2}}}
assert LessEqual('a', 2).build() == {"range": {"a": {"lte": 2}}} assert LessEqual('a', 2).build() == {"range": {"a": {"lte": 2}}}

View File

@ -1,7 +1,6 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pytest import pytest
from matplotlib.testing.decorators import check_figures_equal from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData from eland.tests.common import TestData
@ -14,12 +13,14 @@ def test_plot_hist(fig_test, fig_ref):
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']] pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']] ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222) # This throws a userwarning
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
pd_ax = fig_ref.subplots() pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax) pd_flights.hist(ax=pd_ax)
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222) # This throws a userwarning
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
ed_ax = fig_test.subplots() ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax) ed_flights.hist(ax=ed_ax)

View File

@ -1,7 +1,4 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
from pandas.util.testing import assert_series_equal
from eland import ElandQueryCompiler from eland import ElandQueryCompiler
from eland.tests.common import TestData from eland.tests.common import TestData
@ -20,7 +17,7 @@ class TestQueryCompilerRename(TestData):
field_names = ['a'] field_names = ['a']
display_names = ['A'] display_names = ['A']
update_A = {'a' : 'A'} update_A = {'a': 'A'}
mapper.rename_display_name(update_A) mapper.rename_display_name(update_A)
assert field_names == mapper.field_names_to_list() assert field_names == mapper.field_names_to_list()
@ -29,7 +26,7 @@ class TestQueryCompilerRename(TestData):
field_names = ['a', 'b'] field_names = ['a', 'b']
display_names = ['A', 'B'] display_names = ['A', 'B']
update_B = {'b' : 'B'} update_B = {'b': 'B'}
mapper.rename_display_name(update_B) mapper.rename_display_name(update_B)
assert field_names == mapper.field_names_to_list() assert field_names == mapper.field_names_to_list()
@ -38,7 +35,7 @@ class TestQueryCompilerRename(TestData):
field_names = ['a', 'b'] field_names = ['a', 'b']
display_names = ['AA', 'B'] display_names = ['AA', 'B']
update_AA = {'A' : 'AA'} update_AA = {'A': 'AA'}
mapper.rename_display_name(update_AA) mapper.rename_display_name(update_AA)
assert field_names == mapper.field_names_to_list() assert field_names == mapper.field_names_to_list()
@ -50,26 +47,26 @@ class TestQueryCompilerRename(TestData):
mapper = ElandQueryCompiler.DisplayNameToFieldNameMapper() mapper = ElandQueryCompiler.DisplayNameToFieldNameMapper()
display_names = ['A', 'b', 'c', 'd'] display_names = ['A', 'b', 'c', 'd']
update_A = {'a' : 'A'} update_A = {'a': 'A'}
mapper.rename_display_name(update_A) mapper.rename_display_name(update_A)
assert display_names == mapper.field_to_display_names(columns) assert display_names == mapper.field_to_display_names(columns)
# Invalid update # Invalid update
display_names = ['A', 'b', 'c', 'd'] display_names = ['A', 'b', 'c', 'd']
update_ZZ = {'a' : 'ZZ'} update_ZZ = {'a': 'ZZ'}
mapper.rename_display_name(update_ZZ) mapper.rename_display_name(update_ZZ)
assert display_names == mapper.field_to_display_names(columns) assert display_names == mapper.field_to_display_names(columns)
display_names = ['AA', 'b', 'c', 'd'] display_names = ['AA', 'b', 'c', 'd']
update_AA = {'A' : 'AA'} # already renamed to 'A' update_AA = {'A': 'AA'} # already renamed to 'A'
mapper.rename_display_name(update_AA) mapper.rename_display_name(update_AA)
assert display_names == mapper.field_to_display_names(columns) assert display_names == mapper.field_to_display_names(columns)
display_names = ['AA', 'b', 'C', 'd'] display_names = ['AA', 'b', 'C', 'd']
update_AA_C = {'a' : 'AA', 'c' : 'C'} # 'a' rename ignored update_AA_C = {'a': 'AA', 'c': 'C'} # 'a' rename ignored
mapper.rename_display_name(update_AA_C) mapper.rename_display_name(update_AA_C)
assert display_names == mapper.field_to_display_names(columns) assert display_names == mapper.field_to_display_names(columns)

View File

@ -1,7 +1,6 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pytest
import numpy as np import numpy as np
import pytest
from eland.tests.common import TestData, assert_pandas_eland_series_equal from eland.tests.common import TestData, assert_pandas_eland_series_equal

View File

@ -1,11 +1,7 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData from eland.tests.common import TestData
import eland as ed
class TestSeriesInfoEs(TestData): class TestSeriesInfoEs(TestData):
@ -14,4 +10,3 @@ class TestSeriesInfoEs(TestData):
# No assertion, just test it can be called # No assertion, just test it can be called
info_es = ed_flights.info_es() info_es = ed_flights.info_es()

View File

@ -4,11 +4,8 @@ from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData from eland.tests.common import TestData
import eland as ed
class TestSeriesMetrics(TestData): class TestSeriesMetrics(TestData):
funcs = ['max', 'min', 'mean', 'sum'] funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self): def test_flights_metrics(self):
@ -30,7 +27,6 @@ class TestSeriesMetrics(TestData):
ed_metric = getattr(ed_ecommerce, func)() ed_metric = getattr(ed_ecommerce, func)()
assert ed_metric.empty assert ed_metric.empty
def test_ecommerce_selected_all_numeric_source_fields(self): def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric # All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price'] columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']

View File

@ -27,6 +27,3 @@ class TestSeriesName(TestData):
assert_pandas_eland_series_equal(pd_series, ed_series) assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name assert ed_series.name == pd_series.name

View File

@ -18,6 +18,3 @@ class TestSeriesRename(TestData):
ed_renamed = ed_carrier.rename("renamed") ed_renamed = ed_carrier.rename("renamed")
assert_pandas_eland_series_equal(pd_renamed, ed_renamed) assert_pandas_eland_series_equal(pd_renamed, ed_renamed)

View File

@ -1,8 +1,7 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import eland as ed import eland as ed
import pandas as pd
from eland.tests import ELASTICSEARCH_HOST from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME, ECOMMERCE_INDEX_NAME from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import TestData from eland.tests.common import TestData

View File

@ -1,8 +1,8 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import eland as ed
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
import pytest import pytest
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestSeriesValueCounts(TestData): class TestSeriesValueCounts(TestData):

View File

@ -1,9 +1,8 @@
import pandas as pd
import csv import csv
import pandas as pd
from pandas.io.parsers import _c_parser_defaults from pandas.io.parsers import _c_parser_defaults
from eland import Client from eland import Client
from eland import DataFrame from eland import DataFrame
from eland import Mappings from eland import Mappings
@ -339,4 +338,3 @@ def read_csv(filepath_or_buffer,
ed_df = DataFrame(client, es_dest_index) ed_df = DataFrame(client, es_dest_index)
return ed_df return ed_df

View File

@ -4,6 +4,7 @@ import csv
from elasticsearch import Elasticsearch, helpers from elasticsearch import Elasticsearch, helpers
from elasticsearch.exceptions import TransportError from elasticsearch.exceptions import TransportError
def create_index(es, index): def create_index(es, index):
mapping = { mapping = {
"mappings": { "mappings": {
@ -30,6 +31,7 @@ def create_index(es, index):
else: else:
raise raise
def parse_date(date): def parse_date(date):
""" """
we need to convert dates to conform to the mapping in the following way: we need to convert dates to conform to the mapping in the following way:
@ -55,6 +57,7 @@ def parse_date(date):
return date return date
def parse_line(line): def parse_line(line):
""" """
creates the document to be indexed creates the document to be indexed
@ -72,6 +75,7 @@ def parse_line(line):
return obj return obj
def load_data(es): def load_data(es):
""" """
generate one document per line of online-retail.csv generate one document per line of online-retail.csv
@ -85,7 +89,7 @@ def load_data(es):
reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL) reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
for line in reader: for line in reader:
if header: if header:
header=False header = False
continue continue
doc = parse_line(line) doc = parse_line(line)
@ -106,7 +110,7 @@ if __name__ == "__main__":
# create the elasticsearch client, pointing to the host parameter # create the elasticsearch client, pointing to the host parameter
es = Elasticsearch(args.host) es = Elasticsearch(args.host)
index='online-retail' index = 'online-retail'
# load data from online retail csv in data directory # load data from online retail csv in data directory
stream = load_data(es) stream = load_data(es)