Merge pull request #67 from stevedodson/feature/user_guide

Reformat and cleanup based on PyCharm inspect and reformat
This commit is contained in:
stevedodson 2019-11-26 12:17:00 +00:00 committed by GitHub
commit 33f5495352
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 227 additions and 7413 deletions

1
.gitignore vendored
View File

@ -12,7 +12,6 @@ docs/build/
# pytest results
eland/tests/dataframe/results/
eland/tests/dataframe/results/
result_images/

View File

@ -12,6 +12,7 @@
#
import os
import sys
sys.path.insert(0, os.path.abspath("../sphinxext"))
sys.path.extend(
[
@ -20,8 +21,6 @@ sys.path.extend(
]
)
# -- Project information -----------------------------------------------------
project = 'eland'
@ -30,7 +29,6 @@ copyright = '2019, Elasticsearch B.V.'
# The full version, including alpha/beta/rc tags
release = '0.1'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
@ -73,7 +71,6 @@ plot_html_show_source_link = False
plot_pre_code = """import numpy as np
import eland as ed"""
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@ -82,7 +79,6 @@ templates_path = ['_templates']
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

View File

@ -4,3 +4,5 @@ eland.DataFrame.hist
.. currentmodule:: eland
.. automethod:: DataFrame.hist
.. image:: eland-DataFrame-hist-1.png

View File

@ -1,8 +1,10 @@
# Default number of rows displayed (different to pandas where ALL could be displayed)
DEFAULT_NUM_ROWS_DISPLAYED = 60
def docstring_parameter(*sub):
def dec(obj):
obj.__doc__ = obj.__doc__.format(*sub)
return obj
return dec

View File

@ -1,7 +1,7 @@
import pytest
import numpy as np
import pandas as pd
import pytest
import eland as ed
# Fix console size for consistent test results
@ -9,9 +9,9 @@ pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 100)
@pytest.fixture(autouse=True)
def add_imports(doctest_namespace):
doctest_namespace["np"] = np
doctest_namespace["pd"] = pd
doctest_namespace["ed"] = ed

View File

@ -5,8 +5,8 @@ from io import StringIO
import numpy as np
import pandas as pd
import six
from pandas.core.computation.eval import eval
from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.computation.eval import eval
from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path
@ -17,8 +17,8 @@ from pandas.io.formats.printing import pprint_thing
import eland.plotting as gfx
from eland import NDFrame
from eland import Series
from eland.filter import BooleanFilter, ScriptFilter
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.filter import BooleanFilter
class DataFrame(NDFrame):
@ -35,7 +35,7 @@ class DataFrame(NDFrame):
- elasticsearch-py instance or
- eland.Client instance
index_pattern: str
Elasticsearch index pattern (e.g. 'flights' or 'filebeat-\*')
Elasticsearch index pattern. This can contain wildcards. (e.g. 'flights')
columns: list of str, optional
List of DataFrame columns. A subset of the Elasticsearch index's fields.
index_field: str, optional
@ -76,10 +76,12 @@ class DataFrame(NDFrame):
<BLANKLINE>
[5 rows x 2 columns]
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame index field
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index, with 'timestamp' as the DataFrame
index field
(TODO - currently index_field must also be a field if not _id)
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'], index_field='timestamp')
>>> df = ed.DataFrame(client='localhost', index_pattern='flights', columns=['AvgTicketPrice', 'timestamp'],
... index_field='timestamp')
>>> df.head()
AvgTicketPrice timestamp
2018-01-01T00:00:00 841.265642 2018-01-01 00:00:00
@ -90,6 +92,7 @@ class DataFrame(NDFrame):
<BLANKLINE>
[5 rows x 2 columns]
"""
def __init__(self,
client=None,
index_pattern=None,
@ -310,7 +313,8 @@ class DataFrame(NDFrame):
An alternative approach is to use value_count aggregations. However, they have issues in that:
- They can only be used with aggregatable fields (e.g. keyword not text)
- For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 for a single document.
- For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2 for a
single document.
TODO - add additional pandas.DataFrame.count features
@ -334,6 +338,7 @@ class DataFrame(NDFrame):
return self._query_compiler.count()
def info_es(self):
# noinspection PyPep8
"""
A debug summary of an eland DataFrame internals.
@ -437,10 +442,7 @@ class DataFrame(NDFrame):
if buf is None: # pragma: no cover
buf = sys.stdout
lines = []
lines.append(str(type(self)))
lines.append(self._index_summary())
lines = [str(type(self)), self._index_summary()]
if len(self.columns) == 0:
lines.append('Empty {name}'.format(name=type(self).__name__))
@ -697,7 +699,6 @@ class DataFrame(NDFrame):
return self[key]
raise e
def _getitem(self, key):
"""Get the column specified by key for this DataFrame.
@ -780,7 +781,8 @@ class DataFrame(NDFrame):
else:
self._query_compiler = new_query_compiler
def _reduce_dimension(self, query_compiler):
@staticmethod
def _reduce_dimension(query_compiler):
return Series(query_compiler=query_compiler)
def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
@ -961,7 +963,8 @@ class DataFrame(NDFrame):
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
# currently we only support a subset of functions that aggregate columns.
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile',
# 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, str):
# wrap in list
func = [func]
@ -1031,6 +1034,7 @@ class DataFrame(NDFrame):
Parameters
----------
key: object
default: default value if not found
Returns
-------
@ -1079,7 +1083,7 @@ class DataFrame(NDFrame):
eland_to_pandas
to_numpy
"""
self.to_numpy()
return self.to_numpy()
def to_numpy(self):
"""
@ -1123,4 +1127,3 @@ class DataFrame(NDFrame):
"This method would scan/scroll the entire Elasticsearch index(s) into memory. "
"If this is explicitly required, and there is sufficient memory, call `ed.eland_to_pandas(ed_df).values`"
)

View File

@ -38,7 +38,7 @@ class Index:
@index_field.setter
def index_field(self, index_field):
if index_field == None or index_field == Index.ID_INDEX_FIELD:
if index_field is None or index_field == Index.ID_INDEX_FIELD:
self._index_field = Index.ID_INDEX_FIELD
self._is_source_field = False
else:

View File

@ -13,7 +13,7 @@ class Mappings:
Attributes
----------
mappings_capabilities: pandas.DataFrame
_mappings_capabilities: pandas.DataFrame
A data frame summarising the capabilities of the index mapping
_source - is top level field (i.e. not a multi-field sub-field)
@ -71,7 +71,7 @@ class Mappings:
# (this massively improves performance of DataFrame.flatten)
self._source_field_pd_dtypes = {}
for field_name in self._mappings_capabilities[self._mappings_capabilities._source == True].index:
for field_name in self._mappings_capabilities[self._mappings_capabilities._source].index:
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
self._source_field_pd_dtypes[field_name] = pd_dtype
@ -324,8 +324,7 @@ class Mappings:
}
"""
mappings = {}
mappings['properties'] = {}
mappings = {'properties': {}}
for field_name_name, dtype in dataframe.dtypes.iteritems():
if geo_points is not None and field_name_name in geo_points:
es_dtype = 'geo_point'
@ -453,13 +452,13 @@ class Mappings:
numeric_source_fields: list of str
List of source fields where pd_dtype == (int64 or float64 or bool)
"""
if include_bool == True:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
if include_bool:
df = self._mappings_capabilities[self._mappings_capabilities._source &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))]
else:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
df = self._mappings_capabilities[self._mappings_capabilities._source &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))]
# if field_names exists, filter index with field_names
@ -487,7 +486,7 @@ class Mappings:
count_source_fields: int
Number of source fields in mapping
"""
return len(self.source_fields())
return len(self._source_field_pd_dtypes)
def dtypes(self, field_names=None):
"""

View File

@ -31,6 +31,7 @@ from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler
class NDFrame:
def __init__(self,
@ -314,7 +315,7 @@ class NDFrame:
dayOfWeek 2.835975
dtype: float64
"""
if numeric_only == False:
if not numeric_only:
raise NotImplementedError("Only mean of numeric fields is implemented")
return self._query_compiler.mean()
@ -348,7 +349,7 @@ class NDFrame:
dayOfWeek 3.703500e+04
dtype: float64
"""
if numeric_only == False:
if not numeric_only:
raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.sum()
@ -382,7 +383,7 @@ class NDFrame:
dayOfWeek 0.000000
dtype: float64
"""
if numeric_only == False:
if not numeric_only:
raise NotImplementedError("Only min of numeric fields is implemented")
return self._query_compiler.min()
@ -416,7 +417,7 @@ class NDFrame:
dayOfWeek 6.000000
dtype: float64
"""
if numeric_only == False:
if not numeric_only:
raise NotImplementedError("Only max of numeric fields is implemented")
return self._query_compiler.max()
@ -424,7 +425,8 @@ class NDFrame:
"""
Return cardinality of each field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields**
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword
rather than text fields**
This method will try and field aggregatable fields if possible if mapping has::

View File

@ -39,6 +39,7 @@ class Operations:
return "desc"
@staticmethod
def from_string(order):
if order == "asc":
return Operations.SortOrder.ASC
@ -46,7 +47,7 @@ class Operations:
return Operations.SortOrder.DESC
def __init__(self, tasks=None):
if tasks == None:
if tasks is None:
self._tasks = []
else:
self._tasks = tasks
@ -105,7 +106,8 @@ class Operations:
query_params, post_processing = self._resolve_tasks()
# Elasticsearch _count is very efficient and so used to return results here. This means that
# data frames that have restricted size or sort params will not return valid results (_count doesn't support size).
# data frames that have restricted size or sort params will not return valid results
# (_count doesn't support size).
# Longer term we may fall back to pandas, but this may result in loading all index into memory.
if self._size(query_params, post_processing) is not None:
raise NotImplementedError("Requesting count with additional query and processing parameters "
@ -497,10 +499,14 @@ class Operations:
def to_pandas(self, query_compiler):
class PandasDataFrameCollector:
def __init__(self):
self.df = None
def collect(self, df):
self.df = df
def batch_size(self):
@staticmethod
def batch_size():
return None
collector = PandasDataFrameCollector()
@ -528,7 +534,8 @@ class Operations:
self.kwargs['mode'] = 'a'
df.to_csv(**self.kwargs)
def batch_size(self):
@staticmethod
def batch_size():
# By default read 10000 docs to csv
batch_size = 10000
return batch_size
@ -568,8 +575,8 @@ class Operations:
sort=sort_params,
body=body,
_source=field_names)
except:
# Catch ES error and print debug (currently to stdout)
except Exception:
# Catch all ES errors and print debug (currently to stdout)
error = {
'index': query_compiler._index_pattern,
'size': size,
@ -594,7 +601,7 @@ class Operations:
partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size())
df = self._apply_df_post_processing(df, post_processing)
collector.collect(df)
if partial_result == False:
if not partial_result:
break
else:
partial_result, df = query_compiler._es_results_to_pandas(es_results)
@ -761,7 +768,8 @@ class Operations:
return query_params, post_processing
def _resolve_head(self, item, query_params, post_processing):
@staticmethod
def _resolve_head(item, query_params, post_processing):
# head - sort asc, size n
# |12345-------------|
query_sort_field = item[1][0]
@ -792,7 +800,8 @@ class Operations:
return query_params, post_processing
def _resolve_tail(self, item, query_params, post_processing):
@staticmethod
def _resolve_tail(item, query_params, post_processing):
# tail - sort desc, size n, post-process sort asc
# |-------------12345|
query_sort_field = item[1][0]
@ -802,7 +811,7 @@ class Operations:
# If this is a tail of a tail adjust settings and return
if query_params['query_size'] is not None and \
query_params['query_sort_order'] == query_sort_order and \
post_processing == [('sort_index')]:
post_processing == ['sort_index']:
if query_size < query_params['query_size']:
query_params['query_size'] = query_size
return query_params, post_processing
@ -830,11 +839,12 @@ class Operations:
# reverse sort order
query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
post_processing.append(('sort_index'))
post_processing.append('sort_index')
return query_params, post_processing
def _resolve_iloc(self, item, query_params, post_processing):
@staticmethod
def _resolve_iloc(item, query_params, post_processing):
# tail - sort desc, size n, post-process sort asc
# |---4--7-9---------|
@ -854,7 +864,8 @@ class Operations:
return query_params, post_processing
def _resolve_query_ids(self, item, query_params, post_processing):
@staticmethod
def _resolve_query_ids(item, query_params, post_processing):
# task = ('query_ids', ('must_not', items))
must_clause = item[1][0]
ids = item[1][1]
@ -866,7 +877,8 @@ class Operations:
return query_params, post_processing
def _resolve_query_terms(self, item, query_params, post_processing):
@staticmethod
def _resolve_query_terms(item, query_params, post_processing):
# task = ('query_terms', ('must_not', (field, terms)))
must_clause = item[1][0]
field = item[1][1][0]
@ -879,7 +891,8 @@ class Operations:
return query_params, post_processing
def _resolve_boolean_filter(self, item, query_params, post_processing):
@staticmethod
def _resolve_boolean_filter(item, query_params, post_processing):
# task = ('boolean_filter', object)
boolean_filter = item[1]
@ -1000,15 +1013,14 @@ class Operations:
return query_params, post_processing
def _resolve_post_processing_task(self, item, query_params, post_processing):
@staticmethod
def _resolve_post_processing_task(item, query_params, post_processing):
# Just do this in post-processing
if item[0] != 'field_names':
post_processing.append(item)
return query_params, post_processing
def _size(self, query_params, post_processing):
# Shrink wrap code around checking if size parameter is set
size = query_params['query_size'] # can be None
@ -1023,7 +1035,6 @@ class Operations:
# This can return None
return size
def info_es(self, buf):
buf.write("Operations:\n")
buf.write(" tasks: {0}\n".format(self._tasks))
@ -1044,7 +1055,6 @@ class Operations:
buf.write(" body: {0}\n".format(body))
buf.write(" post_processing: {0}\n".format(post_processing))
def update_query(self, boolean_filter):
task = ('boolean_filter', boolean_filter)
self._tasks.append(task)

View File

@ -35,11 +35,8 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
Examples
--------
.. plot::
:context: close-figs
>>> df = ed.DataFrame('localhost', 'flights')
>>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10])
>>> hist = df.select_dtypes(include=[np.number]).hist(figsize=[10,10]) # doctest: +SKIP
"""
# Start with empty pandas data frame derived from
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)

View File

@ -169,4 +169,3 @@ class Query:
def __repr__(self):
return repr(self.to_search_body())

View File

@ -1,5 +1,5 @@
import pandas as pd
import numpy as np
import pandas as pd
from eland import Client
from eland import Index
@ -188,8 +188,10 @@ class ElandQueryCompiler:
}
}
```
TODO - explain how lists are handled (https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html)
TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great)
TODO - explain how lists are handled
(https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html)
TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists
(which isn't great)
NOTE - using this lists is generally not a good way to use this API
"""
partial_result = False
@ -274,7 +276,8 @@ class ElandQueryCompiler:
elif not is_source_field and type(x) is list:
for a in x:
flatten(a, name)
elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
elif is_source_field: # only print source fields from mappings
# (TODO - not so efficient for large number of fields and filtered mapping)
field_name = name[:-1]
# Coerce types - for now just datetime
@ -292,8 +295,8 @@ class ElandQueryCompiler:
# create lists for this pivot (see notes above)
if field_name in out:
if type(out[field_name]) is not list:
l = [out[field_name]]
out[field_name] = l
field_as_list = [out[field_name]]
out[field_name] = field_as_list
out[field_name].append(x)
else:
out[field_name] = x
@ -524,6 +527,7 @@ class ElandQueryCompiler:
"""
Internal class to deal with column renaming and script_fields
"""
class DisplayNameToFieldNameMapper:
def __init__(self,
field_to_display_names=None,

View File

@ -20,7 +20,6 @@ import warnings
from io import StringIO
import numpy as np
import pandas as pd
from pandas.io.common import _expand_user, _stringify_path
@ -43,7 +42,7 @@ class Series(NDFrame):
A reference to a Elasticsearch python client
index_pattern : str
An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-\*\).
An Elasticsearch index pattern. This can contain wildcards.
index_field : str
The field to base the series on
@ -201,7 +200,8 @@ class Series(NDFrame):
"""
Return the value counts for the specified field.
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword rather than text fields**
**Note we can only do this for aggregatable Elasticsearch fields - (in general) numeric and keyword
rather than text fields**
TODO - implement remainder of pandas arguments
@ -506,7 +506,6 @@ class Series(NDFrame):
"""
return self._numeric_op(right, _get_method_name())
def __truediv__(self, right):
"""
Return floating division of series and right, element-wise (binary operator truediv).
@ -704,7 +703,7 @@ class Series(NDFrame):
def __pow__(self, right):
"""
Return exponential power of series and right, element-wise (binary operator pow \**\).
Return exponential power of series and right, element-wise (binary operator pow).
Parameters
----------
@ -772,6 +771,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
def __rtruediv__(self, left):
"""
Return division of series and left, element-wise (binary operator div).
@ -803,6 +803,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
def __rfloordiv__(self, left):
"""
Return integer division of series and left, element-wise (binary operator floordiv //).
@ -834,6 +835,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
def __rmod__(self, left):
"""
Return modulo of series and left, element-wise (binary operator mod %).
@ -865,6 +867,7 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
def __rmul__(self, left):
"""
Return multiplication of series and left, element-wise (binary operator mul).
@ -896,9 +899,10 @@ class Series(NDFrame):
Name: taxful_total_price, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
def __rpow__(self, left):
"""
Return exponential power of series and left, element-wise (binary operator pow \**\).
Return exponential power of series and left, element-wise (binary operator pow).
Parameters
----------
@ -927,6 +931,7 @@ class Series(NDFrame):
Name: total_quantity, dtype: float64
"""
return self._numeric_rop(left, _get_method_name())
def __rsub__(self, left):
"""
Return subtraction of series and left, element-wise (binary operator sub).

View File

@ -1,23 +0,0 @@
https://docs.google.com/presentation/d/1A3S5aIJC8SuEbi80PhEzyxTUNMjWJ7-_Om92yU9p3yo/edit#slide=id.g5f8a4bcb09_0_3
https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
https://nbviewer.jupyter.org/github/parente/nbestimate/blob/master/estimate.ipynb
https://stackoverflow.blog/2017/09/14/python-growing-quickly/
https://github.com/elastic/eland
http://localhost:8889/notebooks/eland/tests/demo_day_20190815.ipynb
http://localhost:5601/app/kibana#/dev_tools/console?_g=()
devtool console:
```
GET _cat/indices
# Clean demo
DELETE ed_jetbeats_routes
# Demo day schema
GET flights
GET flights/_search
GET ed_jetbeats_routes
GET ed_jetbeats_routes/_search
```

View File

@ -4,8 +4,6 @@ from elasticsearch import Elasticsearch
import eland as ed
from eland.tests.common import TestData
import pytest
class TestClientEq(TestData):

View File

@ -4,7 +4,6 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
import pandas as pd
class TestDataFrameCount(TestData):

View File

@ -1,7 +1,6 @@
# File called _pytest for PyCharm compatability
import numpy as np
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
@ -17,7 +16,7 @@ class TestDataFrameDtypes(TestData):
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
for i in range(0, len(pd_flights.dtypes) - 1):
assert type(pd_flights.dtypes[i]) == type(ed_flights.dtypes[i])
assert isinstance(pd_flights.dtypes[i], type(ed_flights.dtypes[i]))
def test_flights_select_dtypes(self):
ed_flights = self.ed_flights_small()

View File

@ -1,12 +1,12 @@
# File called _pytest for PyCharm compatability
import eland as ed
import pytest
import eland as ed
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
class TestDataFrameInit:
def test_init(self):
@ -28,4 +28,3 @@ class TestDataFrameInit:
qc = ed.ElandQueryCompiler(client=ELASTICSEARCH_HOST, index_pattern=FLIGHTS_INDEX_NAME)
df2 = ed.DataFrame(query_compiler=qc)

View File

@ -1,9 +1,9 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.testing import assert_index_equal
from eland.tests.common import TestData
class TestDataFrameKeys(TestData):

View File

@ -4,11 +4,8 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
import eland as ed
class TestDataFrameMetrics(TestData):
funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self):
@ -29,7 +26,8 @@ class TestDataFrameMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True)
def test_ecommerce_selected_mixed_numeric_source_fields(self):
@ -41,10 +39,10 @@ class TestDataFrameMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True)
def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
@ -53,5 +51,6 @@ class TestDataFrameMetrics(TestData):
ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True),
getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True)

View File

@ -1,5 +1,4 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from pandas.util.testing import assert_series_equal

View File

@ -47,8 +47,7 @@ class TestDataFrameQuery(TestData):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
assert pd_flights.query('FlightDelayMin > 60').shape == \
ed_flights.query('FlightDelayMin > 60').shape
assert pd_flights.query('FlightDelayMin > 60').shape == ed_flights.query('FlightDelayMin > 60').shape
def test_isin_query(self):
ed_flights = self.ed_flights()

View File

@ -1,12 +1,10 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import pytest
import pandas as pd
from eland.tests.common import TestData
from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED
from eland.tests.common import TestData
class TestDataFrameRepr(TestData):
@ -19,6 +17,7 @@ class TestDataFrameRepr(TestData):
"""
to_string
"""
def test_num_rows_to_string(self):
# check setup works
assert pd.get_option('display.max_rows') == 60
@ -64,6 +63,7 @@ class TestDataFrameRepr(TestData):
"""
repr
"""
def test_num_rows_repr(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -103,6 +103,7 @@ class TestDataFrameRepr(TestData):
"""
to_html
"""
def test_num_rows_to_html(self):
# check setup works
assert pd.get_option('display.max_rows') == 60
@ -145,10 +146,10 @@ class TestDataFrameRepr(TestData):
assert ed_ecom_h == pd_ecom_h
"""
_repr_html_
"""
def test_num_rows_repr_html(self):
# check setup works
assert pd.get_option('display.max_rows') == 60
@ -186,7 +187,6 @@ class TestDataFrameRepr(TestData):
assert pd_head_str == ed_head_str
def test_empty_dataframe_repr_html(self):
# TODO - there is a bug in 'show_dimensions' as it gets added after the last </div>
# For now test without this
show_dimensions = pd.get_option('display.show_dimensions')

View File

@ -3,20 +3,15 @@
import ast
import time
import eland as ed
from elasticsearch import Elasticsearch
import pandas as pd
from elasticsearch import Elasticsearch
from pandas.util.testing import assert_frame_equal
from eland.tests.common import ROOT_DIR
from eland.tests.common import TestData
import eland as ed
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import assert_pandas_eland_frame_equal
from eland.tests.common import ROOT_DIR
from eland.tests.common import TestData
class TestDataFrameToCSV(TestData):

File diff suppressed because one or more lines are too long

View File

@ -2,8 +2,6 @@
import numpy as np
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
@ -32,7 +30,8 @@ class TestMappingsNumericSourceFields(TestData):
ed_ecommerce = self.ed_ecommerce()[field_names]
pd_ecommerce = self.pd_ecommerce()[field_names]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names, include_bool=False)
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
@ -53,7 +52,8 @@ class TestMappingsNumericSourceFields(TestData):
ed_ecommerce = self.ed_ecommerce()[field_names]
pd_ecommerce = self.pd_ecommerce()[field_names]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names, include_bool=False)
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
@ -71,7 +71,8 @@ class TestMappingsNumericSourceFields(TestData):
ed_ecommerce = self.ed_ecommerce()[field_names]
pd_ecommerce = self.pd_ecommerce()[field_names]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names, include_bool=False)
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(field_names=field_names,
include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric

View File

@ -2,7 +2,7 @@
from eland.filter import *
class TestOperators():
class TestOperators:
def test_leaf_boolean_filter(self):
assert GreaterEqual('a', 2).build() == {"range": {"a": {"gte": 2}}}
assert LessEqual('a', 2).build() == {"range": {"a": {"lte": 2}}}

View File

@ -1,7 +1,6 @@
# File called _pytest for PyCharm compatability
import pytest
from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData
@ -14,12 +13,14 @@ def test_plot_hist(fig_test, fig_ref):
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
# This throws a userwarning
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning):
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
# This throws a userwarning (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
# This throws a userwarning
# (https://github.com/pandas-dev/pandas/blob/171c71611886aab8549a8620c5b0071a129ad685/pandas/plotting/_matplotlib/tools.py#L222)
with pytest.warns(UserWarning):
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)

View File

@ -1,7 +1,4 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from pandas.util.testing import assert_series_equal
from eland import ElandQueryCompiler
from eland.tests.common import TestData

View File

@ -1,7 +1,6 @@
# File called _pytest for PyCharm compatability
import pytest
import numpy as np
import pytest
from eland.tests.common import TestData, assert_pandas_eland_series_equal

View File

@ -1,11 +1,7 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
import eland as ed
class TestSeriesInfoEs(TestData):
@ -14,4 +10,3 @@ class TestSeriesInfoEs(TestData):
# No assertion, just test it can be called
info_es = ed_flights.info_es()

View File

@ -4,11 +4,8 @@ from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
import eland as ed
class TestSeriesMetrics(TestData):
funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self):
@ -30,7 +27,6 @@ class TestSeriesMetrics(TestData):
ed_metric = getattr(ed_ecommerce, func)()
assert ed_metric.empty
def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']

View File

@ -27,6 +27,3 @@ class TestSeriesName(TestData):
assert_pandas_eland_series_equal(pd_series, ed_series)
assert ed_series.name == pd_series.name

View File

@ -18,6 +18,3 @@ class TestSeriesRename(TestData):
ed_renamed = ed_carrier.rename("renamed")
assert_pandas_eland_series_equal(pd_renamed, ed_renamed)

View File

@ -1,8 +1,7 @@
# File called _pytest for PyCharm compatability
import eland as ed
import pandas as pd
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME, ECOMMERCE_INDEX_NAME
from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import TestData

View File

@ -1,8 +1,8 @@
# File called _pytest for PyCharm compatability
import eland as ed
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
import pytest
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestSeriesValueCounts(TestData):

View File

@ -1,9 +1,8 @@
import pandas as pd
import csv
import pandas as pd
from pandas.io.parsers import _c_parser_defaults
from eland import Client
from eland import DataFrame
from eland import Mappings
@ -339,4 +338,3 @@ def read_csv(filepath_or_buffer,
ed_df = DataFrame(client, es_dest_index)
return ed_df

View File

@ -4,6 +4,7 @@ import csv
from elasticsearch import Elasticsearch, helpers
from elasticsearch.exceptions import TransportError
def create_index(es, index):
mapping = {
"mappings": {
@ -30,6 +31,7 @@ def create_index(es, index):
else:
raise
def parse_date(date):
"""
we need to convert dates to conform to the mapping in the following way:
@ -55,6 +57,7 @@ def parse_date(date):
return date
def parse_line(line):
"""
creates the document to be indexed
@ -72,6 +75,7 @@ def parse_line(line):
return obj
def load_data(es):
"""
generate one document per line of online-retail.csv