Major cleanup - removed modin as dependency

modin removed as a dependency and iloc feature
removed for now - TODO add back in.
This commit is contained in:
Stephen Dodson 2019-11-04 13:13:42 +00:00
parent 9dad8613d3
commit c1ee409a33
46 changed files with 8593 additions and 882 deletions

View File

@ -1,19 +1,14 @@
from __future__ import absolute_import from __future__ import absolute_import
import os
# Set modin to pandas to avoid starting ray or other
os.environ["MODIN_ENGINE"] = 'python'
os.environ["MODIN_BACKEND"] = 'pandas'
from eland.client import * from eland.client import *
from eland.dataframe import *
from eland.filter import *
from eland.index import * from eland.index import *
from eland.mappings import * from eland.mappings import *
from eland.filter import *
from eland.query import *
from eland.operations import *
from eland.query_compiler import *
from eland.plotting import *
from eland.ndframe import * from eland.ndframe import *
from eland.operations import *
from eland.plotting import *
from eland.query import *
from eland.query_compiler import *
from eland.series import * from eland.series import *
from eland.dataframe import *
from eland.utils import * from eland.utils import *

View File

@ -1,10 +1,12 @@
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from elasticsearch import helpers from elasticsearch import helpers
class Client: class Client:
""" """
eland client - implemented as facade to control access to Elasticsearch methods eland client - implemented as facade to control access to Elasticsearch methods
""" """
def __init__(self, es=None): def __init__(self, es=None):
if isinstance(es, Elasticsearch): if isinstance(es, Elasticsearch):
self._es = es self._es = es
@ -40,4 +42,3 @@ class Client:
def count(self, **kwargs): def count(self, **kwargs):
count_json = self._es.count(**kwargs) count_json = self._es.count(**kwargs)
return count_json['count'] return count_json['count']

View File

@ -1,16 +1,15 @@
import sys import sys
import warnings import warnings
from distutils.version import LooseVersion from distutils.version import LooseVersion
from io import StringIO
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pandas.compat as compat
import six import six
from io import StringIO
from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.dtypes.common import ( from pandas.core.dtypes.common import is_list_like
is_list_like from pandas.core.indexing import check_bool_indexer
)
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, _stringify_path
from pandas.io.formats import console from pandas.io.formats import console
from pandas.io.formats import format as fmt from pandas.io.formats import format as fmt
@ -58,10 +57,10 @@ class DataFrame(NDFrame):
return len(self.columns) == 0 or len(self.index) == 0 return len(self.columns) == 0 or len(self.index) == 0
def head(self, n=5): def head(self, n=5):
return super().head(n) return DataFrame(query_compiler=self._query_compiler.head(n))
def tail(self, n=5): def tail(self, n=5):
return super().tail(n) return DataFrame(query_compiler=self._query_compiler.tail(n))
def __repr__(self): def __repr__(self):
""" """
@ -104,7 +103,7 @@ class DataFrame(NDFrame):
return None return None
if self._info_repr(): if self._info_repr():
buf = StringIO(u("")) buf = StringIO()
self.info(buf=buf) self.info(buf=buf)
# need to escape the <class>, should be the first line. # need to escape the <class>, should be the first line.
val = buf.getvalue().replace('<', r'&lt;', 1) val = buf.getvalue().replace('<', r'&lt;', 1)
@ -509,7 +508,7 @@ class DataFrame(NDFrame):
return self.columns return self.columns
def groupby(self, by=None, axis=0, *args, **kwargs): def groupby(self, by=None, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis) axis = pd.DataFrame._get_axis_number(axis)
if axis == 1: if axis == 1:
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform") raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
@ -544,7 +543,7 @@ class DataFrame(NDFrame):
if Series.agg is called with single function, returns a scalar if Series.agg is called with single function, returns a scalar
if Series.agg is called with several functions, returns a Series if Series.agg is called with several functions, returns a Series
""" """
axis = self._get_axis_number(axis) axis = pd.DataFrame._get_axis_number(axis)
if axis == 1: if axis == 1:
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform") raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
@ -579,3 +578,20 @@ class DataFrame(NDFrame):
) )
else: else:
raise NotImplementedError(expr, type(expr)) raise NotImplementedError(expr, type(expr))
def get(self, key, default=None):
"""Get item from object for given key (DataFrame column, Panel
slice, etc.). Returns default value if not found.
Args:
key (DataFrame column, Panel slice) : the key for which value
to get
Returns:
value (type of items contained in object) : A value that is
stored at the key
"""
if key in self.keys():
return self._getitem(key)
else:
return default

View File

@ -1,7 +1,7 @@
# Derived from pandasticsearch filters # Derived from pandasticsearch filters
# Es filter builder for BooleanCond # Es filter builder for BooleanCond
class BooleanFilter(object): class BooleanFilter:
def __init__(self, *args): def __init__(self, *args):
self._filter = None self._filter = None

View File

@ -14,6 +14,8 @@ In case sorting or aggregating on the _id field is required, it is advised to du
the content of the _id field in another field that has doc_values enabled.) the content of the _id field in another field that has doc_values enabled.)
""" """
class Index: class Index:
ID_INDEX_FIELD = '_id' ID_INDEX_FIELD = '_id'
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc

View File

@ -75,6 +75,7 @@ class Mappings:
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype'] pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
self._source_field_pd_dtypes[field_name] = pd_dtype self._source_field_pd_dtypes[field_name] = pd_dtype
@staticmethod
def _extract_fields_from_mapping(mappings, source_only=False): def _extract_fields_from_mapping(mappings, source_only=False):
""" """
Extract all field names and types from a mapping. Extract all field names and types from a mapping.
@ -151,6 +152,7 @@ class Mappings:
return fields return fields
@staticmethod
def _create_capability_matrix(all_fields, source_fields, all_fields_caps): def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
""" """
{ {
@ -414,15 +416,27 @@ class Mappings:
List of source fields where pd_dtype == (int64 or float64 or bool) List of source fields where pd_dtype == (int64 or float64 or bool)
""" """
if columns is not None: if columns is not None:
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) & return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') | ((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') | (self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist() (self._mappings_capabilities.pd_dtype == 'bool'))].loc[
columns].index.tolist()
else: else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].loc[
columns].index.tolist()
else:
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) & return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') | ((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') | (self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist() (self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
def source_fields(self): def source_fields(self):
""" """

View File

@ -26,15 +26,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
import sys import sys
import pandas as pd import pandas as pd
from modin.pandas.base import BasePandasDataset
from modin.pandas.indexing import _iLocIndexer
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.common import is_list_like
from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler from eland import ElandQueryCompiler
class NDFrame(BasePandasDataset): class NDFrame:
def __init__(self, def __init__(self,
client=None, client=None,
@ -85,6 +83,9 @@ class NDFrame(BasePandasDataset):
return head.append(tail) return head.append(tail)
def __getitem__(self, key):
return self._getitem(key)
def __getattr__(self, key): def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns """After regular attribute access, looks up the name in the columns
@ -105,6 +106,14 @@ class NDFrame(BasePandasDataset):
# Don't default to pandas, just return approximation TODO - make this more accurate # Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler) return sys.getsizeof(self._query_compiler)
def __len__(self):
"""Gets the length of the DataFrame.
Returns:
Returns an integer length of the DataFrame object.
"""
return len(self.index)
@property @property
def iloc(self): def iloc(self):
"""Purely integer-location based indexing for selection by position. """Purely integer-location based indexing for selection by position.
@ -235,21 +244,3 @@ class NDFrame(BasePandasDataset):
def describe(self): def describe(self):
return self._query_compiler.describe() return self._query_compiler.describe()
def get(self, key, default=None):
"""Get item from object for given key (DataFrame column, Panel
slice, etc.). Returns default value if not found.
Args:
key (DataFrame column, Panel slice) : the key for which value
to get
Returns:
value (type of items contained in object) : A value that is
stored at the key
"""
if key in self.keys():
return self.__getitem__(key)
else:
return default

View File

@ -1,9 +1,7 @@
import copy import copy
from enum import Enum from enum import Enum
from io import StringIO
import pandas as pd import pandas as pd
import numpy as np
from eland import Index from eland import Index
from eland import Query from eland import Query
@ -410,7 +408,7 @@ class Operations:
columns = self.get_columns() columns = self.get_columns()
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns) numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns, include_bool=False)
# for each field we compute: # for each field we compute:
# count, mean, std, min, 25%, 50%, 75%, max # count, mean, std, min, 25%, 50%, 75%, max
@ -450,6 +448,7 @@ class Operations:
class PandasDataFrameCollector: class PandasDataFrameCollector:
def collect(self, df): def collect(self, df):
self.df = df self.df = df
def batch_size(self): def batch_size(self):
return None return None
@ -465,6 +464,7 @@ class Operations:
self.kwargs = kwargs self.kwargs = kwargs
self.ret = None self.ret = None
self.first_time = True self.first_time = True
def collect(self, df): def collect(self, df):
# If this is the first time we collect results, then write header, otherwise don't write header # If this is the first time we collect results, then write header, otherwise don't write header
# and append results # and append results

View File

@ -3,6 +3,7 @@ from copy import deepcopy
from eland.filter import BooleanFilter, NotNull, IsNull, IsIn from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
class Query: class Query:
""" """
Simple class to manage building Elasticsearch queries. Simple class to manage building Elasticsearch queries.

View File

@ -1,20 +1,15 @@
import pandas as pd import pandas as pd
from modin.backends.base.query_compiler import BaseQueryCompiler from pandas.core.dtypes.common import (
is_list_like
)
from eland import Client from eland import Client
from eland import Index from eland import Index
from eland import Mappings from eland import Mappings
from eland import Operations from eland import Operations
from pandas.core.dtypes.common import (
is_list_like
)
from pandas.core.indexes.numeric import Int64Index class ElandQueryCompiler:
from pandas.core.indexes.range import RangeIndex
class ElandQueryCompiler(BaseQueryCompiler):
""" """
Some notes on what can and can not be mapped: Some notes on what can and can not be mapped:
@ -318,7 +313,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
return df return df
def copy(self): def copy(self):
return self.__constructor__( return ElandQueryCompiler(
client=self._client, client=self._client,
index_pattern=self._index_pattern, index_pattern=self._index_pattern,
columns=None, # columns are embedded in operations columns=None, # columns are embedded in operations
@ -412,14 +407,19 @@ class ElandQueryCompiler(BaseQueryCompiler):
def count(self): def count(self):
return self._operations.count(self) return self._operations.count(self)
def mean(self): def mean(self):
return self._operations.mean(self) return self._operations.mean(self)
def sum(self): def sum(self):
return self._operations.sum(self) return self._operations.sum(self)
def min(self): def min(self):
return self._operations.min(self) return self._operations.min(self)
def max(self): def max(self):
return self._operations.max(self) return self._operations.max(self)
def nunique(self): def nunique(self):
return self._operations.nunique(self) return self._operations.nunique(self)
@ -472,5 +472,3 @@ class ElandQueryCompiler(BaseQueryCompiler):
return result return result
# def isna(self): # def isna(self):

View File

@ -101,10 +101,10 @@ class Series(NDFrame):
name = property(_get_name) name = property(_get_name)
def head(self, n=5): def head(self, n=5):
return super().head(n) return Series(query_compiler=self._query_compiler.head(n))
def tail(self, n=5): def tail(self, n=5):
return super().tail(n) return Series(query_compiler=self._query_compiler.tail(n))
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# Rendering Methods # Rendering Methods
@ -194,7 +194,6 @@ class Series(NDFrame):
else: else:
raise NotImplementedError(other, type(other)) raise NotImplementedError(other, type(other))
def __eq__(self, other): def __eq__(self, other):
if isinstance(other, Series): if isinstance(other, Series):
# Need to use scripted query to compare to values # Need to use scripted query to compare to values

File diff suppressed because one or more lines are too long

View File

@ -1,12 +1,9 @@
import os import os
import pandas as pd import pandas as pd
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
# Set modin to pandas to avoid starting ray or other
os.environ["MODIN_ENGINE"] = 'python'
os.environ["MODIN_BACKEND"] = 'pandas'
# Define test files and indices # Define test files and indices
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
@ -491,4 +488,3 @@ TEST_NESTED_USER_GROUP_DOCS = [
'_source': {'group': 'new york', 'user': [ '_source': {'group': 'new york', 'user': [
{'first': 'Bill', 'last': 'Jones'}]}} {'first': 'Bill', 'last': 'Jones'}]}}
] ]

View File

@ -1,11 +1,9 @@
import pytest import os
import eland as ed
import pandas as pd import pandas as pd
from pandas.util.testing import (assert_frame_equal, assert_series_equal) from pandas.util.testing import (assert_frame_equal, assert_series_equal)
import os import eland as ed
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
@ -34,6 +32,7 @@ _pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
_pd_ecommerce['customer_birth_date'].astype('datetime64') _pd_ecommerce['customer_birth_date'].astype('datetime64')
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME) _ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
class TestData: class TestData:
def pd_flights(self): def pd_flights(self):
@ -48,13 +47,13 @@ class TestData:
def ed_flights_small(self): def ed_flights_small(self):
return _ed_flights_small return _ed_flights_small
def pd_ecommerce(self): def pd_ecommerce(self):
return _pd_ecommerce return _pd_ecommerce
def ed_ecommerce(self): def ed_ecommerce(self):
return _ed_ecommerce return _ed_ecommerce
def assert_pandas_eland_frame_equal(left, right): def assert_pandas_eland_frame_equal(left, right):
if not isinstance(left, pd.DataFrame): if not isinstance(left, pd.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format( raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
@ -67,6 +66,7 @@ def assert_pandas_eland_frame_equal(left, right):
# Use pandas tests to check similarity # Use pandas tests to check similarity
assert_frame_equal(left, right._to_pandas()) assert_frame_equal(left, right._to_pandas())
def assert_eland_frame_equal(left, right): def assert_eland_frame_equal(left, right):
if not isinstance(left, ed.DataFrame): if not isinstance(left, ed.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format( raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
@ -91,4 +91,3 @@ def assert_pandas_eland_series_equal(left, right):
# Use pandas tests to check similarity # Use pandas tests to check similarity
assert_series_equal(left, right._to_pandas()) assert_series_equal(left, right._to_pandas())

View File

@ -1,15 +1,14 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import numpy as np import numpy as np
import pandas as pd from pandas.util.testing import assert_almost_equal
from pandas.util.testing import (assert_almost_equal)
from eland.tests.common import TestData from eland.tests.common import TestData
class TestDataFrameAggs(TestData): class TestDataFrameAggs(TestData):
def test_to_aggs1(self): def test_basic_aggs(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()

View File

@ -1,19 +1,17 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData from eland.tests.common import TestData
class TestDataFrameCount(TestData): class TestDataFrameCount(TestData):
def test_to_count1(self): def test_ecommerce_count(self):
pd_ecommerce = self.pd_ecommerce() pd_ecommerce = self.pd_ecommerce()
ed_ecommerce = self.ed_ecommerce() ed_ecommerce = self.ed_ecommerce()
pd_count = pd_ecommerce.count() pd_count = pd_ecommerce.count()
ed_count = ed_ecommerce.count() ed_count = ed_ecommerce.count()
print(pd_count) assert_series_equal(pd_count, ed_count)
print(ed_count)

View File

@ -6,6 +6,7 @@ import pandas as pd
import eland as ed import eland as ed
from eland.tests.common import ELASTICSEARCH_HOST from eland.tests.common import ELASTICSEARCH_HOST
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameDateTime(TestData): class TestDataFrameDateTime(TestData):
@ -41,4 +42,4 @@ class TestDataFrameDateTime(TestData):
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name) ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
ed_df_head = ed_df.head() ed_df_head = ed_df.head()
# assert_frame_equal(df, ed_df_head) assert_pandas_eland_frame_equal(df, ed_df_head)

View File

@ -1,35 +1,34 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from io import StringIO
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData from eland.tests.common import TestData
class TestDataFrameDescribe(TestData): class TestDataFrameDescribe(TestData):
def test_to_describe1(self): def test_flights_describe(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_describe = pd_flights.describe() pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe() ed_describe = ed_flights.describe()
print(pd_describe) assert_almost_equal(pd_describe[['AvgTicketPrice']],
print(ed_describe) ed_describe[['AvgTicketPrice']],
check_less_precise=True)
# TODO - this fails now as ES aggregations are approximate # TODO - this fails for all fields now as ES aggregations are approximate
# if ES percentile agg uses # if ES percentile agg uses
# "hdr": { # "hdr": {
# "number_of_significant_value_digits": 3 # "number_of_significant_value_digits": 3
# } # }
# this works # this works
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
pd_ecommerce_describe = self.pd_ecommerce().describe()
ed_ecommerce_describe = self.ed_ecommerce().describe()
# pd_ecommerce_describe = self.pd_ecommerce().describe()
# ed_ecommerce_describe = self.ed_ecommerce().describe()
# We don't compare ecommerce here as the default dtypes in pandas from read_json # We don't compare ecommerce here as the default dtypes in pandas from read_json
# don't match the mapping types. This is mainly because the products field is # don't match the mapping types. This is mainly because the products field is
# nested and so can be treated as a multi-field in ES, but not in pandas # nested and so can be treated as a multi-field in ES, but not in pandas
# We can not also run 'describe' on a truncate ed dataframe # We can not also run 'describe' on a truncate ed dataframe

View File

@ -1,19 +1,14 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import ( from eland.tests.common import (
assert_eland_frame_equal, assert_pandas_eland_frame_equal
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
) )
import numpy as np
class TestDataFrameDrop(TestData): class TestDataFrameDrop(TestData):
def test_drop1(self): def test_flights_small_drop(self):
ed_flights_small = self.ed_flights_small() ed_flights_small = self.ed_flights_small()
pd_flights_small = self.pd_flights_small() pd_flights_small = self.pd_flights_small()

View File

@ -0,0 +1,14 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameDtypes(TestData):
def test_flights_dtypes(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)

View File

@ -1,18 +1,11 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameGet(TestData): class TestDataFrameGet(TestData):
def test_get1(self): def test_get_one_attribute(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()

View File

@ -1,5 +1,4 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import ( from eland.tests.common import (
@ -8,10 +7,9 @@ from eland.tests.common import (
) )
class TestDataFrameGetItem(TestData): class TestDataFrameGetItem(TestData):
def test_getitem1(self): def test_getitem_one_attribute(self):
ed_flights = self.ed_flights().head(103) ed_flights = self.ed_flights().head(103)
pd_flights = self.pd_flights().head(103) pd_flights = self.pd_flights().head(103)
@ -20,7 +18,7 @@ class TestDataFrameGetItem(TestData):
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID) assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
def test_getitem2(self): def test_getitem_attribute_list(self):
ed_flights = self.ed_flights().head(42) ed_flights = self.ed_flights().head(42)
pd_flights = self.pd_flights().head(42) pd_flights = self.pd_flights().head(42)
@ -29,7 +27,7 @@ class TestDataFrameGetItem(TestData):
assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice) assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
def test_getitem3(self): def test_getitem_one_argument(self):
ed_flights = self.ed_flights().head(89) ed_flights = self.ed_flights().head(89)
pd_flights = self.pd_flights().head(89) pd_flights = self.pd_flights().head(89)
@ -38,7 +36,7 @@ class TestDataFrameGetItem(TestData):
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID) assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
def test_getitem4(self): def test_getitem_multiple_calls(self):
ed_flights = self.ed_flights().head(89) ed_flights = self.ed_flights().head(89)
pd_flights = self.pd_flights().head(89) pd_flights = self.pd_flights().head(89)
@ -52,4 +50,3 @@ class TestDataFrameGetItem(TestData):
ed_col1 = ed_col0['DestCountry'] ed_col1 = ed_col0['DestCountry']
assert_pandas_eland_series_equal(pd_col1, ed_col1) assert_pandas_eland_series_equal(pd_col1, ed_col1)

View File

@ -1,11 +1,9 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
from eland.tests.common import TestData from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameHeadTail(TestData): class TestDataFrameHeadTail(TestData):
def test_head(self): def test_head(self):

View File

@ -1,6 +1,5 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.util.testing import assert_almost_equal from pandas.util.testing import assert_almost_equal
@ -10,7 +9,7 @@ from eland.tests.common import TestData
class TestDataFrameHist(TestData): class TestDataFrameHist(TestData):
def test_hist1(self): def test_flights_hist(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
@ -30,15 +29,3 @@ class TestDataFrameHist(TestData):
# Numbers are slightly different # Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins) assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights) assert_almost_equal(pd_weights, ed_weights)
def test_hist2(self):
pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
num_bins = 10
ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
print(ed_bins)

View File

@ -1,45 +0,0 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameiLoc(TestData):
def test_iloc1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
#pd_flights.info()
pd_iloc0 = pd_flights.iloc[0]
pd_iloc1= pd_flights.iloc[[0]]
pd_iloc2= pd_flights.iloc[[0, 1]]
pd_iloc3 = pd_flights.iloc[:3]
pd_iloc5 = pd_flights.iloc[0, 1]
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
ed_iloc0 = ed_flights.iloc[0]
ed_iloc1 = ed_flights.iloc[[0]]
ed_iloc2 = ed_flights.iloc[[0, 1]]
ed_iloc3 = ed_flights.iloc[:3]
ed_iloc5 = ed_flights.iloc[0, 1]
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)

View File

@ -1,15 +0,0 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
class TestDataFrameInfoEs(TestData):
def test_to_info1(self):
ed_flights = self.ed_flights()
head = ed_flights.head(103)
slice = head[['timestamp', 'OriginRegion', 'Carrier']]
iloc = slice.iloc[10:92, [0,2]]
print(iloc.info_es())
print(iloc)

View File

@ -6,7 +6,7 @@ from eland.tests.common import TestData
class TestDataFrameInfo(TestData): class TestDataFrameInfo(TestData):
def test_to_info1(self): def test_flights_info(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()

View File

@ -1,10 +1,9 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameMetrics(TestData): class TestDataFrameMetrics(TestData):
@ -43,4 +42,3 @@ class TestDataFrameMetrics(TestData):
ed_max = ed_flights.max(numeric_only=True) ed_max = ed_flights.max(numeric_only=True)
assert_series_equal(pd_max, ed_max) assert_series_equal(pd_max, ed_max)

View File

@ -1,22 +0,0 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameNUnique(TestData):
def test_nunique1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
print(pd_flights.dtypes)
print(ed_flights.dtypes)
print(ed_flights.nunique())

View File

@ -10,7 +10,7 @@ from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameQuery(TestData): class TestDataFrameQuery(TestData):
def test_query1(self): def test_query(self):
# Examples from: # Examples from:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)}, pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
@ -43,4 +43,3 @@ class TestDataFrameQuery(TestData):
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)] ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
assert_pandas_eland_frame_equal(pd_q4, ed_q4) assert_pandas_eland_frame_equal(pd_q4, ed_q4)

View File

@ -3,9 +3,9 @@
from eland.tests.common import TestData from eland.tests.common import TestData
class TestDataFrameHeadTail(TestData): class TestDataFrameRepr(TestData):
def test_to_string1(self): def test_head_101_to_string(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -18,7 +18,7 @@ class TestDataFrameHeadTail(TestData):
assert pd_head_101_str == ed_head_101_str assert pd_head_101_str == ed_head_101_str
def test_to_string2(self): def test_head_11_to_string2(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -30,7 +30,7 @@ class TestDataFrameHeadTail(TestData):
assert pd_head_11_str == ed_head_11_str assert pd_head_11_str == ed_head_11_str
def test_to_repr(self): def test_repr(self):
ed_ecommerce = self.ed_ecommerce() ed_ecommerce = self.ed_ecommerce()
pd_ecommerce = self.pd_ecommerce() pd_ecommerce = self.pd_ecommerce()

View File

@ -1,5 +1,4 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
import numpy as np import numpy as np
from eland.tests.common import TestData from eland.tests.common import TestData
@ -8,10 +7,9 @@ from eland.tests.common import (
) )
class TestDataFrameSelectDTypes(TestData): class TestDataFrameSelectDTypes(TestData):
def test_select_dtypes1(self): def test_select_dtypes_include_number(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -20,7 +18,7 @@ class TestDataFrameSelectDTypes(TestData):
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103)) assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
def test_select_dtypes2(self): def test_select_dtypes_exclude_number(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -28,4 +26,3 @@ class TestDataFrameSelectDTypes(TestData):
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number]) pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103)) assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))

View File

@ -22,5 +22,3 @@ class TestDataFrameShape(TestData):
ed_shape = ed_flights.shape ed_shape = ed_flights.shape
assert pd_shape == ed_shape assert pd_shape == ed_shape

View File

@ -1,14 +1,13 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
from eland.tests.common import TestData
from eland.tests.common import ROOT_DIR
from pandas.util.testing import (assert_equal, assert_frame_equal)
import ast import ast
import pandas as pd
from pandas.util.testing import (assert_frame_equal)
from eland.tests.common import ROOT_DIR
from eland.tests.common import TestData
class TestDataFrameToCSV(TestData): class TestDataFrameToCSV(TestData):
@ -43,6 +42,3 @@ class TestDataFrameToCSV(TestData):
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp) pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
assert_frame_equal(pd_flights, pd_from_csv) assert_frame_equal(pd_flights, pd_from_csv)

View File

@ -1,12 +1,13 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestMappingsDtypes(TestData): class TestMappingsDtypes(TestData):
def test_dtypes1(self): def test_flights_dtypes_all(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -15,7 +16,7 @@ class TestMappingsDtypes(TestData):
assert_series_equal(pd_dtypes, ed_dtypes) assert_series_equal(pd_dtypes, ed_dtypes)
def test_dtypes2(self): def test_flights_dtypes_columns(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']] pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
@ -24,7 +25,7 @@ class TestMappingsDtypes(TestData):
assert_series_equal(pd_dtypes, ed_dtypes) assert_series_equal(pd_dtypes, ed_dtypes)
def test_get_dtype_counts1(self): def test_flights_get_dtype_counts_all(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
@ -33,7 +34,7 @@ class TestMappingsDtypes(TestData):
assert_series_equal(pd_dtypes, ed_dtypes) assert_series_equal(pd_dtypes, ed_dtypes)
def test_get_dtype_counts2(self): def test_flights_get_dtype_counts_columns(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']] pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]

View File

@ -1,8 +1,9 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData from eland.tests.common import TestData
from matplotlib.testing.decorators import check_figures_equal
@check_figures_equal(extensions=['png']) @check_figures_equal(extensions=['png'])
def test_plot_hist(fig_test, fig_ref): def test_plot_hist(fig_test, fig_ref):

View File

@ -1,8 +1,7 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from eland import Query from eland import Query
from eland.tests.common import TestData
class TestQueryCopy(TestData): class TestQueryCopy(TestData):
@ -22,6 +21,3 @@ class TestQueryCopy(TestData):
print(q.to_search_body()) print(q.to_search_body())
print(q1.to_search_body()) print(q1.to_search_body())

View File

@ -1,15 +1,9 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed import eland as ed
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_series_equal
from eland.tests import ELASTICSEARCH_HOST from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal from eland.tests.common import assert_pandas_eland_series_equal
class TestSeriesHeadTail(TestData): class TestSeriesHeadTail(TestData):

View File

@ -1,15 +1,8 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed import eland as ed
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
from eland.tests import ELASTICSEARCH_HOST from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME from eland.tests import FLIGHTS_INDEX_NAME
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
class TestSeriesRepr(TestData): class TestSeriesRepr(TestData):

View File

@ -1,4 +1,3 @@
import pandas as pd
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from elasticsearch import helpers from elasticsearch import helpers
@ -10,6 +9,7 @@ DATA_LIST = [
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING) (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
] ]
def _setup_data(es): def _setup_data(es):
# Read json file and index records into Elasticsearch # Read json file and index records into Elasticsearch
for data in DATA_LIST: for data in DATA_LIST:
@ -50,17 +50,20 @@ def _setup_data(es):
print("Done", index_name) print("Done", index_name)
def _setup_test_mappings(es): def _setup_test_mappings(es):
# Create a complex mapping containing many Elasticsearch features # Create a complex mapping containing many Elasticsearch features
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404]) es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1) es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
def _setup_test_nested(es): def _setup_test_nested(es):
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404]) es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING) es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS) helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
if __name__ == '__main__': if __name__ == '__main__':
# Create connection to Elasticsearch - use defaults # Create connection to Elasticsearch - use defaults
es = Elasticsearch(ELASTICSEARCH_HOST) es = Elasticsearch(ELASTICSEARCH_HOST)

View File

@ -7,7 +7,8 @@ def read_es(es_params, index_pattern):
return DataFrame(client=es_params, index_pattern=index_pattern) return DataFrame(client=es_params, index_pattern=index_pattern)
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None): def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
geo_points=None):
""" """
Append a pandas DataFrame to an Elasticsearch index. Append a pandas DataFrame to an Elasticsearch index.
Mainly used in testing. Mainly used in testing.

3
requirements-dev.txt Normal file
View File

@ -0,0 +1,3 @@
elasticsearch>=7.0.5
pandas==0.25.1
pytest>=5.2.1

View File

@ -1,2 +1,3 @@
elasticsearch>=7.0.5 elasticsearch>=7.0.5
pandas==0.25.1 pandas==0.25.1
matplotlib

View File

@ -1,9 +1,11 @@
from setuptools import setup from setuptools import setup
def readme(): def readme():
with open('README.rst') as f: with open('README.rst') as f:
return f.read() return f.read()
setup(name='eland', setup(name='eland',
version='0.1', version='0.1',
description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch', description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch',