mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Major cleanup - removed modin as dependency
modin removed as a dependency and iloc feature removed for now - TODO add back in.
This commit is contained in:
parent
9dad8613d3
commit
c1ee409a33
@ -1,19 +1,14 @@
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
# Set modin to pandas to avoid starting ray or other
|
||||
os.environ["MODIN_ENGINE"] = 'python'
|
||||
os.environ["MODIN_BACKEND"] = 'pandas'
|
||||
|
||||
from eland.client import *
|
||||
from eland.dataframe import *
|
||||
from eland.filter import *
|
||||
from eland.index import *
|
||||
from eland.mappings import *
|
||||
from eland.filter import *
|
||||
from eland.query import *
|
||||
from eland.operations import *
|
||||
from eland.query_compiler import *
|
||||
from eland.plotting import *
|
||||
from eland.ndframe import *
|
||||
from eland.operations import *
|
||||
from eland.plotting import *
|
||||
from eland.query import *
|
||||
from eland.query_compiler import *
|
||||
from eland.series import *
|
||||
from eland.dataframe import *
|
||||
from eland.utils import *
|
||||
|
@ -1,10 +1,12 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import helpers
|
||||
|
||||
|
||||
class Client:
|
||||
"""
|
||||
eland client - implemented as facade to control access to Elasticsearch methods
|
||||
"""
|
||||
|
||||
def __init__(self, es=None):
|
||||
if isinstance(es, Elasticsearch):
|
||||
self._es = es
|
||||
@ -40,4 +42,3 @@ class Client:
|
||||
def count(self, **kwargs):
|
||||
count_json = self._es.count(**kwargs)
|
||||
return count_json['count']
|
||||
|
||||
|
@ -1,16 +1,15 @@
|
||||
import sys
|
||||
import warnings
|
||||
from distutils.version import LooseVersion
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas.compat as compat
|
||||
import six
|
||||
from io import StringIO
|
||||
from pandas.core.common import apply_if_callable, is_bool_indexer
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like
|
||||
)
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.indexing import check_bool_indexer
|
||||
|
||||
from pandas.io.common import _expand_user, _stringify_path
|
||||
from pandas.io.formats import console
|
||||
from pandas.io.formats import format as fmt
|
||||
@ -58,10 +57,10 @@ class DataFrame(NDFrame):
|
||||
return len(self.columns) == 0 or len(self.index) == 0
|
||||
|
||||
def head(self, n=5):
|
||||
return super().head(n)
|
||||
return DataFrame(query_compiler=self._query_compiler.head(n))
|
||||
|
||||
def tail(self, n=5):
|
||||
return super().tail(n)
|
||||
return DataFrame(query_compiler=self._query_compiler.tail(n))
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
@ -104,7 +103,7 @@ class DataFrame(NDFrame):
|
||||
return None
|
||||
|
||||
if self._info_repr():
|
||||
buf = StringIO(u(""))
|
||||
buf = StringIO()
|
||||
self.info(buf=buf)
|
||||
# need to escape the <class>, should be the first line.
|
||||
val = buf.getvalue().replace('<', r'<', 1)
|
||||
@ -509,7 +508,7 @@ class DataFrame(NDFrame):
|
||||
return self.columns
|
||||
|
||||
def groupby(self, by=None, axis=0, *args, **kwargs):
|
||||
axis = self._get_axis_number(axis)
|
||||
axis = pd.DataFrame._get_axis_number(axis)
|
||||
|
||||
if axis == 1:
|
||||
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
||||
@ -544,7 +543,7 @@ class DataFrame(NDFrame):
|
||||
if Series.agg is called with single function, returns a scalar
|
||||
if Series.agg is called with several functions, returns a Series
|
||||
"""
|
||||
axis = self._get_axis_number(axis)
|
||||
axis = pd.DataFrame._get_axis_number(axis)
|
||||
|
||||
if axis == 1:
|
||||
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
||||
@ -579,3 +578,20 @@ class DataFrame(NDFrame):
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(expr, type(expr))
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get item from object for given key (DataFrame column, Panel
|
||||
slice, etc.). Returns default value if not found.
|
||||
|
||||
Args:
|
||||
key (DataFrame column, Panel slice) : the key for which value
|
||||
to get
|
||||
|
||||
Returns:
|
||||
value (type of items contained in object) : A value that is
|
||||
stored at the key
|
||||
"""
|
||||
if key in self.keys():
|
||||
return self._getitem(key)
|
||||
else:
|
||||
return default
|
||||
|
@ -1,7 +1,7 @@
|
||||
# Derived from pandasticsearch filters
|
||||
|
||||
# Es filter builder for BooleanCond
|
||||
class BooleanFilter(object):
|
||||
class BooleanFilter:
|
||||
def __init__(self, *args):
|
||||
self._filter = None
|
||||
|
||||
|
@ -14,6 +14,8 @@ In case sorting or aggregating on the _id field is required, it is advised to du
|
||||
the content of the _id field in another field that has doc_values enabled.)
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class Index:
|
||||
ID_INDEX_FIELD = '_id'
|
||||
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
|
||||
|
@ -75,6 +75,7 @@ class Mappings:
|
||||
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
||||
self._source_field_pd_dtypes[field_name] = pd_dtype
|
||||
|
||||
@staticmethod
|
||||
def _extract_fields_from_mapping(mappings, source_only=False):
|
||||
"""
|
||||
Extract all field names and types from a mapping.
|
||||
@ -151,6 +152,7 @@ class Mappings:
|
||||
|
||||
return fields
|
||||
|
||||
@staticmethod
|
||||
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
|
||||
"""
|
||||
{
|
||||
@ -414,15 +416,27 @@ class Mappings:
|
||||
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||
"""
|
||||
if columns is not None:
|
||||
if include_bool == True:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist()
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[
|
||||
columns].index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].loc[
|
||||
columns].index.tolist()
|
||||
else:
|
||||
if include_bool == True:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||
|
||||
def source_fields(self):
|
||||
"""
|
||||
|
@ -26,15 +26,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from modin.pandas.base import BasePandasDataset
|
||||
from modin.pandas.indexing import _iLocIndexer
|
||||
from pandas.util._validators import validate_bool_kwarg
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.util._validators import validate_bool_kwarg
|
||||
|
||||
from eland import ElandQueryCompiler
|
||||
|
||||
|
||||
class NDFrame(BasePandasDataset):
|
||||
class NDFrame:
|
||||
|
||||
def __init__(self,
|
||||
client=None,
|
||||
@ -85,6 +83,9 @@ class NDFrame(BasePandasDataset):
|
||||
|
||||
return head.append(tail)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._getitem(key)
|
||||
|
||||
def __getattr__(self, key):
|
||||
"""After regular attribute access, looks up the name in the columns
|
||||
|
||||
@ -105,6 +106,14 @@ class NDFrame(BasePandasDataset):
|
||||
# Don't default to pandas, just return approximation TODO - make this more accurate
|
||||
return sys.getsizeof(self._query_compiler)
|
||||
|
||||
def __len__(self):
|
||||
"""Gets the length of the DataFrame.
|
||||
|
||||
Returns:
|
||||
Returns an integer length of the DataFrame object.
|
||||
"""
|
||||
return len(self.index)
|
||||
|
||||
@property
|
||||
def iloc(self):
|
||||
"""Purely integer-location based indexing for selection by position.
|
||||
@ -235,21 +244,3 @@ class NDFrame(BasePandasDataset):
|
||||
|
||||
def describe(self):
|
||||
return self._query_compiler.describe()
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get item from object for given key (DataFrame column, Panel
|
||||
slice, etc.). Returns default value if not found.
|
||||
|
||||
Args:
|
||||
key (DataFrame column, Panel slice) : the key for which value
|
||||
to get
|
||||
|
||||
Returns:
|
||||
value (type of items contained in object) : A value that is
|
||||
stored at the key
|
||||
"""
|
||||
if key in self.keys():
|
||||
return self.__getitem__(key)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
@ -1,9 +1,7 @@
|
||||
import copy
|
||||
from enum import Enum
|
||||
from io import StringIO
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from eland import Index
|
||||
from eland import Query
|
||||
@ -410,7 +408,7 @@ class Operations:
|
||||
|
||||
columns = self.get_columns()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns, include_bool=False)
|
||||
|
||||
# for each field we compute:
|
||||
# count, mean, std, min, 25%, 50%, 75%, max
|
||||
@ -450,6 +448,7 @@ class Operations:
|
||||
class PandasDataFrameCollector:
|
||||
def collect(self, df):
|
||||
self.df = df
|
||||
|
||||
def batch_size(self):
|
||||
return None
|
||||
|
||||
@ -465,6 +464,7 @@ class Operations:
|
||||
self.kwargs = kwargs
|
||||
self.ret = None
|
||||
self.first_time = True
|
||||
|
||||
def collect(self, df):
|
||||
# If this is the first time we collect results, then write header, otherwise don't write header
|
||||
# and append results
|
||||
|
@ -3,6 +3,7 @@ from copy import deepcopy
|
||||
|
||||
from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
|
||||
|
||||
|
||||
class Query:
|
||||
"""
|
||||
Simple class to manage building Elasticsearch queries.
|
||||
|
@ -1,20 +1,15 @@
|
||||
import pandas as pd
|
||||
from modin.backends.base.query_compiler import BaseQueryCompiler
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like
|
||||
)
|
||||
|
||||
from eland import Client
|
||||
from eland import Index
|
||||
from eland import Mappings
|
||||
from eland import Operations
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like
|
||||
)
|
||||
|
||||
from pandas.core.indexes.numeric import Int64Index
|
||||
from pandas.core.indexes.range import RangeIndex
|
||||
|
||||
|
||||
class ElandQueryCompiler(BaseQueryCompiler):
|
||||
class ElandQueryCompiler:
|
||||
"""
|
||||
Some notes on what can and can not be mapped:
|
||||
|
||||
@ -318,7 +313,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
return df
|
||||
|
||||
def copy(self):
|
||||
return self.__constructor__(
|
||||
return ElandQueryCompiler(
|
||||
client=self._client,
|
||||
index_pattern=self._index_pattern,
|
||||
columns=None, # columns are embedded in operations
|
||||
@ -412,14 +407,19 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
|
||||
def count(self):
|
||||
return self._operations.count(self)
|
||||
|
||||
def mean(self):
|
||||
return self._operations.mean(self)
|
||||
|
||||
def sum(self):
|
||||
return self._operations.sum(self)
|
||||
|
||||
def min(self):
|
||||
return self._operations.min(self)
|
||||
|
||||
def max(self):
|
||||
return self._operations.max(self)
|
||||
|
||||
def nunique(self):
|
||||
return self._operations.nunique(self)
|
||||
|
||||
@ -472,5 +472,3 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
return result
|
||||
|
||||
# def isna(self):
|
||||
|
||||
|
||||
|
@ -101,10 +101,10 @@ class Series(NDFrame):
|
||||
name = property(_get_name)
|
||||
|
||||
def head(self, n=5):
|
||||
return super().head(n)
|
||||
return Series(query_compiler=self._query_compiler.head(n))
|
||||
|
||||
def tail(self, n=5):
|
||||
return super().tail(n)
|
||||
return Series(query_compiler=self._query_compiler.tail(n))
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Rendering Methods
|
||||
@ -194,7 +194,6 @@ class Series(NDFrame):
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,12 +1,9 @@
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Set modin to pandas to avoid starting ray or other
|
||||
os.environ["MODIN_ENGINE"] = 'python'
|
||||
os.environ["MODIN_BACKEND"] = 'pandas'
|
||||
|
||||
# Define test files and indices
|
||||
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
|
||||
|
||||
@ -491,4 +488,3 @@ TEST_NESTED_USER_GROUP_DOCS = [
|
||||
'_source': {'group': 'new york', 'user': [
|
||||
{'first': 'Bill', 'last': 'Jones'}]}}
|
||||
]
|
||||
|
||||
|
@ -1,11 +1,9 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
import eland as ed
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
|
||||
|
||||
import os
|
||||
import eland as ed
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@ -34,6 +32,7 @@ _pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce['customer_birth_date'].astype('datetime64')
|
||||
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
|
||||
|
||||
|
||||
class TestData:
|
||||
|
||||
def pd_flights(self):
|
||||
@ -48,13 +47,13 @@ class TestData:
|
||||
def ed_flights_small(self):
|
||||
return _ed_flights_small
|
||||
|
||||
|
||||
def pd_ecommerce(self):
|
||||
return _pd_ecommerce
|
||||
|
||||
def ed_ecommerce(self):
|
||||
return _ed_ecommerce
|
||||
|
||||
|
||||
def assert_pandas_eland_frame_equal(left, right):
|
||||
if not isinstance(left, pd.DataFrame):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
@ -67,6 +66,7 @@ def assert_pandas_eland_frame_equal(left, right):
|
||||
# Use pandas tests to check similarity
|
||||
assert_frame_equal(left, right._to_pandas())
|
||||
|
||||
|
||||
def assert_eland_frame_equal(left, right):
|
||||
if not isinstance(left, ed.DataFrame):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
@ -91,4 +91,3 @@ def assert_pandas_eland_series_equal(left, right):
|
||||
|
||||
# Use pandas tests to check similarity
|
||||
assert_series_equal(left, right._to_pandas())
|
||||
|
||||
|
@ -1,15 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import (assert_almost_equal)
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameAggs(TestData):
|
||||
|
||||
def test_to_aggs1(self):
|
||||
def test_basic_aggs(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
|
@ -1,19 +1,17 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameCount(TestData):
|
||||
|
||||
def test_to_count1(self):
|
||||
def test_ecommerce_count(self):
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
|
||||
pd_count = pd_ecommerce.count()
|
||||
ed_count = ed_ecommerce.count()
|
||||
|
||||
print(pd_count)
|
||||
print(ed_count)
|
||||
|
||||
|
||||
|
||||
assert_series_equal(pd_count, ed_count)
|
||||
|
@ -6,6 +6,7 @@ import pandas as pd
|
||||
import eland as ed
|
||||
from eland.tests.common import ELASTICSEARCH_HOST
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
|
||||
class TestDataFrameDateTime(TestData):
|
||||
@ -41,4 +42,4 @@ class TestDataFrameDateTime(TestData):
|
||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# assert_frame_equal(df, ed_df_head)
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
@ -1,35 +1,34 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
from io import StringIO
|
||||
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameDescribe(TestData):
|
||||
|
||||
def test_to_describe1(self):
|
||||
def test_flights_describe(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_describe = pd_flights.describe()
|
||||
ed_describe = ed_flights.describe()
|
||||
|
||||
print(pd_describe)
|
||||
print(ed_describe)
|
||||
assert_almost_equal(pd_describe[['AvgTicketPrice']],
|
||||
ed_describe[['AvgTicketPrice']],
|
||||
check_less_precise=True)
|
||||
|
||||
# TODO - this fails now as ES aggregations are approximate
|
||||
# TODO - this fails for all fields now as ES aggregations are approximate
|
||||
# if ES percentile agg uses
|
||||
# "hdr": {
|
||||
# "number_of_significant_value_digits": 3
|
||||
# }
|
||||
# this works
|
||||
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
|
||||
|
||||
pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||
ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||
|
||||
# pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||
# ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
||||
# don't match the mapping types. This is mainly because the products field is
|
||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||
|
||||
# We can not also run 'describe' on a truncate ed dataframe
|
||||
|
||||
|
@ -1,19 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_eland_frame_equal,
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
assert_pandas_eland_frame_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameDrop(TestData):
|
||||
|
||||
def test_drop1(self):
|
||||
def test_flights_small_drop(self):
|
||||
ed_flights_small = self.ed_flights_small()
|
||||
pd_flights_small = self.pd_flights_small()
|
||||
|
||||
|
14
eland/tests/dataframe/test_dtypes_pytest.py
Normal file
14
eland/tests/dataframe/test_dtypes_pytest.py
Normal file
@ -0,0 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameDtypes(TestData):
|
||||
|
||||
def test_flights_dtypes(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
|
@ -1,18 +1,11 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameGet(TestData):
|
||||
|
||||
def test_get1(self):
|
||||
def test_get_one_attribute(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
@ -8,10 +7,9 @@ from eland.tests.common import (
|
||||
)
|
||||
|
||||
|
||||
|
||||
class TestDataFrameGetItem(TestData):
|
||||
|
||||
def test_getitem1(self):
|
||||
def test_getitem_one_attribute(self):
|
||||
ed_flights = self.ed_flights().head(103)
|
||||
pd_flights = self.pd_flights().head(103)
|
||||
|
||||
@ -20,7 +18,7 @@ class TestDataFrameGetItem(TestData):
|
||||
|
||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||
|
||||
def test_getitem2(self):
|
||||
def test_getitem_attribute_list(self):
|
||||
ed_flights = self.ed_flights().head(42)
|
||||
pd_flights = self.pd_flights().head(42)
|
||||
|
||||
@ -29,7 +27,7 @@ class TestDataFrameGetItem(TestData):
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
|
||||
|
||||
def test_getitem3(self):
|
||||
def test_getitem_one_argument(self):
|
||||
ed_flights = self.ed_flights().head(89)
|
||||
pd_flights = self.pd_flights().head(89)
|
||||
|
||||
@ -38,7 +36,7 @@ class TestDataFrameGetItem(TestData):
|
||||
|
||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||
|
||||
def test_getitem4(self):
|
||||
def test_getitem_multiple_calls(self):
|
||||
ed_flights = self.ed_flights().head(89)
|
||||
pd_flights = self.pd_flights().head(89)
|
||||
|
||||
@ -52,4 +50,3 @@ class TestDataFrameGetItem(TestData):
|
||||
ed_col1 = ed_col0['DestCountry']
|
||||
|
||||
assert_pandas_eland_series_equal(pd_col1, ed_col1)
|
||||
|
||||
|
@ -1,11 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
|
||||
|
||||
class TestDataFrameHeadTail(TestData):
|
||||
|
||||
def test_head(self):
|
||||
|
@ -1,6 +1,5 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
@ -10,7 +9,7 @@ from eland.tests.common import TestData
|
||||
|
||||
class TestDataFrameHist(TestData):
|
||||
|
||||
def test_hist1(self):
|
||||
def test_flights_hist(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
@ -30,15 +29,3 @@ class TestDataFrameHist(TestData):
|
||||
# Numbers are slightly different
|
||||
assert_almost_equal(pd_bins, ed_bins)
|
||||
assert_almost_equal(pd_weights, ed_weights)
|
||||
|
||||
def test_hist2(self):
|
||||
pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
|
||||
num_bins = 10
|
||||
|
||||
ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
|
||||
|
||||
print(ed_bins)
|
||||
|
||||
|
||||
|
@ -1,45 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameiLoc(TestData):
|
||||
|
||||
def test_iloc1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
|
||||
|
||||
#pd_flights.info()
|
||||
|
||||
pd_iloc0 = pd_flights.iloc[0]
|
||||
pd_iloc1= pd_flights.iloc[[0]]
|
||||
pd_iloc2= pd_flights.iloc[[0, 1]]
|
||||
pd_iloc3 = pd_flights.iloc[:3]
|
||||
pd_iloc5 = pd_flights.iloc[0, 1]
|
||||
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
|
||||
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
|
||||
|
||||
ed_iloc0 = ed_flights.iloc[0]
|
||||
ed_iloc1 = ed_flights.iloc[[0]]
|
||||
ed_iloc2 = ed_flights.iloc[[0, 1]]
|
||||
ed_iloc3 = ed_flights.iloc[:3]
|
||||
ed_iloc5 = ed_flights.iloc[0, 1]
|
||||
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
|
||||
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
|
||||
|
||||
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
|
||||
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
|
||||
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
|
||||
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
|
||||
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
|
||||
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
|
||||
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
|
@ -1,15 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameInfoEs(TestData):
|
||||
|
||||
def test_to_info1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
head = ed_flights.head(103)
|
||||
slice = head[['timestamp', 'OriginRegion', 'Carrier']]
|
||||
iloc = slice.iloc[10:92, [0,2]]
|
||||
print(iloc.info_es())
|
||||
print(iloc)
|
@ -6,7 +6,7 @@ from eland.tests.common import TestData
|
||||
|
||||
class TestDataFrameInfo(TestData):
|
||||
|
||||
def test_to_info1(self):
|
||||
def test_flights_info(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameMetrics(TestData):
|
||||
|
||||
@ -43,4 +42,3 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_max = ed_flights.max(numeric_only=True)
|
||||
|
||||
assert_series_equal(pd_max, ed_max)
|
||||
|
||||
|
@ -1,22 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameNUnique(TestData):
|
||||
|
||||
def test_nunique1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
print(pd_flights.dtypes)
|
||||
print(ed_flights.dtypes)
|
||||
print(ed_flights.nunique())
|
||||
|
@ -10,7 +10,7 @@ from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
class TestDataFrameQuery(TestData):
|
||||
|
||||
def test_query1(self):
|
||||
def test_query(self):
|
||||
# Examples from:
|
||||
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
||||
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
|
||||
@ -43,4 +43,3 @@ class TestDataFrameQuery(TestData):
|
||||
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
||||
|
||||
|
@ -3,9 +3,9 @@
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameHeadTail(TestData):
|
||||
class TestDataFrameRepr(TestData):
|
||||
|
||||
def test_to_string1(self):
|
||||
def test_head_101_to_string(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -18,7 +18,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
assert pd_head_101_str == ed_head_101_str
|
||||
|
||||
def test_to_string2(self):
|
||||
def test_head_11_to_string2(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -30,7 +30,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
assert pd_head_11_str == ed_head_11_str
|
||||
|
||||
def test_to_repr(self):
|
||||
def test_repr(self):
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from eland.tests.common import TestData
|
||||
@ -8,10 +7,9 @@ from eland.tests.common import (
|
||||
)
|
||||
|
||||
|
||||
|
||||
class TestDataFrameSelectDTypes(TestData):
|
||||
|
||||
def test_select_dtypes1(self):
|
||||
def test_select_dtypes_include_number(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -20,7 +18,7 @@ class TestDataFrameSelectDTypes(TestData):
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
|
||||
|
||||
def test_select_dtypes2(self):
|
||||
def test_select_dtypes_exclude_number(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -28,4 +26,3 @@ class TestDataFrameSelectDTypes(TestData):
|
||||
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
|
||||
|
||||
|
@ -22,5 +22,3 @@ class TestDataFrameShape(TestData):
|
||||
ed_shape = ed_flights.shape
|
||||
|
||||
assert pd_shape == ed_shape
|
||||
|
||||
|
||||
|
@ -1,14 +1,13 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import ROOT_DIR
|
||||
|
||||
from pandas.util.testing import (assert_equal, assert_frame_equal)
|
||||
|
||||
import ast
|
||||
|
||||
import pandas as pd
|
||||
from pandas.util.testing import (assert_frame_equal)
|
||||
|
||||
from eland.tests.common import ROOT_DIR
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameToCSV(TestData):
|
||||
|
||||
@ -43,6 +42,3 @@ class TestDataFrameToCSV(TestData):
|
||||
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
|
||||
|
||||
assert_frame_equal(pd_flights, pd_from_csv)
|
||||
|
||||
|
||||
|
||||
|
@ -1,12 +1,13 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestMappingsDtypes(TestData):
|
||||
|
||||
def test_dtypes1(self):
|
||||
def test_flights_dtypes_all(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -15,7 +16,7 @@ class TestMappingsDtypes(TestData):
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
||||
def test_dtypes2(self):
|
||||
def test_flights_dtypes_columns(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||
|
||||
@ -24,7 +25,7 @@ class TestMappingsDtypes(TestData):
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
||||
def test_get_dtype_counts1(self):
|
||||
def test_flights_get_dtype_counts_all(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -33,7 +34,7 @@ class TestMappingsDtypes(TestData):
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
||||
def test_get_dtype_counts2(self):
|
||||
def test_flights_get_dtype_counts_columns(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from matplotlib.testing.decorators import check_figures_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from matplotlib.testing.decorators import check_figures_equal
|
||||
|
||||
@check_figures_equal(extensions=['png'])
|
||||
def test_plot_hist(fig_test, fig_ref):
|
||||
|
@ -1,8 +1,7 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from eland import Query
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestQueryCopy(TestData):
|
||||
@ -22,6 +21,3 @@ class TestQueryCopy(TestData):
|
||||
|
||||
print(q.to_search_body())
|
||||
print(q1.to_search_body())
|
||||
|
||||
|
||||
|
||||
|
@ -1,15 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_series_equal
|
||||
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_INDEX_NAME
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_series_equal
|
||||
|
||||
|
||||
class TestSeriesHeadTail(TestData):
|
||||
|
@ -1,15 +1,8 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_INDEX_NAME
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestSeriesRepr(TestData):
|
||||
|
@ -1,4 +1,3 @@
|
||||
import pandas as pd
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import helpers
|
||||
|
||||
@ -10,6 +9,7 @@ DATA_LIST = [
|
||||
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
|
||||
]
|
||||
|
||||
|
||||
def _setup_data(es):
|
||||
# Read json file and index records into Elasticsearch
|
||||
for data in DATA_LIST:
|
||||
@ -50,17 +50,20 @@ def _setup_data(es):
|
||||
|
||||
print("Done", index_name)
|
||||
|
||||
|
||||
def _setup_test_mappings(es):
|
||||
# Create a complex mapping containing many Elasticsearch features
|
||||
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
|
||||
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
|
||||
|
||||
|
||||
def _setup_test_nested(es):
|
||||
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
|
||||
es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
|
||||
|
||||
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Create connection to Elasticsearch - use defaults
|
||||
es = Elasticsearch(ELASTICSEARCH_HOST)
|
||||
|
@ -7,7 +7,8 @@ def read_es(es_params, index_pattern):
|
||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||
|
||||
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None):
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||
geo_points=None):
|
||||
"""
|
||||
Append a pandas DataFrame to an Elasticsearch index.
|
||||
Mainly used in testing.
|
||||
|
3
requirements-dev.txt
Normal file
3
requirements-dev.txt
Normal file
@ -0,0 +1,3 @@
|
||||
elasticsearch>=7.0.5
|
||||
pandas==0.25.1
|
||||
pytest>=5.2.1
|
@ -1,2 +1,3 @@
|
||||
elasticsearch>=7.0.5
|
||||
pandas==0.25.1
|
||||
matplotlib
|
||||
|
Loading…
x
Reference in New Issue
Block a user