mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Major cleanup - removed modin as dependency
modin removed as a dependency and iloc feature removed for now - TODO add back in.
This commit is contained in:
parent
9dad8613d3
commit
c1ee409a33
@ -1,19 +1,14 @@
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
import os
|
|
||||||
|
|
||||||
# Set modin to pandas to avoid starting ray or other
|
|
||||||
os.environ["MODIN_ENGINE"] = 'python'
|
|
||||||
os.environ["MODIN_BACKEND"] = 'pandas'
|
|
||||||
|
|
||||||
from eland.client import *
|
from eland.client import *
|
||||||
|
from eland.dataframe import *
|
||||||
|
from eland.filter import *
|
||||||
from eland.index import *
|
from eland.index import *
|
||||||
from eland.mappings import *
|
from eland.mappings import *
|
||||||
from eland.filter import *
|
|
||||||
from eland.query import *
|
|
||||||
from eland.operations import *
|
|
||||||
from eland.query_compiler import *
|
|
||||||
from eland.plotting import *
|
|
||||||
from eland.ndframe import *
|
from eland.ndframe import *
|
||||||
|
from eland.operations import *
|
||||||
|
from eland.plotting import *
|
||||||
|
from eland.query import *
|
||||||
|
from eland.query_compiler import *
|
||||||
from eland.series import *
|
from eland.series import *
|
||||||
from eland.dataframe import *
|
|
||||||
from eland.utils import *
|
from eland.utils import *
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from elasticsearch import helpers
|
from elasticsearch import helpers
|
||||||
|
|
||||||
|
|
||||||
class Client:
|
class Client:
|
||||||
"""
|
"""
|
||||||
eland client - implemented as facade to control access to Elasticsearch methods
|
eland client - implemented as facade to control access to Elasticsearch methods
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, es=None):
|
def __init__(self, es=None):
|
||||||
if isinstance(es, Elasticsearch):
|
if isinstance(es, Elasticsearch):
|
||||||
self._es = es
|
self._es = es
|
||||||
@ -40,4 +42,3 @@ class Client:
|
|||||||
def count(self, **kwargs):
|
def count(self, **kwargs):
|
||||||
count_json = self._es.count(**kwargs)
|
count_json = self._es.count(**kwargs)
|
||||||
return count_json['count']
|
return count_json['count']
|
||||||
|
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pandas.compat as compat
|
|
||||||
import six
|
import six
|
||||||
from io import StringIO
|
|
||||||
from pandas.core.common import apply_if_callable, is_bool_indexer
|
from pandas.core.common import apply_if_callable, is_bool_indexer
|
||||||
from pandas.core.dtypes.common import (
|
from pandas.core.dtypes.common import is_list_like
|
||||||
is_list_like
|
from pandas.core.indexing import check_bool_indexer
|
||||||
)
|
|
||||||
from pandas.io.common import _expand_user, _stringify_path
|
from pandas.io.common import _expand_user, _stringify_path
|
||||||
from pandas.io.formats import console
|
from pandas.io.formats import console
|
||||||
from pandas.io.formats import format as fmt
|
from pandas.io.formats import format as fmt
|
||||||
@ -58,10 +57,10 @@ class DataFrame(NDFrame):
|
|||||||
return len(self.columns) == 0 or len(self.index) == 0
|
return len(self.columns) == 0 or len(self.index) == 0
|
||||||
|
|
||||||
def head(self, n=5):
|
def head(self, n=5):
|
||||||
return super().head(n)
|
return DataFrame(query_compiler=self._query_compiler.head(n))
|
||||||
|
|
||||||
def tail(self, n=5):
|
def tail(self, n=5):
|
||||||
return super().tail(n)
|
return DataFrame(query_compiler=self._query_compiler.tail(n))
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
"""
|
"""
|
||||||
@ -104,7 +103,7 @@ class DataFrame(NDFrame):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if self._info_repr():
|
if self._info_repr():
|
||||||
buf = StringIO(u(""))
|
buf = StringIO()
|
||||||
self.info(buf=buf)
|
self.info(buf=buf)
|
||||||
# need to escape the <class>, should be the first line.
|
# need to escape the <class>, should be the first line.
|
||||||
val = buf.getvalue().replace('<', r'<', 1)
|
val = buf.getvalue().replace('<', r'<', 1)
|
||||||
@ -509,7 +508,7 @@ class DataFrame(NDFrame):
|
|||||||
return self.columns
|
return self.columns
|
||||||
|
|
||||||
def groupby(self, by=None, axis=0, *args, **kwargs):
|
def groupby(self, by=None, axis=0, *args, **kwargs):
|
||||||
axis = self._get_axis_number(axis)
|
axis = pd.DataFrame._get_axis_number(axis)
|
||||||
|
|
||||||
if axis == 1:
|
if axis == 1:
|
||||||
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
||||||
@ -544,7 +543,7 @@ class DataFrame(NDFrame):
|
|||||||
if Series.agg is called with single function, returns a scalar
|
if Series.agg is called with single function, returns a scalar
|
||||||
if Series.agg is called with several functions, returns a Series
|
if Series.agg is called with several functions, returns a Series
|
||||||
"""
|
"""
|
||||||
axis = self._get_axis_number(axis)
|
axis = pd.DataFrame._get_axis_number(axis)
|
||||||
|
|
||||||
if axis == 1:
|
if axis == 1:
|
||||||
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
||||||
@ -579,3 +578,20 @@ class DataFrame(NDFrame):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(expr, type(expr))
|
raise NotImplementedError(expr, type(expr))
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
"""Get item from object for given key (DataFrame column, Panel
|
||||||
|
slice, etc.). Returns default value if not found.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key (DataFrame column, Panel slice) : the key for which value
|
||||||
|
to get
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
value (type of items contained in object) : A value that is
|
||||||
|
stored at the key
|
||||||
|
"""
|
||||||
|
if key in self.keys():
|
||||||
|
return self._getitem(key)
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# Derived from pandasticsearch filters
|
# Derived from pandasticsearch filters
|
||||||
|
|
||||||
# Es filter builder for BooleanCond
|
# Es filter builder for BooleanCond
|
||||||
class BooleanFilter(object):
|
class BooleanFilter:
|
||||||
def __init__(self, *args):
|
def __init__(self, *args):
|
||||||
self._filter = None
|
self._filter = None
|
||||||
|
|
||||||
|
@ -14,6 +14,8 @@ In case sorting or aggregating on the _id field is required, it is advised to du
|
|||||||
the content of the _id field in another field that has doc_values enabled.)
|
the content of the _id field in another field that has doc_values enabled.)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class Index:
|
class Index:
|
||||||
ID_INDEX_FIELD = '_id'
|
ID_INDEX_FIELD = '_id'
|
||||||
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
|
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
|
||||||
|
@ -75,6 +75,7 @@ class Mappings:
|
|||||||
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
||||||
self._source_field_pd_dtypes[field_name] = pd_dtype
|
self._source_field_pd_dtypes[field_name] = pd_dtype
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _extract_fields_from_mapping(mappings, source_only=False):
|
def _extract_fields_from_mapping(mappings, source_only=False):
|
||||||
"""
|
"""
|
||||||
Extract all field names and types from a mapping.
|
Extract all field names and types from a mapping.
|
||||||
@ -151,6 +152,7 @@ class Mappings:
|
|||||||
|
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
|
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
|
||||||
"""
|
"""
|
||||||
{
|
{
|
||||||
@ -414,15 +416,27 @@ class Mappings:
|
|||||||
List of source fields where pd_dtype == (int64 or float64 or bool)
|
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||||
"""
|
"""
|
||||||
if columns is not None:
|
if columns is not None:
|
||||||
|
if include_bool == True:
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||||
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist()
|
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[
|
||||||
|
columns].index.tolist()
|
||||||
else:
|
else:
|
||||||
|
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'float64'))].loc[
|
||||||
|
columns].index.tolist()
|
||||||
|
else:
|
||||||
|
if include_bool == True:
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||||
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
||||||
|
else:
|
||||||
|
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||||
|
|
||||||
def source_fields(self):
|
def source_fields(self):
|
||||||
"""
|
"""
|
||||||
|
@ -26,15 +26,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from modin.pandas.base import BasePandasDataset
|
|
||||||
from modin.pandas.indexing import _iLocIndexer
|
|
||||||
from pandas.util._validators import validate_bool_kwarg
|
|
||||||
from pandas.core.dtypes.common import is_list_like
|
from pandas.core.dtypes.common import is_list_like
|
||||||
|
from pandas.util._validators import validate_bool_kwarg
|
||||||
|
|
||||||
from eland import ElandQueryCompiler
|
from eland import ElandQueryCompiler
|
||||||
|
|
||||||
|
|
||||||
class NDFrame(BasePandasDataset):
|
class NDFrame:
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
client=None,
|
client=None,
|
||||||
@ -85,6 +83,9 @@ class NDFrame(BasePandasDataset):
|
|||||||
|
|
||||||
return head.append(tail)
|
return head.append(tail)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self._getitem(key)
|
||||||
|
|
||||||
def __getattr__(self, key):
|
def __getattr__(self, key):
|
||||||
"""After regular attribute access, looks up the name in the columns
|
"""After regular attribute access, looks up the name in the columns
|
||||||
|
|
||||||
@ -105,6 +106,14 @@ class NDFrame(BasePandasDataset):
|
|||||||
# Don't default to pandas, just return approximation TODO - make this more accurate
|
# Don't default to pandas, just return approximation TODO - make this more accurate
|
||||||
return sys.getsizeof(self._query_compiler)
|
return sys.getsizeof(self._query_compiler)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""Gets the length of the DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Returns an integer length of the DataFrame object.
|
||||||
|
"""
|
||||||
|
return len(self.index)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def iloc(self):
|
def iloc(self):
|
||||||
"""Purely integer-location based indexing for selection by position.
|
"""Purely integer-location based indexing for selection by position.
|
||||||
@ -235,21 +244,3 @@ class NDFrame(BasePandasDataset):
|
|||||||
|
|
||||||
def describe(self):
|
def describe(self):
|
||||||
return self._query_compiler.describe()
|
return self._query_compiler.describe()
|
||||||
|
|
||||||
def get(self, key, default=None):
|
|
||||||
"""Get item from object for given key (DataFrame column, Panel
|
|
||||||
slice, etc.). Returns default value if not found.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
key (DataFrame column, Panel slice) : the key for which value
|
|
||||||
to get
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
value (type of items contained in object) : A value that is
|
|
||||||
stored at the key
|
|
||||||
"""
|
|
||||||
if key in self.keys():
|
|
||||||
return self.__getitem__(key)
|
|
||||||
else:
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
import copy
|
import copy
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import StringIO
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from eland import Index
|
from eland import Index
|
||||||
from eland import Query
|
from eland import Query
|
||||||
@ -170,7 +168,7 @@ class Operations:
|
|||||||
results[field] = response['aggregations'][field]['value']
|
results[field] = response['aggregations'][field]['value']
|
||||||
|
|
||||||
# Return single value if this is a series
|
# Return single value if this is a series
|
||||||
#if len(numeric_source_fields) == 1:
|
# if len(numeric_source_fields) == 1:
|
||||||
# return np.float64(results[numeric_source_fields[0]])
|
# return np.float64(results[numeric_source_fields[0]])
|
||||||
|
|
||||||
s = pd.Series(data=results, index=numeric_source_fields)
|
s = pd.Series(data=results, index=numeric_source_fields)
|
||||||
@ -410,7 +408,7 @@ class Operations:
|
|||||||
|
|
||||||
columns = self.get_columns()
|
columns = self.get_columns()
|
||||||
|
|
||||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns, include_bool=False)
|
||||||
|
|
||||||
# for each field we compute:
|
# for each field we compute:
|
||||||
# count, mean, std, min, 25%, 50%, 75%, max
|
# count, mean, std, min, 25%, 50%, 75%, max
|
||||||
@ -450,6 +448,7 @@ class Operations:
|
|||||||
class PandasDataFrameCollector:
|
class PandasDataFrameCollector:
|
||||||
def collect(self, df):
|
def collect(self, df):
|
||||||
self.df = df
|
self.df = df
|
||||||
|
|
||||||
def batch_size(self):
|
def batch_size(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -465,6 +464,7 @@ class Operations:
|
|||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
self.ret = None
|
self.ret = None
|
||||||
self.first_time = True
|
self.first_time = True
|
||||||
|
|
||||||
def collect(self, df):
|
def collect(self, df):
|
||||||
# If this is the first time we collect results, then write header, otherwise don't write header
|
# If this is the first time we collect results, then write header, otherwise don't write header
|
||||||
# and append results
|
# and append results
|
||||||
|
@ -3,6 +3,7 @@ from copy import deepcopy
|
|||||||
|
|
||||||
from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
|
from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
|
||||||
|
|
||||||
|
|
||||||
class Query:
|
class Query:
|
||||||
"""
|
"""
|
||||||
Simple class to manage building Elasticsearch queries.
|
Simple class to manage building Elasticsearch queries.
|
||||||
|
@ -1,20 +1,15 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from modin.backends.base.query_compiler import BaseQueryCompiler
|
from pandas.core.dtypes.common import (
|
||||||
|
is_list_like
|
||||||
|
)
|
||||||
|
|
||||||
from eland import Client
|
from eland import Client
|
||||||
from eland import Index
|
from eland import Index
|
||||||
from eland import Mappings
|
from eland import Mappings
|
||||||
from eland import Operations
|
from eland import Operations
|
||||||
|
|
||||||
from pandas.core.dtypes.common import (
|
|
||||||
is_list_like
|
|
||||||
)
|
|
||||||
|
|
||||||
from pandas.core.indexes.numeric import Int64Index
|
class ElandQueryCompiler:
|
||||||
from pandas.core.indexes.range import RangeIndex
|
|
||||||
|
|
||||||
|
|
||||||
class ElandQueryCompiler(BaseQueryCompiler):
|
|
||||||
"""
|
"""
|
||||||
Some notes on what can and can not be mapped:
|
Some notes on what can and can not be mapped:
|
||||||
|
|
||||||
@ -318,7 +313,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
return df
|
return df
|
||||||
|
|
||||||
def copy(self):
|
def copy(self):
|
||||||
return self.__constructor__(
|
return ElandQueryCompiler(
|
||||||
client=self._client,
|
client=self._client,
|
||||||
index_pattern=self._index_pattern,
|
index_pattern=self._index_pattern,
|
||||||
columns=None, # columns are embedded in operations
|
columns=None, # columns are embedded in operations
|
||||||
@ -412,14 +407,19 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
|
|
||||||
def count(self):
|
def count(self):
|
||||||
return self._operations.count(self)
|
return self._operations.count(self)
|
||||||
|
|
||||||
def mean(self):
|
def mean(self):
|
||||||
return self._operations.mean(self)
|
return self._operations.mean(self)
|
||||||
|
|
||||||
def sum(self):
|
def sum(self):
|
||||||
return self._operations.sum(self)
|
return self._operations.sum(self)
|
||||||
|
|
||||||
def min(self):
|
def min(self):
|
||||||
return self._operations.min(self)
|
return self._operations.min(self)
|
||||||
|
|
||||||
def max(self):
|
def max(self):
|
||||||
return self._operations.max(self)
|
return self._operations.max(self)
|
||||||
|
|
||||||
def nunique(self):
|
def nunique(self):
|
||||||
return self._operations.nunique(self)
|
return self._operations.nunique(self)
|
||||||
|
|
||||||
@ -471,6 +471,4 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
#def isna(self):
|
# def isna(self):
|
||||||
|
|
||||||
|
|
||||||
|
@ -101,10 +101,10 @@ class Series(NDFrame):
|
|||||||
name = property(_get_name)
|
name = property(_get_name)
|
||||||
|
|
||||||
def head(self, n=5):
|
def head(self, n=5):
|
||||||
return super().head(n)
|
return Series(query_compiler=self._query_compiler.head(n))
|
||||||
|
|
||||||
def tail(self, n=5):
|
def tail(self, n=5):
|
||||||
return super().tail(n)
|
return Series(query_compiler=self._query_compiler.tail(n))
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
# Rendering Methods
|
# Rendering Methods
|
||||||
@ -194,7 +194,6 @@ class Series(NDFrame):
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError(other, type(other))
|
raise NotImplementedError(other, type(other))
|
||||||
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
if isinstance(other, Series):
|
if isinstance(other, Series):
|
||||||
# Need to use scripted query to compare to values
|
# Need to use scripted query to compare to values
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,101 +1,98 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# Set modin to pandas to avoid starting ray or other
|
|
||||||
os.environ["MODIN_ENGINE"] = 'python'
|
|
||||||
os.environ["MODIN_BACKEND"] = 'pandas'
|
|
||||||
|
|
||||||
# Define test files and indices
|
# Define test files and indices
|
||||||
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
|
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
|
||||||
|
|
||||||
FLIGHTS_INDEX_NAME = 'flights'
|
FLIGHTS_INDEX_NAME = 'flights'
|
||||||
FLIGHTS_MAPPING = { "mappings" : {
|
FLIGHTS_MAPPING = {"mappings": {
|
||||||
"properties" : {
|
"properties": {
|
||||||
"AvgTicketPrice" : {
|
"AvgTicketPrice": {
|
||||||
"type" : "float"
|
"type": "float"
|
||||||
},
|
},
|
||||||
"Cancelled" : {
|
"Cancelled": {
|
||||||
"type" : "boolean"
|
"type": "boolean"
|
||||||
},
|
},
|
||||||
"Carrier" : {
|
"Carrier": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"Dest" : {
|
"Dest": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"DestAirportID" : {
|
"DestAirportID": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"DestCityName" : {
|
"DestCityName": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"DestCountry" : {
|
"DestCountry": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"DestLocation" : {
|
"DestLocation": {
|
||||||
"type" : "geo_point"
|
"type": "geo_point"
|
||||||
},
|
},
|
||||||
"DestRegion" : {
|
"DestRegion": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"DestWeather" : {
|
"DestWeather": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"DistanceKilometers" : {
|
"DistanceKilometers": {
|
||||||
"type" : "float"
|
"type": "float"
|
||||||
},
|
},
|
||||||
"DistanceMiles" : {
|
"DistanceMiles": {
|
||||||
"type" : "float"
|
"type": "float"
|
||||||
},
|
},
|
||||||
"FlightDelay" : {
|
"FlightDelay": {
|
||||||
"type" : "boolean"
|
"type": "boolean"
|
||||||
},
|
},
|
||||||
"FlightDelayMin" : {
|
"FlightDelayMin": {
|
||||||
"type" : "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"FlightDelayType" : {
|
"FlightDelayType": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"FlightNum" : {
|
"FlightNum": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"FlightTimeHour" : {
|
"FlightTimeHour": {
|
||||||
"type" : "float"
|
"type": "float"
|
||||||
},
|
},
|
||||||
"FlightTimeMin" : {
|
"FlightTimeMin": {
|
||||||
"type" : "float"
|
"type": "float"
|
||||||
},
|
},
|
||||||
"Origin" : {
|
"Origin": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"OriginAirportID" : {
|
"OriginAirportID": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"OriginCityName" : {
|
"OriginCityName": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"OriginCountry" : {
|
"OriginCountry": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"OriginLocation" : {
|
"OriginLocation": {
|
||||||
"type" : "geo_point"
|
"type": "geo_point"
|
||||||
},
|
},
|
||||||
"OriginRegion" : {
|
"OriginRegion": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"OriginWeather" : {
|
"OriginWeather": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"dayOfWeek" : {
|
"dayOfWeek": {
|
||||||
"type" : "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"timestamp" : {
|
"timestamp": {
|
||||||
"type" : "date"
|
"type": "date"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} }
|
}}
|
||||||
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
|
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
|
||||||
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
|
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
|
||||||
|
|
||||||
@ -104,203 +101,203 @@ FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
|
|||||||
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
|
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
|
||||||
|
|
||||||
ECOMMERCE_INDEX_NAME = 'ecommerce'
|
ECOMMERCE_INDEX_NAME = 'ecommerce'
|
||||||
ECOMMERCE_MAPPING = { "mappings" : {
|
ECOMMERCE_MAPPING = {"mappings": {
|
||||||
"properties" : {
|
"properties": {
|
||||||
"category" : {
|
"category": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"currency" : {
|
"currency": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"customer_birth_date" : {
|
"customer_birth_date": {
|
||||||
"type" : "date"
|
"type": "date"
|
||||||
},
|
},
|
||||||
"customer_first_name" : {
|
"customer_first_name": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword",
|
"type": "keyword",
|
||||||
"ignore_above" : 256
|
"ignore_above": 256
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"customer_full_name" : {
|
"customer_full_name": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword",
|
"type": "keyword",
|
||||||
"ignore_above" : 256
|
"ignore_above": 256
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"customer_gender" : {
|
"customer_gender": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"customer_id" : {
|
"customer_id": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"customer_last_name" : {
|
"customer_last_name": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword",
|
"type": "keyword",
|
||||||
"ignore_above" : 256
|
"ignore_above": 256
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"customer_phone" : {
|
"customer_phone": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"day_of_week" : {
|
"day_of_week": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"day_of_week_i" : {
|
"day_of_week_i": {
|
||||||
"type" : "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"email" : {
|
"email": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"geoip" : {
|
"geoip": {
|
||||||
"properties" : {
|
"properties": {
|
||||||
"city_name" : {
|
"city_name": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"continent_name" : {
|
"continent_name": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"country_iso_code" : {
|
"country_iso_code": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"location" : {
|
"location": {
|
||||||
"type" : "geo_point"
|
"type": "geo_point"
|
||||||
},
|
},
|
||||||
"region_name" : {
|
"region_name": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"manufacturer" : {
|
"manufacturer": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"order_date" : {
|
"order_date": {
|
||||||
"type" : "date"
|
"type": "date"
|
||||||
},
|
},
|
||||||
"order_id" : {
|
"order_id": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"products" : {
|
"products": {
|
||||||
"properties" : {
|
"properties": {
|
||||||
"_id" : {
|
"_id": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword",
|
"type": "keyword",
|
||||||
"ignore_above" : 256
|
"ignore_above": 256
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"base_price" : {
|
"base_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"base_unit_price" : {
|
"base_unit_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"category" : {
|
"category": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"created_on" : {
|
"created_on": {
|
||||||
"type" : "date"
|
"type": "date"
|
||||||
},
|
},
|
||||||
"discount_amount" : {
|
"discount_amount": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"discount_percentage" : {
|
"discount_percentage": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"manufacturer" : {
|
"manufacturer": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"min_price" : {
|
"min_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"price" : {
|
"price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"product_id" : {
|
"product_id": {
|
||||||
"type" : "long"
|
"type": "long"
|
||||||
},
|
},
|
||||||
"product_name" : {
|
"product_name": {
|
||||||
"type" : "text",
|
"type": "text",
|
||||||
"fields" : {
|
"fields": {
|
||||||
"keyword" : {
|
"keyword": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"analyzer" : "english"
|
"analyzer": "english"
|
||||||
},
|
},
|
||||||
"quantity" : {
|
"quantity": {
|
||||||
"type" : "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"sku" : {
|
"sku": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"tax_amount" : {
|
"tax_amount": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"taxful_price" : {
|
"taxful_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"taxless_price" : {
|
"taxless_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"unit_discount_amount" : {
|
"unit_discount_amount": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"sku" : {
|
"sku": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"taxful_total_price" : {
|
"taxful_total_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"taxless_total_price" : {
|
"taxless_total_price": {
|
||||||
"type" : "half_float"
|
"type": "half_float"
|
||||||
},
|
},
|
||||||
"total_quantity" : {
|
"total_quantity": {
|
||||||
"type" : "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"total_unique_products" : {
|
"total_unique_products": {
|
||||||
"type" : "integer"
|
"type": "integer"
|
||||||
},
|
},
|
||||||
"type" : {
|
"type": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"user" : {
|
"user": {
|
||||||
"type" : "keyword"
|
"type": "keyword"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} }
|
}}
|
||||||
ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
|
ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
|
||||||
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
|
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
|
||||||
|
|
||||||
@ -416,7 +413,7 @@ TEST_MAPPING1 = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_MAPPING1_INDEX_NAME = 'mapping1'
|
TEST_MAPPING1_INDEX_NAME = 'mapping1'
|
||||||
|
|
||||||
@ -467,28 +464,27 @@ TEST_NESTED_USER_GROUP_MAPPING = {
|
|||||||
'last': {
|
'last': {
|
||||||
'type': 'keyword'
|
'type': 'keyword'
|
||||||
},
|
},
|
||||||
'address' : {
|
'address': {
|
||||||
'type' : 'keyword'
|
'type': 'keyword'
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_NESTED_USER_GROUP_DOCS = [
|
TEST_NESTED_USER_GROUP_DOCS = [
|
||||||
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
|
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||||
'_source':
|
'_source':
|
||||||
{'group':'amsterdam','user':[
|
{'group': 'amsterdam', 'user': [
|
||||||
{'first':'Manke','last':'Nelis','address':['Elandsgracht', 'Amsterdam']},
|
{'first': 'Manke', 'last': 'Nelis', 'address': ['Elandsgracht', 'Amsterdam']},
|
||||||
{'first':'Johnny','last':'Jordaan','address':['Elandsstraat', 'Amsterdam']}]}},
|
{'first': 'Johnny', 'last': 'Jordaan', 'address': ['Elandsstraat', 'Amsterdam']}]}},
|
||||||
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
|
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||||
'_source':
|
'_source':
|
||||||
{'group':'london','user':[
|
{'group': 'london', 'user': [
|
||||||
{'first':'Alice','last':'Monkton'},
|
{'first': 'Alice', 'last': 'Monkton'},
|
||||||
{'first':'Jimmy','last':'White','address':['London']}]}},
|
{'first': 'Jimmy', 'last': 'White', 'address': ['London']}]}},
|
||||||
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
|
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||||
'_source':{'group':'new york','user':[
|
'_source': {'group': 'new york', 'user': [
|
||||||
{'first':'Bill','last':'Jones'}]}}
|
{'first': 'Bill', 'last': 'Jones'}]}}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1,18 +1,16 @@
|
|||||||
import pytest
|
import os
|
||||||
|
|
||||||
import eland as ed
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
|
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
|
||||||
|
|
||||||
import os
|
import eland as ed
|
||||||
|
|
||||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# Create pandas and eland data frames
|
# Create pandas and eland data frames
|
||||||
from eland.tests import ELASTICSEARCH_HOST
|
from eland.tests import ELASTICSEARCH_HOST
|
||||||
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
|
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME, \
|
||||||
FLIGHTS_SMALL_INDEX_NAME,\
|
FLIGHTS_SMALL_INDEX_NAME, \
|
||||||
ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME
|
ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME
|
||||||
|
|
||||||
_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
|
_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
|
||||||
@ -34,6 +32,7 @@ _pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
|||||||
_pd_ecommerce['customer_birth_date'].astype('datetime64')
|
_pd_ecommerce['customer_birth_date'].astype('datetime64')
|
||||||
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
|
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
|
||||||
|
|
||||||
|
|
||||||
class TestData:
|
class TestData:
|
||||||
|
|
||||||
def pd_flights(self):
|
def pd_flights(self):
|
||||||
@ -48,13 +47,13 @@ class TestData:
|
|||||||
def ed_flights_small(self):
|
def ed_flights_small(self):
|
||||||
return _ed_flights_small
|
return _ed_flights_small
|
||||||
|
|
||||||
|
|
||||||
def pd_ecommerce(self):
|
def pd_ecommerce(self):
|
||||||
return _pd_ecommerce
|
return _pd_ecommerce
|
||||||
|
|
||||||
def ed_ecommerce(self):
|
def ed_ecommerce(self):
|
||||||
return _ed_ecommerce
|
return _ed_ecommerce
|
||||||
|
|
||||||
|
|
||||||
def assert_pandas_eland_frame_equal(left, right):
|
def assert_pandas_eland_frame_equal(left, right):
|
||||||
if not isinstance(left, pd.DataFrame):
|
if not isinstance(left, pd.DataFrame):
|
||||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||||
@ -67,6 +66,7 @@ def assert_pandas_eland_frame_equal(left, right):
|
|||||||
# Use pandas tests to check similarity
|
# Use pandas tests to check similarity
|
||||||
assert_frame_equal(left, right._to_pandas())
|
assert_frame_equal(left, right._to_pandas())
|
||||||
|
|
||||||
|
|
||||||
def assert_eland_frame_equal(left, right):
|
def assert_eland_frame_equal(left, right):
|
||||||
if not isinstance(left, ed.DataFrame):
|
if not isinstance(left, ed.DataFrame):
|
||||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||||
@ -91,4 +91,3 @@ def assert_pandas_eland_series_equal(left, right):
|
|||||||
|
|
||||||
# Use pandas tests to check similarity
|
# Use pandas tests to check similarity
|
||||||
assert_series_equal(left, right._to_pandas())
|
assert_series_equal(left, right._to_pandas())
|
||||||
|
|
||||||
|
@ -1,15 +1,14 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
from pandas.util.testing import assert_almost_equal
|
||||||
from pandas.util.testing import (assert_almost_equal)
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameAggs(TestData):
|
class TestDataFrameAggs(TestData):
|
||||||
|
|
||||||
def test_to_aggs1(self):
|
def test_basic_aggs(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
@ -1,19 +1,17 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameCount(TestData):
|
class TestDataFrameCount(TestData):
|
||||||
|
|
||||||
def test_to_count1(self):
|
def test_ecommerce_count(self):
|
||||||
pd_ecommerce = self.pd_ecommerce()
|
pd_ecommerce = self.pd_ecommerce()
|
||||||
ed_ecommerce = self.ed_ecommerce()
|
ed_ecommerce = self.ed_ecommerce()
|
||||||
|
|
||||||
pd_count = pd_ecommerce.count()
|
pd_count = pd_ecommerce.count()
|
||||||
ed_count = ed_ecommerce.count()
|
ed_count = ed_ecommerce.count()
|
||||||
|
|
||||||
print(pd_count)
|
assert_series_equal(pd_count, ed_count)
|
||||||
print(ed_count)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import pandas as pd
|
|||||||
import eland as ed
|
import eland as ed
|
||||||
from eland.tests.common import ELASTICSEARCH_HOST
|
from eland.tests.common import ELASTICSEARCH_HOST
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameDateTime(TestData):
|
class TestDataFrameDateTime(TestData):
|
||||||
@ -41,4 +42,4 @@ class TestDataFrameDateTime(TestData):
|
|||||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||||
ed_df_head = ed_df.head()
|
ed_df_head = ed_df.head()
|
||||||
|
|
||||||
# assert_frame_equal(df, ed_df_head)
|
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||||
|
@ -1,35 +1,34 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
from io import StringIO
|
|
||||||
|
from pandas.util.testing import assert_almost_equal
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameDescribe(TestData):
|
class TestDataFrameDescribe(TestData):
|
||||||
|
|
||||||
def test_to_describe1(self):
|
def test_flights_describe(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
pd_describe = pd_flights.describe()
|
pd_describe = pd_flights.describe()
|
||||||
ed_describe = ed_flights.describe()
|
ed_describe = ed_flights.describe()
|
||||||
|
|
||||||
print(pd_describe)
|
assert_almost_equal(pd_describe[['AvgTicketPrice']],
|
||||||
print(ed_describe)
|
ed_describe[['AvgTicketPrice']],
|
||||||
|
check_less_precise=True)
|
||||||
|
|
||||||
# TODO - this fails now as ES aggregations are approximate
|
# TODO - this fails for all fields now as ES aggregations are approximate
|
||||||
# if ES percentile agg uses
|
# if ES percentile agg uses
|
||||||
# "hdr": {
|
# "hdr": {
|
||||||
# "number_of_significant_value_digits": 3
|
# "number_of_significant_value_digits": 3
|
||||||
# }
|
# }
|
||||||
# this works
|
# this works
|
||||||
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
|
|
||||||
|
|
||||||
pd_ecommerce_describe = self.pd_ecommerce().describe()
|
|
||||||
ed_ecommerce_describe = self.ed_ecommerce().describe()
|
|
||||||
|
|
||||||
|
# pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||||
|
# ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||||
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
||||||
# don't match the mapping types. This is mainly because the products field is
|
# don't match the mapping types. This is mainly because the products field is
|
||||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||||
|
|
||||||
# We can not also run 'describe' on a truncate ed dataframe
|
# We can not also run 'describe' on a truncate ed dataframe
|
||||||
|
|
||||||
|
@ -1,19 +1,14 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
import eland as ed
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
from eland.tests.common import (
|
from eland.tests.common import (
|
||||||
assert_eland_frame_equal,
|
assert_pandas_eland_frame_equal
|
||||||
assert_pandas_eland_frame_equal,
|
|
||||||
assert_pandas_eland_series_equal
|
|
||||||
)
|
)
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class TestDataFrameDrop(TestData):
|
class TestDataFrameDrop(TestData):
|
||||||
|
|
||||||
def test_drop1(self):
|
def test_flights_small_drop(self):
|
||||||
ed_flights_small = self.ed_flights_small()
|
ed_flights_small = self.ed_flights_small()
|
||||||
pd_flights_small = self.pd_flights_small()
|
pd_flights_small = self.pd_flights_small()
|
||||||
|
|
||||||
|
14
eland/tests/dataframe/test_dtypes_pytest.py
Normal file
14
eland/tests/dataframe/test_dtypes_pytest.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameDtypes(TestData):
|
||||||
|
|
||||||
|
def test_flights_dtypes(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
|
@ -1,18 +1,11 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
import eland as ed
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
from eland.tests.common import (
|
|
||||||
assert_pandas_eland_frame_equal,
|
|
||||||
assert_pandas_eland_series_equal
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class TestDataFrameGet(TestData):
|
class TestDataFrameGet(TestData):
|
||||||
|
|
||||||
def test_get1(self):
|
def test_get_one_attribute(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
from eland.tests.common import (
|
from eland.tests.common import (
|
||||||
@ -8,10 +7,9 @@ from eland.tests.common import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameGetItem(TestData):
|
class TestDataFrameGetItem(TestData):
|
||||||
|
|
||||||
def test_getitem1(self):
|
def test_getitem_one_attribute(self):
|
||||||
ed_flights = self.ed_flights().head(103)
|
ed_flights = self.ed_flights().head(103)
|
||||||
pd_flights = self.pd_flights().head(103)
|
pd_flights = self.pd_flights().head(103)
|
||||||
|
|
||||||
@ -20,7 +18,7 @@ class TestDataFrameGetItem(TestData):
|
|||||||
|
|
||||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||||
|
|
||||||
def test_getitem2(self):
|
def test_getitem_attribute_list(self):
|
||||||
ed_flights = self.ed_flights().head(42)
|
ed_flights = self.ed_flights().head(42)
|
||||||
pd_flights = self.pd_flights().head(42)
|
pd_flights = self.pd_flights().head(42)
|
||||||
|
|
||||||
@ -29,7 +27,7 @@ class TestDataFrameGetItem(TestData):
|
|||||||
|
|
||||||
assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
|
assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
|
||||||
|
|
||||||
def test_getitem3(self):
|
def test_getitem_one_argument(self):
|
||||||
ed_flights = self.ed_flights().head(89)
|
ed_flights = self.ed_flights().head(89)
|
||||||
pd_flights = self.pd_flights().head(89)
|
pd_flights = self.pd_flights().head(89)
|
||||||
|
|
||||||
@ -38,7 +36,7 @@ class TestDataFrameGetItem(TestData):
|
|||||||
|
|
||||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||||
|
|
||||||
def test_getitem4(self):
|
def test_getitem_multiple_calls(self):
|
||||||
ed_flights = self.ed_flights().head(89)
|
ed_flights = self.ed_flights().head(89)
|
||||||
pd_flights = self.pd_flights().head(89)
|
pd_flights = self.pd_flights().head(89)
|
||||||
|
|
||||||
@ -52,4 +50,3 @@ class TestDataFrameGetItem(TestData):
|
|||||||
ed_col1 = ed_col0['DestCountry']
|
ed_col1 = ed_col0['DestCountry']
|
||||||
|
|
||||||
assert_pandas_eland_series_equal(pd_col1, ed_col1)
|
assert_pandas_eland_series_equal(pd_col1, ed_col1)
|
||||||
|
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameHeadTail(TestData):
|
class TestDataFrameHeadTail(TestData):
|
||||||
|
|
||||||
def test_head(self):
|
def test_head(self):
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.util.testing import assert_almost_equal
|
from pandas.util.testing import assert_almost_equal
|
||||||
@ -10,7 +9,7 @@ from eland.tests.common import TestData
|
|||||||
|
|
||||||
class TestDataFrameHist(TestData):
|
class TestDataFrameHist(TestData):
|
||||||
|
|
||||||
def test_hist1(self):
|
def test_flights_hist(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
@ -30,15 +29,3 @@ class TestDataFrameHist(TestData):
|
|||||||
# Numbers are slightly different
|
# Numbers are slightly different
|
||||||
assert_almost_equal(pd_bins, ed_bins)
|
assert_almost_equal(pd_bins, ed_bins)
|
||||||
assert_almost_equal(pd_weights, ed_weights)
|
assert_almost_equal(pd_weights, ed_weights)
|
||||||
|
|
||||||
def test_hist2(self):
|
|
||||||
pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
|
||||||
ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
|
||||||
|
|
||||||
num_bins = 10
|
|
||||||
|
|
||||||
ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
|
|
||||||
|
|
||||||
print(ed_bins)
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,45 +0,0 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
|
||||||
import pandas as pd
|
|
||||||
import eland as ed
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
from eland.tests.common import (
|
|
||||||
assert_pandas_eland_frame_equal,
|
|
||||||
assert_pandas_eland_series_equal
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class TestDataFrameiLoc(TestData):
|
|
||||||
|
|
||||||
def test_iloc1(self):
|
|
||||||
ed_flights = self.ed_flights()
|
|
||||||
pd_flights = self.pd_flights()
|
|
||||||
|
|
||||||
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
|
|
||||||
|
|
||||||
#pd_flights.info()
|
|
||||||
|
|
||||||
pd_iloc0 = pd_flights.iloc[0]
|
|
||||||
pd_iloc1= pd_flights.iloc[[0]]
|
|
||||||
pd_iloc2= pd_flights.iloc[[0, 1]]
|
|
||||||
pd_iloc3 = pd_flights.iloc[:3]
|
|
||||||
pd_iloc5 = pd_flights.iloc[0, 1]
|
|
||||||
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
|
|
||||||
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
|
|
||||||
|
|
||||||
ed_iloc0 = ed_flights.iloc[0]
|
|
||||||
ed_iloc1 = ed_flights.iloc[[0]]
|
|
||||||
ed_iloc2 = ed_flights.iloc[[0, 1]]
|
|
||||||
ed_iloc3 = ed_flights.iloc[:3]
|
|
||||||
ed_iloc5 = ed_flights.iloc[0, 1]
|
|
||||||
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
|
|
||||||
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
|
|
||||||
|
|
||||||
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
|
|
||||||
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
|
|
||||||
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
|
|
||||||
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
|
|
||||||
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
|
|
||||||
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
|
|
||||||
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
|
|
@ -1,15 +0,0 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameInfoEs(TestData):
|
|
||||||
|
|
||||||
def test_to_info1(self):
|
|
||||||
ed_flights = self.ed_flights()
|
|
||||||
|
|
||||||
head = ed_flights.head(103)
|
|
||||||
slice = head[['timestamp', 'OriginRegion', 'Carrier']]
|
|
||||||
iloc = slice.iloc[10:92, [0,2]]
|
|
||||||
print(iloc.info_es())
|
|
||||||
print(iloc)
|
|
@ -6,7 +6,7 @@ from eland.tests.common import TestData
|
|||||||
|
|
||||||
class TestDataFrameInfo(TestData):
|
class TestDataFrameInfo(TestData):
|
||||||
|
|
||||||
def test_to_info1(self):
|
def test_flights_info(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
|
|
||||||
|
|
||||||
from pandas.util.testing import assert_series_equal
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameMetrics(TestData):
|
class TestDataFrameMetrics(TestData):
|
||||||
|
|
||||||
@ -43,4 +42,3 @@ class TestDataFrameMetrics(TestData):
|
|||||||
ed_max = ed_flights.max(numeric_only=True)
|
ed_max = ed_flights.max(numeric_only=True)
|
||||||
|
|
||||||
assert_series_equal(pd_max, ed_max)
|
assert_series_equal(pd_max, ed_max)
|
||||||
|
|
||||||
|
@ -1,22 +0,0 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
|
||||||
import pandas as pd
|
|
||||||
import eland as ed
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
from eland.tests.common import (
|
|
||||||
assert_pandas_eland_frame_equal,
|
|
||||||
assert_pandas_eland_series_equal
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class TestDataFrameNUnique(TestData):
|
|
||||||
|
|
||||||
def test_nunique1(self):
|
|
||||||
ed_flights = self.ed_flights()
|
|
||||||
pd_flights = self.pd_flights()
|
|
||||||
|
|
||||||
print(pd_flights.dtypes)
|
|
||||||
print(ed_flights.dtypes)
|
|
||||||
print(ed_flights.nunique())
|
|
||||||
|
|
@ -10,7 +10,7 @@ from eland.tests.common import assert_pandas_eland_frame_equal
|
|||||||
|
|
||||||
class TestDataFrameQuery(TestData):
|
class TestDataFrameQuery(TestData):
|
||||||
|
|
||||||
def test_query1(self):
|
def test_query(self):
|
||||||
# Examples from:
|
# Examples from:
|
||||||
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
||||||
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
|
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
|
||||||
@ -43,4 +43,3 @@ class TestDataFrameQuery(TestData):
|
|||||||
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
|
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
|
||||||
|
|
||||||
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
||||||
|
|
||||||
|
@ -3,9 +3,9 @@
|
|||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameHeadTail(TestData):
|
class TestDataFrameRepr(TestData):
|
||||||
|
|
||||||
def test_to_string1(self):
|
def test_head_101_to_string(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ class TestDataFrameHeadTail(TestData):
|
|||||||
|
|
||||||
assert pd_head_101_str == ed_head_101_str
|
assert pd_head_101_str == ed_head_101_str
|
||||||
|
|
||||||
def test_to_string2(self):
|
def test_head_11_to_string2(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ class TestDataFrameHeadTail(TestData):
|
|||||||
|
|
||||||
assert pd_head_11_str == ed_head_11_str
|
assert pd_head_11_str == ed_head_11_str
|
||||||
|
|
||||||
def test_to_repr(self):
|
def test_repr(self):
|
||||||
ed_ecommerce = self.ed_ecommerce()
|
ed_ecommerce = self.ed_ecommerce()
|
||||||
pd_ecommerce = self.pd_ecommerce()
|
pd_ecommerce = self.pd_ecommerce()
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
@ -8,10 +7,9 @@ from eland.tests.common import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameSelectDTypes(TestData):
|
class TestDataFrameSelectDTypes(TestData):
|
||||||
|
|
||||||
def test_select_dtypes1(self):
|
def test_select_dtypes_include_number(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
@ -20,7 +18,7 @@ class TestDataFrameSelectDTypes(TestData):
|
|||||||
|
|
||||||
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
|
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
|
||||||
|
|
||||||
def test_select_dtypes2(self):
|
def test_select_dtypes_exclude_number(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
@ -28,4 +26,3 @@ class TestDataFrameSelectDTypes(TestData):
|
|||||||
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
|
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
|
||||||
|
|
||||||
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
|
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
|
||||||
|
|
||||||
|
@ -22,5 +22,3 @@ class TestDataFrameShape(TestData):
|
|||||||
ed_shape = ed_flights.shape
|
ed_shape = ed_flights.shape
|
||||||
|
|
||||||
assert pd_shape == ed_shape
|
assert pd_shape == ed_shape
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,14 +1,13 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
from eland.tests.common import ROOT_DIR
|
|
||||||
|
|
||||||
from pandas.util.testing import (assert_equal, assert_frame_equal)
|
|
||||||
|
|
||||||
import ast
|
import ast
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from pandas.util.testing import (assert_frame_equal)
|
||||||
|
|
||||||
|
from eland.tests.common import ROOT_DIR
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameToCSV(TestData):
|
class TestDataFrameToCSV(TestData):
|
||||||
|
|
||||||
@ -43,6 +42,3 @@ class TestDataFrameToCSV(TestData):
|
|||||||
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
|
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
|
||||||
|
|
||||||
assert_frame_equal(pd_flights, pd_from_csv)
|
assert_frame_equal(pd_flights, pd_from_csv)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
|
|
||||||
from pandas.util.testing import assert_series_equal
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestMappingsDtypes(TestData):
|
class TestMappingsDtypes(TestData):
|
||||||
|
|
||||||
def test_dtypes1(self):
|
def test_flights_dtypes_all(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
@ -15,7 +16,7 @@ class TestMappingsDtypes(TestData):
|
|||||||
|
|
||||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
|
||||||
def test_dtypes2(self):
|
def test_flights_dtypes_columns(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||||
|
|
||||||
@ -24,7 +25,7 @@ class TestMappingsDtypes(TestData):
|
|||||||
|
|
||||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
|
||||||
def test_get_dtype_counts1(self):
|
def test_flights_get_dtype_counts_all(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
@ -33,12 +34,12 @@ class TestMappingsDtypes(TestData):
|
|||||||
|
|
||||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
|
||||||
def test_get_dtype_counts2(self):
|
def test_flights_get_dtype_counts_columns(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||||
|
|
||||||
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
|
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
|
||||||
ed_dtypes = ed_flights._query_compiler._mappings.\
|
ed_dtypes = ed_flights._query_compiler._mappings. \
|
||||||
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
|
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
|
||||||
|
|
||||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from matplotlib.testing.decorators import check_figures_equal
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
from matplotlib.testing.decorators import check_figures_equal
|
|
||||||
|
|
||||||
@check_figures_equal(extensions=['png'])
|
@check_figures_equal(extensions=['png'])
|
||||||
def test_plot_hist(fig_test, fig_ref):
|
def test_plot_hist(fig_test, fig_ref):
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
|
|
||||||
from eland import Query
|
from eland import Query
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
class TestQueryCopy(TestData):
|
class TestQueryCopy(TestData):
|
||||||
@ -22,6 +21,3 @@ class TestQueryCopy(TestData):
|
|||||||
|
|
||||||
print(q.to_search_body())
|
print(q.to_search_body())
|
||||||
print(q1.to_search_body())
|
print(q1.to_search_body())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,15 +1,9 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
from eland.tests.common import assert_pandas_eland_series_equal
|
|
||||||
|
|
||||||
from eland.tests import ELASTICSEARCH_HOST
|
from eland.tests import ELASTICSEARCH_HOST
|
||||||
from eland.tests import FLIGHTS_INDEX_NAME
|
from eland.tests import FLIGHTS_INDEX_NAME
|
||||||
|
from eland.tests.common import TestData
|
||||||
from pandas.util.testing import assert_series_equal
|
from eland.tests.common import assert_pandas_eland_series_equal
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestSeriesHeadTail(TestData):
|
class TestSeriesHeadTail(TestData):
|
||||||
|
@ -1,15 +1,8 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
import pandas as pd
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
|
||||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
|
||||||
|
|
||||||
from eland.tests import ELASTICSEARCH_HOST
|
from eland.tests import ELASTICSEARCH_HOST
|
||||||
from eland.tests import FLIGHTS_INDEX_NAME
|
from eland.tests import FLIGHTS_INDEX_NAME
|
||||||
|
from eland.tests.common import TestData
|
||||||
from pandas.util.testing import assert_series_equal
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestSeriesRepr(TestData):
|
class TestSeriesRepr(TestData):
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import pandas as pd
|
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from elasticsearch import helpers
|
from elasticsearch import helpers
|
||||||
|
|
||||||
@ -10,6 +9,7 @@ DATA_LIST = [
|
|||||||
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
|
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def _setup_data(es):
|
def _setup_data(es):
|
||||||
# Read json file and index records into Elasticsearch
|
# Read json file and index records into Elasticsearch
|
||||||
for data in DATA_LIST:
|
for data in DATA_LIST:
|
||||||
@ -32,7 +32,7 @@ def _setup_data(es):
|
|||||||
for index, row in df.iterrows():
|
for index, row in df.iterrows():
|
||||||
values = row.to_dict()
|
values = row.to_dict()
|
||||||
# make timestamp datetime 2018-01-01T12:09:35
|
# make timestamp datetime 2018-01-01T12:09:35
|
||||||
#values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
|
# values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
|
||||||
|
|
||||||
# Use integer as id field for repeatable results
|
# Use integer as id field for repeatable results
|
||||||
action = {'_index': index_name, '_source': values, '_id': str(n)}
|
action = {'_index': index_name, '_source': values, '_id': str(n)}
|
||||||
@ -50,17 +50,20 @@ def _setup_data(es):
|
|||||||
|
|
||||||
print("Done", index_name)
|
print("Done", index_name)
|
||||||
|
|
||||||
|
|
||||||
def _setup_test_mappings(es):
|
def _setup_test_mappings(es):
|
||||||
# Create a complex mapping containing many Elasticsearch features
|
# Create a complex mapping containing many Elasticsearch features
|
||||||
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
|
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
|
||||||
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
|
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
|
||||||
|
|
||||||
|
|
||||||
def _setup_test_nested(es):
|
def _setup_test_nested(es):
|
||||||
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
|
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
|
||||||
es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
|
es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
|
||||||
|
|
||||||
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
|
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Create connection to Elasticsearch - use defaults
|
# Create connection to Elasticsearch - use defaults
|
||||||
es = Elasticsearch(ELASTICSEARCH_HOST)
|
es = Elasticsearch(ELASTICSEARCH_HOST)
|
||||||
|
@ -7,7 +7,8 @@ def read_es(es_params, index_pattern):
|
|||||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||||
|
|
||||||
|
|
||||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None):
|
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||||
|
geo_points=None):
|
||||||
"""
|
"""
|
||||||
Append a pandas DataFrame to an Elasticsearch index.
|
Append a pandas DataFrame to an Elasticsearch index.
|
||||||
Mainly used in testing.
|
Mainly used in testing.
|
||||||
|
3
requirements-dev.txt
Normal file
3
requirements-dev.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
elasticsearch>=7.0.5
|
||||||
|
pandas==0.25.1
|
||||||
|
pytest>=5.2.1
|
@ -1,2 +1,3 @@
|
|||||||
elasticsearch>=7.0.5
|
elasticsearch>=7.0.5
|
||||||
pandas==0.25.1
|
pandas==0.25.1
|
||||||
|
matplotlib
|
||||||
|
2
setup.py
2
setup.py
@ -1,9 +1,11 @@
|
|||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
def readme():
|
||||||
with open('README.rst') as f:
|
with open('README.rst') as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
setup(name='eland',
|
setup(name='eland',
|
||||||
version='0.1',
|
version='0.1',
|
||||||
description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch',
|
description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user