Merge pull request #30 from stevedodson/master

Major cleanup - removed modin
This commit is contained in:
stevedodson 2019-11-04 14:21:44 +01:00 committed by GitHub
commit 90ea637707
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
49 changed files with 8629 additions and 2333 deletions

View File

@ -1,19 +1,14 @@
from __future__ import absolute_import
import os
# Set modin to pandas to avoid starting ray or other
os.environ["MODIN_ENGINE"] = 'python'
os.environ["MODIN_BACKEND"] = 'pandas'
from eland.client import *
from eland.dataframe import *
from eland.filter import *
from eland.index import *
from eland.mappings import *
from eland.filter import *
from eland.query import *
from eland.operations import *
from eland.query_compiler import *
from eland.plotting import *
from eland.ndframe import *
from eland.operations import *
from eland.plotting import *
from eland.query import *
from eland.query_compiler import *
from eland.series import *
from eland.dataframe import *
from eland.utils import *

View File

@ -1,10 +1,12 @@
from elasticsearch import Elasticsearch
from elasticsearch import helpers
class Client:
"""
eland client - implemented as facade to control access to Elasticsearch methods
"""
def __init__(self, es=None):
if isinstance(es, Elasticsearch):
self._es = es
@ -40,4 +42,3 @@ class Client:
def count(self, **kwargs):
count_json = self._es.count(**kwargs)
return count_json['count']

View File

@ -1,16 +1,15 @@
import sys
import warnings
from distutils.version import LooseVersion
from io import StringIO
import numpy as np
import pandas as pd
import pandas.compat as compat
import six
from io import StringIO
from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.core.dtypes.common import (
is_list_like
)
from pandas.core.dtypes.common import is_list_like
from pandas.core.indexing import check_bool_indexer
from pandas.io.common import _expand_user, _stringify_path
from pandas.io.formats import console
from pandas.io.formats import format as fmt
@ -58,10 +57,10 @@ class DataFrame(NDFrame):
return len(self.columns) == 0 or len(self.index) == 0
def head(self, n=5):
return super().head(n)
return DataFrame(query_compiler=self._query_compiler.head(n))
def tail(self, n=5):
return super().tail(n)
return DataFrame(query_compiler=self._query_compiler.tail(n))
def __repr__(self):
"""
@ -104,7 +103,7 @@ class DataFrame(NDFrame):
return None
if self._info_repr():
buf = StringIO(u(""))
buf = StringIO()
self.info(buf=buf)
# need to escape the <class>, should be the first line.
val = buf.getvalue().replace('<', r'&lt;', 1)
@ -463,7 +462,6 @@ class DataFrame(NDFrame):
"quotechar": quotechar,
"line_terminator": line_terminator,
"chunksize": chunksize,
"tupleize_cols": tupleize_cols,
"date_format": date_format,
"doublequote": doublequote,
"escapechar": escapechar,
@ -510,7 +508,7 @@ class DataFrame(NDFrame):
return self.columns
def groupby(self, by=None, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis)
axis = pd.DataFrame._get_axis_number(axis)
if axis == 1:
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
@ -545,14 +543,14 @@ class DataFrame(NDFrame):
if Series.agg is called with single function, returns a scalar
if Series.agg is called with several functions, returns a Series
"""
axis = self._get_axis_number(axis)
axis = pd.DataFrame._get_axis_number(axis)
if axis == 1:
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
# currently we only support a subset of functions that aggregate columns.
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, compat.string_types):
if isinstance(func, str):
# wrap in list
func = [func]
return self._query_compiler.aggs(func)
@ -580,3 +578,20 @@ class DataFrame(NDFrame):
)
else:
raise NotImplementedError(expr, type(expr))
def get(self, key, default=None):
"""Get item from object for given key (DataFrame column, Panel
slice, etc.). Returns default value if not found.
Args:
key (DataFrame column, Panel slice) : the key for which value
to get
Returns:
value (type of items contained in object) : A value that is
stored at the key
"""
if key in self.keys():
return self._getitem(key)
else:
return default

View File

@ -1,7 +1,7 @@
# Derived from pandasticsearch filters
# Es filter builder for BooleanCond
class BooleanFilter(object):
class BooleanFilter:
def __init__(self, *args):
self._filter = None

View File

@ -14,9 +14,11 @@ In case sorting or aggregating on the _id field is required, it is advised to du
the content of the _id field in another field that has doc_values enabled.)
"""
class Index:
ID_INDEX_FIELD = '_id'
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
def __init__(self, query_compiler, index_field=None):
# Calls setter

View File

@ -75,6 +75,7 @@ class Mappings:
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
self._source_field_pd_dtypes[field_name] = pd_dtype
@staticmethod
def _extract_fields_from_mapping(mappings, source_only=False):
"""
Extract all field names and types from a mapping.
@ -151,6 +152,7 @@ class Mappings:
return fields
@staticmethod
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
"""
{
@ -290,7 +292,7 @@ class Mappings:
return es_dtype
@staticmethod
def _generate_es_mappings(dataframe):
def _generate_es_mappings(dataframe, geo_points=None):
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
Parameters
@ -325,7 +327,10 @@ class Mappings:
mappings = {}
mappings['properties'] = {}
for column_name, dtype in dataframe.dtypes.iteritems():
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
if geo_points is not None and column_name in geo_points:
es_dtype = 'geo_point'
else:
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
mappings['properties'][column_name] = {}
mappings['properties'][column_name]['type'] = es_dtype
@ -411,15 +416,27 @@ class Mappings:
List of source fields where pd_dtype == (int64 or float64 or bool)
"""
if columns is not None:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist()
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[
columns].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].loc[
columns].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
def source_fields(self):
"""

View File

@ -26,15 +26,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
import sys
import pandas as pd
from modin.pandas.base import BasePandasDataset
from modin.pandas.indexing import _iLocIndexer
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import is_list_like
from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler
class NDFrame(BasePandasDataset):
class NDFrame:
def __init__(self,
client=None,
@ -85,6 +83,9 @@ class NDFrame(BasePandasDataset):
return head.append(tail)
def __getitem__(self, key):
return self._getitem(key)
def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns
@ -105,6 +106,14 @@ class NDFrame(BasePandasDataset):
# Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler)
def __len__(self):
"""Gets the length of the DataFrame.
Returns:
Returns an integer length of the DataFrame object.
"""
return len(self.index)
@property
def iloc(self):
"""Purely integer-location based indexing for selection by position.
@ -235,21 +244,3 @@ class NDFrame(BasePandasDataset):
def describe(self):
return self._query_compiler.describe()
def get(self, key, default=None):
"""Get item from object for given key (DataFrame column, Panel
slice, etc.). Returns default value if not found.
Args:
key (DataFrame column, Panel slice) : the key for which value
to get
Returns:
value (type of items contained in object) : A value that is
stored at the key
"""
if key in self.keys():
return self.__getitem__(key)
else:
return default

View File

@ -1,9 +1,7 @@
import copy
from enum import Enum
from io import StringIO
import pandas as pd
import numpy as np
from eland import Index
from eland import Query
@ -170,7 +168,7 @@ class Operations:
results[field] = response['aggregations'][field]['value']
# Return single value if this is a series
#if len(numeric_source_fields) == 1:
# if len(numeric_source_fields) == 1:
# return np.float64(results[numeric_source_fields[0]])
s = pd.Series(data=results, index=numeric_source_fields)
@ -391,7 +389,7 @@ class Operations:
values = list()
for es_agg in es_aggs:
if isinstance(es_agg, tuple):
values.append(response['aggregations'][es_agg[0] + '_' + field][es_agg[1]])
values.append(response['aggregations'][es_agg[0] + '_' + field][es_agg[1]])
else:
values.append(response['aggregations'][es_agg + '_' + field]['value'])
@ -410,7 +408,7 @@ class Operations:
columns = self.get_columns()
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns, include_bool=False)
# for each field we compute:
# count, mean, std, min, 25%, 50%, 75%, max
@ -450,6 +448,7 @@ class Operations:
class PandasDataFrameCollector:
def collect(self, df):
self.df = df
def batch_size(self):
return None
@ -465,6 +464,7 @@ class Operations:
self.kwargs = kwargs
self.ret = None
self.first_time = True
def collect(self, df):
# If this is the first time we collect results, then write header, otherwise don't write header
# and append results

View File

@ -3,13 +3,14 @@ import numpy as np
import pandas.core.common as com
from pandas.core.dtypes.generic import (
ABCIndexClass)
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
"""
Derived from pandas.plotting._core.hist_frame 0.24.2
Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1
Ideally, we'd call hist_frame directly with histogram data,
but weights are applied to ALL series. For example, we can
@ -29,8 +30,6 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
# Start with empty pandas data frame derived from
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
_raise_if_no_mpl()
_converter._WARN = False
if by is not None:
raise NotImplementedError("TODO")
"""

View File

@ -3,6 +3,7 @@ from copy import deepcopy
from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
class Query:
"""
Simple class to manage building Elasticsearch queries.

View File

@ -1,20 +1,15 @@
import pandas as pd
from modin.backends.base.query_compiler import BaseQueryCompiler
from pandas.core.dtypes.common import (
is_list_like
)
from eland import Client
from eland import Index
from eland import Mappings
from eland import Operations
from pandas.core.dtypes.common import (
is_list_like
)
from pandas.core.indexes.numeric import Int64Index
from pandas.core.indexes.range import RangeIndex
class ElandQueryCompiler(BaseQueryCompiler):
class ElandQueryCompiler:
"""
Some notes on what can and can not be mapped:
@ -318,10 +313,10 @@ class ElandQueryCompiler(BaseQueryCompiler):
return df
def copy(self):
return self.__constructor__(
return ElandQueryCompiler(
client=self._client,
index_pattern=self._index_pattern,
columns=None, # columns are embedded in operations
columns=None, # columns are embedded in operations
index_field=self._index.index_field,
operations=self._operations.copy()
)
@ -412,14 +407,19 @@ class ElandQueryCompiler(BaseQueryCompiler):
def count(self):
return self._operations.count(self)
def mean(self):
return self._operations.mean(self)
def sum(self):
return self._operations.sum(self)
def min(self):
return self._operations.min(self)
def max(self):
return self._operations.max(self)
def nunique(self):
return self._operations.nunique(self)
@ -471,6 +471,4 @@ class ElandQueryCompiler(BaseQueryCompiler):
return result
#def isna(self):
# def isna(self):

View File

@ -101,10 +101,10 @@ class Series(NDFrame):
name = property(_get_name)
def head(self, n=5):
return super().head(n)
return Series(query_compiler=self._query_compiler.head(n))
def tail(self, n=5):
return super().tail(n)
return Series(query_compiler=self._query_compiler.tail(n))
# ----------------------------------------------------------------------
# Rendering Methods
@ -194,7 +194,6 @@ class Series(NDFrame):
else:
raise NotImplementedError(other, type(other))
def __eq__(self, other):
if isinstance(other, Series):
# Need to use scripted query to compare to values

File diff suppressed because one or more lines are too long

View File

@ -1,101 +1,98 @@
import os
import pandas as pd
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
# Set modin to pandas to avoid starting ray or other
os.environ["MODIN_ENGINE"] = 'python'
os.environ["MODIN_BACKEND"] = 'pandas'
# Define test files and indices
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
FLIGHTS_INDEX_NAME = 'flights'
FLIGHTS_MAPPING = { "mappings" : {
"properties" : {
"AvgTicketPrice" : {
"type" : "float"
FLIGHTS_MAPPING = {"mappings": {
"properties": {
"AvgTicketPrice": {
"type": "float"
},
"Cancelled" : {
"type" : "boolean"
"Cancelled": {
"type": "boolean"
},
"Carrier" : {
"type" : "keyword"
"Carrier": {
"type": "keyword"
},
"Dest" : {
"type" : "keyword"
"Dest": {
"type": "keyword"
},
"DestAirportID" : {
"type" : "keyword"
"DestAirportID": {
"type": "keyword"
},
"DestCityName" : {
"type" : "keyword"
"DestCityName": {
"type": "keyword"
},
"DestCountry" : {
"type" : "keyword"
"DestCountry": {
"type": "keyword"
},
"DestLocation" : {
"type" : "geo_point"
"DestLocation": {
"type": "geo_point"
},
"DestRegion" : {
"type" : "keyword"
"DestRegion": {
"type": "keyword"
},
"DestWeather" : {
"type" : "keyword"
"DestWeather": {
"type": "keyword"
},
"DistanceKilometers" : {
"type" : "float"
"DistanceKilometers": {
"type": "float"
},
"DistanceMiles" : {
"type" : "float"
"DistanceMiles": {
"type": "float"
},
"FlightDelay" : {
"type" : "boolean"
"FlightDelay": {
"type": "boolean"
},
"FlightDelayMin" : {
"type" : "integer"
"FlightDelayMin": {
"type": "integer"
},
"FlightDelayType" : {
"type" : "keyword"
"FlightDelayType": {
"type": "keyword"
},
"FlightNum" : {
"type" : "keyword"
"FlightNum": {
"type": "keyword"
},
"FlightTimeHour" : {
"type" : "float"
"FlightTimeHour": {
"type": "float"
},
"FlightTimeMin" : {
"type" : "float"
"FlightTimeMin": {
"type": "float"
},
"Origin" : {
"type" : "keyword"
"Origin": {
"type": "keyword"
},
"OriginAirportID" : {
"type" : "keyword"
"OriginAirportID": {
"type": "keyword"
},
"OriginCityName" : {
"type" : "keyword"
"OriginCityName": {
"type": "keyword"
},
"OriginCountry" : {
"type" : "keyword"
"OriginCountry": {
"type": "keyword"
},
"OriginLocation" : {
"type" : "geo_point"
"OriginLocation": {
"type": "geo_point"
},
"OriginRegion" : {
"type" : "keyword"
"OriginRegion": {
"type": "keyword"
},
"OriginWeather" : {
"type" : "keyword"
"OriginWeather": {
"type": "keyword"
},
"dayOfWeek" : {
"type" : "integer"
"dayOfWeek": {
"type": "integer"
},
"timestamp" : {
"type" : "date"
"timestamp": {
"type": "date"
}
}
} }
}
}}
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
@ -104,319 +101,319 @@ FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
ECOMMERCE_INDEX_NAME = 'ecommerce'
ECOMMERCE_MAPPING = { "mappings" : {
"properties" : {
"category" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
},
"currency" : {
"type" : "keyword"
},
"customer_birth_date" : {
"type" : "date"
},
"customer_first_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"customer_full_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"customer_gender" : {
"type" : "keyword"
},
"customer_id" : {
"type" : "keyword"
},
"customer_last_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"customer_phone" : {
"type" : "keyword"
},
"day_of_week" : {
"type" : "keyword"
},
"day_of_week_i" : {
"type" : "integer"
},
"email" : {
"type" : "keyword"
},
"geoip" : {
"properties" : {
"city_name" : {
"type" : "keyword"
},
"continent_name" : {
"type" : "keyword"
},
"country_iso_code" : {
"type" : "keyword"
},
"location" : {
"type" : "geo_point"
},
"region_name" : {
"type" : "keyword"
}
}
},
"manufacturer" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
},
"order_date" : {
"type" : "date"
},
"order_id" : {
"type" : "keyword"
},
"products" : {
"properties" : {
"_id" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
ECOMMERCE_MAPPING = {"mappings": {
"properties": {
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"base_price" : {
"type" : "half_float"
},
"base_unit_price" : {
"type" : "half_float"
},
"category" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
},
"created_on" : {
"type" : "date"
},
"discount_amount" : {
"type" : "half_float"
},
"discount_percentage" : {
"type" : "half_float"
},
"manufacturer" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
}
},
"min_price" : {
"type" : "half_float"
},
"price" : {
"type" : "half_float"
},
"product_id" : {
"type" : "long"
},
"product_name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword"
}
},
"analyzer" : "english"
},
"quantity" : {
"type" : "integer"
},
"sku" : {
"type" : "keyword"
},
"tax_amount" : {
"type" : "half_float"
},
"taxful_price" : {
"type" : "half_float"
},
"taxless_price" : {
"type" : "half_float"
},
"unit_discount_amount" : {
"type" : "half_float"
}
}
},
"sku" : {
"type" : "keyword"
"currency": {
"type": "keyword"
},
"taxful_total_price" : {
"type" : "half_float"
"customer_birth_date": {
"type": "date"
},
"taxless_total_price" : {
"type" : "half_float"
"customer_first_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"total_quantity" : {
"type" : "integer"
"customer_full_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"total_unique_products" : {
"type" : "integer"
"customer_gender": {
"type": "keyword"
},
"type" : {
"type" : "keyword"
"customer_id": {
"type": "keyword"
},
"user" : {
"type" : "keyword"
"customer_last_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_phone": {
"type": "keyword"
},
"day_of_week": {
"type": "keyword"
},
"day_of_week_i": {
"type": "integer"
},
"email": {
"type": "keyword"
},
"geoip": {
"properties": {
"city_name": {
"type": "keyword"
},
"continent_name": {
"type": "keyword"
},
"country_iso_code": {
"type": "keyword"
},
"location": {
"type": "geo_point"
},
"region_name": {
"type": "keyword"
}
}
},
"manufacturer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"order_date": {
"type": "date"
},
"order_id": {
"type": "keyword"
},
"products": {
"properties": {
"_id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"base_price": {
"type": "half_float"
},
"base_unit_price": {
"type": "half_float"
},
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"created_on": {
"type": "date"
},
"discount_amount": {
"type": "half_float"
},
"discount_percentage": {
"type": "half_float"
},
"manufacturer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"min_price": {
"type": "half_float"
},
"price": {
"type": "half_float"
},
"product_id": {
"type": "long"
},
"product_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "english"
},
"quantity": {
"type": "integer"
},
"sku": {
"type": "keyword"
},
"tax_amount": {
"type": "half_float"
},
"taxful_price": {
"type": "half_float"
},
"taxless_price": {
"type": "half_float"
},
"unit_discount_amount": {
"type": "half_float"
}
}
},
"sku": {
"type": "keyword"
},
"taxful_total_price": {
"type": "half_float"
},
"taxless_total_price": {
"type": "half_float"
},
"total_quantity": {
"type": "integer"
},
"total_unique_products": {
"type": "integer"
},
"type": {
"type": "keyword"
},
"user": {
"type": "keyword"
}
}
} }
}
}}
ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
TEST_MAPPING1 = {
'mappings': {
'mappings': {
'properties': {
'city': {
'type': 'text',
'fields': {
'raw': {
'type': 'keyword'
}
}
},
'text': {
'type': 'text',
'fields': {
'english': {
'type': 'text',
'analyzer': 'english'
}
}
},
'origin_location': {
'properties': {
'lat': {
'type': 'text',
'index_prefixes': {},
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
}
},
'lon': {
'city': {
'type': 'text',
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
'raw': {
'type': 'keyword'
}
}
}
}
},
'maps-telemetry': {
'properties': {
'attributesPerMap': {
},
'text': {
'type': 'text',
'fields': {
'english': {
'type': 'text',
'analyzer': 'english'
}
}
},
'origin_location': {
'properties': {
'dataSourcesCount': {
'properties': {
'avg': {
'type': 'long'
},
'max': {
'type': 'long'
},
'min': {
'type': 'long'
}
}
},
'emsVectorLayersCount': {
'dynamic': 'true',
'properties': {
'france_departments': {
'properties': {
'avg': {
'type': 'float'
},
'max': {
'type': 'long'
},
'min': {
'type': 'long'
}
'lat': {
'type': 'text',
'index_prefixes': {},
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
}
},
'lon': {
'type': 'text',
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
}
}
}
}
}
}
},
'maps-telemetry': {
'properties': {
'attributesPerMap': {
'properties': {
'dataSourcesCount': {
'properties': {
'avg': {
'type': 'long'
},
'max': {
'type': 'long'
},
'min': {
'type': 'long'
}
}
},
'emsVectorLayersCount': {
'dynamic': 'true',
'properties': {
'france_departments': {
'properties': {
'avg': {
'type': 'float'
},
'max': {
'type': 'long'
},
'min': {
'type': 'long'
}
}
}
}
}
}
}
}
},
'type': {
'type': 'keyword'
},
'name': {
'type': 'text'
},
'user_name': {
'type': 'keyword'
},
'email': {
'type': 'keyword'
},
'content': {
'type': 'text'
},
'tweeted_at': {
'type': 'date'
},
'dest_location': {
'type': 'geo_point'
},
'my_join_field': {
'type': 'join',
'relations': {
'question': ['answer', 'comment'],
'answer': 'vote'
}
}
},
'type': {
'type': 'keyword'
},
'name': {
'type': 'text'
},
'user_name': {
'type': 'keyword'
},
'email': {
'type': 'keyword'
},
'content': {
'type': 'text'
},
'tweeted_at': {
'type': 'date'
},
'dest_location': {
'type': 'geo_point'
},
'my_join_field': {
'type': 'join',
'relations': {
'question': ['answer', 'comment'],
'answer': 'vote'
}
}
}
}
}
}
TEST_MAPPING1_INDEX_NAME = 'mapping1'
@ -447,48 +444,47 @@ TEST_MAPPING1_EXPECTED = {
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF = TEST_MAPPING1_EXPECTED_DF.drop(index=['city.raw',
'origin_location.lat.keyword',
'origin_location.lon.keyword',
'text.english'])
'origin_location.lat.keyword',
'origin_location.lon.keyword',
'text.english'])
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF.index)
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
TEST_NESTED_USER_GROUP_MAPPING = {
'mappings': {
'properties': {
'group': {
'type': 'keyword'
},
'user': {
'mappings': {
'properties': {
'first': {
'type': 'keyword'
'group': {
'type': 'keyword'
},
'last': {
'type': 'keyword'
},
'address' : {
'type' : 'keyword'
'user': {
'properties': {
'first': {
'type': 'keyword'
},
'last': {
'type': 'keyword'
},
'address': {
'type': 'keyword'
}
}
}
}
}
}
}
}
TEST_NESTED_USER_GROUP_DOCS = [
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':
{'group':'amsterdam','user':[
{'first':'Manke','last':'Nelis','address':['Elandsgracht', 'Amsterdam']},
{'first':'Johnny','last':'Jordaan','address':['Elandsstraat', 'Amsterdam']}]}},
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':
{'group':'london','user':[
{'first':'Alice','last':'Monkton'},
{'first':'Jimmy','last':'White','address':['London']}]}},
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':{'group':'new york','user':[
{'first':'Bill','last':'Jones'}]}}
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':
{'group': 'amsterdam', 'user': [
{'first': 'Manke', 'last': 'Nelis', 'address': ['Elandsgracht', 'Amsterdam']},
{'first': 'Johnny', 'last': 'Jordaan', 'address': ['Elandsstraat', 'Amsterdam']}]}},
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':
{'group': 'london', 'user': [
{'first': 'Alice', 'last': 'Monkton'},
{'first': 'Jimmy', 'last': 'White', 'address': ['London']}]}},
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source': {'group': 'new york', 'user': [
{'first': 'Bill', 'last': 'Jones'}]}}
]

View File

@ -1,24 +1,22 @@
import pytest
import os
import eland as ed
import pandas as pd
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
import os
import eland as ed
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
# Create pandas and eland data frames
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
FLIGHTS_SMALL_INDEX_NAME,\
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME, \
FLIGHTS_SMALL_INDEX_NAME, \
ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME
_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
_pd_flights['timestamp'] = \
pd.to_datetime(_pd_flights['timestamp'])
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
_ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
_pd_flights_small = _pd_flights.head(48)
@ -30,10 +28,11 @@ _pd_ecommerce['order_date'] = \
_pd_ecommerce['products.created_on'] = \
_pd_ecommerce['products.created_on'].apply(lambda x: pd.to_datetime(x))
_pd_ecommerce.insert(2, 'customer_birth_date', None)
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
_pd_ecommerce['customer_birth_date'].astype('datetime64')
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
class TestData:
def pd_flights(self):
@ -48,25 +47,26 @@ class TestData:
def ed_flights_small(self):
return _ed_flights_small
def pd_ecommerce(self):
return _pd_ecommerce
def ed_ecommerce(self):
return _ed_ecommerce
def assert_pandas_eland_frame_equal(left, right):
if not isinstance(left, pd.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
exp_type='pd.DataFrame', act_type=type(left)))
exp_type='pd.DataFrame', act_type=type(left)))
if not isinstance(right, ed.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
exp_type='ed.DataFrame', act_type=type(right)))
exp_type='ed.DataFrame', act_type=type(right)))
# Use pandas tests to check similarity
assert_frame_equal(left, right._to_pandas())
def assert_eland_frame_equal(left, right):
if not isinstance(left, ed.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
@ -83,12 +83,11 @@ def assert_eland_frame_equal(left, right):
def assert_pandas_eland_series_equal(left, right):
if not isinstance(left, pd.Series):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
exp_type='pd.Series', act_type=type(left)))
exp_type='pd.Series', act_type=type(left)))
if not isinstance(right, ed.Series):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
exp_type='ed.Series', act_type=type(right)))
exp_type='ed.Series', act_type=type(right)))
# Use pandas tests to check similarity
assert_series_equal(left, right._to_pandas())

View File

@ -1,15 +1,14 @@
# File called _pytest for PyCharm compatability
import numpy as np
import pandas as pd
from pandas.util.testing import (assert_almost_equal)
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
class TestDataFrameAggs(TestData):
def test_to_aggs1(self):
def test_basic_aggs(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()

View File

@ -1,19 +1,17 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameCount(TestData):
def test_to_count1(self):
def test_ecommerce_count(self):
pd_ecommerce = self.pd_ecommerce()
ed_ecommerce = self.ed_ecommerce()
pd_count = pd_ecommerce.count()
ed_count = ed_ecommerce.count()
print(pd_count)
print(ed_count)
assert_series_equal(pd_count, ed_count)

View File

@ -6,6 +6,7 @@ import pandas as pd
import eland as ed
from eland.tests.common import ELASTICSEARCH_HOST
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameDateTime(TestData):
@ -41,4 +42,4 @@ class TestDataFrameDateTime(TestData):
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
ed_df_head = ed_df.head()
# assert_frame_equal(df, ed_df_head)
assert_pandas_eland_frame_equal(df, ed_df_head)

View File

@ -1,35 +1,34 @@
# File called _pytest for PyCharm compatability
from io import StringIO
from pandas.util.testing import assert_almost_equal
from eland.tests.common import TestData
class TestDataFrameDescribe(TestData):
def test_to_describe1(self):
def test_flights_describe(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe()
print(pd_describe)
print(ed_describe)
assert_almost_equal(pd_describe[['AvgTicketPrice']],
ed_describe[['AvgTicketPrice']],
check_less_precise=True)
# TODO - this fails now as ES aggregations are approximate
# TODO - this fails for all fields now as ES aggregations are approximate
# if ES percentile agg uses
# "hdr": {
# "number_of_significant_value_digits": 3
# }
# this works
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
pd_ecommerce_describe = self.pd_ecommerce().describe()
ed_ecommerce_describe = self.ed_ecommerce().describe()
# pd_ecommerce_describe = self.pd_ecommerce().describe()
# ed_ecommerce_describe = self.ed_ecommerce().describe()
# We don't compare ecommerce here as the default dtypes in pandas from read_json
# don't match the mapping types. This is mainly because the products field is
# nested and so can be treated as a multi-field in ES, but not in pandas
# We can not also run 'describe' on a truncate ed dataframe

View File

@ -1,19 +1,14 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_eland_frame_equal,
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
assert_pandas_eland_frame_equal
)
import numpy as np
class TestDataFrameDrop(TestData):
def test_drop1(self):
def test_flights_small_drop(self):
ed_flights_small = self.ed_flights_small()
pd_flights_small = self.pd_flights_small()

View File

@ -0,0 +1,14 @@
# File called _pytest for PyCharm compatability
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameDtypes(TestData):
def test_flights_dtypes(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)

View File

@ -1,18 +1,11 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameGet(TestData):
def test_get1(self):
def test_get_one_attribute(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()

View File

@ -1,5 +1,4 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from eland.tests.common import TestData
from eland.tests.common import (
@ -8,10 +7,9 @@ from eland.tests.common import (
)
class TestDataFrameGetItem(TestData):
def test_getitem1(self):
def test_getitem_one_attribute(self):
ed_flights = self.ed_flights().head(103)
pd_flights = self.pd_flights().head(103)
@ -20,7 +18,7 @@ class TestDataFrameGetItem(TestData):
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
def test_getitem2(self):
def test_getitem_attribute_list(self):
ed_flights = self.ed_flights().head(42)
pd_flights = self.pd_flights().head(42)
@ -29,7 +27,7 @@ class TestDataFrameGetItem(TestData):
assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
def test_getitem3(self):
def test_getitem_one_argument(self):
ed_flights = self.ed_flights().head(89)
pd_flights = self.pd_flights().head(89)
@ -38,7 +36,7 @@ class TestDataFrameGetItem(TestData):
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
def test_getitem4(self):
def test_getitem_multiple_calls(self):
ed_flights = self.ed_flights().head(89)
pd_flights = self.pd_flights().head(89)
@ -52,4 +50,3 @@ class TestDataFrameGetItem(TestData):
ed_col1 = ed_col0['DestCountry']
assert_pandas_eland_series_equal(pd_col1, ed_col1)

View File

@ -1,11 +1,9 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameHeadTail(TestData):
def test_head(self):

View File

@ -1,6 +1,5 @@
# File called _pytest for PyCharm compatability
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.util.testing import assert_almost_equal
@ -10,7 +9,7 @@ from eland.tests.common import TestData
class TestDataFrameHist(TestData):
def test_hist1(self):
def test_flights_hist(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
@ -30,15 +29,3 @@ class TestDataFrameHist(TestData):
# Numbers are slightly different
assert_almost_equal(pd_bins, ed_bins)
assert_almost_equal(pd_weights, ed_weights)
def test_hist2(self):
pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
num_bins = 10
ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
print(ed_bins)

View File

@ -1,54 +0,0 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameiLoc(TestData):
def test_iloc1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
#pd_flights.info()
pd_iloc0 = pd_flights.iloc[0]
pd_iloc1= pd_flights.iloc[[0]]
pd_iloc2= pd_flights.iloc[[0, 1]]
pd_iloc3 = pd_flights.iloc[:3]
pd_iloc4 = pd_flights.iloc[[True, False, True]]
pd_iloc5 = pd_flights.iloc[0, 1]
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]]
pd_iloc9 = pd_flights.iloc[[True, False, True, False]]
ed_iloc0 = ed_flights.iloc[0]
ed_iloc1 = ed_flights.iloc[[0]]
ed_iloc2 = ed_flights.iloc[[0, 1]]
ed_iloc3 = ed_flights.iloc[:3]
ed_iloc4 = ed_flights.iloc[[True, False, True]]
ed_iloc5 = ed_flights.iloc[0, 1]
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]]
ed_iloc9 = ed_flights.iloc[[True, False, True, False]]
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)

View File

@ -1,15 +0,0 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
class TestDataFrameInfoEs(TestData):
def test_to_info1(self):
ed_flights = self.ed_flights()
head = ed_flights.head(103)
slice = head[['timestamp', 'OriginRegion', 'Carrier']]
iloc = slice.iloc[10:92, [0,2]]
print(iloc.info_es())
print(iloc)

View File

@ -6,7 +6,7 @@ from eland.tests.common import TestData
class TestDataFrameInfo(TestData):
def test_to_info1(self):
def test_flights_info(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()

View File

@ -1,10 +1,9 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestDataFrameMetrics(TestData):
@ -43,4 +42,3 @@ class TestDataFrameMetrics(TestData):
ed_max = ed_flights.max(numeric_only=True)
assert_series_equal(pd_max, ed_max)

View File

@ -1,22 +0,0 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameNUnique(TestData):
def test_nunique1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
print(pd_flights.dtypes)
print(ed_flights.dtypes)
print(ed_flights.nunique())

View File

@ -10,7 +10,7 @@ from eland.tests.common import assert_pandas_eland_frame_equal
class TestDataFrameQuery(TestData):
def test_query1(self):
def test_query(self):
# Examples from:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
@ -43,4 +43,3 @@ class TestDataFrameQuery(TestData):
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
assert_pandas_eland_frame_equal(pd_q4, ed_q4)

View File

@ -3,9 +3,9 @@
from eland.tests.common import TestData
class TestDataFrameHeadTail(TestData):
class TestDataFrameRepr(TestData):
def test_to_string1(self):
def test_head_101_to_string(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -18,7 +18,7 @@ class TestDataFrameHeadTail(TestData):
assert pd_head_101_str == ed_head_101_str
def test_to_string2(self):
def test_head_11_to_string2(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -30,7 +30,7 @@ class TestDataFrameHeadTail(TestData):
assert pd_head_11_str == ed_head_11_str
def test_to_repr(self):
def test_repr(self):
ed_ecommerce = self.ed_ecommerce()
pd_ecommerce = self.pd_ecommerce()

View File

@ -1,31 +0,0 @@
# File called _pytest for PyCharm compatability
import gzip
import pandas as pd
import eland as ed
from eland.tests.common import TestData
class TestDataFrameReviews(TestData):
def test_explore(self):
ed_reviews = ed.DataFrame('localhost', 'anonreviews')
print(ed_reviews.head())
print(ed_reviews.describe())
print(ed_reviews.info())
print(ed_reviews.hist(column="rating", bins=5))
# print(ed_reviews.head().info_es())
def test_review(self):
csv_handle = gzip.open('../anonreviews.csv.gz')
reviews = pd.read_csv(csv_handle)
reviews['date'] = pd.to_datetime(reviews['date'])
g = reviews.groupby('reviewerId')
print(g.describe())

View File

@ -1,5 +1,4 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import numpy as np
from eland.tests.common import TestData
@ -8,10 +7,9 @@ from eland.tests.common import (
)
class TestDataFrameSelectDTypes(TestData):
def test_select_dtypes1(self):
def test_select_dtypes_include_number(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -20,7 +18,7 @@ class TestDataFrameSelectDTypes(TestData):
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
def test_select_dtypes2(self):
def test_select_dtypes_exclude_number(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -28,4 +26,3 @@ class TestDataFrameSelectDTypes(TestData):
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))

View File

@ -22,5 +22,3 @@ class TestDataFrameShape(TestData):
ed_shape = ed_flights.shape
assert pd_shape == ed_shape

View File

@ -1,22 +1,24 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import ast
import pandas as pd
from pandas.util.testing import (assert_frame_equal)
from eland.tests.common import ROOT_DIR
from eland.tests.common import TestData
from pandas.util.testing import (assert_equal, assert_frame_equal)
import ast
class TestDataFrameToCSV(TestData):
def test_to_csv_head(self):
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_head.csv'
ed_flights = self.ed_flights().head()
pd_flights = self.pd_flights().head()
ed_flights.to_csv('results/test_to_csv_head.csv')
ed_flights.to_csv(results_file)
# Converting back from csv is messy as pd_flights is created from a json file
pd_from_csv = pd.read_csv('results/test_to_csv_head.csv', index_col=0, converters={
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
'DestLocation': lambda x: ast.literal_eval(x),
'OriginLocation': lambda x: ast.literal_eval(x)})
pd_from_csv.index = pd_from_csv.index.map(str)
@ -25,19 +27,18 @@ class TestDataFrameToCSV(TestData):
assert_frame_equal(pd_flights, pd_from_csv)
def test_to_csv_full(self):
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv'
# Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_flights.to_csv('results/test_to_csv_full.csv')
ed_flights.to_csv(results_file)
# Converting back from csv is messy as pd_flights is created from a json file
pd_from_csv = pd.read_csv('results/test_to_csv_full.csv', index_col=0, converters={
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
'DestLocation': lambda x: ast.literal_eval(x),
'OriginLocation': lambda x: ast.literal_eval(x)})
pd_from_csv.index = pd_from_csv.index.map(str)
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
assert_frame_equal(pd_flights, pd_from_csv)

View File

@ -7144,7 +7144,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.6.8"
}
},
"nbformat": 4,

View File

@ -1,12 +1,13 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestMappingsDtypes(TestData):
def test_dtypes1(self):
def test_flights_dtypes_all(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -15,7 +16,7 @@ class TestMappingsDtypes(TestData):
assert_series_equal(pd_dtypes, ed_dtypes)
def test_dtypes2(self):
def test_flights_dtypes_columns(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
@ -24,7 +25,7 @@ class TestMappingsDtypes(TestData):
assert_series_equal(pd_dtypes, ed_dtypes)
def test_get_dtype_counts1(self):
def test_flights_get_dtype_counts_all(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
@ -33,12 +34,12 @@ class TestMappingsDtypes(TestData):
assert_series_equal(pd_dtypes, ed_dtypes)
def test_get_dtype_counts2(self):
def test_flights_get_dtype_counts_columns(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
ed_dtypes = ed_flights._query_compiler._mappings.\
ed_dtypes = ed_flights._query_compiler._mappings. \
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
assert_series_equal(pd_dtypes, ed_dtypes)

View File

@ -1,5 +1,5 @@
# -*- coding: UTF-8 -*-
from eland.operators import *
from eland.filter import *
class TestOperators():
@ -21,11 +21,6 @@ class TestOperators():
'script': {'script': {'inline': 'doc["num1"].value > params.param1', 'params': {'param1': 5}}}}
assert IsIn('ids', [1, 2, 3]).build() == {'ids': {'values': [1, 2, 3]}}
def test_and_none(self):
exp = None
exp = exp & Less('b', 3)
print(exp.build())
def test_and_filter1(self):
exp = GreaterEqual('a', 2) & Less('b', 3)
assert exp.build() == {'bool': {'must': [{'range': {'a': {'gte': 2}}}, {'range': {'b': {'lt': 3}}}]}}
@ -33,145 +28,145 @@ class TestOperators():
def test_and_filter2(self):
exp = GreaterEqual('a', 2) & Less('b', 3) & Equal('c', 4)
assert exp.build() == \
{
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
{
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
def test_and_filter3(self):
exp = GreaterEqual('a', 2) & (Less('b', 3) & Equal('c', 4))
assert exp.build() == \
{
'bool': {
'must': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}},
{'range': {'a': {'gte': 2}}}
]
}
}
{
'bool': {
'must': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}},
{'range': {'a': {'gte': 2}}}
]
}
}
def test_or_filter1(self):
exp = GreaterEqual('a', 2) | Less('b', 3)
assert exp.build() == \
{
'bool': {
'should': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}}
]
}
}
{
'bool': {
'should': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}}
]
}
}
def test_or_filter2(self):
exp = GreaterEqual('a', 2) | Less('b', 3) | Equal('c', 4)
assert exp.build() == \
{
'bool': {
'should': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
{
'bool': {
'should': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
def test_or_filter3(self):
exp = GreaterEqual('a', 2) | (Less('b', 3) | Equal('c', 4))
assert exp.build() == \
{
'bool': {
'should': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}},
{'range': {'a': {'gte': 2}}}
]
}
}
{
'bool': {
'should': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}},
{'range': {'a': {'gte': 2}}}
]
}
}
def test_not_filter(self):
exp = ~GreaterEqual('a', 2)
assert exp.build() == \
{
'bool': {
'must_not': {'range': {'a': {'gte': 2}}}
}
}
{
'bool': {
'must_not': {'range': {'a': {'gte': 2}}}
}
}
def test_not_not_filter(self):
exp = ~~GreaterEqual('a', 2)
assert exp.build() == \
{
'bool': {
'must_not': {
'bool': {
'must_not': {'range': {'a': {'gte': 2}}}
}
}
}
}
{
'bool': {
'must_not': {
'bool': {
'must_not': {'range': {'a': {'gte': 2}}}
}
}
}
}
def test_not_and_filter(self):
exp = ~(GreaterEqual('a', 2) & Less('b', 3))
assert exp.build() == \
{
'bool': {
'must_not': {
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}}
]
}
}
}
}
{
'bool': {
'must_not': {
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'lt': 3}}}
]
}
}
}
}
def test_and_or_filter(self):
exp = GreaterEqual('a', 2) & (Less('b', 3) | Equal('c', 4))
assert exp.build() == \
{
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{
'bool': {
'should': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
]
}
}
{
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{
'bool': {
'should': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
]
}
}
def test_and_not_or_filter(self):
exp = GreaterEqual('a', 2) & ~(Less('b', 3) | Equal('c', 4))
assert exp.build() == \
{
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{
'bool': {
'must_not': {
'bool': {
'should': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
{
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{
'bool': {
'must_not': {
'bool': {
'should': [
{'range': {'b': {'lt': 3}}},
{'term': {'c': 4}}
]
}
}
}
}
]
}
}
}
}
}
]
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,8 +1,9 @@
# File called _pytest for PyCharm compatability
from matplotlib.testing.decorators import check_figures_equal
from eland.tests.common import TestData
from matplotlib.testing.decorators import check_figures_equal
@check_figures_equal(extensions=['png'])
def test_plot_hist(fig_test, fig_ref):

View File

@ -1,8 +1,7 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from eland import Query
from eland.tests.common import TestData
class TestQueryCopy(TestData):
@ -22,6 +21,3 @@ class TestQueryCopy(TestData):
print(q.to_search_body())
print(q1.to_search_body())

View File

@ -1,15 +1,9 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_series_equal
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_series_equal
class TestSeriesHeadTail(TestData):

View File

@ -1,15 +1,8 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import assert_pandas_eland_frame_equal
from eland.tests import ELASTICSEARCH_HOST
from eland.tests import FLIGHTS_INDEX_NAME
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestSeriesRepr(TestData):

View File

@ -1,4 +1,3 @@
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch import helpers
@ -10,6 +9,7 @@ DATA_LIST = [
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
]
def _setup_data(es):
# Read json file and index records into Elasticsearch
for data in DATA_LIST:
@ -32,7 +32,7 @@ def _setup_data(es):
for index, row in df.iterrows():
values = row.to_dict()
# make timestamp datetime 2018-01-01T12:09:35
#values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
# values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
# Use integer as id field for repeatable results
action = {'_index': index_name, '_source': values, '_id': str(n)}
@ -50,17 +50,20 @@ def _setup_data(es):
print("Done", index_name)
def _setup_test_mappings(es):
# Create a complex mapping containing many Elasticsearch features
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
def _setup_test_nested(es):
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
if __name__ == '__main__':
# Create connection to Elasticsearch - use defaults
es = Elasticsearch(ELASTICSEARCH_HOST)

View File

@ -7,7 +7,8 @@ def read_es(es_params, index_pattern):
return DataFrame(client=es_params, index_pattern=index_pattern)
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
geo_points=None):
"""
Append a pandas DataFrame to an Elasticsearch index.
Mainly used in testing.
@ -30,10 +31,19 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
If table exists, drop it, recreate it, and insert data.
``'append'``
If table exists, insert data. Create if does not exist.
dropna : bool
``'True'``
Remove missing values (see pandas.Series.dropna)
``'False;``
Include missing values - may cause bulk to fail
geo_points : list or None
List of columns to map to geo_point data type
"""
client = Client(es_params)
mapping = Mappings._generate_es_mappings(df)
mapping = Mappings._generate_es_mappings(df, geo_points)
# If table exists, check if_exists parameter
if client.index_exists(index=destination_index):
@ -58,7 +68,11 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
for row in df.iterrows():
# Use index as _id
id = row[0]
values = row[1].to_dict()
if dropna:
values = row[1].dropna().to_dict()
else:
values = row[1].to_dict()
# Use integer as id field for repeatable results
action = {'_index': destination_index, '_source': values, '_id': str(id)}

3
requirements-dev.txt Normal file
View File

@ -0,0 +1,3 @@
elasticsearch>=7.0.5
pandas==0.25.1
pytest>=5.2.1

View File

@ -1,8 +1,3 @@
elasticsearch==7.0.2
elasticsearch-dsl==7.0.0
numpy==1.16.4
pandas==0.24.2
python-dateutil==2.8.0
pytz==2019.1
six==1.12.0
urllib3==1.25.3
elasticsearch>=7.0.5
pandas==0.25.1
matplotlib

View File

@ -1,9 +1,11 @@
from setuptools import setup
def readme():
with open('README.rst') as f:
return f.read()
setup(name='eland',
version='0.1',
description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch',
@ -13,8 +15,7 @@ setup(name='eland',
license='ELASTIC LICENSE',
packages=['eland'],
install_requires=[
'elasticsearch',
'elasticsearch_dsl',
'pandas'
'elasticsearch>=7.0.5',
'pandas==0.25.1'
],
zip_safe=False)