mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Merge pull request #30 from stevedodson/master
Major cleanup - removed modin
This commit is contained in:
commit
90ea637707
@ -1,19 +1,14 @@
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
# Set modin to pandas to avoid starting ray or other
|
||||
os.environ["MODIN_ENGINE"] = 'python'
|
||||
os.environ["MODIN_BACKEND"] = 'pandas'
|
||||
|
||||
from eland.client import *
|
||||
from eland.dataframe import *
|
||||
from eland.filter import *
|
||||
from eland.index import *
|
||||
from eland.mappings import *
|
||||
from eland.filter import *
|
||||
from eland.query import *
|
||||
from eland.operations import *
|
||||
from eland.query_compiler import *
|
||||
from eland.plotting import *
|
||||
from eland.ndframe import *
|
||||
from eland.operations import *
|
||||
from eland.plotting import *
|
||||
from eland.query import *
|
||||
from eland.query_compiler import *
|
||||
from eland.series import *
|
||||
from eland.dataframe import *
|
||||
from eland.utils import *
|
||||
|
@ -1,10 +1,12 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import helpers
|
||||
|
||||
|
||||
class Client:
|
||||
"""
|
||||
eland client - implemented as facade to control access to Elasticsearch methods
|
||||
"""
|
||||
|
||||
def __init__(self, es=None):
|
||||
if isinstance(es, Elasticsearch):
|
||||
self._es = es
|
||||
@ -40,4 +42,3 @@ class Client:
|
||||
def count(self, **kwargs):
|
||||
count_json = self._es.count(**kwargs)
|
||||
return count_json['count']
|
||||
|
||||
|
@ -1,16 +1,15 @@
|
||||
import sys
|
||||
import warnings
|
||||
from distutils.version import LooseVersion
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pandas.compat as compat
|
||||
import six
|
||||
from io import StringIO
|
||||
from pandas.core.common import apply_if_callable, is_bool_indexer
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like
|
||||
)
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.core.indexing import check_bool_indexer
|
||||
|
||||
from pandas.io.common import _expand_user, _stringify_path
|
||||
from pandas.io.formats import console
|
||||
from pandas.io.formats import format as fmt
|
||||
@ -58,10 +57,10 @@ class DataFrame(NDFrame):
|
||||
return len(self.columns) == 0 or len(self.index) == 0
|
||||
|
||||
def head(self, n=5):
|
||||
return super().head(n)
|
||||
return DataFrame(query_compiler=self._query_compiler.head(n))
|
||||
|
||||
def tail(self, n=5):
|
||||
return super().tail(n)
|
||||
return DataFrame(query_compiler=self._query_compiler.tail(n))
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
@ -104,7 +103,7 @@ class DataFrame(NDFrame):
|
||||
return None
|
||||
|
||||
if self._info_repr():
|
||||
buf = StringIO(u(""))
|
||||
buf = StringIO()
|
||||
self.info(buf=buf)
|
||||
# need to escape the <class>, should be the first line.
|
||||
val = buf.getvalue().replace('<', r'<', 1)
|
||||
@ -463,7 +462,6 @@ class DataFrame(NDFrame):
|
||||
"quotechar": quotechar,
|
||||
"line_terminator": line_terminator,
|
||||
"chunksize": chunksize,
|
||||
"tupleize_cols": tupleize_cols,
|
||||
"date_format": date_format,
|
||||
"doublequote": doublequote,
|
||||
"escapechar": escapechar,
|
||||
@ -510,7 +508,7 @@ class DataFrame(NDFrame):
|
||||
return self.columns
|
||||
|
||||
def groupby(self, by=None, axis=0, *args, **kwargs):
|
||||
axis = self._get_axis_number(axis)
|
||||
axis = pd.DataFrame._get_axis_number(axis)
|
||||
|
||||
if axis == 1:
|
||||
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
||||
@ -545,14 +543,14 @@ class DataFrame(NDFrame):
|
||||
if Series.agg is called with single function, returns a scalar
|
||||
if Series.agg is called with several functions, returns a Series
|
||||
"""
|
||||
axis = self._get_axis_number(axis)
|
||||
axis = pd.DataFrame._get_axis_number(axis)
|
||||
|
||||
if axis == 1:
|
||||
raise NotImplementedError("Aggregating via index not currently implemented - needs index transform")
|
||||
|
||||
# currently we only support a subset of functions that aggregate columns.
|
||||
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
|
||||
if isinstance(func, compat.string_types):
|
||||
if isinstance(func, str):
|
||||
# wrap in list
|
||||
func = [func]
|
||||
return self._query_compiler.aggs(func)
|
||||
@ -580,3 +578,20 @@ class DataFrame(NDFrame):
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(expr, type(expr))
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get item from object for given key (DataFrame column, Panel
|
||||
slice, etc.). Returns default value if not found.
|
||||
|
||||
Args:
|
||||
key (DataFrame column, Panel slice) : the key for which value
|
||||
to get
|
||||
|
||||
Returns:
|
||||
value (type of items contained in object) : A value that is
|
||||
stored at the key
|
||||
"""
|
||||
if key in self.keys():
|
||||
return self._getitem(key)
|
||||
else:
|
||||
return default
|
||||
|
@ -1,7 +1,7 @@
|
||||
# Derived from pandasticsearch filters
|
||||
|
||||
# Es filter builder for BooleanCond
|
||||
class BooleanFilter(object):
|
||||
class BooleanFilter:
|
||||
def __init__(self, *args):
|
||||
self._filter = None
|
||||
|
||||
|
@ -14,9 +14,11 @@ In case sorting or aggregating on the _id field is required, it is advised to du
|
||||
the content of the _id field in another field that has doc_values enabled.)
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class Index:
|
||||
ID_INDEX_FIELD = '_id'
|
||||
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
|
||||
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
|
||||
|
||||
def __init__(self, query_compiler, index_field=None):
|
||||
# Calls setter
|
||||
|
@ -75,6 +75,7 @@ class Mappings:
|
||||
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
||||
self._source_field_pd_dtypes[field_name] = pd_dtype
|
||||
|
||||
@staticmethod
|
||||
def _extract_fields_from_mapping(mappings, source_only=False):
|
||||
"""
|
||||
Extract all field names and types from a mapping.
|
||||
@ -151,6 +152,7 @@ class Mappings:
|
||||
|
||||
return fields
|
||||
|
||||
@staticmethod
|
||||
def _create_capability_matrix(all_fields, source_fields, all_fields_caps):
|
||||
"""
|
||||
{
|
||||
@ -290,7 +292,7 @@ class Mappings:
|
||||
return es_dtype
|
||||
|
||||
@staticmethod
|
||||
def _generate_es_mappings(dataframe):
|
||||
def _generate_es_mappings(dataframe, geo_points=None):
|
||||
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
|
||||
|
||||
Parameters
|
||||
@ -325,7 +327,10 @@ class Mappings:
|
||||
mappings = {}
|
||||
mappings['properties'] = {}
|
||||
for column_name, dtype in dataframe.dtypes.iteritems():
|
||||
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
||||
if geo_points is not None and column_name in geo_points:
|
||||
es_dtype = 'geo_point'
|
||||
else:
|
||||
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
||||
|
||||
mappings['properties'][column_name] = {}
|
||||
mappings['properties'][column_name]['type'] = es_dtype
|
||||
@ -411,15 +416,27 @@ class Mappings:
|
||||
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||
"""
|
||||
if columns is not None:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist()
|
||||
if include_bool == True:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[
|
||||
columns].index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].loc[
|
||||
columns].index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
||||
if include_bool == True:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
||||
else:
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||
|
||||
def source_fields(self):
|
||||
"""
|
||||
|
@ -26,15 +26,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from modin.pandas.base import BasePandasDataset
|
||||
from modin.pandas.indexing import _iLocIndexer
|
||||
from pandas.util._validators import validate_bool_kwarg
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
from pandas.util._validators import validate_bool_kwarg
|
||||
|
||||
from eland import ElandQueryCompiler
|
||||
|
||||
|
||||
class NDFrame(BasePandasDataset):
|
||||
class NDFrame:
|
||||
|
||||
def __init__(self,
|
||||
client=None,
|
||||
@ -85,6 +83,9 @@ class NDFrame(BasePandasDataset):
|
||||
|
||||
return head.append(tail)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._getitem(key)
|
||||
|
||||
def __getattr__(self, key):
|
||||
"""After regular attribute access, looks up the name in the columns
|
||||
|
||||
@ -105,6 +106,14 @@ class NDFrame(BasePandasDataset):
|
||||
# Don't default to pandas, just return approximation TODO - make this more accurate
|
||||
return sys.getsizeof(self._query_compiler)
|
||||
|
||||
def __len__(self):
|
||||
"""Gets the length of the DataFrame.
|
||||
|
||||
Returns:
|
||||
Returns an integer length of the DataFrame object.
|
||||
"""
|
||||
return len(self.index)
|
||||
|
||||
@property
|
||||
def iloc(self):
|
||||
"""Purely integer-location based indexing for selection by position.
|
||||
@ -235,21 +244,3 @@ class NDFrame(BasePandasDataset):
|
||||
|
||||
def describe(self):
|
||||
return self._query_compiler.describe()
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get item from object for given key (DataFrame column, Panel
|
||||
slice, etc.). Returns default value if not found.
|
||||
|
||||
Args:
|
||||
key (DataFrame column, Panel slice) : the key for which value
|
||||
to get
|
||||
|
||||
Returns:
|
||||
value (type of items contained in object) : A value that is
|
||||
stored at the key
|
||||
"""
|
||||
if key in self.keys():
|
||||
return self.__getitem__(key)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
@ -1,9 +1,7 @@
|
||||
import copy
|
||||
from enum import Enum
|
||||
from io import StringIO
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from eland import Index
|
||||
from eland import Query
|
||||
@ -170,7 +168,7 @@ class Operations:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
# Return single value if this is a series
|
||||
#if len(numeric_source_fields) == 1:
|
||||
# if len(numeric_source_fields) == 1:
|
||||
# return np.float64(results[numeric_source_fields[0]])
|
||||
|
||||
s = pd.Series(data=results, index=numeric_source_fields)
|
||||
@ -391,7 +389,7 @@ class Operations:
|
||||
values = list()
|
||||
for es_agg in es_aggs:
|
||||
if isinstance(es_agg, tuple):
|
||||
values.append(response['aggregations'][es_agg[0] + '_' + field][es_agg[1]])
|
||||
values.append(response['aggregations'][es_agg[0] + '_' + field][es_agg[1]])
|
||||
else:
|
||||
values.append(response['aggregations'][es_agg + '_' + field]['value'])
|
||||
|
||||
@ -410,7 +408,7 @@ class Operations:
|
||||
|
||||
columns = self.get_columns()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns, include_bool=False)
|
||||
|
||||
# for each field we compute:
|
||||
# count, mean, std, min, 25%, 50%, 75%, max
|
||||
@ -450,6 +448,7 @@ class Operations:
|
||||
class PandasDataFrameCollector:
|
||||
def collect(self, df):
|
||||
self.df = df
|
||||
|
||||
def batch_size(self):
|
||||
return None
|
||||
|
||||
@ -465,6 +464,7 @@ class Operations:
|
||||
self.kwargs = kwargs
|
||||
self.ret = None
|
||||
self.first_time = True
|
||||
|
||||
def collect(self, df):
|
||||
# If this is the first time we collect results, then write header, otherwise don't write header
|
||||
# and append results
|
||||
|
@ -3,13 +3,14 @@ import numpy as np
|
||||
import pandas.core.common as com
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCIndexClass)
|
||||
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
|
||||
|
||||
|
||||
def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
|
||||
sharey=False, figsize=None, layout=None, bins=10, **kwds):
|
||||
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
|
||||
sharey=False, figsize=None, layout=None, bins=10, **kwds):
|
||||
"""
|
||||
Derived from pandas.plotting._core.hist_frame 0.24.2
|
||||
Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1
|
||||
|
||||
Ideally, we'd call hist_frame directly with histogram data,
|
||||
but weights are applied to ALL series. For example, we can
|
||||
@ -29,8 +30,6 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
# Start with empty pandas data frame derived from
|
||||
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
|
||||
|
||||
_raise_if_no_mpl()
|
||||
_converter._WARN = False
|
||||
if by is not None:
|
||||
raise NotImplementedError("TODO")
|
||||
"""
|
||||
|
@ -3,6 +3,7 @@ from copy import deepcopy
|
||||
|
||||
from eland.filter import BooleanFilter, NotNull, IsNull, IsIn
|
||||
|
||||
|
||||
class Query:
|
||||
"""
|
||||
Simple class to manage building Elasticsearch queries.
|
||||
|
@ -1,20 +1,15 @@
|
||||
import pandas as pd
|
||||
from modin.backends.base.query_compiler import BaseQueryCompiler
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like
|
||||
)
|
||||
|
||||
from eland import Client
|
||||
from eland import Index
|
||||
from eland import Mappings
|
||||
from eland import Operations
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_list_like
|
||||
)
|
||||
|
||||
from pandas.core.indexes.numeric import Int64Index
|
||||
from pandas.core.indexes.range import RangeIndex
|
||||
|
||||
|
||||
class ElandQueryCompiler(BaseQueryCompiler):
|
||||
class ElandQueryCompiler:
|
||||
"""
|
||||
Some notes on what can and can not be mapped:
|
||||
|
||||
@ -318,10 +313,10 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
return df
|
||||
|
||||
def copy(self):
|
||||
return self.__constructor__(
|
||||
return ElandQueryCompiler(
|
||||
client=self._client,
|
||||
index_pattern=self._index_pattern,
|
||||
columns=None, # columns are embedded in operations
|
||||
columns=None, # columns are embedded in operations
|
||||
index_field=self._index.index_field,
|
||||
operations=self._operations.copy()
|
||||
)
|
||||
@ -412,14 +407,19 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
|
||||
def count(self):
|
||||
return self._operations.count(self)
|
||||
|
||||
def mean(self):
|
||||
return self._operations.mean(self)
|
||||
|
||||
def sum(self):
|
||||
return self._operations.sum(self)
|
||||
|
||||
def min(self):
|
||||
return self._operations.min(self)
|
||||
|
||||
def max(self):
|
||||
return self._operations.max(self)
|
||||
|
||||
def nunique(self):
|
||||
return self._operations.nunique(self)
|
||||
|
||||
@ -471,6 +471,4 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
|
||||
return result
|
||||
|
||||
#def isna(self):
|
||||
|
||||
|
||||
# def isna(self):
|
||||
|
@ -101,10 +101,10 @@ class Series(NDFrame):
|
||||
name = property(_get_name)
|
||||
|
||||
def head(self, n=5):
|
||||
return super().head(n)
|
||||
return Series(query_compiler=self._query_compiler.head(n))
|
||||
|
||||
def tail(self, n=5):
|
||||
return super().tail(n)
|
||||
return Series(query_compiler=self._query_compiler.tail(n))
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Rendering Methods
|
||||
@ -194,7 +194,6 @@ class Series(NDFrame):
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,101 +1,98 @@
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Set modin to pandas to avoid starting ray or other
|
||||
os.environ["MODIN_ENGINE"] = 'python'
|
||||
os.environ["MODIN_BACKEND"] = 'pandas'
|
||||
|
||||
# Define test files and indices
|
||||
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
|
||||
ELASTICSEARCH_HOST = 'localhost' # TODO externalise this
|
||||
|
||||
FLIGHTS_INDEX_NAME = 'flights'
|
||||
FLIGHTS_MAPPING = { "mappings" : {
|
||||
"properties" : {
|
||||
"AvgTicketPrice" : {
|
||||
"type" : "float"
|
||||
FLIGHTS_MAPPING = {"mappings": {
|
||||
"properties": {
|
||||
"AvgTicketPrice": {
|
||||
"type": "float"
|
||||
},
|
||||
"Cancelled" : {
|
||||
"type" : "boolean"
|
||||
"Cancelled": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"Carrier" : {
|
||||
"type" : "keyword"
|
||||
"Carrier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"Dest" : {
|
||||
"type" : "keyword"
|
||||
"Dest": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"DestAirportID" : {
|
||||
"type" : "keyword"
|
||||
"DestAirportID": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"DestCityName" : {
|
||||
"type" : "keyword"
|
||||
"DestCityName": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"DestCountry" : {
|
||||
"type" : "keyword"
|
||||
"DestCountry": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"DestLocation" : {
|
||||
"type" : "geo_point"
|
||||
"DestLocation": {
|
||||
"type": "geo_point"
|
||||
},
|
||||
"DestRegion" : {
|
||||
"type" : "keyword"
|
||||
"DestRegion": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"DestWeather" : {
|
||||
"type" : "keyword"
|
||||
"DestWeather": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"DistanceKilometers" : {
|
||||
"type" : "float"
|
||||
"DistanceKilometers": {
|
||||
"type": "float"
|
||||
},
|
||||
"DistanceMiles" : {
|
||||
"type" : "float"
|
||||
"DistanceMiles": {
|
||||
"type": "float"
|
||||
},
|
||||
"FlightDelay" : {
|
||||
"type" : "boolean"
|
||||
"FlightDelay": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"FlightDelayMin" : {
|
||||
"type" : "integer"
|
||||
"FlightDelayMin": {
|
||||
"type": "integer"
|
||||
},
|
||||
"FlightDelayType" : {
|
||||
"type" : "keyword"
|
||||
"FlightDelayType": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"FlightNum" : {
|
||||
"type" : "keyword"
|
||||
"FlightNum": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"FlightTimeHour" : {
|
||||
"type" : "float"
|
||||
"FlightTimeHour": {
|
||||
"type": "float"
|
||||
},
|
||||
"FlightTimeMin" : {
|
||||
"type" : "float"
|
||||
"FlightTimeMin": {
|
||||
"type": "float"
|
||||
},
|
||||
"Origin" : {
|
||||
"type" : "keyword"
|
||||
"Origin": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"OriginAirportID" : {
|
||||
"type" : "keyword"
|
||||
"OriginAirportID": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"OriginCityName" : {
|
||||
"type" : "keyword"
|
||||
"OriginCityName": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"OriginCountry" : {
|
||||
"type" : "keyword"
|
||||
"OriginCountry": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"OriginLocation" : {
|
||||
"type" : "geo_point"
|
||||
"OriginLocation": {
|
||||
"type": "geo_point"
|
||||
},
|
||||
"OriginRegion" : {
|
||||
"type" : "keyword"
|
||||
"OriginRegion": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"OriginWeather" : {
|
||||
"type" : "keyword"
|
||||
"OriginWeather": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"dayOfWeek" : {
|
||||
"type" : "integer"
|
||||
"dayOfWeek": {
|
||||
"type": "integer"
|
||||
},
|
||||
"timestamp" : {
|
||||
"type" : "date"
|
||||
"timestamp": {
|
||||
"type": "date"
|
||||
}
|
||||
}
|
||||
} }
|
||||
}
|
||||
}}
|
||||
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
|
||||
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
|
||||
|
||||
@ -104,319 +101,319 @@ FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
|
||||
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
|
||||
|
||||
ECOMMERCE_INDEX_NAME = 'ecommerce'
|
||||
ECOMMERCE_MAPPING = { "mappings" : {
|
||||
"properties" : {
|
||||
"category" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"currency" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"customer_birth_date" : {
|
||||
"type" : "date"
|
||||
},
|
||||
"customer_first_name" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword",
|
||||
"ignore_above" : 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"customer_full_name" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword",
|
||||
"ignore_above" : 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"customer_gender" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"customer_id" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"customer_last_name" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword",
|
||||
"ignore_above" : 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"customer_phone" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"day_of_week" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"day_of_week_i" : {
|
||||
"type" : "integer"
|
||||
},
|
||||
"email" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"geoip" : {
|
||||
"properties" : {
|
||||
"city_name" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"continent_name" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"country_iso_code" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"location" : {
|
||||
"type" : "geo_point"
|
||||
},
|
||||
"region_name" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"manufacturer" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"order_date" : {
|
||||
"type" : "date"
|
||||
},
|
||||
"order_id" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"products" : {
|
||||
"properties" : {
|
||||
"_id" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword",
|
||||
"ignore_above" : 256
|
||||
ECOMMERCE_MAPPING = {"mappings": {
|
||||
"properties": {
|
||||
"category": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"base_price" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"base_unit_price" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"category" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"created_on" : {
|
||||
"type" : "date"
|
||||
},
|
||||
"discount_amount" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"discount_percentage" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"manufacturer" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"min_price" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"price" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"product_id" : {
|
||||
"type" : "long"
|
||||
},
|
||||
"product_name" : {
|
||||
"type" : "text",
|
||||
"fields" : {
|
||||
"keyword" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
},
|
||||
"analyzer" : "english"
|
||||
},
|
||||
"quantity" : {
|
||||
"type" : "integer"
|
||||
},
|
||||
"sku" : {
|
||||
"type" : "keyword"
|
||||
},
|
||||
"tax_amount" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"taxful_price" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"taxless_price" : {
|
||||
"type" : "half_float"
|
||||
},
|
||||
"unit_discount_amount" : {
|
||||
"type" : "half_float"
|
||||
}
|
||||
}
|
||||
},
|
||||
"sku" : {
|
||||
"type" : "keyword"
|
||||
"currency": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"taxful_total_price" : {
|
||||
"type" : "half_float"
|
||||
"customer_birth_date": {
|
||||
"type": "date"
|
||||
},
|
||||
"taxless_total_price" : {
|
||||
"type" : "half_float"
|
||||
"customer_first_name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"total_quantity" : {
|
||||
"type" : "integer"
|
||||
"customer_full_name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"total_unique_products" : {
|
||||
"type" : "integer"
|
||||
"customer_gender": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"type" : {
|
||||
"type" : "keyword"
|
||||
"customer_id": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"user" : {
|
||||
"type" : "keyword"
|
||||
"customer_last_name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"customer_phone": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"day_of_week": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"day_of_week_i": {
|
||||
"type": "integer"
|
||||
},
|
||||
"email": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"geoip": {
|
||||
"properties": {
|
||||
"city_name": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"continent_name": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"country_iso_code": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"location": {
|
||||
"type": "geo_point"
|
||||
},
|
||||
"region_name": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"manufacturer": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"order_date": {
|
||||
"type": "date"
|
||||
},
|
||||
"order_id": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"products": {
|
||||
"properties": {
|
||||
"_id": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"base_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"base_unit_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"category": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"created_on": {
|
||||
"type": "date"
|
||||
},
|
||||
"discount_amount": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"discount_percentage": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"manufacturer": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
},
|
||||
"min_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"product_id": {
|
||||
"type": "long"
|
||||
},
|
||||
"product_name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword"
|
||||
}
|
||||
},
|
||||
"analyzer": "english"
|
||||
},
|
||||
"quantity": {
|
||||
"type": "integer"
|
||||
},
|
||||
"sku": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"tax_amount": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"taxful_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"taxless_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"unit_discount_amount": {
|
||||
"type": "half_float"
|
||||
}
|
||||
}
|
||||
},
|
||||
"sku": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"taxful_total_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"taxless_total_price": {
|
||||
"type": "half_float"
|
||||
},
|
||||
"total_quantity": {
|
||||
"type": "integer"
|
||||
},
|
||||
"total_unique_products": {
|
||||
"type": "integer"
|
||||
},
|
||||
"type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"user": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
} }
|
||||
}
|
||||
}}
|
||||
ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
|
||||
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
|
||||
|
||||
TEST_MAPPING1 = {
|
||||
'mappings': {
|
||||
'mappings': {
|
||||
'properties': {
|
||||
'city': {
|
||||
'type': 'text',
|
||||
'fields': {
|
||||
'raw': {
|
||||
'type': 'keyword'
|
||||
}
|
||||
}
|
||||
},
|
||||
'text': {
|
||||
'type': 'text',
|
||||
'fields': {
|
||||
'english': {
|
||||
'type': 'text',
|
||||
'analyzer': 'english'
|
||||
}
|
||||
}
|
||||
},
|
||||
'origin_location': {
|
||||
'properties': {
|
||||
'lat': {
|
||||
'type': 'text',
|
||||
'index_prefixes': {},
|
||||
'fields': {
|
||||
'keyword': {
|
||||
'type': 'keyword',
|
||||
'ignore_above': 256
|
||||
}
|
||||
}
|
||||
},
|
||||
'lon': {
|
||||
'city': {
|
||||
'type': 'text',
|
||||
'fields': {
|
||||
'keyword': {
|
||||
'type': 'keyword',
|
||||
'ignore_above': 256
|
||||
}
|
||||
'raw': {
|
||||
'type': 'keyword'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'maps-telemetry': {
|
||||
'properties': {
|
||||
'attributesPerMap': {
|
||||
},
|
||||
'text': {
|
||||
'type': 'text',
|
||||
'fields': {
|
||||
'english': {
|
||||
'type': 'text',
|
||||
'analyzer': 'english'
|
||||
}
|
||||
}
|
||||
},
|
||||
'origin_location': {
|
||||
'properties': {
|
||||
'dataSourcesCount': {
|
||||
'properties': {
|
||||
'avg': {
|
||||
'type': 'long'
|
||||
},
|
||||
'max': {
|
||||
'type': 'long'
|
||||
},
|
||||
'min': {
|
||||
'type': 'long'
|
||||
}
|
||||
}
|
||||
},
|
||||
'emsVectorLayersCount': {
|
||||
'dynamic': 'true',
|
||||
'properties': {
|
||||
'france_departments': {
|
||||
'properties': {
|
||||
'avg': {
|
||||
'type': 'float'
|
||||
},
|
||||
'max': {
|
||||
'type': 'long'
|
||||
},
|
||||
'min': {
|
||||
'type': 'long'
|
||||
}
|
||||
'lat': {
|
||||
'type': 'text',
|
||||
'index_prefixes': {},
|
||||
'fields': {
|
||||
'keyword': {
|
||||
'type': 'keyword',
|
||||
'ignore_above': 256
|
||||
}
|
||||
}
|
||||
},
|
||||
'lon': {
|
||||
'type': 'text',
|
||||
'fields': {
|
||||
'keyword': {
|
||||
'type': 'keyword',
|
||||
'ignore_above': 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'maps-telemetry': {
|
||||
'properties': {
|
||||
'attributesPerMap': {
|
||||
'properties': {
|
||||
'dataSourcesCount': {
|
||||
'properties': {
|
||||
'avg': {
|
||||
'type': 'long'
|
||||
},
|
||||
'max': {
|
||||
'type': 'long'
|
||||
},
|
||||
'min': {
|
||||
'type': 'long'
|
||||
}
|
||||
}
|
||||
},
|
||||
'emsVectorLayersCount': {
|
||||
'dynamic': 'true',
|
||||
'properties': {
|
||||
'france_departments': {
|
||||
'properties': {
|
||||
'avg': {
|
||||
'type': 'float'
|
||||
},
|
||||
'max': {
|
||||
'type': 'long'
|
||||
},
|
||||
'min': {
|
||||
'type': 'long'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
'type': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'name': {
|
||||
'type': 'text'
|
||||
},
|
||||
'user_name': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'email': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'content': {
|
||||
'type': 'text'
|
||||
},
|
||||
'tweeted_at': {
|
||||
'type': 'date'
|
||||
},
|
||||
'dest_location': {
|
||||
'type': 'geo_point'
|
||||
},
|
||||
'my_join_field': {
|
||||
'type': 'join',
|
||||
'relations': {
|
||||
'question': ['answer', 'comment'],
|
||||
'answer': 'vote'
|
||||
}
|
||||
}
|
||||
},
|
||||
'type': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'name': {
|
||||
'type': 'text'
|
||||
},
|
||||
'user_name': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'email': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'content': {
|
||||
'type': 'text'
|
||||
},
|
||||
'tweeted_at': {
|
||||
'type': 'date'
|
||||
},
|
||||
'dest_location': {
|
||||
'type': 'geo_point'
|
||||
},
|
||||
'my_join_field': {
|
||||
'type': 'join',
|
||||
'relations': {
|
||||
'question': ['answer', 'comment'],
|
||||
'answer': 'vote'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_MAPPING1_INDEX_NAME = 'mapping1'
|
||||
|
||||
@ -447,48 +444,47 @@ TEST_MAPPING1_EXPECTED = {
|
||||
|
||||
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
||||
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF = TEST_MAPPING1_EXPECTED_DF.drop(index=['city.raw',
|
||||
'origin_location.lat.keyword',
|
||||
'origin_location.lon.keyword',
|
||||
'text.english'])
|
||||
'origin_location.lat.keyword',
|
||||
'origin_location.lon.keyword',
|
||||
'text.english'])
|
||||
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF.index)
|
||||
|
||||
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
||||
TEST_NESTED_USER_GROUP_MAPPING = {
|
||||
'mappings': {
|
||||
'properties': {
|
||||
'group': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'user': {
|
||||
'mappings': {
|
||||
'properties': {
|
||||
'first': {
|
||||
'type': 'keyword'
|
||||
'group': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'last': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'address' : {
|
||||
'type' : 'keyword'
|
||||
'user': {
|
||||
'properties': {
|
||||
'first': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'last': {
|
||||
'type': 'keyword'
|
||||
},
|
||||
'address': {
|
||||
'type': 'keyword'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_NESTED_USER_GROUP_DOCS = [
|
||||
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||
'_source':
|
||||
{'group':'amsterdam','user':[
|
||||
{'first':'Manke','last':'Nelis','address':['Elandsgracht', 'Amsterdam']},
|
||||
{'first':'Johnny','last':'Jordaan','address':['Elandsstraat', 'Amsterdam']}]}},
|
||||
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||
'_source':
|
||||
{'group':'london','user':[
|
||||
{'first':'Alice','last':'Monkton'},
|
||||
{'first':'Jimmy','last':'White','address':['London']}]}},
|
||||
{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||
'_source':{'group':'new york','user':[
|
||||
{'first':'Bill','last':'Jones'}]}}
|
||||
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||
'_source':
|
||||
{'group': 'amsterdam', 'user': [
|
||||
{'first': 'Manke', 'last': 'Nelis', 'address': ['Elandsgracht', 'Amsterdam']},
|
||||
{'first': 'Johnny', 'last': 'Jordaan', 'address': ['Elandsstraat', 'Amsterdam']}]}},
|
||||
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||
'_source':
|
||||
{'group': 'london', 'user': [
|
||||
{'first': 'Alice', 'last': 'Monkton'},
|
||||
{'first': 'Jimmy', 'last': 'White', 'address': ['London']}]}},
|
||||
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
||||
'_source': {'group': 'new york', 'user': [
|
||||
{'first': 'Bill', 'last': 'Jones'}]}}
|
||||
]
|
||||
|
||||
|
@ -1,24 +1,22 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
import eland as ed
|
||||
import pandas as pd
|
||||
|
||||
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
|
||||
|
||||
import os
|
||||
import eland as ed
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Create pandas and eland data frames
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
|
||||
FLIGHTS_SMALL_INDEX_NAME,\
|
||||
from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME, \
|
||||
FLIGHTS_SMALL_INDEX_NAME, \
|
||||
ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME
|
||||
|
||||
_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
|
||||
_pd_flights['timestamp'] = \
|
||||
pd.to_datetime(_pd_flights['timestamp'])
|
||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||
_ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
|
||||
|
||||
_pd_flights_small = _pd_flights.head(48)
|
||||
@ -30,10 +28,11 @@ _pd_ecommerce['order_date'] = \
|
||||
_pd_ecommerce['products.created_on'] = \
|
||||
_pd_ecommerce['products.created_on'].apply(lambda x: pd.to_datetime(x))
|
||||
_pd_ecommerce.insert(2, 'customer_birth_date', None)
|
||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce['customer_birth_date'].astype('datetime64')
|
||||
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
|
||||
|
||||
|
||||
class TestData:
|
||||
|
||||
def pd_flights(self):
|
||||
@ -48,25 +47,26 @@ class TestData:
|
||||
def ed_flights_small(self):
|
||||
return _ed_flights_small
|
||||
|
||||
|
||||
def pd_ecommerce(self):
|
||||
return _pd_ecommerce
|
||||
|
||||
def ed_ecommerce(self):
|
||||
return _ed_ecommerce
|
||||
|
||||
|
||||
def assert_pandas_eland_frame_equal(left, right):
|
||||
if not isinstance(left, pd.DataFrame):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
exp_type='pd.DataFrame', act_type=type(left)))
|
||||
exp_type='pd.DataFrame', act_type=type(left)))
|
||||
|
||||
if not isinstance(right, ed.DataFrame):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
exp_type='ed.DataFrame', act_type=type(right)))
|
||||
exp_type='ed.DataFrame', act_type=type(right)))
|
||||
|
||||
# Use pandas tests to check similarity
|
||||
assert_frame_equal(left, right._to_pandas())
|
||||
|
||||
|
||||
def assert_eland_frame_equal(left, right):
|
||||
if not isinstance(left, ed.DataFrame):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
@ -83,12 +83,11 @@ def assert_eland_frame_equal(left, right):
|
||||
def assert_pandas_eland_series_equal(left, right):
|
||||
if not isinstance(left, pd.Series):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
exp_type='pd.Series', act_type=type(left)))
|
||||
exp_type='pd.Series', act_type=type(left)))
|
||||
|
||||
if not isinstance(right, ed.Series):
|
||||
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||
exp_type='ed.Series', act_type=type(right)))
|
||||
exp_type='ed.Series', act_type=type(right)))
|
||||
|
||||
# Use pandas tests to check similarity
|
||||
assert_series_equal(left, right._to_pandas())
|
||||
|
||||
|
@ -1,15 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import (assert_almost_equal)
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameAggs(TestData):
|
||||
|
||||
def test_to_aggs1(self):
|
||||
def test_basic_aggs(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
|
@ -1,19 +1,17 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameCount(TestData):
|
||||
|
||||
def test_to_count1(self):
|
||||
def test_ecommerce_count(self):
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
|
||||
pd_count = pd_ecommerce.count()
|
||||
ed_count = ed_ecommerce.count()
|
||||
|
||||
print(pd_count)
|
||||
print(ed_count)
|
||||
|
||||
|
||||
|
||||
assert_series_equal(pd_count, ed_count)
|
||||
|
@ -6,6 +6,7 @@ import pandas as pd
|
||||
import eland as ed
|
||||
from eland.tests.common import ELASTICSEARCH_HOST
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
|
||||
class TestDataFrameDateTime(TestData):
|
||||
@ -41,4 +42,4 @@ class TestDataFrameDateTime(TestData):
|
||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# assert_frame_equal(df, ed_df_head)
|
||||
assert_pandas_eland_frame_equal(df, ed_df_head)
|
||||
|
@ -1,35 +1,34 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
from io import StringIO
|
||||
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameDescribe(TestData):
|
||||
|
||||
def test_to_describe1(self):
|
||||
def test_flights_describe(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_describe = pd_flights.describe()
|
||||
ed_describe = ed_flights.describe()
|
||||
|
||||
print(pd_describe)
|
||||
print(ed_describe)
|
||||
assert_almost_equal(pd_describe[['AvgTicketPrice']],
|
||||
ed_describe[['AvgTicketPrice']],
|
||||
check_less_precise=True)
|
||||
|
||||
# TODO - this fails now as ES aggregations are approximate
|
||||
# TODO - this fails for all fields now as ES aggregations are approximate
|
||||
# if ES percentile agg uses
|
||||
# "hdr": {
|
||||
# "number_of_significant_value_digits": 3
|
||||
# }
|
||||
# this works
|
||||
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
|
||||
|
||||
pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||
ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||
|
||||
# pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||
# ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
||||
# don't match the mapping types. This is mainly because the products field is
|
||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||
|
||||
# We can not also run 'describe' on a truncate ed dataframe
|
||||
|
||||
|
@ -1,19 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_eland_frame_equal,
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
assert_pandas_eland_frame_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameDrop(TestData):
|
||||
|
||||
def test_drop1(self):
|
||||
def test_flights_small_drop(self):
|
||||
ed_flights_small = self.ed_flights_small()
|
||||
pd_flights_small = self.pd_flights_small()
|
||||
|
||||
|
14
eland/tests/dataframe/test_dtypes_pytest.py
Normal file
14
eland/tests/dataframe/test_dtypes_pytest.py
Normal file
@ -0,0 +1,14 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameDtypes(TestData):
|
||||
|
||||
def test_flights_dtypes(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
assert_series_equal(pd_flights.dtypes, ed_flights.dtypes)
|
@ -1,18 +1,11 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameGet(TestData):
|
||||
|
||||
def test_get1(self):
|
||||
def test_get_one_attribute(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
@ -8,10 +7,9 @@ from eland.tests.common import (
|
||||
)
|
||||
|
||||
|
||||
|
||||
class TestDataFrameGetItem(TestData):
|
||||
|
||||
def test_getitem1(self):
|
||||
def test_getitem_one_attribute(self):
|
||||
ed_flights = self.ed_flights().head(103)
|
||||
pd_flights = self.pd_flights().head(103)
|
||||
|
||||
@ -20,7 +18,7 @@ class TestDataFrameGetItem(TestData):
|
||||
|
||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||
|
||||
def test_getitem2(self):
|
||||
def test_getitem_attribute_list(self):
|
||||
ed_flights = self.ed_flights().head(42)
|
||||
pd_flights = self.pd_flights().head(42)
|
||||
|
||||
@ -29,7 +27,7 @@ class TestDataFrameGetItem(TestData):
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
|
||||
|
||||
def test_getitem3(self):
|
||||
def test_getitem_one_argument(self):
|
||||
ed_flights = self.ed_flights().head(89)
|
||||
pd_flights = self.pd_flights().head(89)
|
||||
|
||||
@ -38,7 +36,7 @@ class TestDataFrameGetItem(TestData):
|
||||
|
||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||
|
||||
def test_getitem4(self):
|
||||
def test_getitem_multiple_calls(self):
|
||||
ed_flights = self.ed_flights().head(89)
|
||||
pd_flights = self.pd_flights().head(89)
|
||||
|
||||
@ -52,4 +50,3 @@ class TestDataFrameGetItem(TestData):
|
||||
ed_col1 = ed_col0['DestCountry']
|
||||
|
||||
assert_pandas_eland_series_equal(pd_col1, ed_col1)
|
||||
|
||||
|
@ -1,11 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
|
||||
|
||||
class TestDataFrameHeadTail(TestData):
|
||||
|
||||
def test_head(self):
|
||||
|
@ -1,6 +1,5 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.util.testing import assert_almost_equal
|
||||
@ -10,7 +9,7 @@ from eland.tests.common import TestData
|
||||
|
||||
class TestDataFrameHist(TestData):
|
||||
|
||||
def test_hist1(self):
|
||||
def test_flights_hist(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
@ -30,15 +29,3 @@ class TestDataFrameHist(TestData):
|
||||
# Numbers are slightly different
|
||||
assert_almost_equal(pd_bins, ed_bins)
|
||||
assert_almost_equal(pd_weights, ed_weights)
|
||||
|
||||
def test_hist2(self):
|
||||
pd_df = self.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
ed_df = self.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
|
||||
num_bins = 10
|
||||
|
||||
ed_bins, ed_weights = ed_df._hist(num_bins=num_bins)
|
||||
|
||||
print(ed_bins)
|
||||
|
||||
|
||||
|
@ -1,54 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameiLoc(TestData):
|
||||
|
||||
def test_iloc1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html#pandas.DataFrame.iloc
|
||||
|
||||
#pd_flights.info()
|
||||
|
||||
pd_iloc0 = pd_flights.iloc[0]
|
||||
pd_iloc1= pd_flights.iloc[[0]]
|
||||
pd_iloc2= pd_flights.iloc[[0, 1]]
|
||||
pd_iloc3 = pd_flights.iloc[:3]
|
||||
pd_iloc4 = pd_flights.iloc[[True, False, True]]
|
||||
pd_iloc5 = pd_flights.iloc[0, 1]
|
||||
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
|
||||
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
|
||||
pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]]
|
||||
pd_iloc9 = pd_flights.iloc[[True, False, True, False]]
|
||||
|
||||
ed_iloc0 = ed_flights.iloc[0]
|
||||
ed_iloc1 = ed_flights.iloc[[0]]
|
||||
ed_iloc2 = ed_flights.iloc[[0, 1]]
|
||||
ed_iloc3 = ed_flights.iloc[:3]
|
||||
ed_iloc4 = ed_flights.iloc[[True, False, True]]
|
||||
ed_iloc5 = ed_flights.iloc[0, 1]
|
||||
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
|
||||
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
|
||||
ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]]
|
||||
ed_iloc9 = ed_flights.iloc[[True, False, True, False]]
|
||||
|
||||
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
|
||||
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
|
||||
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
|
||||
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
|
||||
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
|
||||
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
|
||||
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
|
||||
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
|
||||
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
|
||||
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
|
@ -1,15 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameInfoEs(TestData):
|
||||
|
||||
def test_to_info1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
head = ed_flights.head(103)
|
||||
slice = head[['timestamp', 'OriginRegion', 'Carrier']]
|
||||
iloc = slice.iloc[10:92, [0,2]]
|
||||
print(iloc.info_es())
|
||||
print(iloc)
|
@ -6,7 +6,7 @@ from eland.tests.common import TestData
|
||||
|
||||
class TestDataFrameInfo(TestData):
|
||||
|
||||
def test_to_info1(self):
|
||||
def test_flights_info(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameMetrics(TestData):
|
||||
|
||||
@ -43,4 +42,3 @@ class TestDataFrameMetrics(TestData):
|
||||
ed_max = ed_flights.max(numeric_only=True)
|
||||
|
||||
assert_series_equal(pd_max, ed_max)
|
||||
|
||||
|
@ -1,22 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameNUnique(TestData):
|
||||
|
||||
def test_nunique1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
print(pd_flights.dtypes)
|
||||
print(ed_flights.dtypes)
|
||||
print(ed_flights.nunique())
|
||||
|
@ -10,7 +10,7 @@ from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
class TestDataFrameQuery(TestData):
|
||||
|
||||
def test_query1(self):
|
||||
def test_query(self):
|
||||
# Examples from:
|
||||
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
|
||||
pd_df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2), 'C': range(10, 5, -1)},
|
||||
@ -43,4 +43,3 @@ class TestDataFrameQuery(TestData):
|
||||
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
||||
|
||||
|
@ -3,9 +3,9 @@
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameHeadTail(TestData):
|
||||
class TestDataFrameRepr(TestData):
|
||||
|
||||
def test_to_string1(self):
|
||||
def test_head_101_to_string(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -18,7 +18,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
assert pd_head_101_str == ed_head_101_str
|
||||
|
||||
def test_to_string2(self):
|
||||
def test_head_11_to_string2(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -30,7 +30,7 @@ class TestDataFrameHeadTail(TestData):
|
||||
|
||||
assert pd_head_11_str == ed_head_11_str
|
||||
|
||||
def test_to_repr(self):
|
||||
def test_repr(self):
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
|
||||
|
@ -1,31 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import gzip
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameReviews(TestData):
|
||||
|
||||
def test_explore(self):
|
||||
ed_reviews = ed.DataFrame('localhost', 'anonreviews')
|
||||
|
||||
print(ed_reviews.head())
|
||||
print(ed_reviews.describe())
|
||||
print(ed_reviews.info())
|
||||
print(ed_reviews.hist(column="rating", bins=5))
|
||||
# print(ed_reviews.head().info_es())
|
||||
|
||||
def test_review(self):
|
||||
csv_handle = gzip.open('../anonreviews.csv.gz')
|
||||
|
||||
reviews = pd.read_csv(csv_handle)
|
||||
|
||||
reviews['date'] = pd.to_datetime(reviews['date'])
|
||||
|
||||
g = reviews.groupby('reviewerId')
|
||||
|
||||
print(g.describe())
|
@ -1,5 +1,4 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from eland.tests.common import TestData
|
||||
@ -8,10 +7,9 @@ from eland.tests.common import (
|
||||
)
|
||||
|
||||
|
||||
|
||||
class TestDataFrameSelectDTypes(TestData):
|
||||
|
||||
def test_select_dtypes1(self):
|
||||
def test_select_dtypes_include_number(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -20,7 +18,7 @@ class TestDataFrameSelectDTypes(TestData):
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
|
||||
|
||||
def test_select_dtypes2(self):
|
||||
def test_select_dtypes_exclude_number(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -28,4 +26,3 @@ class TestDataFrameSelectDTypes(TestData):
|
||||
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
|
||||
|
||||
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
|
||||
|
||||
|
@ -22,5 +22,3 @@ class TestDataFrameShape(TestData):
|
||||
ed_shape = ed_flights.shape
|
||||
|
||||
assert pd_shape == ed_shape
|
||||
|
||||
|
||||
|
@ -1,22 +1,24 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import pandas as pd
|
||||
import ast
|
||||
|
||||
import pandas as pd
|
||||
from pandas.util.testing import (assert_frame_equal)
|
||||
|
||||
from eland.tests.common import ROOT_DIR
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from pandas.util.testing import (assert_equal, assert_frame_equal)
|
||||
|
||||
import ast
|
||||
|
||||
class TestDataFrameToCSV(TestData):
|
||||
|
||||
def test_to_csv_head(self):
|
||||
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_head.csv'
|
||||
|
||||
ed_flights = self.ed_flights().head()
|
||||
pd_flights = self.pd_flights().head()
|
||||
|
||||
ed_flights.to_csv('results/test_to_csv_head.csv')
|
||||
ed_flights.to_csv(results_file)
|
||||
# Converting back from csv is messy as pd_flights is created from a json file
|
||||
pd_from_csv = pd.read_csv('results/test_to_csv_head.csv', index_col=0, converters={
|
||||
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
|
||||
'DestLocation': lambda x: ast.literal_eval(x),
|
||||
'OriginLocation': lambda x: ast.literal_eval(x)})
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
@ -25,19 +27,18 @@ class TestDataFrameToCSV(TestData):
|
||||
assert_frame_equal(pd_flights, pd_from_csv)
|
||||
|
||||
def test_to_csv_full(self):
|
||||
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv'
|
||||
|
||||
# Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
ed_flights.to_csv('results/test_to_csv_full.csv')
|
||||
ed_flights.to_csv(results_file)
|
||||
# Converting back from csv is messy as pd_flights is created from a json file
|
||||
pd_from_csv = pd.read_csv('results/test_to_csv_full.csv', index_col=0, converters={
|
||||
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
|
||||
'DestLocation': lambda x: ast.literal_eval(x),
|
||||
'OriginLocation': lambda x: ast.literal_eval(x)})
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
|
||||
|
||||
assert_frame_equal(pd_flights, pd_from_csv)
|
||||
|
||||
|
||||
|
||||
|
@ -7144,7 +7144,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,12 +1,13 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestMappingsDtypes(TestData):
|
||||
|
||||
def test_dtypes1(self):
|
||||
def test_flights_dtypes_all(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -15,7 +16,7 @@ class TestMappingsDtypes(TestData):
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
||||
def test_dtypes2(self):
|
||||
def test_flights_dtypes_columns(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||
|
||||
@ -24,7 +25,7 @@ class TestMappingsDtypes(TestData):
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
||||
def test_get_dtype_counts1(self):
|
||||
def test_flights_get_dtype_counts_all(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
@ -33,12 +34,12 @@ class TestMappingsDtypes(TestData):
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
||||
def test_get_dtype_counts2(self):
|
||||
def test_flights_get_dtype_counts_columns(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||
|
||||
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
|
||||
ed_dtypes = ed_flights._query_compiler._mappings.\
|
||||
ed_dtypes = ed_flights._query_compiler._mappings. \
|
||||
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
|
||||
|
||||
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||
|
@ -1,5 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
from eland.operators import *
|
||||
from eland.filter import *
|
||||
|
||||
|
||||
class TestOperators():
|
||||
@ -21,11 +21,6 @@ class TestOperators():
|
||||
'script': {'script': {'inline': 'doc["num1"].value > params.param1', 'params': {'param1': 5}}}}
|
||||
assert IsIn('ids', [1, 2, 3]).build() == {'ids': {'values': [1, 2, 3]}}
|
||||
|
||||
def test_and_none(self):
|
||||
exp = None
|
||||
exp = exp & Less('b', 3)
|
||||
print(exp.build())
|
||||
|
||||
def test_and_filter1(self):
|
||||
exp = GreaterEqual('a', 2) & Less('b', 3)
|
||||
assert exp.build() == {'bool': {'must': [{'range': {'a': {'gte': 2}}}, {'range': {'b': {'lt': 3}}}]}}
|
||||
@ -33,145 +28,145 @@ class TestOperators():
|
||||
def test_and_filter2(self):
|
||||
exp = GreaterEqual('a', 2) & Less('b', 3) & Equal('c', 4)
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def test_and_filter3(self):
|
||||
exp = GreaterEqual('a', 2) & (Less('b', 3) & Equal('c', 4))
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}},
|
||||
{'range': {'a': {'gte': 2}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}},
|
||||
{'range': {'a': {'gte': 2}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def test_or_filter1(self):
|
||||
exp = GreaterEqual('a', 2) | Less('b', 3)
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def test_or_filter2(self):
|
||||
exp = GreaterEqual('a', 2) | Less('b', 3) | Equal('c', 4)
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def test_or_filter3(self):
|
||||
exp = GreaterEqual('a', 2) | (Less('b', 3) | Equal('c', 4))
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}},
|
||||
{'range': {'a': {'gte': 2}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}},
|
||||
{'range': {'a': {'gte': 2}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def test_not_filter(self):
|
||||
exp = ~GreaterEqual('a', 2)
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {'range': {'a': {'gte': 2}}}
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {'range': {'a': {'gte': 2}}}
|
||||
}
|
||||
}
|
||||
|
||||
def test_not_not_filter(self):
|
||||
exp = ~~GreaterEqual('a', 2)
|
||||
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {
|
||||
'bool': {
|
||||
'must_not': {'range': {'a': {'gte': 2}}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {
|
||||
'bool': {
|
||||
'must_not': {'range': {'a': {'gte': 2}}}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def test_not_and_filter(self):
|
||||
exp = ~(GreaterEqual('a', 2) & Less('b', 3))
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{'range': {'b': {'lt': 3}}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def test_and_or_filter(self):
|
||||
exp = GreaterEqual('a', 2) & (Less('b', 3) | Equal('c', 4))
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
def test_and_not_or_filter(self):
|
||||
exp = GreaterEqual('a', 2) & ~(Less('b', 3) | Equal('c', 4))
|
||||
assert exp.build() == \
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
{
|
||||
'bool': {
|
||||
'must': [
|
||||
{'range': {'a': {'gte': 2}}},
|
||||
{
|
||||
'bool': {
|
||||
'must_not': {
|
||||
'bool': {
|
||||
'should': [
|
||||
{'range': {'b': {'lt': 3}}},
|
||||
{'term': {'c': 4}}
|
||||
]
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,8 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from matplotlib.testing.decorators import check_figures_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from matplotlib.testing.decorators import check_figures_equal
|
||||
|
||||
@check_figures_equal(extensions=['png'])
|
||||
def test_plot_hist(fig_test, fig_ref):
|
||||
|
@ -1,8 +1,7 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from eland import Query
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestQueryCopy(TestData):
|
||||
@ -22,6 +21,3 @@ class TestQueryCopy(TestData):
|
||||
|
||||
print(q.to_search_body())
|
||||
print(q1.to_search_body())
|
||||
|
||||
|
||||
|
||||
|
@ -1,15 +1,9 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_series_equal
|
||||
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_INDEX_NAME
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_series_equal
|
||||
|
||||
|
||||
class TestSeriesHeadTail(TestData):
|
||||
|
@ -1,15 +1,8 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||
|
||||
from eland.tests import ELASTICSEARCH_HOST
|
||||
from eland.tests import FLIGHTS_INDEX_NAME
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestSeriesRepr(TestData):
|
||||
|
@ -1,4 +1,3 @@
|
||||
import pandas as pd
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import helpers
|
||||
|
||||
@ -10,6 +9,7 @@ DATA_LIST = [
|
||||
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING)
|
||||
]
|
||||
|
||||
|
||||
def _setup_data(es):
|
||||
# Read json file and index records into Elasticsearch
|
||||
for data in DATA_LIST:
|
||||
@ -32,7 +32,7 @@ def _setup_data(es):
|
||||
for index, row in df.iterrows():
|
||||
values = row.to_dict()
|
||||
# make timestamp datetime 2018-01-01T12:09:35
|
||||
#values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
|
||||
# values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
|
||||
|
||||
# Use integer as id field for repeatable results
|
||||
action = {'_index': index_name, '_source': values, '_id': str(n)}
|
||||
@ -50,17 +50,20 @@ def _setup_data(es):
|
||||
|
||||
print("Done", index_name)
|
||||
|
||||
|
||||
def _setup_test_mappings(es):
|
||||
# Create a complex mapping containing many Elasticsearch features
|
||||
es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404])
|
||||
es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1)
|
||||
|
||||
|
||||
def _setup_test_nested(es):
|
||||
es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404])
|
||||
es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING)
|
||||
|
||||
helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Create connection to Elasticsearch - use defaults
|
||||
es = Elasticsearch(ELASTICSEARCH_HOST)
|
||||
|
@ -7,7 +7,8 @@ def read_es(es_params, index_pattern):
|
||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||
|
||||
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
||||
geo_points=None):
|
||||
"""
|
||||
Append a pandas DataFrame to an Elasticsearch index.
|
||||
Mainly used in testing.
|
||||
@ -30,10 +31,19 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
||||
If table exists, drop it, recreate it, and insert data.
|
||||
``'append'``
|
||||
If table exists, insert data. Create if does not exist.
|
||||
|
||||
dropna : bool
|
||||
``'True'``
|
||||
Remove missing values (see pandas.Series.dropna)
|
||||
``'False;``
|
||||
Include missing values - may cause bulk to fail
|
||||
|
||||
geo_points : list or None
|
||||
List of columns to map to geo_point data type
|
||||
"""
|
||||
client = Client(es_params)
|
||||
|
||||
mapping = Mappings._generate_es_mappings(df)
|
||||
mapping = Mappings._generate_es_mappings(df, geo_points)
|
||||
|
||||
# If table exists, check if_exists parameter
|
||||
if client.index_exists(index=destination_index):
|
||||
@ -58,7 +68,11 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
||||
for row in df.iterrows():
|
||||
# Use index as _id
|
||||
id = row[0]
|
||||
values = row[1].to_dict()
|
||||
|
||||
if dropna:
|
||||
values = row[1].dropna().to_dict()
|
||||
else:
|
||||
values = row[1].to_dict()
|
||||
|
||||
# Use integer as id field for repeatable results
|
||||
action = {'_index': destination_index, '_source': values, '_id': str(id)}
|
||||
|
3
requirements-dev.txt
Normal file
3
requirements-dev.txt
Normal file
@ -0,0 +1,3 @@
|
||||
elasticsearch>=7.0.5
|
||||
pandas==0.25.1
|
||||
pytest>=5.2.1
|
@ -1,8 +1,3 @@
|
||||
elasticsearch==7.0.2
|
||||
elasticsearch-dsl==7.0.0
|
||||
numpy==1.16.4
|
||||
pandas==0.24.2
|
||||
python-dateutil==2.8.0
|
||||
pytz==2019.1
|
||||
six==1.12.0
|
||||
urllib3==1.25.3
|
||||
elasticsearch>=7.0.5
|
||||
pandas==0.25.1
|
||||
matplotlib
|
||||
|
7
setup.py
7
setup.py
@ -1,9 +1,11 @@
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
def readme():
|
||||
with open('README.rst') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
setup(name='eland',
|
||||
version='0.1',
|
||||
description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch',
|
||||
@ -13,8 +15,7 @@ setup(name='eland',
|
||||
license='ELASTIC LICENSE',
|
||||
packages=['eland'],
|
||||
install_requires=[
|
||||
'elasticsearch',
|
||||
'elasticsearch_dsl',
|
||||
'pandas'
|
||||
'elasticsearch>=7.0.5',
|
||||
'pandas==0.25.1'
|
||||
],
|
||||
zip_safe=False)
|
||||
|
Loading…
x
Reference in New Issue
Block a user