Fixing tests, and upgrading to pandas 0.25.1

This commit is contained in:
Stephen Dodson 2019-10-18 08:06:07 +00:00
parent 315d4c3287
commit 9dad8613d3
12 changed files with 40 additions and 1455 deletions

View File

@ -463,7 +463,6 @@ class DataFrame(NDFrame):
"quotechar": quotechar,
"line_terminator": line_terminator,
"chunksize": chunksize,
"tupleize_cols": tupleize_cols,
"date_format": date_format,
"doublequote": doublequote,
"escapechar": escapechar,
@ -552,7 +551,7 @@ class DataFrame(NDFrame):
# currently we only support a subset of functions that aggregate columns.
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
if isinstance(func, compat.string_types):
if isinstance(func, str):
# wrap in list
func = [func]
return self._query_compiler.aggs(func)

View File

@ -290,7 +290,7 @@ class Mappings:
return es_dtype
@staticmethod
def _generate_es_mappings(dataframe):
def _generate_es_mappings(dataframe, geo_points=None):
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
Parameters
@ -325,6 +325,9 @@ class Mappings:
mappings = {}
mappings['properties'] = {}
for column_name, dtype in dataframe.dtypes.iteritems():
if geo_points is not None and column_name in geo_points:
es_dtype = 'geo_point'
else:
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
mappings['properties'][column_name] = {}

View File

@ -3,13 +3,14 @@ import numpy as np
import pandas.core.common as com
from pandas.core.dtypes.generic import (
ABCIndexClass)
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
"""
Derived from pandas.plotting._core.hist_frame 0.24.2
Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1
Ideally, we'd call hist_frame directly with histogram data,
but weights are applied to ALL series. For example, we can
@ -29,8 +30,6 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
# Start with empty pandas data frame derived from
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
_raise_if_no_mpl()
_converter._WARN = False
if by is not None:
raise NotImplementedError("TODO")
"""

View File

@ -24,31 +24,22 @@ class TestDataFrameiLoc(TestData):
pd_iloc1= pd_flights.iloc[[0]]
pd_iloc2= pd_flights.iloc[[0, 1]]
pd_iloc3 = pd_flights.iloc[:3]
pd_iloc4 = pd_flights.iloc[[True, False, True]]
pd_iloc5 = pd_flights.iloc[0, 1]
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]]
pd_iloc9 = pd_flights.iloc[[True, False, True, False]]
ed_iloc0 = ed_flights.iloc[0]
ed_iloc1 = ed_flights.iloc[[0]]
ed_iloc2 = ed_flights.iloc[[0, 1]]
ed_iloc3 = ed_flights.iloc[:3]
ed_iloc4 = ed_flights.iloc[[True, False, True]]
ed_iloc5 = ed_flights.iloc[0, 1]
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]]
ed_iloc9 = ed_flights.iloc[[True, False, True, False]]
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)

View File

@ -1,31 +0,0 @@
# File called _pytest for PyCharm compatability
import gzip
import pandas as pd
import eland as ed
from eland.tests.common import TestData
class TestDataFrameReviews(TestData):
def test_explore(self):
ed_reviews = ed.DataFrame('localhost', 'anonreviews')
print(ed_reviews.head())
print(ed_reviews.describe())
print(ed_reviews.info())
print(ed_reviews.hist(column="rating", bins=5))
# print(ed_reviews.head().info_es())
def test_review(self):
csv_handle = gzip.open('../anonreviews.csv.gz')
reviews = pd.read_csv(csv_handle)
reviews['date'] = pd.to_datetime(reviews['date'])
g = reviews.groupby('reviewerId')
print(g.describe())

View File

@ -3,20 +3,23 @@
import pandas as pd
from eland.tests.common import TestData
from eland.tests.common import ROOT_DIR
from pandas.util.testing import (assert_equal, assert_frame_equal)
import ast
class TestDataFrameToCSV(TestData):
def test_to_csv_head(self):
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_head.csv'
ed_flights = self.ed_flights().head()
pd_flights = self.pd_flights().head()
ed_flights.to_csv('results/test_to_csv_head.csv')
ed_flights.to_csv(results_file)
# Converting back from csv is messy as pd_flights is created from a json file
pd_from_csv = pd.read_csv('results/test_to_csv_head.csv', index_col=0, converters={
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
'DestLocation': lambda x: ast.literal_eval(x),
'OriginLocation': lambda x: ast.literal_eval(x)})
pd_from_csv.index = pd_from_csv.index.map(str)
@ -25,13 +28,15 @@ class TestDataFrameToCSV(TestData):
assert_frame_equal(pd_flights, pd_from_csv)
def test_to_csv_full(self):
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv'
# Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_flights.to_csv('results/test_to_csv_full.csv')
ed_flights.to_csv(results_file)
# Converting back from csv is messy as pd_flights is created from a json file
pd_from_csv = pd.read_csv('results/test_to_csv_full.csv', index_col=0, converters={
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
'DestLocation': lambda x: ast.literal_eval(x),
'OriginLocation': lambda x: ast.literal_eval(x)})
pd_from_csv.index = pd_from_csv.index.map(str)

View File

@ -7144,7 +7144,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.6.8"
}
},
"nbformat": 4,

View File

@ -1,5 +1,5 @@
# -*- coding: UTF-8 -*-
from eland.operators import *
from eland.filter import *
class TestOperators():
@ -21,11 +21,6 @@ class TestOperators():
'script': {'script': {'inline': 'doc["num1"].value > params.param1', 'params': {'param1': 5}}}}
assert IsIn('ids', [1, 2, 3]).build() == {'ids': {'values': [1, 2, 3]}}
def test_and_none(self):
exp = None
exp = exp & Less('b', 3)
print(exp.build())
def test_and_filter1(self):
exp = GreaterEqual('a', 2) & Less('b', 3)
assert exp.build() == {'bool': {'must': [{'range': {'a': {'gte': 2}}}, {'range': {'b': {'lt': 3}}}]}}

File diff suppressed because one or more lines are too long

View File

@ -7,7 +7,7 @@ def read_es(es_params, index_pattern):
return DataFrame(client=es_params, index_pattern=index_pattern)
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None):
"""
Append a pandas DataFrame to an Elasticsearch index.
Mainly used in testing.
@ -30,10 +30,19 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
If table exists, drop it, recreate it, and insert data.
``'append'``
If table exists, insert data. Create if does not exist.
dropna : bool
``'True'``
Remove missing values (see pandas.Series.dropna)
``'False;``
Include missing values - may cause bulk to fail
geo_points : list or None
List of columns to map to geo_point data type
"""
client = Client(es_params)
mapping = Mappings._generate_es_mappings(df)
mapping = Mappings._generate_es_mappings(df, geo_points)
# If table exists, check if_exists parameter
if client.index_exists(index=destination_index):
@ -58,6 +67,10 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
for row in df.iterrows():
# Use index as _id
id = row[0]
if dropna:
values = row[1].dropna().to_dict()
else:
values = row[1].to_dict()
# Use integer as id field for repeatable results

View File

@ -1,8 +1,2 @@
elasticsearch==7.0.2
elasticsearch-dsl==7.0.0
numpy==1.16.4
pandas==0.24.2
python-dateutil==2.8.0
pytz==2019.1
six==1.12.0
urllib3==1.25.3
elasticsearch>=7.0.5
pandas==0.25.1

View File

@ -13,8 +13,7 @@ setup(name='eland',
license='ELASTIC LICENSE',
packages=['eland'],
install_requires=[
'elasticsearch',
'elasticsearch_dsl',
'pandas'
'elasticsearch>=7.0.5',
'pandas==0.25.1'
],
zip_safe=False)