mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Fixing tests, and upgrading to pandas 0.25.1
This commit is contained in:
parent
315d4c3287
commit
9dad8613d3
@ -463,7 +463,6 @@ class DataFrame(NDFrame):
|
||||
"quotechar": quotechar,
|
||||
"line_terminator": line_terminator,
|
||||
"chunksize": chunksize,
|
||||
"tupleize_cols": tupleize_cols,
|
||||
"date_format": date_format,
|
||||
"doublequote": doublequote,
|
||||
"escapechar": escapechar,
|
||||
@ -552,7 +551,7 @@ class DataFrame(NDFrame):
|
||||
|
||||
# currently we only support a subset of functions that aggregate columns.
|
||||
# ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
|
||||
if isinstance(func, compat.string_types):
|
||||
if isinstance(func, str):
|
||||
# wrap in list
|
||||
func = [func]
|
||||
return self._query_compiler.aggs(func)
|
||||
|
@ -290,7 +290,7 @@ class Mappings:
|
||||
return es_dtype
|
||||
|
||||
@staticmethod
|
||||
def _generate_es_mappings(dataframe):
|
||||
def _generate_es_mappings(dataframe, geo_points=None):
|
||||
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
|
||||
|
||||
Parameters
|
||||
@ -325,7 +325,10 @@ class Mappings:
|
||||
mappings = {}
|
||||
mappings['properties'] = {}
|
||||
for column_name, dtype in dataframe.dtypes.iteritems():
|
||||
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
||||
if geo_points is not None and column_name in geo_points:
|
||||
es_dtype = 'geo_point'
|
||||
else:
|
||||
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
||||
|
||||
mappings['properties'][column_name] = {}
|
||||
mappings['properties'][column_name]['type'] = es_dtype
|
||||
|
@ -3,13 +3,14 @@ import numpy as np
|
||||
import pandas.core.common as com
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCIndexClass)
|
||||
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
|
||||
|
||||
|
||||
def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
|
||||
sharey=False, figsize=None, layout=None, bins=10, **kwds):
|
||||
"""
|
||||
Derived from pandas.plotting._core.hist_frame 0.24.2
|
||||
Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1
|
||||
|
||||
Ideally, we'd call hist_frame directly with histogram data,
|
||||
but weights are applied to ALL series. For example, we can
|
||||
@ -29,8 +30,6 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
# Start with empty pandas data frame derived from
|
||||
ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)
|
||||
|
||||
_raise_if_no_mpl()
|
||||
_converter._WARN = False
|
||||
if by is not None:
|
||||
raise NotImplementedError("TODO")
|
||||
"""
|
||||
|
@ -24,31 +24,22 @@ class TestDataFrameiLoc(TestData):
|
||||
pd_iloc1= pd_flights.iloc[[0]]
|
||||
pd_iloc2= pd_flights.iloc[[0, 1]]
|
||||
pd_iloc3 = pd_flights.iloc[:3]
|
||||
pd_iloc4 = pd_flights.iloc[[True, False, True]]
|
||||
pd_iloc5 = pd_flights.iloc[0, 1]
|
||||
pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
|
||||
pd_iloc7 = pd_flights.iloc[1:3, 0:3]
|
||||
pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]]
|
||||
pd_iloc9 = pd_flights.iloc[[True, False, True, False]]
|
||||
|
||||
ed_iloc0 = ed_flights.iloc[0]
|
||||
ed_iloc1 = ed_flights.iloc[[0]]
|
||||
ed_iloc2 = ed_flights.iloc[[0, 1]]
|
||||
ed_iloc3 = ed_flights.iloc[:3]
|
||||
ed_iloc4 = ed_flights.iloc[[True, False, True]]
|
||||
ed_iloc5 = ed_flights.iloc[0, 1]
|
||||
ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
|
||||
ed_iloc7 = ed_flights.iloc[1:3, 0:3]
|
||||
ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]]
|
||||
ed_iloc9 = ed_flights.iloc[[True, False, True, False]]
|
||||
|
||||
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
|
||||
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
|
||||
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
|
||||
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
|
||||
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
|
||||
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
|
||||
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
|
||||
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
|
||||
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
|
||||
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
|
||||
|
@ -1,31 +0,0 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import gzip
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameReviews(TestData):
|
||||
|
||||
def test_explore(self):
|
||||
ed_reviews = ed.DataFrame('localhost', 'anonreviews')
|
||||
|
||||
print(ed_reviews.head())
|
||||
print(ed_reviews.describe())
|
||||
print(ed_reviews.info())
|
||||
print(ed_reviews.hist(column="rating", bins=5))
|
||||
# print(ed_reviews.head().info_es())
|
||||
|
||||
def test_review(self):
|
||||
csv_handle = gzip.open('../anonreviews.csv.gz')
|
||||
|
||||
reviews = pd.read_csv(csv_handle)
|
||||
|
||||
reviews['date'] = pd.to_datetime(reviews['date'])
|
||||
|
||||
g = reviews.groupby('reviewerId')
|
||||
|
||||
print(g.describe())
|
@ -3,20 +3,23 @@
|
||||
import pandas as pd
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import ROOT_DIR
|
||||
|
||||
from pandas.util.testing import (assert_equal, assert_frame_equal)
|
||||
|
||||
import ast
|
||||
|
||||
|
||||
class TestDataFrameToCSV(TestData):
|
||||
|
||||
def test_to_csv_head(self):
|
||||
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_head.csv'
|
||||
|
||||
ed_flights = self.ed_flights().head()
|
||||
pd_flights = self.pd_flights().head()
|
||||
|
||||
ed_flights.to_csv('results/test_to_csv_head.csv')
|
||||
ed_flights.to_csv(results_file)
|
||||
# Converting back from csv is messy as pd_flights is created from a json file
|
||||
pd_from_csv = pd.read_csv('results/test_to_csv_head.csv', index_col=0, converters={
|
||||
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
|
||||
'DestLocation': lambda x: ast.literal_eval(x),
|
||||
'OriginLocation': lambda x: ast.literal_eval(x)})
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
@ -25,13 +28,15 @@ class TestDataFrameToCSV(TestData):
|
||||
assert_frame_equal(pd_flights, pd_from_csv)
|
||||
|
||||
def test_to_csv_full(self):
|
||||
results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv'
|
||||
|
||||
# Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
ed_flights.to_csv('results/test_to_csv_full.csv')
|
||||
ed_flights.to_csv(results_file)
|
||||
# Converting back from csv is messy as pd_flights is created from a json file
|
||||
pd_from_csv = pd.read_csv('results/test_to_csv_full.csv', index_col=0, converters={
|
||||
pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
|
||||
'DestLocation': lambda x: ast.literal_eval(x),
|
||||
'OriginLocation': lambda x: ast.literal_eval(x)})
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
|
@ -7144,7 +7144,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,5 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
from eland.operators import *
|
||||
from eland.filter import *
|
||||
|
||||
|
||||
class TestOperators():
|
||||
@ -21,11 +21,6 @@ class TestOperators():
|
||||
'script': {'script': {'inline': 'doc["num1"].value > params.param1', 'params': {'param1': 5}}}}
|
||||
assert IsIn('ids', [1, 2, 3]).build() == {'ids': {'values': [1, 2, 3]}}
|
||||
|
||||
def test_and_none(self):
|
||||
exp = None
|
||||
exp = exp & Less('b', 3)
|
||||
print(exp.build())
|
||||
|
||||
def test_and_filter1(self):
|
||||
exp = GreaterEqual('a', 2) & Less('b', 3)
|
||||
assert exp.build() == {'bool': {'must': [{'range': {'a': {'gte': 2}}}, {'range': {'b': {'lt': 3}}}]}}
|
||||
|
File diff suppressed because one or more lines are too long
@ -7,7 +7,7 @@ def read_es(es_params, index_pattern):
|
||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||
|
||||
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
|
||||
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None):
|
||||
"""
|
||||
Append a pandas DataFrame to an Elasticsearch index.
|
||||
Mainly used in testing.
|
||||
@ -30,10 +30,19 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
||||
If table exists, drop it, recreate it, and insert data.
|
||||
``'append'``
|
||||
If table exists, insert data. Create if does not exist.
|
||||
|
||||
dropna : bool
|
||||
``'True'``
|
||||
Remove missing values (see pandas.Series.dropna)
|
||||
``'False;``
|
||||
Include missing values - may cause bulk to fail
|
||||
|
||||
geo_points : list or None
|
||||
List of columns to map to geo_point data type
|
||||
"""
|
||||
client = Client(es_params)
|
||||
|
||||
mapping = Mappings._generate_es_mappings(df)
|
||||
mapping = Mappings._generate_es_mappings(df, geo_points)
|
||||
|
||||
# If table exists, check if_exists parameter
|
||||
if client.index_exists(index=destination_index):
|
||||
@ -58,7 +67,11 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
||||
for row in df.iterrows():
|
||||
# Use index as _id
|
||||
id = row[0]
|
||||
values = row[1].to_dict()
|
||||
|
||||
if dropna:
|
||||
values = row[1].dropna().to_dict()
|
||||
else:
|
||||
values = row[1].to_dict()
|
||||
|
||||
# Use integer as id field for repeatable results
|
||||
action = {'_index': destination_index, '_source': values, '_id': str(id)}
|
||||
|
@ -1,8 +1,2 @@
|
||||
elasticsearch==7.0.2
|
||||
elasticsearch-dsl==7.0.0
|
||||
numpy==1.16.4
|
||||
pandas==0.24.2
|
||||
python-dateutil==2.8.0
|
||||
pytz==2019.1
|
||||
six==1.12.0
|
||||
urllib3==1.25.3
|
||||
elasticsearch>=7.0.5
|
||||
pandas==0.25.1
|
||||
|
Loading…
x
Reference in New Issue
Block a user