Checkpoint code before attempting major investigation into using modin

This commit is contained in:
Stephen Dodson 2019-07-03 09:49:58 +00:00
parent 30df901fce
commit 5e10b2e818
10 changed files with 281 additions and 44 deletions

View File

@ -1,7 +1,7 @@
from .utils import *
from .client import * from .client import *
from .ndframe import * from .ndframe import *
from .index import * from .index import *
from .mappings import * from .mappings import *
from .dataframe import *
from .series import * from .series import *
from .dataframe import *
from .utils import *

View File

@ -1,4 +1,5 @@
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from elasticsearch import helpers
class Client(): class Client():
""" """
@ -17,7 +18,13 @@ class Client():
def indices(self): def indices(self):
return self.es.indices return self.es.indices
def bulk(self, actions, refresh=False):
return helpers.bulk(self.es, actions, refresh=refresh)
def scan(self, **kwargs):
return helpers.scan(self.es, **kwargs)
def search(self, **kwargs): def search(self, **kwargs):
return self.es.search(**kwargs) return self.es.search(**kwargs)

View File

@ -32,9 +32,14 @@ from pandas.io.formats.printing import pprint_thing
from pandas.compat import StringIO from pandas.compat import StringIO
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, _stringify_path
from pandas.io.formats import console from pandas.io.formats import console
from pandas.core import common as com
from eland import NDFrame from eland import NDFrame
from eland import Index from eland import Index
from eland import Series
class DataFrame(NDFrame): class DataFrame(NDFrame):
@ -217,10 +222,6 @@ class DataFrame(NDFrame):
return num_rows, num_columns return num_rows, num_columns
@property
def columns(self):
return super()._columns
def set_index(self, index_field): def set_index(self, index_field):
copy = self.copy() copy = self.copy()
copy._index = Index(index_field) copy._index = Index(index_field)
@ -265,7 +266,6 @@ class DataFrame(NDFrame):
def __getitem__(self, key): def __getitem__(self, key):
# NOTE: there is a difference between pandas here. # NOTE: there is a difference between pandas here.
# e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame # e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame
# we always return DataFrame - TODO maybe create eland.Series at some point...
# Implementation mainly copied from pandas v0.24.2 # Implementation mainly copied from pandas v0.24.2
# (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html) # (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)
@ -291,10 +291,12 @@ class DataFrame(NDFrame):
# We are left with two options: a single key, and a collection of keys, # We are left with two options: a single key, and a collection of keys,
columns = [] columns = []
is_single_key = False
if isinstance(key, str): if isinstance(key, str):
if not self._mappings.is_source_field(key): if not self._mappings.is_source_field(key):
raise TypeError('Column does not exist: [{0}]'.format(key)) raise TypeError('Column does not exist: [{0}]'.format(key))
columns.append(key) columns.append(key)
is_single_key = True
elif isinstance(key, list): elif isinstance(key, list):
columns.extend(key) columns.extend(key)
else: else:
@ -303,7 +305,18 @@ class DataFrame(NDFrame):
mappings = self._filter_mappings(columns) mappings = self._filter_mappings(columns)
# Return new eland.DataFrame with modified mappings # Return new eland.DataFrame with modified mappings
return DataFrame(self._client, self._index_pattern, mappings=mappings) if is_single_key:
return Series(self._client, self._index_pattern, mappings=mappings)
else:
return DataFrame(self._client, self._index_pattern, mappings=mappings)
def __getattr__(self, name):
# Note: obj.x will always call obj.__getattribute__('x') prior to
# calling obj.__getattr__('x').
mappings = self._filter_mappings([name])
return Series(self._client, self._index_pattern, mappings=mappings)
def copy(self): def copy(self):
# TODO - test and validate...may need deep copying # TODO - test and validate...may need deep copying
@ -373,7 +386,8 @@ class DataFrame(NDFrame):
result = _buf.getvalue() result = _buf.getvalue()
return result return result
def to_pandas(selfs):
return super()._to_pandas()
# From pandas.DataFrame # From pandas.DataFrame
def _put_str(s, space): def _put_str(s, space):

View File

@ -2,6 +2,7 @@ import warnings
import pandas as pd import pandas as pd
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype, is_string_dtype)
class Mappings(): class Mappings():
""" """
@ -217,6 +218,7 @@ class Mappings():
return capability_matrix_df.sort_index() return capability_matrix_df.sort_index()
@staticmethod
def _es_dtype_to_pd_dtype(es_dtype): def _es_dtype_to_pd_dtype(es_dtype):
""" """
Mapping Elasticsearch types to pandas dtypes Mapping Elasticsearch types to pandas dtypes
@ -259,6 +261,84 @@ class Mappings():
# Return 'object' for all unsupported TODO - investigate how different types could be supported # Return 'object' for all unsupported TODO - investigate how different types could be supported
return 'object' return 'object'
@staticmethod
def _pd_dtype_to_es_dtype(pd_dtype):
"""
Mapping pandas dtypes to Elasticsearch dtype
--------------------------------------------
```
Pandas dtype Python type NumPy type Usage
object str string_, unicode_ Text
int64 int int_, int8, int16, int32, int64, uint8, uint16, uint32, uint64 Integer numbers
float64 float float_, float16, float32, float64 Floating point numbers
bool bool bool_ True/False values
datetime64 NA datetime64[ns] Date and time values
timedelta[ns] NA NA Differences between two datetimes
category NA NA Finite list of text values
```
"""
es_dtype = None
# Map all to 64-bit - TODO map to specifics: int32 -> int etc.
if is_float_dtype(pd_dtype):
es_dtype = 'double'
elif is_integer_dtype(pd_dtype):
es_dtype = 'long'
elif is_bool_dtype(pd_dtype):
es_dtype = 'boolean'
elif is_string_dtype(pd_dtype):
es_dtype = 'keyword'
elif is_datetime_or_timedelta_dtype(pd_dtype):
es_dtype = 'date'
else:
warnings.warn('No mapping for pd_dtype: [{0}], using default mapping'.format(pd_dtype))
return es_dtype
@staticmethod
def _generate_es_mappings(dataframe):
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
Parameters
----------
dataframe : pandas.DataFrame
pandas.DataFrame to create schema from
Returns
-------
mapping : str
"""
"""
"mappings" : {
"properties" : {
"AvgTicketPrice" : {
"type" : "float"
},
"Cancelled" : {
"type" : "boolean"
},
"Carrier" : {
"type" : "keyword"
},
"Dest" : {
"type" : "keyword"
}
}
}
"""
mappings = {}
mappings['properties'] = {}
for column_name, dtype in dataframe.dtypes.iteritems():
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
mappings['properties'][column_name] = {}
mappings['properties'][column_name]['type'] = es_dtype
return {"mappings": mappings}
def all_fields(self): def all_fields(self):
""" """
Returns Returns
@ -379,3 +459,14 @@ class Mappings():
""" """
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[ return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
'_source'].count().to_dict()) '_source'].count().to_dict())
def to_pandas(self):
"""
Returns
-------
df : pd.DataFrame
pandas DaraFrame representing this index
"""

View File

@ -23,10 +23,14 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
""" """
import pandas as pd import pandas as pd
import functools
from elasticsearch_dsl import Search from elasticsearch_dsl import Search
import eland as ed import eland as ed
from pandas.core.generic import NDFrame as pd_NDFrame
from pandas._libs import Timestamp, iNaT, properties
class NDFrame(): class NDFrame():
""" """
@ -44,7 +48,6 @@ class NDFrame():
-------- --------
""" """
def __init__(self, def __init__(self,
client, client,
index_pattern, index_pattern,
@ -191,7 +194,12 @@ class NDFrame():
rows = [] rows = []
index = [] index = []
for hit in results['hits']['hits']: if isinstance(results, dict):
iterator = results['hits']['hits']
else:
iterator = results
for hit in iterator:
row = hit['_source'] row = hit['_source']
# get index value - can be _id or can be field value in source # get index value - can be _id or can be field value in source
@ -255,6 +263,23 @@ class NDFrame():
# reverse order (index ascending) # reverse order (index ascending)
return df.sort_index() return df.sort_index()
def _to_pandas(self):
"""
Protected method that returns all data as pandas.DataFrame.
Returns
-------
df
pandas.DataFrame of all values
"""
sort_params = self._index.sort_field + ":asc"
results = self._client.scan(index=self._index_pattern)
# We sort here rather than in scan - once everything is in core this
# should be faster
return self._es_results_to_pandas(results)
def _describe(self): def _describe(self):
numeric_source_fields = self._mappings.numeric_source_fields() numeric_source_fields = self._mappings.numeric_source_fields()
@ -294,6 +319,10 @@ class NDFrame():
return mappings return mappings
@property
def columns(self):
return self._columns
@property @property
def index(self): def index(self):
return self._index return self._index
@ -309,7 +338,6 @@ class NDFrame():
def get_dtype_counts(self): def get_dtype_counts(self):
return self._mappings.get_dtype_counts() return self._mappings.get_dtype_counts()
def _index_count(self): def _index_count(self):
""" """
Returns Returns

View File

@ -72,14 +72,17 @@ class Series(NDFrame):
def __init__(self, def __init__(self,
client, client,
index_pattern, index_pattern,
field_name, field_name=None,
mappings=None, mappings=None,
index_field=None): index_field=None):
# python 3 syntax # python 3 syntax
super().__init__(client, index_pattern, mappings=mappings, index_field=index_field) super().__init__(client, index_pattern, mappings=mappings, index_field=index_field)
# now select column (field_name) # now select column (field_name)
self._mappings = self._filter_mappings([field_name]) if field_name is not None:
self._mappings = self._filter_mappings([field_name])
elif len(self._mappings.source_fields()) != 1:
raise TypeError('Series must have 1 field: [{0}]'.format(len(self._mappings.source_fields())))
def head(self, n=5): def head(self, n=5):
return self._df_to_series(super()._head(n)) return self._df_to_series(super()._head(n))
@ -199,6 +202,10 @@ class Series(NDFrame):
fmt.buffer_put_lines(buf, lines) fmt.buffer_put_lines(buf, lines)
@property
def name(self):
return list(self._mappings.source_fields())[0]
@property @property
def shape(self): def shape(self):
""" """
@ -257,7 +264,7 @@ class Series(NDFrame):
return super()._describe() return super()._describe()
def _df_to_series(self, df): def _df_to_series(self, df):
return df.iloc[:, 0] return df[self.name]
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# Rendering Methods # Rendering Methods
@ -269,8 +276,8 @@ class Series(NDFrame):
max_rows = pd.get_option("display.max_rows") max_rows = pd.get_option("display.max_rows")
self.to_string(buf=buf, na_rep='NaN', float_format=None, header=True, index=True, length=False, self.to_string(buf=buf, na_rep='NaN', float_format=None, header=True, index=True, length=True,
dtype=False, name=False, max_rows=max_rows) dtype=True, name=True, max_rows=max_rows)
return buf.getvalue() return buf.getvalue()
@ -279,7 +286,7 @@ class Series(NDFrame):
index=True, length=True, dtype=True, index=True, length=True, dtype=True,
name=True, max_rows=None): name=True, max_rows=None):
""" """
From pandas From pandas 0.24.2
Render a string representation of the Series. Render a string representation of the Series.
@ -343,7 +350,6 @@ class Series(NDFrame):
""" """
A hacked overridden version of pandas.io.formats.SeriesFormatter that writes correct length A hacked overridden version of pandas.io.formats.SeriesFormatter that writes correct length
""" """
def __init__(self, series, series_length, buf=None, length=True, header=True, index=True, def __init__(self, series, series_length, buf=None, length=True, header=True, index=True,
na_rep='NaN', name=False, float_format=None, dtype=True, na_rep='NaN', name=False, float_format=None, dtype=True,
max_rows=None): max_rows=None):

View File

@ -1,5 +1,6 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import numpy as np
from pandas.util.testing import ( from pandas.util.testing import (
assert_series_equal, assert_frame_equal) assert_series_equal, assert_frame_equal)
@ -88,3 +89,36 @@ class TestMapping(TestData):
assert 'object' == field_capabilities['pd_dtype'] assert 'object' == field_capabilities['pd_dtype']
assert True == field_capabilities['searchable'] assert True == field_capabilities['searchable']
assert True == field_capabilities['aggregatable'] assert True == field_capabilities['aggregatable']
def test_generate_es_mappings(self):
df = pd.DataFrame(data={'A': np.random.rand(3),
'B': 1,
'C': 'foo',
'D': pd.Timestamp('20190102'),
'E': [1.0, 2.0, 3.0],
'F': False,
'G': [1, 2, 3]},
index=['0','1','2'])
expected_mappings = {'mappings': {
'properties': {'A': {'type': 'double'},
'B': {'type': 'long'},
'C': {'type': 'keyword'},
'D': {'type': 'date'},
'E': {'type': 'double'},
'F': {'type': 'boolean'},
'G': {'type': 'long'}}}}
mappings = ed.Mappings._generate_es_mappings(df)
assert expected_mappings == mappings
# Now create index
index_name = 'eland_test_generate_es_mappings'
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
ed_df_head = ed_df.head()
assert_frame_equal(df, ed_df_head)

View File

@ -153,3 +153,7 @@ class TestDataFrameBasics(TestData):
ed_flights_timestamp.info() ed_flights_timestamp.info()
ed_flights.info() ed_flights.info()
def test_to_pandas(self):
ed_ecommerce_pd_df = self.ed_ecommerce().to_pandas()
assert_frame_equal(self.pd_ecommerce(), ed_ecommerce_pd_df)

View File

@ -15,7 +15,7 @@ class TestDataFrameGetItem(TestData):
ed_carrier = self.ed_flights()['Carrier'] ed_carrier = self.ed_flights()['Carrier']
# pandas returns a Series here # pandas returns a Series here
assert_frame_equal(pd.DataFrame(pd_carrier.head(100)), ed_carrier.head(100)) assert_series_equal(pd_carrier.head(100), ed_carrier.head(100))
pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']] pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']]
ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']] ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']]
@ -36,28 +36,12 @@ class TestDataFrameGetItem(TestData):
def test_getattr_basic(self): def test_getattr_basic(self):
# Test 1 attribute # Test 1 attribute
pd_carrier = self.pd_flights().Carrier pd_carrier = self.pd_flights().Carrier
#ed_carrier = self.ed_flights().Carrier ed_carrier = self.ed_flights().Carrier
print(type(pd_carrier)) assert_series_equal(pd_carrier.head(100), ed_carrier.head(100))
print(pd_carrier)
def test_boolean(self): pd_avgticketprice = self.pd_flights().AvgTicketPrice
# Test 1 attribute ed_avgticketprice = self.ed_flights().AvgTicketPrice
pd_carrier = self.pd_flights()['Carrier == "Kibana Airlines"']
#ed_carrier = self.ed_flights().Carrier
print(type(pd_carrier)) assert_series_equal(pd_avgticketprice.head(100), ed_avgticketprice.head(100))
print(pd_carrier)
def test_loc(self):
pd = self.pd_flights().loc[10:15, ['Dest', 'Carrier']]
print(type(pd))
print(pd)
pd = self.pd_flights().loc[10]
print(type(pd))
print(pd)

View File

@ -1,4 +1,73 @@
import eland as ed from eland import Client
from eland import DataFrame
from eland import Mappings
def read_es(es_params, index_pattern): def read_es(es_params, index_pattern):
return ed.DataFrame(client=es_params, index_pattern=index_pattern) return DataFrame(client=es_params, index_pattern=index_pattern)
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
"""
Append a pandas DataFrame to an Elasticsearch index.
Mainly used in testing.
Parameters
----------
es_params : Elasticsearch client argument
elasticsearch-py parameters or
elasticsearch-py instance or
eland.Client instance
destination_index : str
Name of Elasticsearch index to be written
if_exists : str, default 'fail'
Behavior when the destination index exists. Value can be one of:
``'fail'``
If table exists, do nothing.
``'replace'``
If table exists, drop it, recreate it, and insert data.
``'append'``
If table exists, insert data. Create if does not exist.
"""
client = Client(es_params)
mapping = Mappings._generate_es_mappings(df)
# If table exists, check if_exists parameter
if client.indices().exists(destination_index):
if if_exists == "fail":
raise ValueError(
"Could not create the index [{0}] because it "
"already exists. "
"Change the if_exists parameter to "
"'append' or 'replace' data.".format(destination_index)
)
elif if_exists == "replace":
client.indices().delete(destination_index)
client.indices().create(destination_index, mapping)
#elif if_exists == "append":
# TODO validate mapping is compatible
else:
client.indices().create(destination_index, mapping)
# Now add data
actions = []
n = 0
for row in df.iterrows():
# Use index as _id
id = row[0]
values = row[1].to_dict()
# Use integer as id field for repeatable results
action = {'_index': destination_index, '_source': values, '_id': str(id)}
actions.append(action)
n = n + 1
if n % chunk_size == 0:
client.bulk(actions, refresh=refresh)
actions = []
client.bulk(actions, refresh=refresh)