mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Checkpoint code before attempting major investigation into using modin
This commit is contained in:
parent
30df901fce
commit
5e10b2e818
@ -1,7 +1,7 @@
|
|||||||
from .utils import *
|
|
||||||
from .client import *
|
from .client import *
|
||||||
from .ndframe import *
|
from .ndframe import *
|
||||||
from .index import *
|
from .index import *
|
||||||
from .mappings import *
|
from .mappings import *
|
||||||
from .dataframe import *
|
|
||||||
from .series import *
|
from .series import *
|
||||||
|
from .dataframe import *
|
||||||
|
from .utils import *
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
|
from elasticsearch import helpers
|
||||||
|
|
||||||
class Client():
|
class Client():
|
||||||
"""
|
"""
|
||||||
@ -17,7 +18,13 @@ class Client():
|
|||||||
|
|
||||||
def indices(self):
|
def indices(self):
|
||||||
return self.es.indices
|
return self.es.indices
|
||||||
|
|
||||||
|
def bulk(self, actions, refresh=False):
|
||||||
|
return helpers.bulk(self.es, actions, refresh=refresh)
|
||||||
|
|
||||||
|
def scan(self, **kwargs):
|
||||||
|
return helpers.scan(self.es, **kwargs)
|
||||||
|
|
||||||
def search(self, **kwargs):
|
def search(self, **kwargs):
|
||||||
return self.es.search(**kwargs)
|
return self.es.search(**kwargs)
|
||||||
|
|
||||||
|
@ -32,9 +32,14 @@ from pandas.io.formats.printing import pprint_thing
|
|||||||
from pandas.compat import StringIO
|
from pandas.compat import StringIO
|
||||||
from pandas.io.common import _expand_user, _stringify_path
|
from pandas.io.common import _expand_user, _stringify_path
|
||||||
from pandas.io.formats import console
|
from pandas.io.formats import console
|
||||||
|
from pandas.core import common as com
|
||||||
|
|
||||||
from eland import NDFrame
|
from eland import NDFrame
|
||||||
from eland import Index
|
from eland import Index
|
||||||
|
from eland import Series
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DataFrame(NDFrame):
|
class DataFrame(NDFrame):
|
||||||
@ -217,10 +222,6 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
return num_rows, num_columns
|
return num_rows, num_columns
|
||||||
|
|
||||||
@property
|
|
||||||
def columns(self):
|
|
||||||
return super()._columns
|
|
||||||
|
|
||||||
def set_index(self, index_field):
|
def set_index(self, index_field):
|
||||||
copy = self.copy()
|
copy = self.copy()
|
||||||
copy._index = Index(index_field)
|
copy._index = Index(index_field)
|
||||||
@ -265,7 +266,6 @@ class DataFrame(NDFrame):
|
|||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
# NOTE: there is a difference between pandas here.
|
# NOTE: there is a difference between pandas here.
|
||||||
# e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame
|
# e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame
|
||||||
# we always return DataFrame - TODO maybe create eland.Series at some point...
|
|
||||||
|
|
||||||
# Implementation mainly copied from pandas v0.24.2
|
# Implementation mainly copied from pandas v0.24.2
|
||||||
# (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)
|
# (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)
|
||||||
@ -291,10 +291,12 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
# We are left with two options: a single key, and a collection of keys,
|
# We are left with two options: a single key, and a collection of keys,
|
||||||
columns = []
|
columns = []
|
||||||
|
is_single_key = False
|
||||||
if isinstance(key, str):
|
if isinstance(key, str):
|
||||||
if not self._mappings.is_source_field(key):
|
if not self._mappings.is_source_field(key):
|
||||||
raise TypeError('Column does not exist: [{0}]'.format(key))
|
raise TypeError('Column does not exist: [{0}]'.format(key))
|
||||||
columns.append(key)
|
columns.append(key)
|
||||||
|
is_single_key = True
|
||||||
elif isinstance(key, list):
|
elif isinstance(key, list):
|
||||||
columns.extend(key)
|
columns.extend(key)
|
||||||
else:
|
else:
|
||||||
@ -303,7 +305,18 @@ class DataFrame(NDFrame):
|
|||||||
mappings = self._filter_mappings(columns)
|
mappings = self._filter_mappings(columns)
|
||||||
|
|
||||||
# Return new eland.DataFrame with modified mappings
|
# Return new eland.DataFrame with modified mappings
|
||||||
return DataFrame(self._client, self._index_pattern, mappings=mappings)
|
if is_single_key:
|
||||||
|
return Series(self._client, self._index_pattern, mappings=mappings)
|
||||||
|
else:
|
||||||
|
return DataFrame(self._client, self._index_pattern, mappings=mappings)
|
||||||
|
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
# Note: obj.x will always call obj.__getattribute__('x') prior to
|
||||||
|
# calling obj.__getattr__('x').
|
||||||
|
mappings = self._filter_mappings([name])
|
||||||
|
|
||||||
|
return Series(self._client, self._index_pattern, mappings=mappings)
|
||||||
|
|
||||||
def copy(self):
|
def copy(self):
|
||||||
# TODO - test and validate...may need deep copying
|
# TODO - test and validate...may need deep copying
|
||||||
@ -373,7 +386,8 @@ class DataFrame(NDFrame):
|
|||||||
result = _buf.getvalue()
|
result = _buf.getvalue()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def to_pandas(selfs):
|
||||||
|
return super()._to_pandas()
|
||||||
|
|
||||||
# From pandas.DataFrame
|
# From pandas.DataFrame
|
||||||
def _put_str(s, space):
|
def _put_str(s, space):
|
||||||
|
@ -2,6 +2,7 @@ import warnings
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype, is_string_dtype)
|
||||||
|
|
||||||
class Mappings():
|
class Mappings():
|
||||||
"""
|
"""
|
||||||
@ -217,6 +218,7 @@ class Mappings():
|
|||||||
|
|
||||||
return capability_matrix_df.sort_index()
|
return capability_matrix_df.sort_index()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _es_dtype_to_pd_dtype(es_dtype):
|
def _es_dtype_to_pd_dtype(es_dtype):
|
||||||
"""
|
"""
|
||||||
Mapping Elasticsearch types to pandas dtypes
|
Mapping Elasticsearch types to pandas dtypes
|
||||||
@ -259,6 +261,84 @@ class Mappings():
|
|||||||
# Return 'object' for all unsupported TODO - investigate how different types could be supported
|
# Return 'object' for all unsupported TODO - investigate how different types could be supported
|
||||||
return 'object'
|
return 'object'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _pd_dtype_to_es_dtype(pd_dtype):
|
||||||
|
"""
|
||||||
|
Mapping pandas dtypes to Elasticsearch dtype
|
||||||
|
--------------------------------------------
|
||||||
|
|
||||||
|
```
|
||||||
|
Pandas dtype Python type NumPy type Usage
|
||||||
|
object str string_, unicode_ Text
|
||||||
|
int64 int int_, int8, int16, int32, int64, uint8, uint16, uint32, uint64 Integer numbers
|
||||||
|
float64 float float_, float16, float32, float64 Floating point numbers
|
||||||
|
bool bool bool_ True/False values
|
||||||
|
datetime64 NA datetime64[ns] Date and time values
|
||||||
|
timedelta[ns] NA NA Differences between two datetimes
|
||||||
|
category NA NA Finite list of text values
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
es_dtype = None
|
||||||
|
|
||||||
|
# Map all to 64-bit - TODO map to specifics: int32 -> int etc.
|
||||||
|
if is_float_dtype(pd_dtype):
|
||||||
|
es_dtype = 'double'
|
||||||
|
elif is_integer_dtype(pd_dtype):
|
||||||
|
es_dtype = 'long'
|
||||||
|
elif is_bool_dtype(pd_dtype):
|
||||||
|
es_dtype = 'boolean'
|
||||||
|
elif is_string_dtype(pd_dtype):
|
||||||
|
es_dtype = 'keyword'
|
||||||
|
elif is_datetime_or_timedelta_dtype(pd_dtype):
|
||||||
|
es_dtype = 'date'
|
||||||
|
else:
|
||||||
|
warnings.warn('No mapping for pd_dtype: [{0}], using default mapping'.format(pd_dtype))
|
||||||
|
|
||||||
|
return es_dtype
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_es_mappings(dataframe):
|
||||||
|
"""Given a pandas dataframe, generate the associated Elasticsearch mapping
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataframe : pandas.DataFrame
|
||||||
|
pandas.DataFrame to create schema from
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
mapping : str
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
"mappings" : {
|
||||||
|
"properties" : {
|
||||||
|
"AvgTicketPrice" : {
|
||||||
|
"type" : "float"
|
||||||
|
},
|
||||||
|
"Cancelled" : {
|
||||||
|
"type" : "boolean"
|
||||||
|
},
|
||||||
|
"Carrier" : {
|
||||||
|
"type" : "keyword"
|
||||||
|
},
|
||||||
|
"Dest" : {
|
||||||
|
"type" : "keyword"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
mappings = {}
|
||||||
|
mappings['properties'] = {}
|
||||||
|
for column_name, dtype in dataframe.dtypes.iteritems():
|
||||||
|
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
|
||||||
|
|
||||||
|
mappings['properties'][column_name] = {}
|
||||||
|
mappings['properties'][column_name]['type'] = es_dtype
|
||||||
|
|
||||||
|
return {"mappings": mappings}
|
||||||
|
|
||||||
def all_fields(self):
|
def all_fields(self):
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
@ -379,3 +459,14 @@ class Mappings():
|
|||||||
"""
|
"""
|
||||||
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
|
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
|
||||||
'_source'].count().to_dict())
|
'_source'].count().to_dict())
|
||||||
|
|
||||||
|
def to_pandas(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df : pd.DataFrame
|
||||||
|
pandas DaraFrame representing this index
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,10 +23,14 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import functools
|
||||||
from elasticsearch_dsl import Search
|
from elasticsearch_dsl import Search
|
||||||
|
|
||||||
import eland as ed
|
import eland as ed
|
||||||
|
|
||||||
|
from pandas.core.generic import NDFrame as pd_NDFrame
|
||||||
|
from pandas._libs import Timestamp, iNaT, properties
|
||||||
|
|
||||||
|
|
||||||
class NDFrame():
|
class NDFrame():
|
||||||
"""
|
"""
|
||||||
@ -44,7 +48,6 @@ class NDFrame():
|
|||||||
--------
|
--------
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
client,
|
client,
|
||||||
index_pattern,
|
index_pattern,
|
||||||
@ -191,7 +194,12 @@ class NDFrame():
|
|||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
index = []
|
index = []
|
||||||
for hit in results['hits']['hits']:
|
if isinstance(results, dict):
|
||||||
|
iterator = results['hits']['hits']
|
||||||
|
else:
|
||||||
|
iterator = results
|
||||||
|
|
||||||
|
for hit in iterator:
|
||||||
row = hit['_source']
|
row = hit['_source']
|
||||||
|
|
||||||
# get index value - can be _id or can be field value in source
|
# get index value - can be _id or can be field value in source
|
||||||
@ -255,6 +263,23 @@ class NDFrame():
|
|||||||
# reverse order (index ascending)
|
# reverse order (index ascending)
|
||||||
return df.sort_index()
|
return df.sort_index()
|
||||||
|
|
||||||
|
def _to_pandas(self):
|
||||||
|
"""
|
||||||
|
Protected method that returns all data as pandas.DataFrame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df
|
||||||
|
pandas.DataFrame of all values
|
||||||
|
"""
|
||||||
|
sort_params = self._index.sort_field + ":asc"
|
||||||
|
|
||||||
|
results = self._client.scan(index=self._index_pattern)
|
||||||
|
|
||||||
|
# We sort here rather than in scan - once everything is in core this
|
||||||
|
# should be faster
|
||||||
|
return self._es_results_to_pandas(results)
|
||||||
|
|
||||||
def _describe(self):
|
def _describe(self):
|
||||||
numeric_source_fields = self._mappings.numeric_source_fields()
|
numeric_source_fields = self._mappings.numeric_source_fields()
|
||||||
|
|
||||||
@ -294,6 +319,10 @@ class NDFrame():
|
|||||||
|
|
||||||
return mappings
|
return mappings
|
||||||
|
|
||||||
|
@property
|
||||||
|
def columns(self):
|
||||||
|
return self._columns
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def index(self):
|
def index(self):
|
||||||
return self._index
|
return self._index
|
||||||
@ -309,7 +338,6 @@ class NDFrame():
|
|||||||
def get_dtype_counts(self):
|
def get_dtype_counts(self):
|
||||||
return self._mappings.get_dtype_counts()
|
return self._mappings.get_dtype_counts()
|
||||||
|
|
||||||
|
|
||||||
def _index_count(self):
|
def _index_count(self):
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
|
@ -72,14 +72,17 @@ class Series(NDFrame):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
client,
|
client,
|
||||||
index_pattern,
|
index_pattern,
|
||||||
field_name,
|
field_name=None,
|
||||||
mappings=None,
|
mappings=None,
|
||||||
index_field=None):
|
index_field=None):
|
||||||
# python 3 syntax
|
# python 3 syntax
|
||||||
super().__init__(client, index_pattern, mappings=mappings, index_field=index_field)
|
super().__init__(client, index_pattern, mappings=mappings, index_field=index_field)
|
||||||
|
|
||||||
# now select column (field_name)
|
# now select column (field_name)
|
||||||
self._mappings = self._filter_mappings([field_name])
|
if field_name is not None:
|
||||||
|
self._mappings = self._filter_mappings([field_name])
|
||||||
|
elif len(self._mappings.source_fields()) != 1:
|
||||||
|
raise TypeError('Series must have 1 field: [{0}]'.format(len(self._mappings.source_fields())))
|
||||||
|
|
||||||
def head(self, n=5):
|
def head(self, n=5):
|
||||||
return self._df_to_series(super()._head(n))
|
return self._df_to_series(super()._head(n))
|
||||||
@ -199,6 +202,10 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
fmt.buffer_put_lines(buf, lines)
|
fmt.buffer_put_lines(buf, lines)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self):
|
||||||
|
return list(self._mappings.source_fields())[0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape(self):
|
def shape(self):
|
||||||
"""
|
"""
|
||||||
@ -257,7 +264,7 @@ class Series(NDFrame):
|
|||||||
return super()._describe()
|
return super()._describe()
|
||||||
|
|
||||||
def _df_to_series(self, df):
|
def _df_to_series(self, df):
|
||||||
return df.iloc[:, 0]
|
return df[self.name]
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
# Rendering Methods
|
# Rendering Methods
|
||||||
@ -269,8 +276,8 @@ class Series(NDFrame):
|
|||||||
|
|
||||||
max_rows = pd.get_option("display.max_rows")
|
max_rows = pd.get_option("display.max_rows")
|
||||||
|
|
||||||
self.to_string(buf=buf, na_rep='NaN', float_format=None, header=True, index=True, length=False,
|
self.to_string(buf=buf, na_rep='NaN', float_format=None, header=True, index=True, length=True,
|
||||||
dtype=False, name=False, max_rows=max_rows)
|
dtype=True, name=True, max_rows=max_rows)
|
||||||
|
|
||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
@ -279,7 +286,7 @@ class Series(NDFrame):
|
|||||||
index=True, length=True, dtype=True,
|
index=True, length=True, dtype=True,
|
||||||
name=True, max_rows=None):
|
name=True, max_rows=None):
|
||||||
"""
|
"""
|
||||||
From pandas
|
From pandas 0.24.2
|
||||||
|
|
||||||
Render a string representation of the Series.
|
Render a string representation of the Series.
|
||||||
|
|
||||||
@ -343,7 +350,6 @@ class Series(NDFrame):
|
|||||||
"""
|
"""
|
||||||
A hacked overridden version of pandas.io.formats.SeriesFormatter that writes correct length
|
A hacked overridden version of pandas.io.formats.SeriesFormatter that writes correct length
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, series, series_length, buf=None, length=True, header=True, index=True,
|
def __init__(self, series, series_length, buf=None, length=True, header=True, index=True,
|
||||||
na_rep='NaN', name=False, float_format=None, dtype=True,
|
na_rep='NaN', name=False, float_format=None, dtype=True,
|
||||||
max_rows=None):
|
max_rows=None):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from pandas.util.testing import (
|
from pandas.util.testing import (
|
||||||
assert_series_equal, assert_frame_equal)
|
assert_series_equal, assert_frame_equal)
|
||||||
|
|
||||||
@ -88,3 +89,36 @@ class TestMapping(TestData):
|
|||||||
assert 'object' == field_capabilities['pd_dtype']
|
assert 'object' == field_capabilities['pd_dtype']
|
||||||
assert True == field_capabilities['searchable']
|
assert True == field_capabilities['searchable']
|
||||||
assert True == field_capabilities['aggregatable']
|
assert True == field_capabilities['aggregatable']
|
||||||
|
|
||||||
|
def test_generate_es_mappings(self):
|
||||||
|
df = pd.DataFrame(data={'A': np.random.rand(3),
|
||||||
|
'B': 1,
|
||||||
|
'C': 'foo',
|
||||||
|
'D': pd.Timestamp('20190102'),
|
||||||
|
'E': [1.0, 2.0, 3.0],
|
||||||
|
'F': False,
|
||||||
|
'G': [1, 2, 3]},
|
||||||
|
index=['0','1','2'])
|
||||||
|
|
||||||
|
expected_mappings = {'mappings': {
|
||||||
|
'properties': {'A': {'type': 'double'},
|
||||||
|
'B': {'type': 'long'},
|
||||||
|
'C': {'type': 'keyword'},
|
||||||
|
'D': {'type': 'date'},
|
||||||
|
'E': {'type': 'double'},
|
||||||
|
'F': {'type': 'boolean'},
|
||||||
|
'G': {'type': 'long'}}}}
|
||||||
|
|
||||||
|
mappings = ed.Mappings._generate_es_mappings(df)
|
||||||
|
|
||||||
|
assert expected_mappings == mappings
|
||||||
|
|
||||||
|
# Now create index
|
||||||
|
index_name = 'eland_test_generate_es_mappings'
|
||||||
|
|
||||||
|
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||||
|
|
||||||
|
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||||
|
ed_df_head = ed_df.head()
|
||||||
|
|
||||||
|
assert_frame_equal(df, ed_df_head)
|
||||||
|
@ -153,3 +153,7 @@ class TestDataFrameBasics(TestData):
|
|||||||
ed_flights_timestamp.info()
|
ed_flights_timestamp.info()
|
||||||
ed_flights.info()
|
ed_flights.info()
|
||||||
|
|
||||||
|
def test_to_pandas(self):
|
||||||
|
ed_ecommerce_pd_df = self.ed_ecommerce().to_pandas()
|
||||||
|
|
||||||
|
assert_frame_equal(self.pd_ecommerce(), ed_ecommerce_pd_df)
|
||||||
|
@ -15,7 +15,7 @@ class TestDataFrameGetItem(TestData):
|
|||||||
ed_carrier = self.ed_flights()['Carrier']
|
ed_carrier = self.ed_flights()['Carrier']
|
||||||
|
|
||||||
# pandas returns a Series here
|
# pandas returns a Series here
|
||||||
assert_frame_equal(pd.DataFrame(pd_carrier.head(100)), ed_carrier.head(100))
|
assert_series_equal(pd_carrier.head(100), ed_carrier.head(100))
|
||||||
|
|
||||||
pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']]
|
pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']]
|
||||||
ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']]
|
ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']]
|
||||||
@ -36,28 +36,12 @@ class TestDataFrameGetItem(TestData):
|
|||||||
def test_getattr_basic(self):
|
def test_getattr_basic(self):
|
||||||
# Test 1 attribute
|
# Test 1 attribute
|
||||||
pd_carrier = self.pd_flights().Carrier
|
pd_carrier = self.pd_flights().Carrier
|
||||||
#ed_carrier = self.ed_flights().Carrier
|
ed_carrier = self.ed_flights().Carrier
|
||||||
|
|
||||||
print(type(pd_carrier))
|
assert_series_equal(pd_carrier.head(100), ed_carrier.head(100))
|
||||||
print(pd_carrier)
|
|
||||||
|
|
||||||
def test_boolean(self):
|
pd_avgticketprice = self.pd_flights().AvgTicketPrice
|
||||||
# Test 1 attribute
|
ed_avgticketprice = self.ed_flights().AvgTicketPrice
|
||||||
pd_carrier = self.pd_flights()['Carrier == "Kibana Airlines"']
|
|
||||||
#ed_carrier = self.ed_flights().Carrier
|
|
||||||
|
|
||||||
print(type(pd_carrier))
|
assert_series_equal(pd_avgticketprice.head(100), ed_avgticketprice.head(100))
|
||||||
print(pd_carrier)
|
|
||||||
|
|
||||||
|
|
||||||
def test_loc(self):
|
|
||||||
pd = self.pd_flights().loc[10:15, ['Dest', 'Carrier']]
|
|
||||||
|
|
||||||
print(type(pd))
|
|
||||||
print(pd)
|
|
||||||
|
|
||||||
pd = self.pd_flights().loc[10]
|
|
||||||
|
|
||||||
print(type(pd))
|
|
||||||
print(pd)
|
|
||||||
|
|
||||||
|
@ -1,4 +1,73 @@
|
|||||||
import eland as ed
|
from eland import Client
|
||||||
|
from eland import DataFrame
|
||||||
|
from eland import Mappings
|
||||||
|
|
||||||
def read_es(es_params, index_pattern):
|
def read_es(es_params, index_pattern):
|
||||||
return ed.DataFrame(client=es_params, index_pattern=index_pattern)
|
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||||
|
|
||||||
|
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
|
||||||
|
"""
|
||||||
|
Append a pandas DataFrame to an Elasticsearch index.
|
||||||
|
Mainly used in testing.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
es_params : Elasticsearch client argument
|
||||||
|
elasticsearch-py parameters or
|
||||||
|
elasticsearch-py instance or
|
||||||
|
eland.Client instance
|
||||||
|
|
||||||
|
destination_index : str
|
||||||
|
Name of Elasticsearch index to be written
|
||||||
|
|
||||||
|
if_exists : str, default 'fail'
|
||||||
|
Behavior when the destination index exists. Value can be one of:
|
||||||
|
``'fail'``
|
||||||
|
If table exists, do nothing.
|
||||||
|
``'replace'``
|
||||||
|
If table exists, drop it, recreate it, and insert data.
|
||||||
|
``'append'``
|
||||||
|
If table exists, insert data. Create if does not exist.
|
||||||
|
"""
|
||||||
|
client = Client(es_params)
|
||||||
|
|
||||||
|
mapping = Mappings._generate_es_mappings(df)
|
||||||
|
|
||||||
|
# If table exists, check if_exists parameter
|
||||||
|
if client.indices().exists(destination_index):
|
||||||
|
if if_exists == "fail":
|
||||||
|
raise ValueError(
|
||||||
|
"Could not create the index [{0}] because it "
|
||||||
|
"already exists. "
|
||||||
|
"Change the if_exists parameter to "
|
||||||
|
"'append' or 'replace' data.".format(destination_index)
|
||||||
|
)
|
||||||
|
elif if_exists == "replace":
|
||||||
|
client.indices().delete(destination_index)
|
||||||
|
client.indices().create(destination_index, mapping)
|
||||||
|
#elif if_exists == "append":
|
||||||
|
# TODO validate mapping is compatible
|
||||||
|
else:
|
||||||
|
client.indices().create(destination_index, mapping)
|
||||||
|
|
||||||
|
# Now add data
|
||||||
|
actions = []
|
||||||
|
n = 0
|
||||||
|
for row in df.iterrows():
|
||||||
|
# Use index as _id
|
||||||
|
id = row[0]
|
||||||
|
values = row[1].to_dict()
|
||||||
|
|
||||||
|
# Use integer as id field for repeatable results
|
||||||
|
action = {'_index': destination_index, '_source': values, '_id': str(id)}
|
||||||
|
|
||||||
|
actions.append(action)
|
||||||
|
|
||||||
|
n = n + 1
|
||||||
|
|
||||||
|
if n % chunk_size == 0:
|
||||||
|
client.bulk(actions, refresh=refresh)
|
||||||
|
actions = []
|
||||||
|
|
||||||
|
client.bulk(actions, refresh=refresh)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user