mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Adding eland.read_csv
TODO - resolve issue with ordering of eland.DataFrame compared to csv
This commit is contained in:
parent
21f9c0f54a
commit
2f4d601932
@ -26,5 +26,7 @@ In general, the data resides in elasticsearch and not in memory, which allows el
|
|||||||
|
|
||||||
* :doc:`reference/index`
|
* :doc:`reference/index`
|
||||||
|
|
||||||
|
* :doc:`reference/io`
|
||||||
* :doc:`reference/general_utility_functions`
|
* :doc:`reference/general_utility_functions`
|
||||||
* :doc:`reference/dataframe`
|
* :doc:`reference/dataframe`
|
||||||
|
* :doc:`reference/index`
|
||||||
|
6
docs/source/reference/api/eland.read_csv.rst
Normal file
6
docs/source/reference/api/eland.read_csv.rst
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
eland.read_csv
|
||||||
|
==============
|
||||||
|
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
.. autofunction:: read_csv
|
@ -10,6 +10,7 @@ methods. All classes and functions exposed in ``eland.*`` namespace are public.
|
|||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
||||||
|
io
|
||||||
general_utility_functions
|
general_utility_functions
|
||||||
dataframe
|
dataframe
|
||||||
indexing
|
indexing
|
||||||
|
@ -9,6 +9,8 @@ Index
|
|||||||
that contain an index (Series/DataFrame) and those should most likely be
|
that contain an index (Series/DataFrame) and those should most likely be
|
||||||
used before calling these methods directly.**
|
used before calling these methods directly.**
|
||||||
|
|
||||||
|
Constructor
|
||||||
|
~~~~~~~~~~~
|
||||||
.. autosummary::
|
.. autosummary::
|
||||||
:toctree: api/
|
:toctree: api/
|
||||||
|
|
||||||
|
13
docs/source/reference/io.rst
Normal file
13
docs/source/reference/io.rst
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
.. _api.io:
|
||||||
|
|
||||||
|
============
|
||||||
|
Input/Output
|
||||||
|
============
|
||||||
|
.. currentmodule:: eland
|
||||||
|
|
||||||
|
Flat File
|
||||||
|
~~~~~~~~~
|
||||||
|
.. autosummary::
|
||||||
|
:toctree: api/
|
||||||
|
|
||||||
|
read_csv
|
@ -431,4 +431,3 @@ class ElandQueryCompiler:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# def isna(self):
|
|
||||||
|
@ -1,13 +1,23 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
import ast
|
import ast
|
||||||
|
import time
|
||||||
|
|
||||||
|
import eland as ed
|
||||||
|
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.util.testing import (assert_frame_equal)
|
from pandas.util.testing import assert_frame_equal
|
||||||
|
|
||||||
from eland.tests.common import ROOT_DIR
|
from eland.tests.common import ROOT_DIR
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
from eland.tests import ELASTICSEARCH_HOST
|
||||||
|
from eland.tests import FLIGHTS_INDEX_NAME
|
||||||
|
|
||||||
|
from eland.tests.common import assert_pandas_eland_frame_equal
|
||||||
|
|
||||||
|
|
||||||
class TestDataFrameToCSV(TestData):
|
class TestDataFrameToCSV(TestData):
|
||||||
|
|
||||||
@ -42,3 +52,21 @@ class TestDataFrameToCSV(TestData):
|
|||||||
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
|
pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)
|
||||||
|
|
||||||
assert_frame_equal(pd_flights, pd_from_csv)
|
assert_frame_equal(pd_flights, pd_from_csv)
|
||||||
|
|
||||||
|
# Now read the csv to an index
|
||||||
|
now_millis = int(round(time.time() * 1000))
|
||||||
|
|
||||||
|
test_index = FLIGHTS_INDEX_NAME + '.' + str(now_millis)
|
||||||
|
es = Elasticsearch(ELASTICSEARCH_HOST)
|
||||||
|
|
||||||
|
ed_flights_from_csv = ed.read_csv(results_file, es, test_index, index_col=0, es_refresh=True,
|
||||||
|
es_geo_points=['OriginLocation', 'DestLocation'],
|
||||||
|
converters={
|
||||||
|
'DestLocation': lambda x: ast.literal_eval(x),
|
||||||
|
'OriginLocation': lambda x: ast.literal_eval(x)}
|
||||||
|
)
|
||||||
|
pd_flights_from_csv = ed.eland_to_pandas(ed_flights_from_csv)
|
||||||
|
|
||||||
|
# TODO - there is a 'bug' where the Elasticsearch index returns data in a different order to the CSV
|
||||||
|
print(ed_flights_from_csv.head())
|
||||||
|
print(pd_flights_from_csv.head())
|
||||||
|
229
eland/utils.py
229
eland/utils.py
@ -1,8 +1,14 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from pandas.io.parsers import _c_parser_defaults
|
||||||
|
|
||||||
|
|
||||||
from eland import Client
|
from eland import Client
|
||||||
from eland import DataFrame
|
from eland import DataFrame
|
||||||
from eland import Mappings
|
from eland import Mappings
|
||||||
|
|
||||||
import pandas as pd
|
_default_chunk_size = 10000
|
||||||
|
|
||||||
|
|
||||||
def read_es(es_params, index_pattern):
|
def read_es(es_params, index_pattern):
|
||||||
@ -31,7 +37,10 @@ def read_es(es_params, index_pattern):
|
|||||||
"""
|
"""
|
||||||
return DataFrame(client=es_params, index_pattern=index_pattern)
|
return DataFrame(client=es_params, index_pattern=index_pattern)
|
||||||
|
|
||||||
def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
|
||||||
|
def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunksize=None,
|
||||||
|
refresh=False,
|
||||||
|
dropna=False,
|
||||||
geo_points=None):
|
geo_points=None):
|
||||||
"""
|
"""
|
||||||
Append a pandas DataFrame to an Elasticsearch index.
|
Append a pandas DataFrame to an Elasticsearch index.
|
||||||
@ -52,6 +61,8 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
|
|||||||
- fail: Raise a ValueError.
|
- fail: Raise a ValueError.
|
||||||
- replace: Delete the index before inserting new values.
|
- replace: Delete the index before inserting new values.
|
||||||
- append: Insert new values to the existing index. Create if does not exist.
|
- append: Insert new values to the existing index. Create if does not exist.
|
||||||
|
refresh: bool, default 'False'
|
||||||
|
Refresh destination_index after bulk index
|
||||||
dropna: bool, default 'False'
|
dropna: bool, default 'False'
|
||||||
* True: Remove missing values (see pandas.Series.dropna)
|
* True: Remove missing values (see pandas.Series.dropna)
|
||||||
* False: Include missing values - may cause bulk to fail
|
* False: Include missing values - may cause bulk to fail
|
||||||
@ -68,6 +79,9 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
|
|||||||
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||||
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
|
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
|
||||||
"""
|
"""
|
||||||
|
if chunksize is None:
|
||||||
|
chunksize = _default_chunk_size
|
||||||
|
|
||||||
client = Client(es_params)
|
client = Client(es_params)
|
||||||
|
|
||||||
mapping = Mappings._generate_es_mappings(pd_df, geo_points)
|
mapping = Mappings._generate_es_mappings(pd_df, geo_points)
|
||||||
@ -108,7 +122,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
|
|||||||
|
|
||||||
n = n + 1
|
n = n + 1
|
||||||
|
|
||||||
if n % chunk_size == 0:
|
if n % chunksize == 0:
|
||||||
client.bulk(actions, refresh=refresh)
|
client.bulk(actions, refresh=refresh)
|
||||||
actions = []
|
actions = []
|
||||||
|
|
||||||
@ -118,6 +132,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
|
|||||||
|
|
||||||
return ed_df
|
return ed_df
|
||||||
|
|
||||||
|
|
||||||
def eland_to_pandas(ed_df):
|
def eland_to_pandas(ed_df):
|
||||||
"""
|
"""
|
||||||
Convert an eland.Dataframe to a pandas.DataFrame
|
Convert an eland.Dataframe to a pandas.DataFrame
|
||||||
@ -142,36 +157,186 @@ def eland_to_pandas(ed_df):
|
|||||||
"""
|
"""
|
||||||
return ed_df._to_pandas()
|
return ed_df._to_pandas()
|
||||||
|
|
||||||
def _inherit_docstrings(parent, excluded=[]):
|
|
||||||
"""Creates a decorator which overwrites a decorated class' __doc__
|
|
||||||
attribute with parent's __doc__ attribute. Also overwrites __doc__ of
|
|
||||||
methods and properties defined in the class with the __doc__ of matching
|
|
||||||
methods and properties in parent.
|
|
||||||
|
|
||||||
Args:
|
def read_csv(filepath_or_buffer,
|
||||||
parent (object): Class from which the decorated class inherits __doc__.
|
es_client,
|
||||||
excluded (list): List of parent objects from which the class does not
|
es_dest_index,
|
||||||
inherit docstrings.
|
es_if_exists='fail',
|
||||||
|
es_refresh=False,
|
||||||
Returns:
|
es_dropna=False,
|
||||||
function: decorator which replaces the decorated class' documentation
|
es_geo_points=None,
|
||||||
parent's documentation.
|
sep=",",
|
||||||
|
delimiter=None,
|
||||||
|
# Column and Index Locations and Names
|
||||||
|
header="infer",
|
||||||
|
names=None,
|
||||||
|
index_col=None,
|
||||||
|
usecols=None,
|
||||||
|
squeeze=False,
|
||||||
|
prefix=None,
|
||||||
|
mangle_dupe_cols=True,
|
||||||
|
# General Parsing Configuration
|
||||||
|
dtype=None,
|
||||||
|
engine=None,
|
||||||
|
converters=None,
|
||||||
|
true_values=None,
|
||||||
|
false_values=None,
|
||||||
|
skipinitialspace=False,
|
||||||
|
skiprows=None,
|
||||||
|
skipfooter=0,
|
||||||
|
nrows=None,
|
||||||
|
# Iteration
|
||||||
|
# iterator=False,
|
||||||
|
chunksize=None,
|
||||||
|
# NA and Missing Data Handling
|
||||||
|
na_values=None,
|
||||||
|
keep_default_na=True,
|
||||||
|
na_filter=True,
|
||||||
|
verbose=False,
|
||||||
|
skip_blank_lines=True,
|
||||||
|
# Datetime Handling
|
||||||
|
parse_dates=False,
|
||||||
|
infer_datetime_format=False,
|
||||||
|
keep_date_col=False,
|
||||||
|
date_parser=None,
|
||||||
|
dayfirst=False,
|
||||||
|
cache_dates=True,
|
||||||
|
# Quoting, Compression, and File Format
|
||||||
|
compression="infer",
|
||||||
|
thousands=None,
|
||||||
|
decimal=b".",
|
||||||
|
lineterminator=None,
|
||||||
|
quotechar='"',
|
||||||
|
quoting=csv.QUOTE_MINIMAL,
|
||||||
|
doublequote=True,
|
||||||
|
escapechar=None,
|
||||||
|
comment=None,
|
||||||
|
encoding=None,
|
||||||
|
dialect=None,
|
||||||
|
# Error Handling
|
||||||
|
error_bad_lines=True,
|
||||||
|
warn_bad_lines=True,
|
||||||
|
# Internal
|
||||||
|
delim_whitespace=False,
|
||||||
|
low_memory=_c_parser_defaults["low_memory"],
|
||||||
|
memory_map=False,
|
||||||
|
float_precision=None):
|
||||||
"""
|
"""
|
||||||
|
Read a comma-separated values (csv) file into eland.DataFrame (i.e. an Elasticsearch index).
|
||||||
|
|
||||||
def decorator(cls):
|
**Modifies an Elasticsearch index**
|
||||||
if parent not in excluded:
|
|
||||||
cls.__doc__ = parent.__doc__
|
**Note iteration not supported**
|
||||||
for attr, obj in cls.__dict__.items():
|
|
||||||
parent_obj = getattr(parent, attr, None)
|
Parameters
|
||||||
if parent_obj in excluded or (
|
----------
|
||||||
not callable(parent_obj) and not isinstance(parent_obj, property)
|
es_params: Elasticsearch client argument(s)
|
||||||
):
|
- elasticsearch-py parameters or
|
||||||
continue
|
- elasticsearch-py instance or
|
||||||
if callable(obj):
|
- eland.Client instance
|
||||||
obj.__doc__ = parent_obj.__doc__
|
es_dest_index: str
|
||||||
elif isinstance(obj, property) and obj.fget is not None:
|
Name of Elasticsearch index to be appended to
|
||||||
p = property(obj.fget, obj.fset, obj.fdel, parent_obj.__doc__)
|
es_if_exists : {'fail', 'replace', 'append'}, default 'fail'
|
||||||
setattr(cls, attr, p)
|
How to behave if the index already exists.
|
||||||
return cls
|
|
||||||
|
- fail: Raise a ValueError.
|
||||||
|
- replace: Delete the index before inserting new values.
|
||||||
|
- append: Insert new values to the existing index. Create if does not exist.
|
||||||
|
es_dropna: bool, default 'False'
|
||||||
|
* True: Remove missing values (see pandas.Series.dropna)
|
||||||
|
* False: Include missing values - may cause bulk to fail
|
||||||
|
es_geo_points: list, default None
|
||||||
|
List of columns to map to geo_point data type
|
||||||
|
iterator
|
||||||
|
ignored
|
||||||
|
chunksize
|
||||||
|
number of csv rows to read before bulk index into Elasticsearch
|
||||||
|
|
||||||
|
Other Parameters
|
||||||
|
----------------
|
||||||
|
Parameters derived from :pandas_api_docs:`read_csv`.
|
||||||
|
|
||||||
|
See Also
|
||||||
|
--------
|
||||||
|
:pandas_api_docs:`read_csv` - for all parameters
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
TODO - currently the eland.DataFrame may not retain the order of the data in the csv.
|
||||||
|
"""
|
||||||
|
kwds = dict()
|
||||||
|
|
||||||
|
kwds.update(
|
||||||
|
delimiter=delimiter,
|
||||||
|
engine=engine,
|
||||||
|
dialect=dialect,
|
||||||
|
compression=compression,
|
||||||
|
# engine_specified=engine_specified,
|
||||||
|
doublequote=doublequote,
|
||||||
|
escapechar=escapechar,
|
||||||
|
quotechar=quotechar,
|
||||||
|
quoting=quoting,
|
||||||
|
skipinitialspace=skipinitialspace,
|
||||||
|
lineterminator=lineterminator,
|
||||||
|
header=header,
|
||||||
|
index_col=index_col,
|
||||||
|
names=names,
|
||||||
|
prefix=prefix,
|
||||||
|
skiprows=skiprows,
|
||||||
|
skipfooter=skipfooter,
|
||||||
|
na_values=na_values,
|
||||||
|
true_values=true_values,
|
||||||
|
false_values=false_values,
|
||||||
|
keep_default_na=keep_default_na,
|
||||||
|
thousands=thousands,
|
||||||
|
comment=comment,
|
||||||
|
decimal=decimal,
|
||||||
|
parse_dates=parse_dates,
|
||||||
|
keep_date_col=keep_date_col,
|
||||||
|
dayfirst=dayfirst,
|
||||||
|
date_parser=date_parser,
|
||||||
|
cache_dates=cache_dates,
|
||||||
|
nrows=nrows,
|
||||||
|
# iterator=iterator,
|
||||||
|
chunksize=chunksize,
|
||||||
|
converters=converters,
|
||||||
|
dtype=dtype,
|
||||||
|
usecols=usecols,
|
||||||
|
verbose=verbose,
|
||||||
|
encoding=encoding,
|
||||||
|
squeeze=squeeze,
|
||||||
|
memory_map=memory_map,
|
||||||
|
float_precision=float_precision,
|
||||||
|
na_filter=na_filter,
|
||||||
|
delim_whitespace=delim_whitespace,
|
||||||
|
warn_bad_lines=warn_bad_lines,
|
||||||
|
error_bad_lines=error_bad_lines,
|
||||||
|
low_memory=low_memory,
|
||||||
|
mangle_dupe_cols=mangle_dupe_cols,
|
||||||
|
infer_datetime_format=infer_datetime_format,
|
||||||
|
skip_blank_lines=skip_blank_lines,
|
||||||
|
)
|
||||||
|
|
||||||
|
if chunksize is None:
|
||||||
|
kwds.update(chunksize=_default_chunk_size)
|
||||||
|
|
||||||
|
client = Client(es_client)
|
||||||
|
|
||||||
|
# read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch)
|
||||||
|
reader = pd.read_csv(filepath_or_buffer, **kwds)
|
||||||
|
|
||||||
|
first_write = True
|
||||||
|
for chunk in reader:
|
||||||
|
if first_write:
|
||||||
|
pandas_to_eland(chunk, client, es_dest_index, if_exists=es_if_exists, chunksize=chunksize,
|
||||||
|
refresh=es_refresh, dropna=es_dropna, geo_points=es_geo_points)
|
||||||
|
first_write = False
|
||||||
|
else:
|
||||||
|
pandas_to_eland(chunk, client, es_dest_index, if_exists='append', chunksize=chunksize,
|
||||||
|
refresh=es_refresh, dropna=es_dropna, geo_points=es_geo_points)
|
||||||
|
|
||||||
|
# Now create an eland.DataFrame that references the new index
|
||||||
|
ed_df = DataFrame(client, es_dest_index)
|
||||||
|
|
||||||
|
return ed_df
|
||||||
|
|
||||||
return decorator
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user