mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
from eland import Client
|
|
from eland import DataFrame
|
|
from eland import Mappings
|
|
|
|
|
|
def read_es(es_params, index_pattern):
|
|
return DataFrame(client=es_params, index_pattern=index_pattern)
|
|
|
|
|
|
def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False,
|
|
geo_points=None):
|
|
"""
|
|
Append a pandas DataFrame to an Elasticsearch index.
|
|
Mainly used in testing.
|
|
|
|
Parameters
|
|
----------
|
|
es_params : Elasticsearch client argument
|
|
elasticsearch-py parameters or
|
|
elasticsearch-py instance or
|
|
eland.Client instance
|
|
|
|
destination_index : str
|
|
Name of Elasticsearch index to be written
|
|
|
|
if_exists : str, default 'fail'
|
|
Behavior when the destination index exists. Value can be one of:
|
|
``'fail'``
|
|
If table exists, do nothing.
|
|
``'replace'``
|
|
If table exists, drop it, recreate it, and insert data.
|
|
``'append'``
|
|
If table exists, insert data. Create if does not exist.
|
|
|
|
dropna : bool
|
|
``'True'``
|
|
Remove missing values (see pandas.Series.dropna)
|
|
``'False;``
|
|
Include missing values - may cause bulk to fail
|
|
|
|
geo_points : list or None
|
|
List of columns to map to geo_point data type
|
|
"""
|
|
client = Client(es_params)
|
|
|
|
mapping = Mappings._generate_es_mappings(df, geo_points)
|
|
|
|
# If table exists, check if_exists parameter
|
|
if client.index_exists(index=destination_index):
|
|
if if_exists == "fail":
|
|
raise ValueError(
|
|
"Could not create the index [{0}] because it "
|
|
"already exists. "
|
|
"Change the if_exists parameter to "
|
|
"'append' or 'replace' data.".format(destination_index)
|
|
)
|
|
elif if_exists == "replace":
|
|
client.index_delete(index=destination_index)
|
|
client.index_create(index=destination_index, body=mapping)
|
|
# elif if_exists == "append":
|
|
# TODO validate mapping is compatible
|
|
else:
|
|
client.index_create(index=destination_index, body=mapping)
|
|
|
|
# Now add data
|
|
actions = []
|
|
n = 0
|
|
for row in df.iterrows():
|
|
# Use index as _id
|
|
id = row[0]
|
|
|
|
if dropna:
|
|
values = row[1].dropna().to_dict()
|
|
else:
|
|
values = row[1].to_dict()
|
|
|
|
# Use integer as id field for repeatable results
|
|
action = {'_index': destination_index, '_source': values, '_id': str(id)}
|
|
|
|
actions.append(action)
|
|
|
|
n = n + 1
|
|
|
|
if n % chunk_size == 0:
|
|
client.bulk(actions, refresh=refresh)
|
|
actions = []
|
|
|
|
client.bulk(actions, refresh=refresh)
|