mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Improve efficiency of 'pandas_to_eland()' using 'parallel_bulk()'
This commit is contained in:
parent
225a23a59a
commit
0dd247b693
41
eland/etl.py
41
eland/etl.py
@ -16,8 +16,8 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
from typing import Union, List, Tuple, Optional, Mapping, Dict, Any
|
from typing import Generator, Union, List, Tuple, Optional, Mapping, Dict, Any
|
||||||
|
from collections import deque
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
from pandas.io.parsers import _c_parser_defaults # type: ignore
|
from pandas.io.parsers import _c_parser_defaults # type: ignore
|
||||||
|
|
||||||
@ -26,7 +26,7 @@ from eland.field_mappings import FieldMappings, verify_mapping_compatibility
|
|||||||
from eland.common import ensure_es_client, DEFAULT_CHUNK_SIZE
|
from eland.common import ensure_es_client, DEFAULT_CHUNK_SIZE
|
||||||
from eland.utils import deprecated_api
|
from eland.utils import deprecated_api
|
||||||
from elasticsearch import Elasticsearch # type: ignore
|
from elasticsearch import Elasticsearch # type: ignore
|
||||||
from elasticsearch.helpers import bulk # type: ignore
|
from elasticsearch.helpers import parallel_bulk # type: ignore
|
||||||
|
|
||||||
|
|
||||||
@deprecated_api("eland.DataFrame()")
|
@deprecated_api("eland.DataFrame()")
|
||||||
@ -67,6 +67,7 @@ def pandas_to_eland(
|
|||||||
es_refresh: bool = False,
|
es_refresh: bool = False,
|
||||||
es_dropna: bool = False,
|
es_dropna: bool = False,
|
||||||
es_type_overrides: Optional[Mapping[str, str]] = None,
|
es_type_overrides: Optional[Mapping[str, str]] = None,
|
||||||
|
thread_count: int = 4,
|
||||||
chunksize: Optional[int] = None,
|
chunksize: Optional[int] = None,
|
||||||
use_pandas_index_for_es_ids: bool = True,
|
use_pandas_index_for_es_ids: bool = True,
|
||||||
) -> DataFrame:
|
) -> DataFrame:
|
||||||
@ -95,6 +96,8 @@ def pandas_to_eland(
|
|||||||
* False: Include missing values - may cause bulk to fail
|
* False: Include missing values - may cause bulk to fail
|
||||||
es_type_overrides: dict, default None
|
es_type_overrides: dict, default None
|
||||||
Dict of field_name: es_data_type that overrides default es data types
|
Dict of field_name: es_data_type that overrides default es data types
|
||||||
|
thread_count: int
|
||||||
|
number of the threads to use for the bulk requests
|
||||||
chunksize: int, default None
|
chunksize: int, default None
|
||||||
Number of pandas.DataFrame rows to read before bulk index into Elasticsearch
|
Number of pandas.DataFrame rows to read before bulk index into Elasticsearch
|
||||||
use_pandas_index_for_es_ids: bool, default 'True'
|
use_pandas_index_for_es_ids: bool, default 'True'
|
||||||
@ -205,9 +208,12 @@ def pandas_to_eland(
|
|||||||
else:
|
else:
|
||||||
es_client.indices.create(index=es_dest_index, body=mapping)
|
es_client.indices.create(index=es_dest_index, body=mapping)
|
||||||
|
|
||||||
# Now add data
|
def action_generator(
|
||||||
actions = []
|
pd_df: pd.DataFrame,
|
||||||
n = 0
|
es_dropna: bool,
|
||||||
|
use_pandas_index_for_es_ids: bool,
|
||||||
|
es_dest_index: str,
|
||||||
|
) -> Generator[Dict[str, Any], None, None]:
|
||||||
for row in pd_df.iterrows():
|
for row in pd_df.iterrows():
|
||||||
if es_dropna:
|
if es_dropna:
|
||||||
values = row[1].dropna().to_dict()
|
values = row[1].dropna().to_dict()
|
||||||
@ -218,20 +224,29 @@ def pandas_to_eland(
|
|||||||
# Use index as _id
|
# Use index as _id
|
||||||
id = row[0]
|
id = row[0]
|
||||||
|
|
||||||
# Use integer as id field for repeatable results
|
|
||||||
action = {"_index": es_dest_index, "_source": values, "_id": str(id)}
|
action = {"_index": es_dest_index, "_source": values, "_id": str(id)}
|
||||||
else:
|
else:
|
||||||
action = {"_index": es_dest_index, "_source": values}
|
action = {"_index": es_dest_index, "_source": values}
|
||||||
|
|
||||||
actions.append(action)
|
yield action
|
||||||
|
|
||||||
n = n + 1
|
# parallel_bulk is lazy generator so use deque to consume them immediately
|
||||||
|
# maxlen = 0 because don't need results of parallel_bulk
|
||||||
|
deque(
|
||||||
|
parallel_bulk(
|
||||||
|
client=es_client,
|
||||||
|
actions=action_generator(
|
||||||
|
pd_df, es_dropna, use_pandas_index_for_es_ids, es_dest_index
|
||||||
|
),
|
||||||
|
thread_count=thread_count,
|
||||||
|
chunk_size=chunksize / thread_count,
|
||||||
|
),
|
||||||
|
maxlen=0,
|
||||||
|
)
|
||||||
|
|
||||||
if n % chunksize == 0:
|
if es_refresh:
|
||||||
bulk(client=es_client, actions=actions, refresh=es_refresh)
|
es_client.indices.refresh(index=es_dest_index)
|
||||||
actions = []
|
|
||||||
|
|
||||||
bulk(client=es_client, actions=actions, refresh=es_refresh)
|
|
||||||
return DataFrame(es_client, es_dest_index)
|
return DataFrame(es_client, es_dest_index)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user