Improve efficiency of 'pandas_to_eland()' using 'parallel_bulk()'

2025-07-11 00:02:14 +08:00 · 2020-10-08 20:47:22 +05:30 · 2020-10-08 20:47:22 +05:30 · 0dd247b693
commit 0dd247b693
parent 225a23a59a
1 changed files with 39 additions and 24 deletions
--- a/eland/etl.py
+++ b/eland/etl.py
@ -16,8 +16,8 @@
 #  under the License.
 import csv
-from typing import Union, List, Tuple, Optional, Mapping, Dict, Any
+from typing import Generator, Union, List, Tuple, Optional, Mapping, Dict, Any
-
+from collections import deque
 import pandas as pd  # type: ignore
 from pandas.io.parsers import _c_parser_defaults  # type: ignore
@ -26,7 +26,7 @@ from eland.field_mappings import FieldMappings, verify_mapping_compatibility
 from eland.common import ensure_es_client, DEFAULT_CHUNK_SIZE
 from eland.utils import deprecated_api
 from elasticsearch import Elasticsearch  # type: ignore
-from elasticsearch.helpers import bulk  # type: ignore
+from elasticsearch.helpers import parallel_bulk  # type: ignore
@deprecated_api("eland.DataFrame()")
@ -67,6 +67,7 @@ def pandas_to_eland(
    es_refresh: bool = False,
    es_dropna: bool = False,
    es_type_overrides: Optional[Mapping[str, str]] = None,
    thread_count: int = 4,
    chunksize: Optional[int] = None,
    use_pandas_index_for_es_ids: bool = True,
 ) -> DataFrame:
@ -95,6 +96,8 @@ def pandas_to_eland(
        * False: Include missing values - may cause bulk to fail
    es_type_overrides: dict, default None
        Dict of field_name: es_data_type that overrides default es data types
    thread_count: int
        number of the threads to use for the bulk requests
    chunksize: int, default None
        Number of pandas.DataFrame rows to read before bulk index into Elasticsearch
    use_pandas_index_for_es_ids: bool, default 'True'
@ -205,9 +208,12 @@ def pandas_to_eland(
    else:
        es_client.indices.create(index=es_dest_index, body=mapping)
-    # Now add data
+    def action_generator(
-    actions = []
+        pd_df: pd.DataFrame,
-    n = 0
+        es_dropna: bool,
        use_pandas_index_for_es_ids: bool,
        es_dest_index: str,
    ) -> Generator[Dict[str, Any], None, None]:
        for row in pd_df.iterrows():
            if es_dropna:
                values = row[1].dropna().to_dict()
@ -218,20 +224,29 @@ def pandas_to_eland(
                # Use index as _id
                id = row[0]
            # Use integer as id field for repeatable results
                action = {"_index": es_dest_index, "_source": values, "_id": str(id)}
            else:
                action = {"_index": es_dest_index, "_source": values}
-        actions.append(action)
+            yield action
-        n = n + 1
+    # parallel_bulk is lazy generator so use deque to consume them immediately
    # maxlen = 0 because don't need results of parallel_bulk
    deque(
        parallel_bulk(
            client=es_client,
            actions=action_generator(
                pd_df, es_dropna, use_pandas_index_for_es_ids, es_dest_index
            ),
            thread_count=thread_count,
            chunk_size=chunksize / thread_count,
        ),
        maxlen=0,
    )
-        if n % chunksize == 0:
+    if es_refresh:
-            bulk(client=es_client, actions=actions, refresh=es_refresh)
+        es_client.indices.refresh(index=es_dest_index)
            actions = []
    bulk(client=es_client, actions=actions, refresh=es_refresh)
    return DataFrame(es_client, es_dest_index)