Feature/show progress (#120)

* Adding show_progress debug option to eland_to_pandas

* Adding show_progress debug option to eland_to_pandas
This commit is contained in:
stevedodson 2020-01-29 12:59:48 +00:00 committed by GitHub
parent 409cb043c8
commit 2ca538c49d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 79 additions and 40 deletions

View File

@ -17,6 +17,11 @@ from enum import Enum
DEFAULT_NUM_ROWS_DISPLAYED = 60 DEFAULT_NUM_ROWS_DISPLAYED = 60
DEFAULT_CHUNK_SIZE = 10000
DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
def docstring_parameter(*sub): def docstring_parameter(*sub):
def dec(obj): def dec(obj):

View File

@ -973,7 +973,7 @@ class DataFrame(NDFrame):
} }
return self._query_compiler.to_csv(**kwargs) return self._query_compiler.to_csv(**kwargs)
def _to_pandas(self): def _to_pandas(self, show_progress=False):
""" """
Utility method to convert eland.Dataframe to pandas.Dataframe Utility method to convert eland.Dataframe to pandas.Dataframe
@ -981,7 +981,7 @@ class DataFrame(NDFrame):
------- -------
pandas.DataFrame pandas.DataFrame
""" """
return self._query_compiler.to_pandas() return self._query_compiler.to_pandas(show_progress=show_progress)
def _empty_pd_df(self): def _empty_pd_df(self):
return self._query_compiler._empty_pd_ef() return self._query_compiler._empty_pd_ef()

View File

@ -371,7 +371,7 @@ class NDFrame(ABC):
return self._query_compiler.describe() return self._query_compiler.describe()
@abstractmethod @abstractmethod
def _to_pandas(self): def _to_pandas(self, show_progress=False):
pass pass
@abstractmethod @abstractmethod

View File

@ -17,7 +17,7 @@ from collections import OrderedDict
import pandas as pd import pandas as pd
from eland import Index, SortOrder from eland import Index, SortOrder, DEFAULT_CSV_BATCH_OUTPUT_SIZE, DEFAULT_ES_MAX_RESULT_WINDOW
from eland import Query from eland import Query
from eland.actions import SortFieldAction from eland.actions import SortFieldAction
from eland.tasks import HeadTask, TailTask, BooleanFilterTask, ArithmeticOpFieldsTask, QueryTermsTask, \ from eland.tasks import HeadTask, TailTask, BooleanFilterTask, ArithmeticOpFieldsTask, QueryTermsTask, \
@ -491,54 +491,70 @@ class Operations:
return df return df
def to_pandas(self, query_compiler): def to_pandas(self, query_compiler, show_progress=False):
class PandasDataFrameCollector: class PandasDataFrameCollector:
def __init__(self): def __init__(self, show_progress):
self.df = None self._df = None
self._show_progress = show_progress
def collect(self, df): def collect(self, df):
self.df = df # This collector does not batch data on output. Therefore, batch_size is fixed to None and this method
# is only called once.
if self._df is not None:
raise RuntimeError("Logic error in execution, this method must only be called once for this"
"collector - batch_size == None")
self._df = df
@staticmethod @staticmethod
def batch_size(): def batch_size():
# Do not change (see notes on collect)
return None return None
collector = PandasDataFrameCollector() @property
def show_progress(self):
return self._show_progress
collector = PandasDataFrameCollector(show_progress)
self._es_results(query_compiler, collector) self._es_results(query_compiler, collector)
return collector.df return collector._df
def to_csv(self, query_compiler, **kwargs): def to_csv(self, query_compiler, show_progress=False, **kwargs):
class PandasToCSVCollector: class PandasToCSVCollector:
def __init__(self, **args): def __init__(self, show_progress, **args):
self.args = args self._args = args
self.ret = None self._show_progress = show_progress
self.first_time = True self._ret = None
self._first_time = True
def collect(self, df): def collect(self, df):
# If this is the first time we collect results, then write header, otherwise don't write header # If this is the first time we collect results, then write header, otherwise don't write header
# and append results # and append results
if self.first_time: if self._first_time:
self.first_time = False self._first_time = False
df.to_csv(**self.args) df.to_csv(**self._args)
else: else:
# Don't write header, and change mode to append # Don't write header, and change mode to append
self.args['header'] = False self._args['header'] = False
self.args['mode'] = 'a' self._args['mode'] = 'a'
df.to_csv(**self.args) df.to_csv(**self._args)
@staticmethod @staticmethod
def batch_size(): def batch_size():
# By default read 10000 docs to csv # By default read n docs and then dump to csv
batch_size = 10000 batch_size = DEFAULT_CSV_BATCH_OUTPUT_SIZE
return batch_size return batch_size
collector = PandasToCSVCollector(**kwargs) @property
def show_progress(self):
return self._show_progress
collector = PandasToCSVCollector(show_progress, **kwargs)
self._es_results(query_compiler, collector) self._es_results(query_compiler, collector)
return collector.ret return collector._ret
def _es_results(self, query_compiler, collector): def _es_results(self, query_compiler, collector):
query_params, post_processing = self._resolve_tasks(query_compiler) query_params, post_processing = self._resolve_tasks(query_compiler)
@ -562,7 +578,7 @@ class Operations:
# If size=None use scan not search - then post sort results when in df # If size=None use scan not search - then post sort results when in df
# If size>10000 use scan # If size>10000 use scan
is_scan = False is_scan = False
if size is not None and size <= 10000: if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
if size > 0: if size > 0:
try: try:
es_results = query_compiler._client.search( es_results = query_compiler._client.search(
@ -594,7 +610,8 @@ class Operations:
if is_scan: if is_scan:
while True: while True:
partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size()) partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size(),
collector.show_progress)
df = self._apply_df_post_processing(df, post_processing) df = self._apply_df_post_processing(df, post_processing)
collector.collect(df) collector.collect(df)
if not partial_result: if not partial_result:

View File

@ -14,12 +14,13 @@
import copy import copy
import warnings import warnings
from collections import OrderedDict from collections import OrderedDict
from datetime import datetime
from typing import Union from typing import Union
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from eland import Client from eland import Client, DEFAULT_PROGRESS_REPORTING_NUM_ROWS
from eland import FieldMappings from eland import FieldMappings
from eland import Index from eland import Index
from eland import Operations from eland import Operations
@ -109,7 +110,7 @@ class QueryCompiler:
# END Index, columns, and dtypes objects # END Index, columns, and dtypes objects
def _es_results_to_pandas(self, results, batch_size=None): def _es_results_to_pandas(self, results, batch_size=None, show_progress=False):
""" """
Parameters Parameters
---------- ----------
@ -248,6 +249,10 @@ class QueryCompiler:
partial_result = True partial_result = True
break break
if show_progress:
if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
print("{}: read {} rows".format(datetime.now(), i))
# Create pandas DataFrame # Create pandas DataFrame
df = pd.DataFrame(data=rows, index=index) df = pd.DataFrame(data=rows, index=index)
@ -267,6 +272,9 @@ class QueryCompiler:
if len(self.columns) > 1: if len(self.columns) > 1:
df = df[self.columns] df = df[self.columns]
if show_progress:
print("{}: read {} rows".format(datetime.now(), i))
return partial_result, df return partial_result, df
def _flatten_dict(self, y, field_mapping_cache): def _flatten_dict(self, y, field_mapping_cache):
@ -381,13 +389,13 @@ class QueryCompiler:
return result return result
# To/From Pandas # To/From Pandas
def to_pandas(self): def to_pandas(self, show_progress=False):
"""Converts Eland DataFrame to Pandas DataFrame. """Converts Eland DataFrame to Pandas DataFrame.
Returns: Returns:
Pandas DataFrame Pandas DataFrame
""" """
return self._operations.to_pandas(self) return self._operations.to_pandas(self, show_progress)
# To CSV # To CSV
def to_csv(self, **kwargs): def to_csv(self, **kwargs):

View File

@ -387,8 +387,8 @@ class Series(NDFrame):
result = _buf.getvalue() result = _buf.getvalue()
return result return result
def _to_pandas(self): def _to_pandas(self, show_progress=False):
return self._query_compiler.to_pandas()[self.name] return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
@property @property
def _dtype(self): def _dtype(self):

View File

@ -57,4 +57,7 @@ class TestDataFrameUtils(TestData):
def test_eland_to_pandas_performance(self): def test_eland_to_pandas_performance(self):
# TODO quantify this # TODO quantify this
pd_df = ed.eland_to_pandas(self.ed_flights()) pd_df = ed.eland_to_pandas(self.ed_flights(), show_progress=True)
# This test calls the same method so is redundant
#assert_pandas_eland_frame_equal(pd_df, self.ed_flights())

View File

@ -17,12 +17,10 @@ import csv
import pandas as pd import pandas as pd
from pandas.io.parsers import _c_parser_defaults from pandas.io.parsers import _c_parser_defaults
from eland import Client from eland import Client, DEFAULT_CHUNK_SIZE
from eland import DataFrame from eland import DataFrame
from eland import FieldMappings from eland import FieldMappings
DEFAULT_CHUNK_SIZE = 10000
def read_es(es_client, es_index_pattern): def read_es(es_client, es_index_pattern):
""" """
@ -211,7 +209,7 @@ def pandas_to_eland(pd_df,
return ed_df return ed_df
def eland_to_pandas(ed_df): def eland_to_pandas(ed_df, show_progress=False):
""" """
Convert an eland.Dataframe to a pandas.DataFrame Convert an eland.Dataframe to a pandas.DataFrame
@ -222,6 +220,8 @@ def eland_to_pandas(ed_df):
---------- ----------
ed_df: eland.DataFrame ed_df: eland.DataFrame
The source eland.Dataframe referencing the Elasticsearch index The source eland.Dataframe referencing the Elasticsearch index
show_progress: bool
Output progress of option to stdout? By default False.
Returns Returns
------- -------
@ -258,12 +258,18 @@ def eland_to_pandas(ed_df):
<BLANKLINE> <BLANKLINE>
[5 rows x 27 columns] [5 rows x 27 columns]
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
>>> pd_df = ed.eland_to_pandas(ed.DataFrame('localhost', 'flights'), show_progress=True) # doctest: +SKIP
2020-01-29 12:43:36.572395: read 10000 rows
2020-01-29 12:43:37.309031: read 13059 rows
See Also See Also
-------- --------
eland.read_es: Create an eland.Dataframe from an Elasticsearch index eland.read_es: Create an eland.Dataframe from an Elasticsearch index
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
""" """
return ed_df._to_pandas() return ed_df._to_pandas(show_progress=show_progress)
def read_csv(filepath_or_buffer, def read_csv(filepath_or_buffer,