mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Feature/show progress (#120)
* Adding show_progress debug option to eland_to_pandas * Adding show_progress debug option to eland_to_pandas
This commit is contained in:
parent
409cb043c8
commit
2ca538c49d
@ -17,6 +17,11 @@ from enum import Enum
|
|||||||
|
|
||||||
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
||||||
|
|
||||||
|
DEFAULT_CHUNK_SIZE = 10000
|
||||||
|
DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000
|
||||||
|
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
||||||
|
DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window
|
||||||
|
|
||||||
|
|
||||||
def docstring_parameter(*sub):
|
def docstring_parameter(*sub):
|
||||||
def dec(obj):
|
def dec(obj):
|
||||||
|
@ -973,7 +973,7 @@ class DataFrame(NDFrame):
|
|||||||
}
|
}
|
||||||
return self._query_compiler.to_csv(**kwargs)
|
return self._query_compiler.to_csv(**kwargs)
|
||||||
|
|
||||||
def _to_pandas(self):
|
def _to_pandas(self, show_progress=False):
|
||||||
"""
|
"""
|
||||||
Utility method to convert eland.Dataframe to pandas.Dataframe
|
Utility method to convert eland.Dataframe to pandas.Dataframe
|
||||||
|
|
||||||
@ -981,7 +981,7 @@ class DataFrame(NDFrame):
|
|||||||
-------
|
-------
|
||||||
pandas.DataFrame
|
pandas.DataFrame
|
||||||
"""
|
"""
|
||||||
return self._query_compiler.to_pandas()
|
return self._query_compiler.to_pandas(show_progress=show_progress)
|
||||||
|
|
||||||
def _empty_pd_df(self):
|
def _empty_pd_df(self):
|
||||||
return self._query_compiler._empty_pd_ef()
|
return self._query_compiler._empty_pd_ef()
|
||||||
|
@ -371,7 +371,7 @@ class NDFrame(ABC):
|
|||||||
return self._query_compiler.describe()
|
return self._query_compiler.describe()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _to_pandas(self):
|
def _to_pandas(self, show_progress=False):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -17,7 +17,7 @@ from collections import OrderedDict
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from eland import Index, SortOrder
|
from eland import Index, SortOrder, DEFAULT_CSV_BATCH_OUTPUT_SIZE, DEFAULT_ES_MAX_RESULT_WINDOW
|
||||||
from eland import Query
|
from eland import Query
|
||||||
from eland.actions import SortFieldAction
|
from eland.actions import SortFieldAction
|
||||||
from eland.tasks import HeadTask, TailTask, BooleanFilterTask, ArithmeticOpFieldsTask, QueryTermsTask, \
|
from eland.tasks import HeadTask, TailTask, BooleanFilterTask, ArithmeticOpFieldsTask, QueryTermsTask, \
|
||||||
@ -491,54 +491,70 @@ class Operations:
|
|||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def to_pandas(self, query_compiler):
|
def to_pandas(self, query_compiler, show_progress=False):
|
||||||
class PandasDataFrameCollector:
|
class PandasDataFrameCollector:
|
||||||
def __init__(self):
|
def __init__(self, show_progress):
|
||||||
self.df = None
|
self._df = None
|
||||||
|
self._show_progress = show_progress
|
||||||
|
|
||||||
def collect(self, df):
|
def collect(self, df):
|
||||||
self.df = df
|
# This collector does not batch data on output. Therefore, batch_size is fixed to None and this method
|
||||||
|
# is only called once.
|
||||||
|
if self._df is not None:
|
||||||
|
raise RuntimeError("Logic error in execution, this method must only be called once for this"
|
||||||
|
"collector - batch_size == None")
|
||||||
|
self._df = df
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def batch_size():
|
def batch_size():
|
||||||
|
# Do not change (see notes on collect)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
collector = PandasDataFrameCollector()
|
@property
|
||||||
|
def show_progress(self):
|
||||||
|
return self._show_progress
|
||||||
|
|
||||||
|
collector = PandasDataFrameCollector(show_progress)
|
||||||
|
|
||||||
self._es_results(query_compiler, collector)
|
self._es_results(query_compiler, collector)
|
||||||
|
|
||||||
return collector.df
|
return collector._df
|
||||||
|
|
||||||
def to_csv(self, query_compiler, **kwargs):
|
def to_csv(self, query_compiler, show_progress=False, **kwargs):
|
||||||
class PandasToCSVCollector:
|
class PandasToCSVCollector:
|
||||||
def __init__(self, **args):
|
def __init__(self, show_progress, **args):
|
||||||
self.args = args
|
self._args = args
|
||||||
self.ret = None
|
self._show_progress = show_progress
|
||||||
self.first_time = True
|
self._ret = None
|
||||||
|
self._first_time = True
|
||||||
|
|
||||||
def collect(self, df):
|
def collect(self, df):
|
||||||
# If this is the first time we collect results, then write header, otherwise don't write header
|
# If this is the first time we collect results, then write header, otherwise don't write header
|
||||||
# and append results
|
# and append results
|
||||||
if self.first_time:
|
if self._first_time:
|
||||||
self.first_time = False
|
self._first_time = False
|
||||||
df.to_csv(**self.args)
|
df.to_csv(**self._args)
|
||||||
else:
|
else:
|
||||||
# Don't write header, and change mode to append
|
# Don't write header, and change mode to append
|
||||||
self.args['header'] = False
|
self._args['header'] = False
|
||||||
self.args['mode'] = 'a'
|
self._args['mode'] = 'a'
|
||||||
df.to_csv(**self.args)
|
df.to_csv(**self._args)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def batch_size():
|
def batch_size():
|
||||||
# By default read 10000 docs to csv
|
# By default read n docs and then dump to csv
|
||||||
batch_size = 10000
|
batch_size = DEFAULT_CSV_BATCH_OUTPUT_SIZE
|
||||||
return batch_size
|
return batch_size
|
||||||
|
|
||||||
collector = PandasToCSVCollector(**kwargs)
|
@property
|
||||||
|
def show_progress(self):
|
||||||
|
return self._show_progress
|
||||||
|
|
||||||
|
collector = PandasToCSVCollector(show_progress, **kwargs)
|
||||||
|
|
||||||
self._es_results(query_compiler, collector)
|
self._es_results(query_compiler, collector)
|
||||||
|
|
||||||
return collector.ret
|
return collector._ret
|
||||||
|
|
||||||
def _es_results(self, query_compiler, collector):
|
def _es_results(self, query_compiler, collector):
|
||||||
query_params, post_processing = self._resolve_tasks(query_compiler)
|
query_params, post_processing = self._resolve_tasks(query_compiler)
|
||||||
@ -562,7 +578,7 @@ class Operations:
|
|||||||
# If size=None use scan not search - then post sort results when in df
|
# If size=None use scan not search - then post sort results when in df
|
||||||
# If size>10000 use scan
|
# If size>10000 use scan
|
||||||
is_scan = False
|
is_scan = False
|
||||||
if size is not None and size <= 10000:
|
if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
|
||||||
if size > 0:
|
if size > 0:
|
||||||
try:
|
try:
|
||||||
es_results = query_compiler._client.search(
|
es_results = query_compiler._client.search(
|
||||||
@ -594,7 +610,8 @@ class Operations:
|
|||||||
|
|
||||||
if is_scan:
|
if is_scan:
|
||||||
while True:
|
while True:
|
||||||
partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size())
|
partial_result, df = query_compiler._es_results_to_pandas(es_results, collector.batch_size(),
|
||||||
|
collector.show_progress)
|
||||||
df = self._apply_df_post_processing(df, post_processing)
|
df = self._apply_df_post_processing(df, post_processing)
|
||||||
collector.collect(df)
|
collector.collect(df)
|
||||||
if not partial_result:
|
if not partial_result:
|
||||||
|
@ -14,12 +14,13 @@
|
|||||||
import copy
|
import copy
|
||||||
import warnings
|
import warnings
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from datetime import datetime
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from eland import Client
|
from eland import Client, DEFAULT_PROGRESS_REPORTING_NUM_ROWS
|
||||||
from eland import FieldMappings
|
from eland import FieldMappings
|
||||||
from eland import Index
|
from eland import Index
|
||||||
from eland import Operations
|
from eland import Operations
|
||||||
@ -109,7 +110,7 @@ class QueryCompiler:
|
|||||||
|
|
||||||
# END Index, columns, and dtypes objects
|
# END Index, columns, and dtypes objects
|
||||||
|
|
||||||
def _es_results_to_pandas(self, results, batch_size=None):
|
def _es_results_to_pandas(self, results, batch_size=None, show_progress=False):
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -248,6 +249,10 @@ class QueryCompiler:
|
|||||||
partial_result = True
|
partial_result = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
|
||||||
|
print("{}: read {} rows".format(datetime.now(), i))
|
||||||
|
|
||||||
# Create pandas DataFrame
|
# Create pandas DataFrame
|
||||||
df = pd.DataFrame(data=rows, index=index)
|
df = pd.DataFrame(data=rows, index=index)
|
||||||
|
|
||||||
@ -267,6 +272,9 @@ class QueryCompiler:
|
|||||||
if len(self.columns) > 1:
|
if len(self.columns) > 1:
|
||||||
df = df[self.columns]
|
df = df[self.columns]
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
print("{}: read {} rows".format(datetime.now(), i))
|
||||||
|
|
||||||
return partial_result, df
|
return partial_result, df
|
||||||
|
|
||||||
def _flatten_dict(self, y, field_mapping_cache):
|
def _flatten_dict(self, y, field_mapping_cache):
|
||||||
@ -381,13 +389,13 @@ class QueryCompiler:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
# To/From Pandas
|
# To/From Pandas
|
||||||
def to_pandas(self):
|
def to_pandas(self, show_progress=False):
|
||||||
"""Converts Eland DataFrame to Pandas DataFrame.
|
"""Converts Eland DataFrame to Pandas DataFrame.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Pandas DataFrame
|
Pandas DataFrame
|
||||||
"""
|
"""
|
||||||
return self._operations.to_pandas(self)
|
return self._operations.to_pandas(self, show_progress)
|
||||||
|
|
||||||
# To CSV
|
# To CSV
|
||||||
def to_csv(self, **kwargs):
|
def to_csv(self, **kwargs):
|
||||||
|
@ -387,8 +387,8 @@ class Series(NDFrame):
|
|||||||
result = _buf.getvalue()
|
result = _buf.getvalue()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _to_pandas(self):
|
def _to_pandas(self, show_progress=False):
|
||||||
return self._query_compiler.to_pandas()[self.name]
|
return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _dtype(self):
|
def _dtype(self):
|
||||||
|
@ -57,4 +57,7 @@ class TestDataFrameUtils(TestData):
|
|||||||
|
|
||||||
def test_eland_to_pandas_performance(self):
|
def test_eland_to_pandas_performance(self):
|
||||||
# TODO quantify this
|
# TODO quantify this
|
||||||
pd_df = ed.eland_to_pandas(self.ed_flights())
|
pd_df = ed.eland_to_pandas(self.ed_flights(), show_progress=True)
|
||||||
|
|
||||||
|
# This test calls the same method so is redundant
|
||||||
|
#assert_pandas_eland_frame_equal(pd_df, self.ed_flights())
|
||||||
|
@ -17,12 +17,10 @@ import csv
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.io.parsers import _c_parser_defaults
|
from pandas.io.parsers import _c_parser_defaults
|
||||||
|
|
||||||
from eland import Client
|
from eland import Client, DEFAULT_CHUNK_SIZE
|
||||||
from eland import DataFrame
|
from eland import DataFrame
|
||||||
from eland import FieldMappings
|
from eland import FieldMappings
|
||||||
|
|
||||||
DEFAULT_CHUNK_SIZE = 10000
|
|
||||||
|
|
||||||
|
|
||||||
def read_es(es_client, es_index_pattern):
|
def read_es(es_client, es_index_pattern):
|
||||||
"""
|
"""
|
||||||
@ -211,7 +209,7 @@ def pandas_to_eland(pd_df,
|
|||||||
return ed_df
|
return ed_df
|
||||||
|
|
||||||
|
|
||||||
def eland_to_pandas(ed_df):
|
def eland_to_pandas(ed_df, show_progress=False):
|
||||||
"""
|
"""
|
||||||
Convert an eland.Dataframe to a pandas.DataFrame
|
Convert an eland.Dataframe to a pandas.DataFrame
|
||||||
|
|
||||||
@ -222,6 +220,8 @@ def eland_to_pandas(ed_df):
|
|||||||
----------
|
----------
|
||||||
ed_df: eland.DataFrame
|
ed_df: eland.DataFrame
|
||||||
The source eland.Dataframe referencing the Elasticsearch index
|
The source eland.Dataframe referencing the Elasticsearch index
|
||||||
|
show_progress: bool
|
||||||
|
Output progress of option to stdout? By default False.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -258,12 +258,18 @@ def eland_to_pandas(ed_df):
|
|||||||
<BLANKLINE>
|
<BLANKLINE>
|
||||||
[5 rows x 27 columns]
|
[5 rows x 27 columns]
|
||||||
|
|
||||||
|
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
|
||||||
|
|
||||||
|
>>> pd_df = ed.eland_to_pandas(ed.DataFrame('localhost', 'flights'), show_progress=True) # doctest: +SKIP
|
||||||
|
2020-01-29 12:43:36.572395: read 10000 rows
|
||||||
|
2020-01-29 12:43:37.309031: read 13059 rows
|
||||||
|
|
||||||
See Also
|
See Also
|
||||||
--------
|
--------
|
||||||
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
eland.read_es: Create an eland.Dataframe from an Elasticsearch index
|
||||||
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
|
eland.pandas_to_eland: Create an eland.Dataframe from pandas.DataFrame
|
||||||
"""
|
"""
|
||||||
return ed_df._to_pandas()
|
return ed_df._to_pandas(show_progress=show_progress)
|
||||||
|
|
||||||
|
|
||||||
def read_csv(filepath_or_buffer,
|
def read_csv(filepath_or_buffer,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user