mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Stream writes in to_csv()
Co-authored-by: P. Sai Vinay <pvinay1998@gmail.com>
This commit is contained in:
parent
adf0535608
commit
28e6d92430
@ -41,7 +41,6 @@ if TYPE_CHECKING:
|
|||||||
# Default number of rows displayed (different to pandas where ALL could be displayed)
|
# Default number of rows displayed (different to pandas where ALL could be displayed)
|
||||||
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
||||||
DEFAULT_CHUNK_SIZE = 10000
|
DEFAULT_CHUNK_SIZE = 10000
|
||||||
DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000
|
|
||||||
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
||||||
DEFAULT_SEARCH_SIZE = 5000
|
DEFAULT_SEARCH_SIZE = 5000
|
||||||
DEFAULT_PIT_KEEP_ALIVE = "3m"
|
DEFAULT_PIT_KEEP_ALIVE = "3m"
|
||||||
|
@ -1218,6 +1218,36 @@ class Operations:
|
|||||||
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
|
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def to_csv( # type: ignore
|
||||||
|
self,
|
||||||
|
query_compiler: "QueryCompiler",
|
||||||
|
path_or_buf=None,
|
||||||
|
header: bool = True,
|
||||||
|
mode: str = "w",
|
||||||
|
show_progress: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> Optional[str]:
|
||||||
|
result = []
|
||||||
|
processed = 0
|
||||||
|
for i, df in enumerate(
|
||||||
|
self.search_yield_pandas_dataframes(query_compiler=query_compiler)
|
||||||
|
):
|
||||||
|
processed += df.shape[0]
|
||||||
|
if show_progress and processed % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
|
||||||
|
print(f"{datetime.now()}: read {processed} rows")
|
||||||
|
result.append(
|
||||||
|
df.to_csv(
|
||||||
|
path_or_buf=path_or_buf,
|
||||||
|
# start appending after the first batch
|
||||||
|
mode=mode if i == 0 else "a",
|
||||||
|
# only write the header for the first batch, if wanted at all
|
||||||
|
header=header if i == 0 else False,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if path_or_buf is None:
|
||||||
|
return "".join(result)
|
||||||
|
|
||||||
def to_pandas(
|
def to_pandas(
|
||||||
self, query_compiler: "QueryCompiler", show_progress: bool = False
|
self, query_compiler: "QueryCompiler", show_progress: bool = False
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
@ -1239,16 +1269,6 @@ class Operations:
|
|||||||
return query_compiler._empty_pd_ef()
|
return query_compiler._empty_pd_ef()
|
||||||
return pd.concat(df_list)
|
return pd.concat(df_list)
|
||||||
|
|
||||||
def to_csv(
|
|
||||||
self,
|
|
||||||
query_compiler: "QueryCompiler",
|
|
||||||
show_progress: bool = False,
|
|
||||||
**kwargs: Union[bool, str],
|
|
||||||
) -> Optional[str]:
|
|
||||||
return self.to_pandas( # type: ignore[no-any-return]
|
|
||||||
query_compiler=query_compiler, show_progress=show_progress
|
|
||||||
).to_csv(**kwargs)
|
|
||||||
|
|
||||||
def search_yield_pandas_dataframes(
|
def search_yield_pandas_dataframes(
|
||||||
self, query_compiler: "QueryCompiler"
|
self, query_compiler: "QueryCompiler"
|
||||||
) -> Generator["pd.DataFrame", None, None]:
|
) -> Generator["pd.DataFrame", None, None]:
|
||||||
|
@ -497,7 +497,7 @@ class QueryCompiler:
|
|||||||
return self._update_query(QueryFilter(query))
|
return self._update_query(QueryFilter(query))
|
||||||
|
|
||||||
# To/From Pandas
|
# To/From Pandas
|
||||||
def to_pandas(self, show_progress: bool = False):
|
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
|
||||||
"""Converts Eland DataFrame to Pandas DataFrame.
|
"""Converts Eland DataFrame to Pandas DataFrame.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -512,7 +512,7 @@ class QueryCompiler:
|
|||||||
Returns:
|
Returns:
|
||||||
If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None.
|
If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None.
|
||||||
"""
|
"""
|
||||||
return self._operations.to_csv(self, **kwargs)
|
return self._operations.to_csv(query_compiler=self, **kwargs)
|
||||||
|
|
||||||
def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
|
def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
|
||||||
return self._operations.search_yield_pandas_dataframes(self)
|
return self._operations.search_yield_pandas_dataframes(self)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user