mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Stream writes in to_csv()
Co-authored-by: P. Sai Vinay <pvinay1998@gmail.com>
This commit is contained in:
parent
adf0535608
commit
28e6d92430
@ -41,7 +41,6 @@ if TYPE_CHECKING:
|
||||
# Default number of rows displayed (different to pandas where ALL could be displayed)
|
||||
DEFAULT_NUM_ROWS_DISPLAYED = 60
|
||||
DEFAULT_CHUNK_SIZE = 10000
|
||||
DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000
|
||||
DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000
|
||||
DEFAULT_SEARCH_SIZE = 5000
|
||||
DEFAULT_PIT_KEEP_ALIVE = "3m"
|
||||
|
@ -1218,6 +1218,36 @@ class Operations:
|
||||
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
|
||||
)
|
||||
|
||||
def to_csv( # type: ignore
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
path_or_buf=None,
|
||||
header: bool = True,
|
||||
mode: str = "w",
|
||||
show_progress: bool = False,
|
||||
**kwargs,
|
||||
) -> Optional[str]:
|
||||
result = []
|
||||
processed = 0
|
||||
for i, df in enumerate(
|
||||
self.search_yield_pandas_dataframes(query_compiler=query_compiler)
|
||||
):
|
||||
processed += df.shape[0]
|
||||
if show_progress and processed % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
|
||||
print(f"{datetime.now()}: read {processed} rows")
|
||||
result.append(
|
||||
df.to_csv(
|
||||
path_or_buf=path_or_buf,
|
||||
# start appending after the first batch
|
||||
mode=mode if i == 0 else "a",
|
||||
# only write the header for the first batch, if wanted at all
|
||||
header=header if i == 0 else False,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if path_or_buf is None:
|
||||
return "".join(result)
|
||||
|
||||
def to_pandas(
|
||||
self, query_compiler: "QueryCompiler", show_progress: bool = False
|
||||
) -> pd.DataFrame:
|
||||
@ -1239,16 +1269,6 @@ class Operations:
|
||||
return query_compiler._empty_pd_ef()
|
||||
return pd.concat(df_list)
|
||||
|
||||
def to_csv(
|
||||
self,
|
||||
query_compiler: "QueryCompiler",
|
||||
show_progress: bool = False,
|
||||
**kwargs: Union[bool, str],
|
||||
) -> Optional[str]:
|
||||
return self.to_pandas( # type: ignore[no-any-return]
|
||||
query_compiler=query_compiler, show_progress=show_progress
|
||||
).to_csv(**kwargs)
|
||||
|
||||
def search_yield_pandas_dataframes(
|
||||
self, query_compiler: "QueryCompiler"
|
||||
) -> Generator["pd.DataFrame", None, None]:
|
||||
|
@ -497,7 +497,7 @@ class QueryCompiler:
|
||||
return self._update_query(QueryFilter(query))
|
||||
|
||||
# To/From Pandas
|
||||
def to_pandas(self, show_progress: bool = False):
|
||||
def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
|
||||
"""Converts Eland DataFrame to Pandas DataFrame.
|
||||
|
||||
Returns:
|
||||
@ -512,7 +512,7 @@ class QueryCompiler:
|
||||
Returns:
|
||||
If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None.
|
||||
"""
|
||||
return self._operations.to_csv(self, **kwargs)
|
||||
return self._operations.to_csv(query_compiler=self, **kwargs)
|
||||
|
||||
def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
|
||||
return self._operations.search_yield_pandas_dataframes(self)
|
||||
|
Loading…
x
Reference in New Issue
Block a user