mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Yield list of hits from _search_yield_hits() instead of individual hits
This commit is contained in:
parent
011bf29816
commit
e4f88a34a6
@ -1234,13 +1234,14 @@ class Operations:
|
|||||||
if sort_params:
|
if sort_params:
|
||||||
body["sort"] = [sort_params]
|
body["sort"] = [sort_params]
|
||||||
|
|
||||||
es_results = list(
|
es_results: List[Dict[str, Any]] = sum(
|
||||||
search_yield_hits(
|
_search_yield_hits(
|
||||||
query_compiler=query_compiler, body=body, max_number_of_hits=result_size
|
query_compiler=query_compiler, body=body, max_number_of_hits=result_size
|
||||||
)
|
),
|
||||||
|
[],
|
||||||
)
|
)
|
||||||
|
|
||||||
_, df = query_compiler._es_results_to_pandas(
|
df = query_compiler._es_results_to_pandas(
|
||||||
results=es_results, show_progress=show_progress
|
results=es_results, show_progress=show_progress
|
||||||
)
|
)
|
||||||
df = self._apply_df_post_processing(df, post_processing)
|
df = self._apply_df_post_processing(df, post_processing)
|
||||||
@ -1447,14 +1448,16 @@ def quantile_to_percentile(quantile: Union[int, float]) -> float:
|
|||||||
return float(min(100, max(0, quantile * 100)))
|
return float(min(100, max(0, quantile * 100)))
|
||||||
|
|
||||||
|
|
||||||
def search_yield_hits(
|
def _search_yield_hits(
|
||||||
query_compiler: "QueryCompiler",
|
query_compiler: "QueryCompiler",
|
||||||
body: Dict[str, Any],
|
body: Dict[str, Any],
|
||||||
max_number_of_hits: Optional[int],
|
max_number_of_hits: Optional[int],
|
||||||
) -> Generator[Dict[str, Any], None, None]:
|
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||||
"""
|
"""
|
||||||
This is a generator used to initialize point in time API and query the
|
This is a generator used to initialize point in time API and query the
|
||||||
search API and return generator which yields an individual documents
|
search API and return generator which yields batches of hits as they
|
||||||
|
come in. No empty batches will be yielded, if there are no hits then
|
||||||
|
no batches will be yielded instead.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -1469,8 +1472,8 @@ def search_yield_hits(
|
|||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
>>> results = list(search_yield_hits(query_compiler, body, 2)) # doctest: +SKIP
|
>>> results = list(search_yield_hits(query_compiler, body, 2)) # doctest: +SKIP
|
||||||
[{'_index': 'flights', '_type': '_doc', '_id': '0', '_score': None, '_source': {...}, 'sort': [...]},
|
[[{'_index': 'flights', '_type': '_doc', '_id': '0', '_score': None, '_source': {...}, 'sort': [...]},
|
||||||
{'_index': 'flights', '_type': '_doc', '_id': '1', '_score': None, '_source': {...}, 'sort': [...]}]
|
{'_index': 'flights', '_type': '_doc', '_id': '1', '_score': None, '_source': {...}, 'sort': [...]}]]
|
||||||
"""
|
"""
|
||||||
# Make a copy of 'body' to avoid mutating it outside this function.
|
# Make a copy of 'body' to avoid mutating it outside this function.
|
||||||
body = body.copy()
|
body = body.copy()
|
||||||
@ -1500,7 +1503,7 @@ def _search_with_scroll(
|
|||||||
query_compiler: "QueryCompiler",
|
query_compiler: "QueryCompiler",
|
||||||
body: Dict[str, Any],
|
body: Dict[str, Any],
|
||||||
max_number_of_hits: Optional[int],
|
max_number_of_hits: Optional[int],
|
||||||
) -> Generator[Dict[str, Any], None, None]:
|
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||||
# No documents, no reason to send a search.
|
# No documents, no reason to send a search.
|
||||||
if max_number_of_hits == 0:
|
if max_number_of_hits == 0:
|
||||||
return
|
return
|
||||||
@ -1533,7 +1536,10 @@ def _search_with_scroll(
|
|||||||
hits_to_yield = min(len(hits), max_number_of_hits - hits_yielded)
|
hits_to_yield = min(len(hits), max_number_of_hits - hits_yielded)
|
||||||
|
|
||||||
# Yield the hits we need to and then track the total number.
|
# Yield the hits we need to and then track the total number.
|
||||||
yield from hits[:hits_to_yield]
|
# Never yield an empty list as that makes things simpler for
|
||||||
|
# downstream consumers.
|
||||||
|
if hits and hits_to_yield > 0:
|
||||||
|
yield hits[:hits_to_yield]
|
||||||
hits_yielded += hits_to_yield
|
hits_yielded += hits_to_yield
|
||||||
|
|
||||||
# Retrieve the next set of results
|
# Retrieve the next set of results
|
||||||
@ -1555,7 +1561,7 @@ def _search_with_pit_and_search_after(
|
|||||||
query_compiler: "QueryCompiler",
|
query_compiler: "QueryCompiler",
|
||||||
body: Dict[str, Any],
|
body: Dict[str, Any],
|
||||||
max_number_of_hits: Optional[int],
|
max_number_of_hits: Optional[int],
|
||||||
) -> Generator[Dict[str, Any], None, None]:
|
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||||
|
|
||||||
# No documents, no reason to send a search.
|
# No documents, no reason to send a search.
|
||||||
if max_number_of_hits == 0:
|
if max_number_of_hits == 0:
|
||||||
@ -1602,7 +1608,10 @@ def _search_with_pit_and_search_after(
|
|||||||
hits_to_yield = min(len(hits), max_number_of_hits - hits_yielded)
|
hits_to_yield = min(len(hits), max_number_of_hits - hits_yielded)
|
||||||
|
|
||||||
# Yield the hits we need to and then track the total number.
|
# Yield the hits we need to and then track the total number.
|
||||||
yield from hits[:hits_to_yield]
|
# Never yield an empty list as that makes things simpler for
|
||||||
|
# downstream consumers.
|
||||||
|
if hits and hits_to_yield > 0:
|
||||||
|
yield hits[:hits_to_yield]
|
||||||
hits_yielded += hits_to_yield
|
hits_yielded += hits_to_yield
|
||||||
|
|
||||||
# Set the 'search_after' for the next request
|
# Set the 'search_after' for the next request
|
||||||
|
@ -148,7 +148,6 @@ class QueryCompiler:
|
|||||||
def _es_results_to_pandas(
|
def _es_results_to_pandas(
|
||||||
self,
|
self,
|
||||||
results: List[Dict[str, Any]],
|
results: List[Dict[str, Any]],
|
||||||
batch_size: Optional[int] = None,
|
|
||||||
show_progress: bool = False,
|
show_progress: bool = False,
|
||||||
) -> "pd.Dataframe":
|
) -> "pd.Dataframe":
|
||||||
"""
|
"""
|
||||||
@ -239,10 +238,8 @@ class QueryCompiler:
|
|||||||
(which isn't great)
|
(which isn't great)
|
||||||
NOTE - using this lists is generally not a good way to use this API
|
NOTE - using this lists is generally not a good way to use this API
|
||||||
"""
|
"""
|
||||||
partial_result = False
|
if not results:
|
||||||
|
return self._empty_pd_ef()
|
||||||
if results is None:
|
|
||||||
return partial_result, self._empty_pd_ef()
|
|
||||||
|
|
||||||
# This is one of the most performance critical areas of eland, and it repeatedly calls
|
# This is one of the most performance critical areas of eland, and it repeatedly calls
|
||||||
# self._mappings.field_name_pd_dtype and self._mappings.date_field_format
|
# self._mappings.field_name_pd_dtype and self._mappings.date_field_format
|
||||||
@ -253,8 +250,7 @@ class QueryCompiler:
|
|||||||
index = []
|
index = []
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for hit in results:
|
for i, hit in enumerate(results, 1):
|
||||||
i = i + 1
|
|
||||||
|
|
||||||
if "_source" in hit:
|
if "_source" in hit:
|
||||||
row = hit["_source"]
|
row = hit["_source"]
|
||||||
@ -277,11 +273,6 @@ class QueryCompiler:
|
|||||||
# flatten row to map correctly to 2D DataFrame
|
# flatten row to map correctly to 2D DataFrame
|
||||||
rows.append(self._flatten_dict(row, field_mapping_cache))
|
rows.append(self._flatten_dict(row, field_mapping_cache))
|
||||||
|
|
||||||
if batch_size is not None:
|
|
||||||
if i >= batch_size:
|
|
||||||
partial_result = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if show_progress:
|
if show_progress:
|
||||||
if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
|
if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
|
||||||
print(f"{datetime.now()}: read {i} rows")
|
print(f"{datetime.now()}: read {i} rows")
|
||||||
@ -310,7 +301,7 @@ class QueryCompiler:
|
|||||||
if show_progress:
|
if show_progress:
|
||||||
print(f"{datetime.now()}: read {i} rows")
|
print(f"{datetime.now()}: read {i} rows")
|
||||||
|
|
||||||
return partial_result, df
|
return df
|
||||||
|
|
||||||
def _flatten_dict(self, y, field_mapping_cache: "FieldMappingCache"):
|
def _flatten_dict(self, y, field_mapping_cache: "FieldMappingCache"):
|
||||||
out = {}
|
out = {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user