mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Fix non _source fields missing from the result hits (#693)
This commit is contained in:
parent
632074c0f0
commit
1014ecdb39
@ -83,7 +83,7 @@ class DataFrame(NDFrame):
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
[5 rows x 28 columns]
|
||||
|
||||
|
||||
Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
|
||||
@ -173,13 +173,13 @@ class DataFrame(NDFrame):
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> assert isinstance(df.columns, pd.Index)
|
||||
>>> df.columns
|
||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
|
||||
... 'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
|
||||
... 'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
|
||||
... 'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||
... 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
|
||||
... 'timestamp'],
|
||||
... dtype='object')
|
||||
Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Cities', 'Dest', 'DestAirportID', 'DestCityName',
|
||||
'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
|
||||
'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
|
||||
'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||
'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
|
||||
'timestamp'],
|
||||
dtype='object')
|
||||
"""
|
||||
return self._query_compiler.columns
|
||||
|
||||
@ -2014,9 +2014,9 @@ class DataFrame(NDFrame):
|
||||
--------
|
||||
>>> df = ed.DataFrame('http://localhost:9200', 'flights')
|
||||
>>> df.shape
|
||||
(13059, 27)
|
||||
(13059, 28)
|
||||
>>> df.query('FlightDelayMin > 60').shape
|
||||
(2730, 27)
|
||||
(2730, 28)
|
||||
"""
|
||||
if isinstance(expr, BooleanFilter):
|
||||
return DataFrame(
|
||||
|
@ -262,7 +262,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
[5 rows x 28 columns]
|
||||
|
||||
Convert `eland.DataFrame` to `pandas.DataFrame` (Note: this loads entire Elasticsearch index into core memory)
|
||||
|
||||
@ -277,7 +277,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
|
||||
3 181.694216 True ... 0 2018-01-01 10:33:28
|
||||
4 730.041778 False ... 0 2018-01-01 05:13:00
|
||||
<BLANKLINE>
|
||||
[5 rows x 27 columns]
|
||||
[5 rows x 28 columns]
|
||||
|
||||
Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows
|
||||
|
||||
|
@ -1543,6 +1543,16 @@ def quantile_to_percentile(quantile: Union[int, float]) -> float:
|
||||
return float(min(100, max(0, quantile * 100)))
|
||||
|
||||
|
||||
def is_field_already_present(key: str, dictionary: Dict[str, Any]) -> bool:
|
||||
if "." in key:
|
||||
splitted = key.split(".")
|
||||
return is_field_already_present(
|
||||
".".join(splitted[1:]), dictionary.get(splitted[0], {})
|
||||
)
|
||||
else:
|
||||
return key in dictionary
|
||||
|
||||
|
||||
def _search_yield_hits(
|
||||
query_compiler: "QueryCompiler",
|
||||
body: Dict[str, Any],
|
||||
@ -1600,10 +1610,24 @@ def _search_yield_hits(
|
||||
|
||||
# Modify the search with the new point in time ID and keep-alive time.
|
||||
body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
|
||||
if isinstance(body["_source"], list):
|
||||
body["fields"] = body["_source"]
|
||||
|
||||
while max_number_of_hits is None or hits_yielded < max_number_of_hits:
|
||||
resp = client.search(**body)
|
||||
hits: List[Dict[str, Any]] = resp["hits"]["hits"]
|
||||
hits: List[Dict[str, Any]] = []
|
||||
for hit in resp["hits"]["hits"]:
|
||||
# Copy some of the fields to _source if they are missing there.
|
||||
if "fields" in hit and "_source" in hit:
|
||||
fields = hit["fields"]
|
||||
del hit["fields"]
|
||||
for k, v in fields.items():
|
||||
if not is_field_already_present(k, hit["_source"]):
|
||||
if isinstance(v, list):
|
||||
hit["_source"][k] = list(sorted(v))
|
||||
else:
|
||||
hit["_source"][k] = v
|
||||
hits.append(hit)
|
||||
|
||||
# The point in time ID can change between searches so we
|
||||
# need to keep the next search up-to-date
|
||||
|
@ -43,7 +43,7 @@ FLIGHTS_MAPPING = {
|
||||
"Carrier": {"type": "keyword"},
|
||||
"Dest": {"type": "keyword"},
|
||||
"DestAirportID": {"type": "keyword"},
|
||||
"DestCityName": {"type": "keyword"},
|
||||
"DestCityName": {"type": "keyword", "copy_to": "Cities"},
|
||||
"DestCountry": {"type": "keyword"},
|
||||
"DestLocation": {"type": "geo_point"},
|
||||
"DestRegion": {"type": "keyword"},
|
||||
@ -58,11 +58,12 @@ FLIGHTS_MAPPING = {
|
||||
"FlightTimeMin": {"type": "float"},
|
||||
"Origin": {"type": "keyword"},
|
||||
"OriginAirportID": {"type": "keyword"},
|
||||
"OriginCityName": {"type": "keyword"},
|
||||
"OriginCityName": {"type": "keyword", "copy_to": "Cities"},
|
||||
"OriginCountry": {"type": "keyword"},
|
||||
"OriginLocation": {"type": "geo_point"},
|
||||
"OriginRegion": {"type": "keyword"},
|
||||
"OriginWeather": {"type": "keyword"},
|
||||
"Cities": {"type": "text"},
|
||||
"dayOfWeek": {"type": "byte"},
|
||||
"timestamp": {"type": "date", "format": "strict_date_hour_minute_second"},
|
||||
}
|
||||
|
@ -46,6 +46,10 @@ _pd_flights = pd.DataFrame.from_records(flight_records).reindex(
|
||||
_ed_flights.columns, axis=1
|
||||
)
|
||||
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
|
||||
# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
|
||||
_pd_flights["Cities"] = _pd_flights.apply(
|
||||
lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
|
||||
)
|
||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||
|
||||
_pd_flights_small = _pd_flights.head(48)
|
||||
|
@ -43,6 +43,7 @@ class TestDataFrameDtypes:
|
||||
"AvgTicketPrice": "float",
|
||||
"Cancelled": "boolean",
|
||||
"Carrier": "keyword",
|
||||
"Cities": "text",
|
||||
"Dest": "keyword",
|
||||
"DestAirportID": "keyword",
|
||||
"DestCityName": "keyword",
|
||||
|
@ -41,8 +41,9 @@ class TestDataFrameToCSV(TestData):
|
||||
results_file,
|
||||
index_col=0,
|
||||
converters={
|
||||
"DestLocation": lambda x: ast.literal_eval(x),
|
||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
||||
"DestLocation": ast.literal_eval,
|
||||
"OriginLocation": ast.literal_eval,
|
||||
"Cities": ast.literal_eval,
|
||||
},
|
||||
)
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
@ -63,8 +64,9 @@ class TestDataFrameToCSV(TestData):
|
||||
results_file,
|
||||
index_col=0,
|
||||
converters={
|
||||
"DestLocation": lambda x: ast.literal_eval(x),
|
||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
||||
"DestLocation": ast.literal_eval,
|
||||
"OriginLocation": ast.literal_eval,
|
||||
"Cities": ast.literal_eval,
|
||||
},
|
||||
)
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
@ -112,8 +114,9 @@ class TestDataFrameToCSV(TestData):
|
||||
results,
|
||||
index_col=0,
|
||||
converters={
|
||||
"DestLocation": lambda x: ast.literal_eval(x),
|
||||
"OriginLocation": lambda x: ast.literal_eval(x),
|
||||
"DestLocation": ast.literal_eval,
|
||||
"OriginLocation": ast.literal_eval,
|
||||
"Cities": ast.literal_eval,
|
||||
},
|
||||
)
|
||||
pd_from_csv.index = pd_from_csv.index.map(str)
|
||||
|
File diff suppressed because one or more lines are too long
@ -19,7 +19,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
"HeadApiResponse(False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
@ -43,8 +43,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2021-03-30 11:57:39.116425: read 10000 rows\n",
|
||||
"2021-03-30 11:57:39.522722: read 13059 rows\n"
|
||||
"2024-05-21 09:07:17.882569: read 10000 rows\n",
|
||||
"2024-05-21 09:07:18.375305: read 13059 rows\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -78,6 +78,18 @@
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'\n",
|
||||
" reader = pd.read_csv(filepath_or_buffer, **kwargs)\n",
|
||||
"/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze(\"columns\") to the call to squeeze.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" reader = pd.read_csv(filepath_or_buffer, **kwargs)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -218,35 +230,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'took': 0,\n",
|
||||
" 'timed_out': False,\n",
|
||||
" '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n",
|
||||
" 'hits': {'total': {'value': 2, 'relation': 'eq'},\n",
|
||||
" 'max_score': 1.0,\n",
|
||||
" 'hits': [{'_index': 'churn',\n",
|
||||
" '_id': '0',\n",
|
||||
" '_score': 1.0,\n",
|
||||
" '_source': {'state': 'KS',\n",
|
||||
" 'account length': 128,\n",
|
||||
" 'area code': 415,\n",
|
||||
" 'phone number': '382-4657',\n",
|
||||
" 'international plan': 'no',\n",
|
||||
" 'voice mail plan': 'yes',\n",
|
||||
" 'number vmail messages': 25,\n",
|
||||
" 'total day minutes': 265.1,\n",
|
||||
" 'total day calls': 110,\n",
|
||||
" 'total day charge': 45.07,\n",
|
||||
" 'total eve minutes': 197.4,\n",
|
||||
" 'total eve calls': 99,\n",
|
||||
" 'total eve charge': 16.78,\n",
|
||||
" 'total night minutes': 244.7,\n",
|
||||
" 'total night calls': 91,\n",
|
||||
" 'total night charge': 11.01,\n",
|
||||
" 'total intl minutes': 10.0,\n",
|
||||
" 'total intl calls': 3,\n",
|
||||
" 'total intl charge': 2.7,\n",
|
||||
" 'customer service calls': 1,\n",
|
||||
" 'churn': 0}}]}}"
|
||||
"ObjectApiResponse({'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'churn', '_id': '0', '_score': 1.0, '_source': {'state': 'KS', 'account length': 128, 'area code': 415, 'phone number': '382-4657', 'international plan': 'no', 'voice mail plan': 'yes', 'number vmail messages': 25, 'total day minutes': 265.1, 'total day calls': 110, 'total day charge': 45.07, 'total eve minutes': 197.4, 'total eve calls': 99, 'total eve charge': 16.78, 'total night minutes': 244.7, 'total night calls': 91, 'total night charge': 11.01, 'total intl minutes': 10.0, 'total intl calls': 3, 'total intl charge': 2.7, 'customer service calls': 1, 'churn': 0}}]}})"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
@ -267,7 +251,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'acknowledged': True}"
|
||||
"ObjectApiResponse({'acknowledged': True})"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
@ -297,7 +281,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -33,10 +33,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"AvgTicketPrice 639.433214\n",
|
||||
"Cancelled False\n",
|
||||
"dayOfWeek 3\n",
|
||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
||||
"dayOfWeek 2\n",
|
||||
"timestamp 2018-01-21 20:23:15.159835648\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
@ -58,9 +58,9 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"AvgTicketPrice 639.433214\n",
|
||||
"Cancelled 0.000000\n",
|
||||
"dayOfWeek 3.000000\n",
|
||||
"dayOfWeek 2.935777\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
@ -82,10 +82,10 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 640.387285\n",
|
||||
"AvgTicketPrice 639.433214\n",
|
||||
"Cancelled False\n",
|
||||
"dayOfWeek 3\n",
|
||||
"timestamp 2018-01-21 23:43:19.256498944\n",
|
||||
"dayOfWeek 2\n",
|
||||
"timestamp 2018-01-21 20:23:15.159835648\n",
|
||||
"DestCountry NaN\n",
|
||||
"dtype: object"
|
||||
]
|
||||
@ -108,7 +108,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"AvgTicketPrice 213.453156\n",
|
||||
"dayOfWeek 2.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
@ -131,7 +131,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"AvgTicketPrice 213.453156\n",
|
||||
"dayOfWeek 2.000000\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
@ -154,7 +154,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AvgTicketPrice 213.430365\n",
|
||||
"AvgTicketPrice 213.453156\n",
|
||||
"Cancelled NaN\n",
|
||||
"dayOfWeek 2.0\n",
|
||||
"timestamp NaT\n",
|
||||
@ -189,7 +189,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user