Fix non _source fields missing from the result hits (#693)

2025-07-11 00:02:14 +08:00 · 2024-06-10 09:09:52 +02:00 · 2024-06-10 09:09:52 +02:00 · 1014ecdb39
commit 1014ecdb39
parent 632074c0f0
11 changed files with 240 additions and 221 deletions
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -83,7 +83,7 @@ class DataFrame(NDFrame):
    3      181.694216       True  ...         0 2018-01-01 10:33:28
    4      730.041778      False  ...         0 2018-01-01 05:13:00
    <BLANKLINE>
-    [5 rows x 27 columns]
+    [5 rows x 28 columns]


    Constructing DataFrame from an Elasticsearch client and an Elasticsearch index
@ -173,13 +173,13 @@ class DataFrame(NDFrame):
        >>> df = ed.DataFrame('http://localhost:9200', 'flights')
        >>> assert isinstance(df.columns, pd.Index)
        >>> df.columns
-        Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID', 'DestCityName',
-        ...   'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
-        ...   'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
-        ...   'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
-        ...   'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
-        ...   'timestamp'],
-        ...   dtype='object')
+        Index(['AvgTicketPrice', 'Cancelled', 'Carrier', 'Cities', 'Dest', 'DestAirportID', 'DestCityName',
+               'DestCountry', 'DestLocation', 'DestRegion', 'DestWeather', 'DistanceKilometers',
+               'DistanceMiles', 'FlightDelay', 'FlightDelayMin', 'FlightDelayType', 'FlightNum',
+               'FlightTimeHour', 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
+               'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather', 'dayOfWeek',
+               'timestamp'],
+              dtype='object')
        """
        return self._query_compiler.columns

@ -2014,9 +2014,9 @@ class DataFrame(NDFrame):
        --------
        >>> df = ed.DataFrame('http://localhost:9200', 'flights')
        >>> df.shape
-        (13059, 27)
+        (13059, 28)
        >>> df.query('FlightDelayMin > 60').shape
-        (2730, 27)
+        (2730, 28)
        """
        if isinstance(expr, BooleanFilter):
            return DataFrame(
--- a/eland/etl.py
+++ b/eland/etl.py
@ -262,7 +262,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
    3      181.694216       True  ...         0 2018-01-01 10:33:28
    4      730.041778      False  ...         0 2018-01-01 05:13:00
    <BLANKLINE>
-    [5 rows x 27 columns]
+    [5 rows x 28 columns]

    Convert `eland.DataFrame` to `pandas.DataFrame` (Note: this loads entire Elasticsearch index into core memory)

@ -277,7 +277,7 @@ def eland_to_pandas(ed_df: DataFrame, show_progress: bool = False) -> pd.DataFra
    3      181.694216       True  ...         0 2018-01-01 10:33:28
    4      730.041778      False  ...         0 2018-01-01 05:13:00
    <BLANKLINE>
-    [5 rows x 27 columns]
+    [5 rows x 28 columns]

    Convert `eland.DataFrame` to `pandas.DataFrame` and show progress every 10000 rows

--- a/eland/operations.py
+++ b/eland/operations.py
@ -1543,6 +1543,16 @@ def quantile_to_percentile(quantile: Union[int, float]) -> float:
    return float(min(100, max(0, quantile * 100)))


+def is_field_already_present(key: str, dictionary: Dict[str, Any]) -> bool:
+    if "." in key:
+        splitted = key.split(".")
+        return is_field_already_present(
+            ".".join(splitted[1:]), dictionary.get(splitted[0], {})
+        )
+    else:
+        return key in dictionary
+
+
 def _search_yield_hits(
    query_compiler: "QueryCompiler",
    body: Dict[str, Any],
@ -1600,10 +1610,24 @@ def _search_yield_hits(

        # Modify the search with the new point in time ID and keep-alive time.
        body["pit"] = {"id": pit_id, "keep_alive": DEFAULT_PIT_KEEP_ALIVE}
+        if isinstance(body["_source"], list):
+            body["fields"] = body["_source"]

        while max_number_of_hits is None or hits_yielded < max_number_of_hits:
            resp = client.search(**body)
-            hits: List[Dict[str, Any]] = resp["hits"]["hits"]
+            hits: List[Dict[str, Any]] = []
+            for hit in resp["hits"]["hits"]:
+                # Copy some of the fields to _source if they are missing there.
+                if "fields" in hit and "_source" in hit:
+                    fields = hit["fields"]
+                    del hit["fields"]
+                    for k, v in fields.items():
+                        if not is_field_already_present(k, hit["_source"]):
+                            if isinstance(v, list):
+                                hit["_source"][k] = list(sorted(v))
+                            else:
+                                hit["_source"][k] = v
+                hits.append(hit)

            # The point in time ID can change between searches so we
            # need to keep the next search up-to-date
--- a/tests/init.py
+++ b/tests/init.py
@ -43,7 +43,7 @@ FLIGHTS_MAPPING = {
            "Carrier": {"type": "keyword"},
            "Dest": {"type": "keyword"},
            "DestAirportID": {"type": "keyword"},
-            "DestCityName": {"type": "keyword"},
+            "DestCityName": {"type": "keyword", "copy_to": "Cities"},
            "DestCountry": {"type": "keyword"},
            "DestLocation": {"type": "geo_point"},
            "DestRegion": {"type": "keyword"},
@ -58,11 +58,12 @@ FLIGHTS_MAPPING = {
            "FlightTimeMin": {"type": "float"},
            "Origin": {"type": "keyword"},
            "OriginAirportID": {"type": "keyword"},
-            "OriginCityName": {"type": "keyword"},
+            "OriginCityName": {"type": "keyword", "copy_to": "Cities"},
            "OriginCountry": {"type": "keyword"},
            "OriginLocation": {"type": "geo_point"},
            "OriginRegion": {"type": "keyword"},
            "OriginWeather": {"type": "keyword"},
+            "Cities": {"type": "text"},
            "dayOfWeek": {"type": "byte"},
            "timestamp": {"type": "date", "format": "strict_date_hour_minute_second"},
        }
--- a/tests/common.py
+++ b/tests/common.py
@ -46,6 +46,10 @@ _pd_flights = pd.DataFrame.from_records(flight_records).reindex(
    _ed_flights.columns, axis=1
 )
 _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
+# Mimic what copy_to in an Elasticsearch mapping would do, combining the two fields in a list
+_pd_flights["Cities"] = _pd_flights.apply(
+    lambda x: list(sorted([x["OriginCityName"], x["DestCityName"]])), axis=1
+)
 _pd_flights.index = _pd_flights.index.map(str)  # make index 'object' not int

 _pd_flights_small = _pd_flights.head(48)
--- a/tests/dataframe/test_dtypes_pytest.py
+++ b/tests/dataframe/test_dtypes_pytest.py
@ -43,6 +43,7 @@ class TestDataFrameDtypes:
                    "AvgTicketPrice": "float",
                    "Cancelled": "boolean",
                    "Carrier": "keyword",
+                    "Cities": "text",
                    "Dest": "keyword",
                    "DestAirportID": "keyword",
                    "DestCityName": "keyword",
--- a/tests/dataframe/test_to_csv_pytest.py
+++ b/tests/dataframe/test_to_csv_pytest.py
@ -41,8 +41,9 @@ class TestDataFrameToCSV(TestData):
            results_file,
            index_col=0,
            converters={
-                "DestLocation": lambda x: ast.literal_eval(x),
-                "OriginLocation": lambda x: ast.literal_eval(x),
+                "DestLocation": ast.literal_eval,
+                "OriginLocation": ast.literal_eval,
+                "Cities": ast.literal_eval,
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
@ -63,8 +64,9 @@ class TestDataFrameToCSV(TestData):
            results_file,
            index_col=0,
            converters={
-                "DestLocation": lambda x: ast.literal_eval(x),
-                "OriginLocation": lambda x: ast.literal_eval(x),
+                "DestLocation": ast.literal_eval,
+                "OriginLocation": ast.literal_eval,
+                "Cities": ast.literal_eval,
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
@ -112,8 +114,9 @@ class TestDataFrameToCSV(TestData):
            results,
            index_col=0,
            converters={
-                "DestLocation": lambda x: ast.literal_eval(x),
-                "OriginLocation": lambda x: ast.literal_eval(x),
+                "DestLocation": ast.literal_eval,
+                "OriginLocation": ast.literal_eval,
+                "Cities": ast.literal_eval,
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
--- a/tests/notebook/test_demo_notebook.ipynb
+++ b/tests/notebook/test_demo_notebook.ipynb
--- a/tests/notebook/test_etl.ipynb
+++ b/tests/notebook/test_etl.ipynb
@ -19,7 +19,7 @@
    {
     "data": {
      "text/plain": [
-       "False"
+       "HeadApiResponse(False)"
      ]
     },
     "execution_count": 2,
@ -43,8 +43,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2021-03-30 11:57:39.116425: read 10000 rows\n",
-      "2021-03-30 11:57:39.522722: read 13059 rows\n"
+      "2024-05-21 09:07:17.882569: read 10000 rows\n",
+      "2024-05-21 09:07:18.375305: read 13059 rows\n"
     ]
    }
   ],
@ -78,6 +78,18 @@
   "execution_count": 5,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: the 'mangle_dupe_cols' keyword is deprecated and will be removed in a future version. Please take steps to stop the use of 'mangle_dupe_cols'\n",
+      "  reader = pd.read_csv(filepath_or_buffer, **kwargs)\n",
+      "/home/codespace/.python/current/lib/python3.10/site-packages/eland/etl.py:529: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze(\"columns\") to the call to squeeze.\n",
+      "\n",
+      "\n",
+      "  reader = pd.read_csv(filepath_or_buffer, **kwargs)\n"
+     ]
+    },
    {
     "data": {
      "text/html": [
@ -218,35 +230,7 @@
    {
     "data": {
      "text/plain": [
-       "{'took': 0,\n",
-       " 'timed_out': False,\n",
-       " '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},\n",
-       " 'hits': {'total': {'value': 2, 'relation': 'eq'},\n",
-       "  'max_score': 1.0,\n",
-       "  'hits': [{'_index': 'churn',\n",
-       "    '_id': '0',\n",
-       "    '_score': 1.0,\n",
-       "    '_source': {'state': 'KS',\n",
-       "     'account length': 128,\n",
-       "     'area code': 415,\n",
-       "     'phone number': '382-4657',\n",
-       "     'international plan': 'no',\n",
-       "     'voice mail plan': 'yes',\n",
-       "     'number vmail messages': 25,\n",
-       "     'total day minutes': 265.1,\n",
-       "     'total day calls': 110,\n",
-       "     'total day charge': 45.07,\n",
-       "     'total eve minutes': 197.4,\n",
-       "     'total eve calls': 99,\n",
-       "     'total eve charge': 16.78,\n",
-       "     'total night minutes': 244.7,\n",
-       "     'total night calls': 91,\n",
-       "     'total night charge': 11.01,\n",
-       "     'total intl minutes': 10.0,\n",
-       "     'total intl calls': 3,\n",
-       "     'total intl charge': 2.7,\n",
-       "     'customer service calls': 1,\n",
-       "     'churn': 0}}]}}"
+       "ObjectApiResponse({'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'churn', '_id': '0', '_score': 1.0, '_source': {'state': 'KS', 'account length': 128, 'area code': 415, 'phone number': '382-4657', 'international plan': 'no', 'voice mail plan': 'yes', 'number vmail messages': 25, 'total day minutes': 265.1, 'total day calls': 110, 'total day charge': 45.07, 'total eve minutes': 197.4, 'total eve calls': 99, 'total eve charge': 16.78, 'total night minutes': 244.7, 'total night calls': 91, 'total night charge': 11.01, 'total intl minutes': 10.0, 'total intl calls': 3, 'total intl charge': 2.7, 'customer service calls': 1, 'churn': 0}}]}})"
      ]
     },
     "execution_count": 6,
@ -267,7 +251,7 @@
    {
     "data": {
      "text/plain": [
-       "{'acknowledged': True}"
+       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 7,
@ -297,7 +281,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.13"
  }
 },
 "nbformat": 4,
--- a/tests/notebook/test_metrics.ipynb
+++ b/tests/notebook/test_metrics.ipynb
@ -33,10 +33,10 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice                       640.387285\n",
+       "AvgTicketPrice                       639.433214\n",
       "Cancelled                                 False\n",
-       "dayOfWeek                                     3\n",
-       "timestamp         2018-01-21 23:43:19.256498944\n",
+       "dayOfWeek                                     2\n",
+       "timestamp         2018-01-21 20:23:15.159835648\n",
       "dtype: object"
      ]
     },
@ -58,9 +58,9 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    640.387285\n",
+       "AvgTicketPrice    639.433214\n",
       "Cancelled           0.000000\n",
-       "dayOfWeek           3.000000\n",
+       "dayOfWeek           2.935777\n",
       "dtype: float64"
      ]
     },
@ -82,10 +82,10 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice                       640.387285\n",
+       "AvgTicketPrice                       639.433214\n",
       "Cancelled                                 False\n",
-       "dayOfWeek                                     3\n",
-       "timestamp         2018-01-21 23:43:19.256498944\n",
+       "dayOfWeek                                     2\n",
+       "timestamp         2018-01-21 20:23:15.159835648\n",
       "DestCountry                                 NaN\n",
       "dtype: object"
      ]
@ -108,7 +108,7 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    213.430365\n",
+       "AvgTicketPrice    213.453156\n",
       "dayOfWeek           2.000000\n",
       "dtype: float64"
      ]
@ -131,7 +131,7 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    213.430365\n",
+       "AvgTicketPrice    213.453156\n",
       "dayOfWeek           2.000000\n",
       "dtype: float64"
      ]
@ -154,7 +154,7 @@
    {
     "data": {
      "text/plain": [
-       "AvgTicketPrice    213.430365\n",
+       "AvgTicketPrice    213.453156\n",
       "Cancelled                NaN\n",
       "dayOfWeek                2.0\n",
       "timestamp                NaT\n",
@ -189,7 +189,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.13"
  }
 },
 "nbformat": 4,
--- a/tests/notebook/test_plotting.ipynb
+++ b/tests/notebook/test_plotting.ipynb