Adding a new movie dataset to the tests. (#646)

2025-07-11 00:02:14 +08:00 · 2024-01-04 16:14:56 +01:00 · 2024-01-04 16:14:56 +01:00 · 05c5859b8a
commit 05c5859b8a
parent 0f91224daf
3 changed files with 63 additions and 4 deletions
--- a/tests/init.py
+++ b/tests/init.py
@ -163,6 +163,59 @@ ECOMMERCE_MAPPING = {
 ECOMMERCE_FILE_NAME = ROOT_DIR + "/ecommerce.json.gz"
 ECOMMERCE_DF_FILE_NAME = ROOT_DIR + "/ecommerce_df.json.gz"

+MOVIES_INDEX_NAME = "movies"
+MOVIES_FILE_NAME = ROOT_DIR + "/movies.json.gz"
+MOVIES_MAPPING = {
+    "mappings": {
+        "properties": {
+            "type": {"type": "keyword"},
+            "title": {"type": "text"},
+            "year": {"type": "integer"},
+            "rated": {"type": "keyword"},
+            "released": {"type": "date"},
+            "plot": {"type": "text"},
+            "awards": {"type": "text"},
+            "poster": {"type": "keyword"},
+            "id": {"type": "keyword"},
+            "metascore": {"type": "float"},
+            "imdbRating": {"type": "float"},
+            "imdbVotes": {"type": "integer"},
+            "language": {"type": "keyword"},
+            "runtime": {"type": "integer"},
+            "genres": {
+                "type": "text",
+                "fields": {
+                    "keyword": {"type": "keyword"},
+                },
+            },
+            "directors": {
+                "type": "text",
+                "fields": {
+                    "keyword": {"type": "keyword"},
+                },
+            },
+            "writers": {
+                "type": "text",
+                "fields": {
+                    "keyword": {"type": "keyword"},
+                },
+            },
+            "actors": {
+                "type": "text",
+                "fields": {
+                    "keyword": {"type": "keyword"},
+                },
+            },
+            "country": {
+                "type": "text",
+                "fields": {
+                    "keyword": {"type": "keyword"},
+                },
+            },
+        }
+    }
+}
+
 TEST_MAPPING1 = {
    "mappings": {
        "properties": {
--- a/tests/movies.json.gz
+++ b/tests/movies.json.gz
--- a/tests/setup_tests.py
+++ b/tests/setup_tests.py
@ -30,6 +30,9 @@ from tests import (
    FLIGHTS_MAPPING,
    FLIGHTS_SMALL_FILE_NAME,
    FLIGHTS_SMALL_INDEX_NAME,
+    MOVIES_FILE_NAME,
+    MOVIES_INDEX_NAME,
+    MOVIES_MAPPING,
    TEST_MAPPING1,
    TEST_MAPPING1_INDEX_NAME,
    TEST_NESTED_USER_GROUP_DOCS,
@ -41,6 +44,7 @@ DATA_LIST = [
    (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
    (FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING),
    (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING),
+    (MOVIES_FILE_NAME, MOVIES_INDEX_NAME, MOVIES_MAPPING),
 ]


@ -58,18 +62,20 @@ def _setup_data(es):
        es.indices.create(index=index_name, **mapping)

        df = pd.read_json(json_file_name, lines=True)
-
        actions = []
        n = 0

        print("Adding", df.shape[0], "items to index:", index_name)
        for index, row in df.iterrows():
-            values = row.to_dict()
+            values = row.dropna().to_dict()
            # make timestamp datetime 2018-01-01T12:09:35
            # values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')

-            # Use integer as id field for repeatable results
-            action = {"_index": index_name, "_source": values, "_id": str(n)}
+            # Use id field as document id from the row if the fiel exists.
+            # Else, use integer as id field for repeatable results
+            # document_id = values['id'] if 'id' in values else str(n)
+            document_id = values["id"] if "id" in values else str(n)
+            action = {"_index": index_name, "_source": values, "_id": document_id}

            actions.append(action)