diff --git a/tests/__init__.py b/tests/__init__.py index a8d1e77..36a107b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -163,6 +163,59 @@ ECOMMERCE_MAPPING = { ECOMMERCE_FILE_NAME = ROOT_DIR + "/ecommerce.json.gz" ECOMMERCE_DF_FILE_NAME = ROOT_DIR + "/ecommerce_df.json.gz" +MOVIES_INDEX_NAME = "movies" +MOVIES_FILE_NAME = ROOT_DIR + "/movies.json.gz" +MOVIES_MAPPING = { + "mappings": { + "properties": { + "type": {"type": "keyword"}, + "title": {"type": "text"}, + "year": {"type": "integer"}, + "rated": {"type": "keyword"}, + "released": {"type": "date"}, + "plot": {"type": "text"}, + "awards": {"type": "text"}, + "poster": {"type": "keyword"}, + "id": {"type": "keyword"}, + "metascore": {"type": "float"}, + "imdbRating": {"type": "float"}, + "imdbVotes": {"type": "integer"}, + "language": {"type": "keyword"}, + "runtime": {"type": "integer"}, + "genres": { + "type": "text", + "fields": { + "keyword": {"type": "keyword"}, + }, + }, + "directors": { + "type": "text", + "fields": { + "keyword": {"type": "keyword"}, + }, + }, + "writers": { + "type": "text", + "fields": { + "keyword": {"type": "keyword"}, + }, + }, + "actors": { + "type": "text", + "fields": { + "keyword": {"type": "keyword"}, + }, + }, + "country": { + "type": "text", + "fields": { + "keyword": {"type": "keyword"}, + }, + }, + } + } +} + TEST_MAPPING1 = { "mappings": { "properties": { diff --git a/tests/movies.json.gz b/tests/movies.json.gz new file mode 100644 index 0000000..47e6bbb Binary files /dev/null and b/tests/movies.json.gz differ diff --git a/tests/setup_tests.py b/tests/setup_tests.py index 953aeb9..c0c353a 100644 --- a/tests/setup_tests.py +++ b/tests/setup_tests.py @@ -30,6 +30,9 @@ from tests import ( FLIGHTS_MAPPING, FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, + MOVIES_FILE_NAME, + MOVIES_INDEX_NAME, + MOVIES_MAPPING, TEST_MAPPING1, TEST_MAPPING1_INDEX_NAME, TEST_NESTED_USER_GROUP_DOCS, @@ -41,6 +44,7 @@ DATA_LIST = [ (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING), (FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING), (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING), + (MOVIES_FILE_NAME, MOVIES_INDEX_NAME, MOVIES_MAPPING), ] @@ -58,18 +62,20 @@ def _setup_data(es): es.indices.create(index=index_name, **mapping) df = pd.read_json(json_file_name, lines=True) - actions = [] n = 0 print("Adding", df.shape[0], "items to index:", index_name) for index, row in df.iterrows(): - values = row.to_dict() + values = row.dropna().to_dict() # make timestamp datetime 2018-01-01T12:09:35 # values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S') - # Use integer as id field for repeatable results - action = {"_index": index_name, "_source": values, "_id": str(n)} + # Use id field as document id from the row if the fiel exists. + # Else, use integer as id field for repeatable results + # document_id = values['id'] if 'id' in values else str(n) + document_id = values["id"] if "id" in values else str(n) + action = {"_index": index_name, "_source": values, "_id": document_id} actions.append(action)