Adding a new movie dataset to the tests. (#646)

This commit is contained in:
Aurélien FOUCRET 2024-01-04 16:14:56 +01:00 committed by GitHub
parent 0f91224daf
commit 05c5859b8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 63 additions and 4 deletions

View File

@ -163,6 +163,59 @@ ECOMMERCE_MAPPING = {
ECOMMERCE_FILE_NAME = ROOT_DIR + "/ecommerce.json.gz"
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + "/ecommerce_df.json.gz"
MOVIES_INDEX_NAME = "movies"
MOVIES_FILE_NAME = ROOT_DIR + "/movies.json.gz"
MOVIES_MAPPING = {
"mappings": {
"properties": {
"type": {"type": "keyword"},
"title": {"type": "text"},
"year": {"type": "integer"},
"rated": {"type": "keyword"},
"released": {"type": "date"},
"plot": {"type": "text"},
"awards": {"type": "text"},
"poster": {"type": "keyword"},
"id": {"type": "keyword"},
"metascore": {"type": "float"},
"imdbRating": {"type": "float"},
"imdbVotes": {"type": "integer"},
"language": {"type": "keyword"},
"runtime": {"type": "integer"},
"genres": {
"type": "text",
"fields": {
"keyword": {"type": "keyword"},
},
},
"directors": {
"type": "text",
"fields": {
"keyword": {"type": "keyword"},
},
},
"writers": {
"type": "text",
"fields": {
"keyword": {"type": "keyword"},
},
},
"actors": {
"type": "text",
"fields": {
"keyword": {"type": "keyword"},
},
},
"country": {
"type": "text",
"fields": {
"keyword": {"type": "keyword"},
},
},
}
}
}
TEST_MAPPING1 = {
"mappings": {
"properties": {

BIN
tests/movies.json.gz Normal file

Binary file not shown.

View File

@ -30,6 +30,9 @@ from tests import (
FLIGHTS_MAPPING,
FLIGHTS_SMALL_FILE_NAME,
FLIGHTS_SMALL_INDEX_NAME,
MOVIES_FILE_NAME,
MOVIES_INDEX_NAME,
MOVIES_MAPPING,
TEST_MAPPING1,
TEST_MAPPING1_INDEX_NAME,
TEST_NESTED_USER_GROUP_DOCS,
@ -41,6 +44,7 @@ DATA_LIST = [
(FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING),
(FLIGHTS_SMALL_FILE_NAME, FLIGHTS_SMALL_INDEX_NAME, FLIGHTS_MAPPING),
(ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING),
(MOVIES_FILE_NAME, MOVIES_INDEX_NAME, MOVIES_MAPPING),
]
@ -58,18 +62,20 @@ def _setup_data(es):
es.indices.create(index=index_name, **mapping)
df = pd.read_json(json_file_name, lines=True)
actions = []
n = 0
print("Adding", df.shape[0], "items to index:", index_name)
for index, row in df.iterrows():
values = row.to_dict()
values = row.dropna().to_dict()
# make timestamp datetime 2018-01-01T12:09:35
# values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
# Use integer as id field for repeatable results
action = {"_index": index_name, "_source": values, "_id": str(n)}
# Use id field as document id from the row if the fiel exists.
# Else, use integer as id field for repeatable results
# document_id = values['id'] if 'id' in values else str(n)
document_id = values["id"] if "id" in values else str(n)
action = {"_index": index_name, "_source": values, "_id": document_id}
actions.append(action)