eland/eland/tests/__init__.py
2019-12-02 17:18:40 +01:00

491 lines
14 KiB
Python

import os
import pandas as pd
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
# Define test files and indices
ELASTICSEARCH_HOST = os.environ.get('ELASTICSEARCH_HOST') or 'localhost'
FLIGHTS_INDEX_NAME = 'flights'
FLIGHTS_MAPPING = {"mappings": {
"properties": {
"AvgTicketPrice": {
"type": "float"
},
"Cancelled": {
"type": "boolean"
},
"Carrier": {
"type": "keyword"
},
"Dest": {
"type": "keyword"
},
"DestAirportID": {
"type": "keyword"
},
"DestCityName": {
"type": "keyword"
},
"DestCountry": {
"type": "keyword"
},
"DestLocation": {
"type": "geo_point"
},
"DestRegion": {
"type": "keyword"
},
"DestWeather": {
"type": "keyword"
},
"DistanceKilometers": {
"type": "float"
},
"DistanceMiles": {
"type": "float"
},
"FlightDelay": {
"type": "boolean"
},
"FlightDelayMin": {
"type": "integer"
},
"FlightDelayType": {
"type": "keyword"
},
"FlightNum": {
"type": "keyword"
},
"FlightTimeHour": {
"type": "float"
},
"FlightTimeMin": {
"type": "float"
},
"Origin": {
"type": "keyword"
},
"OriginAirportID": {
"type": "keyword"
},
"OriginCityName": {
"type": "keyword"
},
"OriginCountry": {
"type": "keyword"
},
"OriginLocation": {
"type": "geo_point"
},
"OriginRegion": {
"type": "keyword"
},
"OriginWeather": {
"type": "keyword"
},
"dayOfWeek": {
"type": "integer"
},
"timestamp": {
"type": "date"
}
}
}}
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
FLIGHTS_SMALL_INDEX_NAME = 'flights_small'
FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
ECOMMERCE_INDEX_NAME = 'ecommerce'
ECOMMERCE_MAPPING = {"mappings": {
"properties": {
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"currency": {
"type": "keyword"
},
"customer_birth_date": {
"type": "date"
},
"customer_first_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_full_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_gender": {
"type": "keyword"
},
"customer_id": {
"type": "keyword"
},
"customer_last_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"customer_phone": {
"type": "keyword"
},
"day_of_week": {
"type": "keyword"
},
"day_of_week_i": {
"type": "integer"
},
"email": {
"type": "keyword"
},
"geoip": {
"properties": {
"city_name": {
"type": "keyword"
},
"continent_name": {
"type": "keyword"
},
"country_iso_code": {
"type": "keyword"
},
"location": {
"type": "geo_point"
},
"region_name": {
"type": "keyword"
}
}
},
"manufacturer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"order_date": {
"type": "date"
},
"order_id": {
"type": "keyword"
},
"products": {
"properties": {
"_id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"base_price": {
"type": "half_float"
},
"base_unit_price": {
"type": "half_float"
},
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"created_on": {
"type": "date"
},
"discount_amount": {
"type": "half_float"
},
"discount_percentage": {
"type": "half_float"
},
"manufacturer": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"min_price": {
"type": "half_float"
},
"price": {
"type": "half_float"
},
"product_id": {
"type": "long"
},
"product_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
},
"analyzer": "english"
},
"quantity": {
"type": "integer"
},
"sku": {
"type": "keyword"
},
"tax_amount": {
"type": "half_float"
},
"taxful_price": {
"type": "half_float"
},
"taxless_price": {
"type": "half_float"
},
"unit_discount_amount": {
"type": "half_float"
}
}
},
"sku": {
"type": "keyword"
},
"taxful_total_price": {
"type": "float"
},
"taxless_total_price": {
"type": "float"
},
"total_quantity": {
"type": "integer"
},
"total_unique_products": {
"type": "integer"
},
"type": {
"type": "keyword"
},
"user": {
"type": "keyword"
}
}
}}
ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
TEST_MAPPING1 = {
'mappings': {
'properties': {
'city': {
'type': 'text',
'fields': {
'raw': {
'type': 'keyword'
}
}
},
'text': {
'type': 'text',
'fields': {
'english': {
'type': 'text',
'analyzer': 'english'
}
}
},
'origin_location': {
'properties': {
'lat': {
'type': 'text',
'index_prefixes': {},
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
}
},
'lon': {
'type': 'text',
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
}
}
}
},
'maps-telemetry': {
'properties': {
'attributesPerMap': {
'properties': {
'dataSourcesCount': {
'properties': {
'avg': {
'type': 'long'
},
'max': {
'type': 'long'
},
'min': {
'type': 'long'
}
}
},
'emsVectorLayersCount': {
'dynamic': 'true',
'properties': {
'france_departments': {
'properties': {
'avg': {
'type': 'float'
},
'max': {
'type': 'long'
},
'min': {
'type': 'long'
}
}
}
}
}
}
}
}
},
'type': {
'type': 'keyword'
},
'name': {
'type': 'text'
},
'user_name': {
'type': 'keyword'
},
'email': {
'type': 'keyword'
},
'content': {
'type': 'text'
},
'tweeted_at': {
'type': 'date'
},
'dest_location': {
'type': 'geo_point'
},
'my_join_field': {
'type': 'join',
'relations': {
'question': ['answer', 'comment'],
'answer': 'vote'
}
}
}
}
}
TEST_MAPPING1_INDEX_NAME = 'mapping1'
TEST_MAPPING1_EXPECTED = {
'city': 'text',
'city.raw': 'keyword',
'content': 'text',
'dest_location': 'geo_point',
'email': 'keyword',
'maps-telemetry.attributesPerMap.dataSourcesCount.avg': 'long',
'maps-telemetry.attributesPerMap.dataSourcesCount.max': 'long',
'maps-telemetry.attributesPerMap.dataSourcesCount.min': 'long',
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.avg': 'float',
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.max': 'long',
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.min': 'long',
'my_join_field': 'join',
'name': 'text',
'origin_location.lat': 'text',
'origin_location.lat.keyword': 'keyword',
'origin_location.lon': 'text',
'origin_location.lon.keyword': 'keyword',
'text': 'text',
'text.english': 'text',
'tweeted_at': 'date',
'type': 'keyword',
'user_name': 'keyword'
}
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF = TEST_MAPPING1_EXPECTED_DF.drop(index=['city.raw',
'origin_location.lat.keyword',
'origin_location.lon.keyword',
'text.english'])
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF.index)
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
TEST_NESTED_USER_GROUP_MAPPING = {
'mappings': {
'properties': {
'group': {
'type': 'keyword'
},
'user': {
'properties': {
'first': {
'type': 'keyword'
},
'last': {
'type': 'keyword'
},
'address': {
'type': 'keyword'
}
}
}
}
}
}
TEST_NESTED_USER_GROUP_DOCS = [
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':
{'group': 'amsterdam', 'user': [
{'first': 'Manke', 'last': 'Nelis', 'address': ['Elandsgracht', 'Amsterdam']},
{'first': 'Johnny', 'last': 'Jordaan', 'address': ['Elandsstraat', 'Amsterdam']}]}},
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source':
{'group': 'london', 'user': [
{'first': 'Alice', 'last': 'Monkton'},
{'first': 'Jimmy', 'last': 'White', 'address': ['London']}]}},
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
'_source': {'group': 'new york', 'user': [
{'first': 'Bill', 'last': 'Jones'}]}}
]