mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
491 lines
14 KiB
Python
491 lines
14 KiB
Python
import os
|
|
|
|
import pandas as pd
|
|
|
|
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# Define test files and indices
|
|
ELASTICSEARCH_HOST = os.environ.get('ELASTICSEARCH_HOST') or 'localhost'
|
|
|
|
FLIGHTS_INDEX_NAME = 'flights'
|
|
FLIGHTS_MAPPING = {"mappings": {
|
|
"properties": {
|
|
"AvgTicketPrice": {
|
|
"type": "float"
|
|
},
|
|
"Cancelled": {
|
|
"type": "boolean"
|
|
},
|
|
"Carrier": {
|
|
"type": "keyword"
|
|
},
|
|
"Dest": {
|
|
"type": "keyword"
|
|
},
|
|
"DestAirportID": {
|
|
"type": "keyword"
|
|
},
|
|
"DestCityName": {
|
|
"type": "keyword"
|
|
},
|
|
"DestCountry": {
|
|
"type": "keyword"
|
|
},
|
|
"DestLocation": {
|
|
"type": "geo_point"
|
|
},
|
|
"DestRegion": {
|
|
"type": "keyword"
|
|
},
|
|
"DestWeather": {
|
|
"type": "keyword"
|
|
},
|
|
"DistanceKilometers": {
|
|
"type": "float"
|
|
},
|
|
"DistanceMiles": {
|
|
"type": "float"
|
|
},
|
|
"FlightDelay": {
|
|
"type": "boolean"
|
|
},
|
|
"FlightDelayMin": {
|
|
"type": "integer"
|
|
},
|
|
"FlightDelayType": {
|
|
"type": "keyword"
|
|
},
|
|
"FlightNum": {
|
|
"type": "keyword"
|
|
},
|
|
"FlightTimeHour": {
|
|
"type": "float"
|
|
},
|
|
"FlightTimeMin": {
|
|
"type": "float"
|
|
},
|
|
"Origin": {
|
|
"type": "keyword"
|
|
},
|
|
"OriginAirportID": {
|
|
"type": "keyword"
|
|
},
|
|
"OriginCityName": {
|
|
"type": "keyword"
|
|
},
|
|
"OriginCountry": {
|
|
"type": "keyword"
|
|
},
|
|
"OriginLocation": {
|
|
"type": "geo_point"
|
|
},
|
|
"OriginRegion": {
|
|
"type": "keyword"
|
|
},
|
|
"OriginWeather": {
|
|
"type": "keyword"
|
|
},
|
|
"dayOfWeek": {
|
|
"type": "integer"
|
|
},
|
|
"timestamp": {
|
|
"type": "date"
|
|
}
|
|
}
|
|
}}
|
|
FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz'
|
|
FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz'
|
|
|
|
FLIGHTS_SMALL_INDEX_NAME = 'flights_small'
|
|
FLIGHTS_SMALL_MAPPING = FLIGHTS_MAPPING
|
|
FLIGHTS_SMALL_FILE_NAME = ROOT_DIR + '/flights_small.json.gz'
|
|
|
|
ECOMMERCE_INDEX_NAME = 'ecommerce'
|
|
ECOMMERCE_MAPPING = {"mappings": {
|
|
"properties": {
|
|
"category": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
},
|
|
"currency": {
|
|
"type": "keyword"
|
|
},
|
|
"customer_birth_date": {
|
|
"type": "date"
|
|
},
|
|
"customer_first_name": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword",
|
|
"ignore_above": 256
|
|
}
|
|
}
|
|
},
|
|
"customer_full_name": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword",
|
|
"ignore_above": 256
|
|
}
|
|
}
|
|
},
|
|
"customer_gender": {
|
|
"type": "keyword"
|
|
},
|
|
"customer_id": {
|
|
"type": "keyword"
|
|
},
|
|
"customer_last_name": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword",
|
|
"ignore_above": 256
|
|
}
|
|
}
|
|
},
|
|
"customer_phone": {
|
|
"type": "keyword"
|
|
},
|
|
"day_of_week": {
|
|
"type": "keyword"
|
|
},
|
|
"day_of_week_i": {
|
|
"type": "integer"
|
|
},
|
|
"email": {
|
|
"type": "keyword"
|
|
},
|
|
"geoip": {
|
|
"properties": {
|
|
"city_name": {
|
|
"type": "keyword"
|
|
},
|
|
"continent_name": {
|
|
"type": "keyword"
|
|
},
|
|
"country_iso_code": {
|
|
"type": "keyword"
|
|
},
|
|
"location": {
|
|
"type": "geo_point"
|
|
},
|
|
"region_name": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
},
|
|
"manufacturer": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
},
|
|
"order_date": {
|
|
"type": "date"
|
|
},
|
|
"order_id": {
|
|
"type": "keyword"
|
|
},
|
|
"products": {
|
|
"properties": {
|
|
"_id": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword",
|
|
"ignore_above": 256
|
|
}
|
|
}
|
|
},
|
|
"base_price": {
|
|
"type": "half_float"
|
|
},
|
|
"base_unit_price": {
|
|
"type": "half_float"
|
|
},
|
|
"category": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
},
|
|
"created_on": {
|
|
"type": "date"
|
|
},
|
|
"discount_amount": {
|
|
"type": "half_float"
|
|
},
|
|
"discount_percentage": {
|
|
"type": "half_float"
|
|
},
|
|
"manufacturer": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
},
|
|
"min_price": {
|
|
"type": "half_float"
|
|
},
|
|
"price": {
|
|
"type": "half_float"
|
|
},
|
|
"product_id": {
|
|
"type": "long"
|
|
},
|
|
"product_name": {
|
|
"type": "text",
|
|
"fields": {
|
|
"keyword": {
|
|
"type": "keyword"
|
|
}
|
|
},
|
|
"analyzer": "english"
|
|
},
|
|
"quantity": {
|
|
"type": "integer"
|
|
},
|
|
"sku": {
|
|
"type": "keyword"
|
|
},
|
|
"tax_amount": {
|
|
"type": "half_float"
|
|
},
|
|
"taxful_price": {
|
|
"type": "half_float"
|
|
},
|
|
"taxless_price": {
|
|
"type": "half_float"
|
|
},
|
|
"unit_discount_amount": {
|
|
"type": "half_float"
|
|
}
|
|
}
|
|
},
|
|
"sku": {
|
|
"type": "keyword"
|
|
},
|
|
"taxful_total_price": {
|
|
"type": "float"
|
|
},
|
|
"taxless_total_price": {
|
|
"type": "float"
|
|
},
|
|
"total_quantity": {
|
|
"type": "integer"
|
|
},
|
|
"total_unique_products": {
|
|
"type": "integer"
|
|
},
|
|
"type": {
|
|
"type": "keyword"
|
|
},
|
|
"user": {
|
|
"type": "keyword"
|
|
}
|
|
}
|
|
}}
|
|
ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz'
|
|
ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz'
|
|
|
|
TEST_MAPPING1 = {
|
|
'mappings': {
|
|
'properties': {
|
|
'city': {
|
|
'type': 'text',
|
|
'fields': {
|
|
'raw': {
|
|
'type': 'keyword'
|
|
}
|
|
}
|
|
},
|
|
'text': {
|
|
'type': 'text',
|
|
'fields': {
|
|
'english': {
|
|
'type': 'text',
|
|
'analyzer': 'english'
|
|
}
|
|
}
|
|
},
|
|
'origin_location': {
|
|
'properties': {
|
|
'lat': {
|
|
'type': 'text',
|
|
'index_prefixes': {},
|
|
'fields': {
|
|
'keyword': {
|
|
'type': 'keyword',
|
|
'ignore_above': 256
|
|
}
|
|
}
|
|
},
|
|
'lon': {
|
|
'type': 'text',
|
|
'fields': {
|
|
'keyword': {
|
|
'type': 'keyword',
|
|
'ignore_above': 256
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
'maps-telemetry': {
|
|
'properties': {
|
|
'attributesPerMap': {
|
|
'properties': {
|
|
'dataSourcesCount': {
|
|
'properties': {
|
|
'avg': {
|
|
'type': 'long'
|
|
},
|
|
'max': {
|
|
'type': 'long'
|
|
},
|
|
'min': {
|
|
'type': 'long'
|
|
}
|
|
}
|
|
},
|
|
'emsVectorLayersCount': {
|
|
'dynamic': 'true',
|
|
'properties': {
|
|
'france_departments': {
|
|
'properties': {
|
|
'avg': {
|
|
'type': 'float'
|
|
},
|
|
'max': {
|
|
'type': 'long'
|
|
},
|
|
'min': {
|
|
'type': 'long'
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
'type': {
|
|
'type': 'keyword'
|
|
},
|
|
'name': {
|
|
'type': 'text'
|
|
},
|
|
'user_name': {
|
|
'type': 'keyword'
|
|
},
|
|
'email': {
|
|
'type': 'keyword'
|
|
},
|
|
'content': {
|
|
'type': 'text'
|
|
},
|
|
'tweeted_at': {
|
|
'type': 'date'
|
|
},
|
|
'dest_location': {
|
|
'type': 'geo_point'
|
|
},
|
|
'my_join_field': {
|
|
'type': 'join',
|
|
'relations': {
|
|
'question': ['answer', 'comment'],
|
|
'answer': 'vote'
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST_MAPPING1_INDEX_NAME = 'mapping1'
|
|
|
|
TEST_MAPPING1_EXPECTED = {
|
|
'city': 'text',
|
|
'city.raw': 'keyword',
|
|
'content': 'text',
|
|
'dest_location': 'geo_point',
|
|
'email': 'keyword',
|
|
'maps-telemetry.attributesPerMap.dataSourcesCount.avg': 'long',
|
|
'maps-telemetry.attributesPerMap.dataSourcesCount.max': 'long',
|
|
'maps-telemetry.attributesPerMap.dataSourcesCount.min': 'long',
|
|
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.avg': 'float',
|
|
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.max': 'long',
|
|
'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.min': 'long',
|
|
'my_join_field': 'join',
|
|
'name': 'text',
|
|
'origin_location.lat': 'text',
|
|
'origin_location.lat.keyword': 'keyword',
|
|
'origin_location.lon': 'text',
|
|
'origin_location.lon.keyword': 'keyword',
|
|
'text': 'text',
|
|
'text.english': 'text',
|
|
'tweeted_at': 'date',
|
|
'type': 'keyword',
|
|
'user_name': 'keyword'
|
|
}
|
|
|
|
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
|
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF = TEST_MAPPING1_EXPECTED_DF.drop(index=['city.raw',
|
|
'origin_location.lat.keyword',
|
|
'origin_location.lon.keyword',
|
|
'text.english'])
|
|
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_SOURCE_FIELD_DF.index)
|
|
|
|
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
|
TEST_NESTED_USER_GROUP_MAPPING = {
|
|
'mappings': {
|
|
'properties': {
|
|
'group': {
|
|
'type': 'keyword'
|
|
},
|
|
'user': {
|
|
'properties': {
|
|
'first': {
|
|
'type': 'keyword'
|
|
},
|
|
'last': {
|
|
'type': 'keyword'
|
|
},
|
|
'address': {
|
|
'type': 'keyword'
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST_NESTED_USER_GROUP_DOCS = [
|
|
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
|
'_source':
|
|
{'group': 'amsterdam', 'user': [
|
|
{'first': 'Manke', 'last': 'Nelis', 'address': ['Elandsgracht', 'Amsterdam']},
|
|
{'first': 'Johnny', 'last': 'Jordaan', 'address': ['Elandsstraat', 'Amsterdam']}]}},
|
|
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
|
'_source':
|
|
{'group': 'london', 'user': [
|
|
{'first': 'Alice', 'last': 'Monkton'},
|
|
{'first': 'Jimmy', 'last': 'White', 'address': ['London']}]}},
|
|
{'_index': TEST_NESTED_USER_GROUP_INDEX_NAME,
|
|
'_source': {'group': 'new york', 'user': [
|
|
{'first': 'Bill', 'last': 'Jones'}]}}
|
|
]
|