mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
This PR adds an ability to estimate per deployment and per allocation memory usage of NLP transformer models. It uses torch.profiler and performs logs the peak memory usage during the inference. This information is then used in Elasticsearch to provision models with sufficient memory (elastic/elasticsearch#98874).
191 lines
6.1 KiB
Bash
Executable File
191 lines
6.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# Launch one or more Elasticsearch nodes via the Docker image,
|
|
# to form a cluster suitable for running the REST API tests.
|
|
#
|
|
# Export the ELASTICSEARCH_VERSION variable, eg. 'elasticsearch:8.0.0-SNAPSHOT'.
|
|
|
|
# Version 1.0
|
|
# - Initial version of the run-elasticsearch.sh script
|
|
|
|
|
|
if [[ -z "$ELASTICSEARCH_VERSION" ]]; then
|
|
echo -e "\033[31;1mERROR:\033[0m Required environment variable [ELASTICSEARCH_VERSION] not set\033[0m"
|
|
exit 1
|
|
fi
|
|
|
|
set -euxo pipefail
|
|
|
|
# realpath on MacOS use different flags than on Linux
|
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
SCRIPT_PATH=$(dirname $(realpath $0))
|
|
else
|
|
SCRIPT_PATH=$(dirname $(realpath -s $0))
|
|
fi
|
|
|
|
moniker=$(echo "$ELASTICSEARCH_VERSION" | tr -C "[:alnum:]" '-')
|
|
suffix=rest-test
|
|
|
|
NODE_NAME=${NODE_NAME-${moniker}node1}
|
|
MASTER_NODE_NAME=${MASTER_NODE_NAME-${NODE_NAME}}
|
|
CLUSTER_NAME=${CLUSTER_NAME-${moniker}${suffix}}
|
|
HTTP_PORT=${HTTP_PORT-9200}
|
|
|
|
ELASTIC_PASSWORD=${ELASTIC_PASSWORD-changeme}
|
|
|
|
DETACH=${DETACH-false}
|
|
CLEANUP=${CLEANUP-false}
|
|
|
|
volume_name=${NODE_NAME}-${suffix}-data
|
|
network_default=${moniker}${suffix}
|
|
NETWORK_NAME=${NETWORK_NAME-"$network_default"}
|
|
|
|
set +x
|
|
|
|
# Set vm.max_map_count kernel setting to 262144 if we're in CI
|
|
if [[ "$BUILDKITE" == "true" ]]; then
|
|
sudo sysctl -w vm.max_map_count=262144
|
|
fi
|
|
|
|
function cleanup_volume {
|
|
if [[ "$(docker volume ls -q -f name=$1)" ]]; then
|
|
echo -e "\033[34;1mINFO:\033[0m Removing volume $1\033[0m"
|
|
(docker volume rm "$1") || true
|
|
fi
|
|
}
|
|
function container_running {
|
|
if [[ "$(docker ps -q -f name=$1)" ]]; then
|
|
return 0;
|
|
else return 1;
|
|
fi
|
|
}
|
|
function cleanup_node {
|
|
if container_running "$1"; then
|
|
echo -e "\033[34;1mINFO:\033[0m Removing container $1\033[0m"
|
|
(docker container rm --force --volumes "$1") || true
|
|
cleanup_volume "$1-${suffix}-data"
|
|
fi
|
|
}
|
|
function cleanup_network {
|
|
if [[ "$(docker network ls -q -f name=$1)" ]]; then
|
|
echo -e "\033[34;1mINFO:\033[0m Removing network $1\033[0m"
|
|
(docker network rm "$1") || true
|
|
fi
|
|
}
|
|
|
|
function cleanup {
|
|
if [[ "$DETACH" != "true" ]] || [[ "$1" == "1" ]]; then
|
|
echo -e "\033[34;1mINFO:\033[0m clean the node and volume on startup (1) OR on exit if not detached\033[0m"
|
|
cleanup_node "$NODE_NAME"
|
|
fi
|
|
if [[ "$DETACH" != "true" ]]; then
|
|
echo -e "\033[34;1mINFO:\033[0m clean the network if not detached (start and exit)\033[0m"
|
|
cleanup_network "$NETWORK_NAME"
|
|
fi
|
|
};
|
|
trap "cleanup 0" EXIT
|
|
|
|
if [[ "$CLEANUP" == "true" ]]; then
|
|
trap - EXIT
|
|
if [[ -z "$(docker network ls -q -f name=${NETWORK_NAME})" ]]; then
|
|
echo -e "\033[34;1mINFO:\033[0m $NETWORK_NAME is already deleted\033[0m"
|
|
exit 0
|
|
fi
|
|
containers=$(docker network inspect -f '{{ range $key, $value := .Containers }}{{ printf "%s\n" .Name}}{{ end }}' ${NETWORK_NAME})
|
|
while read -r container; do
|
|
cleanup_node "$container"
|
|
done <<< "$containers"
|
|
cleanup_network "$NETWORK_NAME"
|
|
echo -e "\033[32;1mSUCCESS:\033[0m Cleaned up and exiting\033[0m"
|
|
exit 0
|
|
fi
|
|
|
|
echo -e "\033[34;1mINFO:\033[0m Making sure previous run leftover infrastructure is removed \033[0m"
|
|
cleanup 1
|
|
|
|
echo -e "\033[34;1mINFO:\033[0m Creating network $NETWORK_NAME if it does not exist already \033[0m"
|
|
docker network inspect "$NETWORK_NAME" > /dev/null 2>&1 || docker network create "$NETWORK_NAME"
|
|
|
|
environment=($(cat <<-END
|
|
--env node.name=$NODE_NAME
|
|
--env cluster.name=$CLUSTER_NAME
|
|
--env cluster.initial_master_nodes=$MASTER_NODE_NAME
|
|
--env discovery.seed_hosts=$MASTER_NODE_NAME
|
|
--env cluster.routing.allocation.disk.threshold_enabled=false
|
|
--env bootstrap.memory_lock=true
|
|
--env node.attr.testattr=test
|
|
--env path.repo=/tmp
|
|
--env repositories.url.allowed_urls=http://snapshot.test*
|
|
--env ELASTIC_PASSWORD=$ELASTIC_PASSWORD
|
|
--env xpack.license.self_generated.type=trial
|
|
--env xpack.security.enabled=false
|
|
--env xpack.security.http.ssl.enabled=false
|
|
--env xpack.security.transport.ssl.enabled=false
|
|
--env xpack.ml.max_machine_memory_percent=90
|
|
END
|
|
))
|
|
|
|
volumes=($(cat <<-END
|
|
--volume $volume_name:/usr/share/elasticsearch/data
|
|
END
|
|
))
|
|
|
|
url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME"
|
|
|
|
# Pull the container, retry on failures up to 5 times with
|
|
# short delays between each attempt. Fixes most transient network errors.
|
|
docker_pull_attempts=0
|
|
until [ "$docker_pull_attempts" -ge 5 ]
|
|
do
|
|
docker pull docker.elastic.co/elasticsearch/$ELASTICSEARCH_VERSION && break
|
|
docker_pull_attempts=$((docker_pull_attempts+1))
|
|
sleep 10
|
|
done
|
|
|
|
echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m"
|
|
set -x
|
|
docker run \
|
|
--name "$NODE_NAME" \
|
|
--network "$NETWORK_NAME" \
|
|
--env ES_JAVA_OPTS=-"Xms2g -Xmx2g" \
|
|
"${environment[@]}" \
|
|
"${volumes[@]}" \
|
|
--publish "$HTTP_PORT":9200 \
|
|
--ulimit nofile=65536:65536 \
|
|
--ulimit memlock=-1:-1 \
|
|
--detach="$DETACH" \
|
|
--health-cmd="curl --insecure --fail $url:9200/_cluster/health || exit 1" \
|
|
--health-interval=2s \
|
|
--health-retries=20 \
|
|
--health-timeout=2s \
|
|
--rm \
|
|
docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION";
|
|
set +x
|
|
|
|
if [[ "$DETACH" == "true" ]]; then
|
|
until ! container_running "$NODE_NAME" || (container_running "$NODE_NAME" && [[ "$(docker inspect -f "{{.State.Health.Status}}" ${NODE_NAME})" != "starting" ]]); do
|
|
echo ""
|
|
docker inspect -f "{{range .State.Health.Log}}{{.Output}}{{end}}" ${NODE_NAME}
|
|
echo -e "\033[34;1mINFO:\033[0m waiting for node $NODE_NAME to be up\033[0m"
|
|
sleep 2;
|
|
done;
|
|
|
|
# Always show logs if the container is running, this is very useful both on CI as well as while developing
|
|
if container_running $NODE_NAME; then
|
|
docker logs $NODE_NAME
|
|
fi
|
|
|
|
if ! container_running $NODE_NAME || [[ "$(docker inspect -f "{{.State.Health.Status}}" ${NODE_NAME})" != "healthy" ]]; then
|
|
cleanup 1
|
|
echo
|
|
echo -e "\033[31;1mERROR:\033[0m Failed to start ${ELASTICSEARCH_VERSION} in detached mode beyond health checks\033[0m"
|
|
echo -e "\033[31;1mERROR:\033[0m dumped the docker log before shutting the node down\033[0m"
|
|
exit 1
|
|
else
|
|
echo
|
|
echo -e "\033[32;1mSUCCESS:\033[0m Detached and healthy: ${NODE_NAME} on docker network: ${NETWORK_NAME}\033[0m"
|
|
echo -e "\033[32;1mSUCCESS:\033[0m Running on: ${url/$NODE_NAME/localhost}:${HTTP_PORT}\033[0m"
|
|
exit 0
|
|
fi
|
|
fi
|