From 5ffe8cb8aec9440449d89bbfa3197512191dad46 Mon Sep 17 00:00:00 2001 From: "Eugene Cheah (picocreator)" Date: Wed, 7 Feb 2024 23:45:38 +0000 Subject: [PATCH] Added nvidia-smi safety within the container, stops and restart on missing nvidia-smi --- docker/github-worker-cuda-12-1/entrypoint.sh | 41 +++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/docker/github-worker-cuda-12-1/entrypoint.sh b/docker/github-worker-cuda-12-1/entrypoint.sh index b0cf6cc8..bd3db193 100644 --- a/docker/github-worker-cuda-12-1/entrypoint.sh +++ b/docker/github-worker-cuda-12-1/entrypoint.sh @@ -6,6 +6,21 @@ cd /actions-runner # CUDA version for label CUDA_VER="cuda-12-1" +# Check if nvidia-smi is available +SMI_CALL=$(nvidia-smi) +if [ $? -ne 0 ]; then + echo "# [ERROR] nvidia-smi is not available, shutting down (after long delay)" + echo "# ---" + echo "$SMI_CALL" + echo "# ---" + + # Sleep for 1 hour, a failure on start is a sign of a bigger problem + echo "# Performing a long sleep, to stagger container flow ..." + sleep 3600 + echo "# Exiting now" + exit 1 +fi + # Check the URL, token, and name of the runner from the container ENV vars # and if they are not set, provide default values if [[ -z "${RUNNER_NAME}" ]]; then @@ -65,9 +80,31 @@ fi # Follow up on any forwarded command args if [[ $# -gt 0 ]]; then cd /root - exec "$@" + exec "$@" & fi # Wait for everything to exit # wait $RUNNER_PID -wait \ No newline at end of file +while true; do + # Check if any background process is still running + # Break if there is no background process + if [ -z "$(jobs -p)" ]; then + echo "# [INFO] All runners have exited, shutting down" + break + fi + + # Call nvidia-smi, check if any error orccured + SMI_CALL=$(nvidia-smi) + + # If nvidia-smi failed, exit + if [ $? -ne 0 ]; then + echo "# [ERROR] nvidia-smi failed, shutting down now!" + echo "# ---" + echo "$SMI_CALL" + echo "# ---" + break + fi + + # Performs a small sleep wait + sleep 10 +done \ No newline at end of file