diff --git a/codalab/worker/docker_utils.py b/codalab/worker/docker_utils.py index e21dfb5eb..61293e063 100644 --- a/codalab/worker/docker_utils.py +++ b/codalab/worker/docker_utils.py @@ -152,7 +152,11 @@ def get_nvidia_devices(self, use_docker=True): docker.errors.ImageNotFound if the CUDA image cannot be pulled docker.errors.APIError if another server error occurs """ - cuda_image = 'sulfurheron/nvidia-cuda:9.0-cudnn7-devel-ubuntu16.04-2018-06-08' + + # Note: Do NOT update the NVIDIA image to use a CUDA version higher than + # that supported by the NLP machines. Otherwise, Slurm Batch Worker + # Manager will no longer function. + cuda_image = 'nvidia/cuda:11.5.2-base-ubuntu20.04' nvidia_command = 'nvidia-smi --query-gpu=index,uuid --format=csv,noheader' if use_docker: self.client.images.pull(cuda_image) diff --git a/codalab/worker/main.py b/codalab/worker/main.py index ef98f0c9e..35b1f76f8 100644 --- a/codalab/worker/main.py +++ b/codalab/worker/main.py @@ -453,13 +453,17 @@ def parse_gpuset_args(arg): try: all_gpus = DockerRuntime().get_nvidia_devices() # Dict[GPU index: GPU UUID] - except DockerException: + except DockerException as e: + logger.error(e) + logger.error("Setting all_gpus to be empty...") all_gpus = {} # Docker socket can't be used except requests.exceptions.ConnectionError: try: all_gpus = DockerRuntime().get_nvidia_devices(use_docker=False) - except SingularityError: + except SingularityError as e: + logger.error(e) + logger.error("Setting all_gpus to be empty...") all_gpus = {} if arg == 'ALL':