From 11cf3a5cf796da19b9f33af63fcbf8992a4d5615 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 13 Feb 2024 14:58:22 -0800 Subject: [PATCH 1/2] Set runtimeClass for operands unless CRI-O is the runtime and CDI is disabled Signed-off-by: Christopher Desiniotis --- controllers/object_controls.go | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 07114c39f..fb87757a4 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -824,7 +824,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1270,7 +1270,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1472,7 +1472,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // mount configmap for custom metrics if provided by user if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { @@ -1589,7 +1589,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) return nil } @@ -1639,7 +1639,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -1948,7 +1948,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2280,13 +2280,18 @@ func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string { return DefaultRuntimeClass } -func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) { - if runtime == gpuv1.Containerd { - if runtimeClass == "" { - runtimeClass = DefaultRuntimeClass - } - podSpec.RuntimeClassName = &runtimeClass +// setRuntimeClass sets the runtimeClass for a pod, unless CRI-O is the container runtime +// being used and CDI is not enabled. In this case, an OCI hook is used and the nvidia +// runtime is not configured. +func setRuntimeClass(podSpec *corev1.PodSpec, n ClusterPolicyController, runtimeClass string) { + if n.runtime == gpuv1.CRIO && !n.singleton.Spec.CDI.IsEnabled() { + return + } + + if runtimeClass == "" { + runtimeClass = DefaultRuntimeClass } + podSpec.RuntimeClassName = &runtimeClass } func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) { From 3e0f91d9ff986258d5c75a3a29c233ecf7ddabdb Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 13 Feb 2024 15:21:03 -0800 Subject: [PATCH 2/2] Enable the use of CDI on OpenShift This commit configures the toolkit container and device-plugin appropriately so that CDI can be used to provide GPU access to both management and application containers on OpenShift. On OpenShift, we cannot set 'nvidia' as the default runtime. Because of this, we have decided to take a hybrid approach for enabling CDI. That is, we use the 'nvidia' runtime configured in CDI mode for providing GPU access to management containers, and native CDI support in CRI-O for providing GPU access to application containers. Signed-off-by: Christopher Desiniotis --- controllers/object_controls.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index fb87757a4..1697e6d30 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -168,6 +168,8 @@ const ( MPSRootEnvName = "MPS_ROOT" // DefaultMPSRoot is the default MPS root path on the host DefaultMPSRoot = "/run/nvidia/mps" + // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime + NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" ) // ContainerProbe defines container probe types @@ -1125,6 +1127,10 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if config.CDI.IsDefault() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") } + // do not set 'nvidia' as the default runtime on OpenShift + if n.openshift != "" { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaRuntimeSetAsDefaultEnvName, "false") + } } // set install directory for the toolkit @@ -1279,7 +1285,12 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if config.CDI.IsEnabled() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/") + // for OpenShift, use native-CDI support in CRI-O as the 'nvidia' runtime will not be configured as the default runtime + cdiAnnotationPrefix := "nvidia.cdi.k8s.io/" + if n.openshift != "" { + cdiAnnotationPrefix = "cdi.k8s.io/" + } + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, cdiAnnotationPrefix) if config.Toolkit.IsEnabled() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCTKPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-ctk")) }