diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 07114c39f..1697e6d30 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -168,6 +168,8 @@ const ( MPSRootEnvName = "MPS_ROOT" // DefaultMPSRoot is the default MPS root path on the host DefaultMPSRoot = "/run/nvidia/mps" + // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime + NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" ) // ContainerProbe defines container probe types @@ -824,7 +826,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1125,6 +1127,10 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if config.CDI.IsDefault() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") } + // do not set 'nvidia' as the default runtime on OpenShift + if n.openshift != "" { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaRuntimeSetAsDefaultEnvName, "false") + } } // set install directory for the toolkit @@ -1270,7 +1276,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1279,7 +1285,12 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if config.CDI.IsEnabled() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/") + // for OpenShift, use native-CDI support in CRI-O as the 'nvidia' runtime will not be configured as the default runtime + cdiAnnotationPrefix := "nvidia.cdi.k8s.io/" + if n.openshift != "" { + cdiAnnotationPrefix = "cdi.k8s.io/" + } + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, cdiAnnotationPrefix) if config.Toolkit.IsEnabled() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCTKPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-ctk")) } @@ -1472,7 +1483,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // mount configmap for custom metrics if provided by user if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { @@ -1589,7 +1600,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) return nil } @@ -1639,7 +1650,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -1948,7 +1959,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, } // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2280,13 +2291,18 @@ func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string { return DefaultRuntimeClass } -func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) { - if runtime == gpuv1.Containerd { - if runtimeClass == "" { - runtimeClass = DefaultRuntimeClass - } - podSpec.RuntimeClassName = &runtimeClass +// setRuntimeClass sets the runtimeClass for a pod, unless CRI-O is the container runtime +// being used and CDI is not enabled. In this case, an OCI hook is used and the nvidia +// runtime is not configured. +func setRuntimeClass(podSpec *corev1.PodSpec, n ClusterPolicyController, runtimeClass string) { + if n.runtime == gpuv1.CRIO && !n.singleton.Spec.CDI.IsEnabled() { + return + } + + if runtimeClass == "" { + runtimeClass = DefaultRuntimeClass } + podSpec.RuntimeClassName = &runtimeClass } func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) {