From fe6aeb95caa3d7c5972ada8e67a679fb7c6f4d73 Mon Sep 17 00:00:00 2001 From: Michael Thomson Date: Fri, 6 Jun 2025 15:16:32 -0400 Subject: [PATCH] nvidia update values --- .../helmrelease-nvidia-device-plugin.yaml | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml b/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml index cc865ae..addcd82 100644 --- a/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml +++ b/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml @@ -16,7 +16,161 @@ spec: timeout: 5m releaseName: nvidia-device-plugin values: + # Plugin configuration + # Only one of "name" or "map" should ever be set for a given deployment. + # Use "name" to point to an external ConfigMap with a list of configurations. + # Use "map" to build an integrated ConfigMap from a set of configurations as + # part of this helm chart. An example of setting "map" might be: + # config: + # map: + # default: |- + # version: v1 + # flags: + # migStrategy: none + # mig-single: |- + # version: v1 + # flags: + # migStrategy: single + # mig-mixed: |- + # version: v1 + # flags: + # migStrategy: mixed + config: + # ConfigMap name if pulling from an external ConfigMap + name: "" + # Set of named configs to build an integrated ConfigMap from + map: {} + # Default config name within the ConfigMap + default: "" + # List of fallback strategies to attempt if no config is selected and no default is provided + fallbackStrategies: ["named" , "single"] + + compatWithCPUManager: null + migStrategy: null + failOnInitError: null + deviceListStrategy: null + deviceIDStrategy: null + nvidiaDriverRoot: null + gdsEnabled: null + mofedEnabled: null + deviceDiscoveryStrategy: null + + nameOverride: "" + fullnameOverride: "" + namespaceOverride: "" + selectorLabelsOverride: {} + + allowDefaultNamespace: false + + imagePullSecrets: [] + image: + repository: nvcr.io/nvidia/k8s-device-plugin + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + + updateStrategy: + type: RollingUpdate + + podAnnotations: {} + podSecurityContext: {} + securityContext: {} + + resources: {} + nodeSelector: {} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - "NVIDIA" + - matchExpressions: + # We allow a GPU deployment to be forced by setting the following label to "true" + - key: "nvidia.com/gpu.present" + operator: In + values: + - "true" + tolerations: + # This toleration is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + runtimeClassName: nvidia + devicePlugin: + enabled: true + gfd: enabled: true + nameOverride: gpu-feature-discovery + namespaceOverride: "" + noTimestamp: null + sleepInterval: null + securityContext: + # privileged access is required for the gpu-feature-discovery to access the + # vgpu info on a host. + # TODO: This should be optional and detected automatically. + privileged: true + + # Helm dependency + nfd: + nameOverride: node-feature-discovery + enableNodeFeatureApi: false + master: + serviceAccount: + name: node-feature-discovery + create: true + config: + extraLabelNs: ["nvidia.com"] + + worker: + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: "NoSchedule" + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "03" + deviceLabelFields: + - vendor + + mps: + # root specifies the location where files and folders for managing MPS will + # be created. This includes a daemon-specific /dev/shm and pipe and log + # directories. + # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }} + root: "/run/nvidia/mps" + + + cdi: + # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host. + # This is required to ensure that the generated CDI specification refers to the correct CDI hooks. + nvidiaHookPath: null