Reapply "nvidia device manager"

This reverts commit b8e936f5c5.
2026-02-04 04:59:54 +00:00 · 2025-06-07 00:05:24 -04:00
parent b8e936f5c5
commit 638e381aa5
4 changed files with 206 additions and 0 deletions
--- a/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml
+++ b/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml
@@ -0,0 +1,176 @@
+apiVersion: helm.toolkit.fluxcd.io/v2beta2
+kind: HelmRelease
+metadata:
+  name: nvidia-device-plugin
+  namespace: nvidia-device-plugin
+spec:
+  chart:
+    spec:
+      chart: nvidia-device-plugin
+      version: 0.x
+      sourceRef:
+        kind: HelmRepository
+        name: nvidia-device-plugin
+        namespace: flux-system
+  interval: 15m
+  timeout: 5m
+  releaseName: nvidia-device-plugin
+  values:
+    # Plugin configuration
+    # Only one of "name" or "map" should ever be set for a given deployment.
+    # Use "name" to point to an external ConfigMap with a list of configurations.
+    # Use "map" to build an integrated ConfigMap from a set of configurations as
+    # part of this helm chart. An example of setting "map" might be:
+    # config:
+    #   map:
+    #     default: |-
+    #       version: v1
+    #       flags:
+    #         migStrategy: none
+    #     mig-single: |-
+    #       version: v1
+    #       flags:
+    #         migStrategy: single
+    #     mig-mixed: |-
+    #       version: v1
+    #       flags:
+    #         migStrategy: mixed
+    config:
+      # ConfigMap name if pulling from an external ConfigMap
+      name: ""
+      # Set of named configs to build an integrated ConfigMap from
+      map: {}
+      # Default config name within the ConfigMap
+      default: ""
+      # List of fallback strategies to attempt if no config is selected and no default is provided
+      fallbackStrategies: ["named" , "single"]
+
+    compatWithCPUManager: null
+    migStrategy: null
+    failOnInitError: null
+    deviceListStrategy: null
+    deviceIDStrategy: null
+    nvidiaDriverRoot: null
+    gdsEnabled: null
+    mofedEnabled: null
+    deviceDiscoveryStrategy: null
+
+    nameOverride: ""
+    fullnameOverride: ""
+    namespaceOverride: ""
+    selectorLabelsOverride: {}
+
+    allowDefaultNamespace: false
+
+    imagePullSecrets: []
+    image:
+      repository: nvcr.io/nvidia/k8s-device-plugin
+      pullPolicy: IfNotPresent
+      # Overrides the image tag whose default is the chart appVersion.
+      tag: ""
+
+    updateStrategy:
+      type: RollingUpdate
+
+    podAnnotations: {}
+    podSecurityContext: {}
+    securityContext: {}
+
+    resources: {}
+    nodeSelector: {}
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+          - matchExpressions:
+            # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
+            - key: feature.node.kubernetes.io/pci-10de.present
+              operator: In
+              values:
+              - "true"
+          - matchExpressions:
+            # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
+            - key: feature.node.kubernetes.io/cpu-model.vendor_id
+              operator: In
+              values:
+              - "NVIDIA"
+          - matchExpressions:
+            # We allow a GPU deployment to be forced by setting the following label to "true"
+            - key: "nvidia.com/gpu.present"
+              operator: In
+              values:
+              - "true"
+    tolerations:
+      # This toleration is deprecated. Kept here for backward compatibility
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      - key: CriticalAddonsOnly
+        operator: Exists
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+
+    # Mark this pod as a critical add-on; when enabled, the critical add-on
+    # scheduler reserves resources for critical add-on pods so that they can
+    # be rescheduled after a failure.
+    # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+    priorityClassName: "system-node-critical"
+
+    runtimeClassName: nvidia
+
+    devicePlugin:
+      enabled: true
+
+    gfd:
+      enabled: true
+      nameOverride: gpu-feature-discovery
+      namespaceOverride: ""
+      noTimestamp: null
+      sleepInterval: null
+      securityContext:
+        # privileged access is required for the gpu-feature-discovery to access the
+        # vgpu info on a host.
+        # TODO: This should be optional and detected automatically.
+        privileged: true
+
+    # Helm dependency
+    nfd:
+      nameOverride: node-feature-discovery
+      enableNodeFeatureApi: false
+      master:
+        serviceAccount:
+          name: node-feature-discovery
+          create: true
+        config:
+          extraLabelNs: ["nvidia.com"]
+
+      worker:
+        tolerations:
+        - key: "node-role.kubernetes.io/master"
+          operator: "Equal"
+          value: ""
+          effect: "NoSchedule"
+        - key: "nvidia.com/gpu"
+          operator: "Equal"
+          value: "present"
+          effect: "NoSchedule"
+        config:
+          sources:
+            pci:
+              deviceClassWhitelist:
+              - "02"
+              - "03"
+              deviceLabelFields:
+              - vendor
+
+    mps:
+      # root specifies the location where files and folders for managing MPS will
+      # be created. This includes a daemon-specific /dev/shm and pipe and log
+      # directories.
+      # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
+      root: "/run/nvidia/mps"
+
+
+    cdi:
+      # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
+      # This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
+      nvidiaHookPath: null