fleet-infra/infrastructure/controllers/nvidia-device-plugin/release.yaml

apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
  name: nvidia-device-plugin
  namespace: nvidia-device-plugin
spec:
  chart:
    spec:
      chart: nvidia-device-plugin
      version: 0.x
      sourceRef:
        kind: HelmRepository
        name: nvidia-device-plugin
  interval: 15m
  releaseName: nvidia-device-plugin
  values:
    # Plugin configuration
    # Only one of "name" or "map" should ever be set for a given deployment.
    # Use "name" to point to an external ConfigMap with a list of configurations.
    # Use "map" to build an integrated ConfigMap from a set of configurations as
    # part of this helm chart. An example of setting "map" might be:
    # config:
    #   map:
    #     default: |-
    #       version: v1
    #       flags:
    #         migStrategy: none
    #     mig-single: |-
    #       version: v1
    #       flags:
    #         migStrategy: single
    #     mig-mixed: |-
    #       version: v1
    #       flags:
    #         migStrategy: mixed
    config:
      # ConfigMap name if pulling from an external ConfigMap
      name: "nvidia-plugin-configs"
      # List of fallback strategies to attempt if no config is selected and no default is provided
      fallbackStrategies: ["named" , "single"]

    compatWithCPUManager: null
    migStrategy: null
    failOnInitError: null
    deviceListStrategy: null
    deviceIDStrategy: null
    nvidiaDriverRoot: null
    gdsEnabled: null
    mofedEnabled: null
    deviceDiscoveryStrategy: null

    nameOverride: ""
    fullnameOverride: ""
    namespaceOverride: ""
    selectorLabelsOverride: {}

    allowDefaultNamespace: false

    imagePullSecrets: []
    image:
      repository: nvcr.io/nvidia/k8s-device-plugin
      pullPolicy: IfNotPresent
      # Overrides the image tag whose default is the chart appVersion.
      tag: ""

    updateStrategy:
      type: RollingUpdate

    podAnnotations: {}
    podSecurityContext: {}
    securityContext: {}

    resources: {}
    nodeSelector: {}
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
          - matchExpressions:
            # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
            - key: feature.node.kubernetes.io/pci-10de.present
              operator: In
              values:
              - "true"
          - matchExpressions:
            # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
            - key: feature.node.kubernetes.io/cpu-model.vendor_id
              operator: In
              values:
              - "NVIDIA"
          - matchExpressions:
            # We allow a GPU deployment to be forced by setting the following label to "true"
            - key: "nvidia.com/gpu.present"
              operator: In
              values:
              - "true"
    tolerations:
      # This toleration is deprecated. Kept here for backward compatibility
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      - key: CriticalAddonsOnly
        operator: Exists
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule

    # Mark this pod as a critical add-on; when enabled, the critical add-on
    # scheduler reserves resources for critical add-on pods so that they can
    # be rescheduled after a failure.
    # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
    priorityClassName: "system-node-critical"

    runtimeClassName: nvidia

    devicePlugin:
      enabled: true

    gfd:
      enabled: true
      nameOverride: gpu-feature-discovery
      namespaceOverride: ""
      noTimestamp: null
      sleepInterval: null
      securityContext:
        # privileged access is required for the gpu-feature-discovery to access the
        # vgpu info on a host.
        # TODO: This should be optional and detected automatically.
        privileged: true

    # Helm dependency
    nfd:
      nameOverride: node-feature-discovery
      enableNodeFeatureApi: false
      master:
        serviceAccount:
          name: node-feature-discovery
          create: true
        config:
          extraLabelNs: ["nvidia.com"]

      worker:
        tolerations:
        - key: "node-role.kubernetes.io/master"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: "nvidia.com/gpu"
          operator: "Equal"
          value: "present"
          effect: "NoSchedule"
        config:
          sources:
            pci:
              deviceClassWhitelist:
              - "02"
              - "03"
              deviceLabelFields:
              - vendor

    mps:
      # root specifies the location where files and folders for managing MPS will
      # be created. This includes a daemon-specific /dev/shm and pipe and log
      # directories.
      # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
      root: "/run/nvidia/mps"


    cdi:
      # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
      # This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
      nvidiaHookPath: null