Revert "nvidia device manager"

This reverts commit 17ece709ba.
2026-02-04 13:09:53 +00:00 · 2025-06-06 15:30:35 -04:00
parent fe6aeb95ca
commit b8e936f5c5
4 changed files with 0 additions and 206 deletions
--- a/bootstrap/helmrepositories/helmrepository-nvidia-device-plugin.yaml
+++ b/bootstrap/helmrepositories/helmrepository-nvidia-device-plugin.yaml
@@ -1,8 +0,0 @@
 apiVersion: source.toolkit.fluxcd.io/v1beta2
 kind: HelmRepository
 metadata:
  name: nvidia-device-plugin
  namespace: flux-system
 spec:
  interval: 15m
  url: https://nvidia.github.io/k8s-device-plugin
--- a/bootstrap/kustomizations/kustomization-nvidia-device-plugin.yaml
+++ b/bootstrap/kustomizations/kustomization-nvidia-device-plugin.yaml
@@ -1,18 +0,0 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: nvidia-device-plugin
  namespace: flux-system
 spec:
  interval: 15m
  path: ./nvidia-device-plugin
  prune: true # remove any elements later removed from the above path
  timeout: 2m # if not set, this defaults to interval duration, which is 1h
  sourceRef:
    kind: GitRepository
    name: flux-system
  healthChecks:
    - apiVersion: helm.toolkit.fluxcd.io/v2beta2
      kind: HelmRelease
      name: nvidia-device-plugin
      namespace: nvidia-device-plugin
--- a/bootstrap/namespaces/namespace-nvidia-device-plugin.yaml
+++ b/bootstrap/namespaces/namespace-nvidia-device-plugin.yaml
@@ -1,4 +0,0 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: nvidia-device-plugin
--- a/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml
+++ b/nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml
@@ -1,176 +0,0 @@
 apiVersion: helm.toolkit.fluxcd.io/v2beta2
 kind: HelmRelease
 metadata:
  name: nvidia-device-plugin
  namespace: nvidia-device-plugin
 spec:
  chart:
    spec:
      chart: nvidia-device-plugin
      version: 0.x
      sourceRef:
        kind: HelmRepository
        name: nvidia-device-plugin
        namespace: flux-system
  interval: 15m
  timeout: 5m
  releaseName: nvidia-device-plugin
  values:
    # Plugin configuration
    # Only one of "name" or "map" should ever be set for a given deployment.
    # Use "name" to point to an external ConfigMap with a list of configurations.
    # Use "map" to build an integrated ConfigMap from a set of configurations as
    # part of this helm chart. An example of setting "map" might be:
    # config:
    #   map:
    #     default: |-
    #       version: v1
    #       flags:
    #         migStrategy: none
    #     mig-single: |-
    #       version: v1
    #       flags:
    #         migStrategy: single
    #     mig-mixed: |-
    #       version: v1
    #       flags:
    #         migStrategy: mixed
    config:
      # ConfigMap name if pulling from an external ConfigMap
      name: ""
      # Set of named configs to build an integrated ConfigMap from
      map: {}
      # Default config name within the ConfigMap
      default: ""
      # List of fallback strategies to attempt if no config is selected and no default is provided
      fallbackStrategies: ["named" , "single"]
    compatWithCPUManager: null
    migStrategy: null
    failOnInitError: null
    deviceListStrategy: null
    deviceIDStrategy: null
    nvidiaDriverRoot: null
    gdsEnabled: null
    mofedEnabled: null
    deviceDiscoveryStrategy: null
    nameOverride: ""
    fullnameOverride: ""
    namespaceOverride: ""
    selectorLabelsOverride: {}
    allowDefaultNamespace: false
    imagePullSecrets: []
    image:
      repository: nvcr.io/nvidia/k8s-device-plugin
      pullPolicy: IfNotPresent
      # Overrides the image tag whose default is the chart appVersion.
      tag: ""
    updateStrategy:
      type: RollingUpdate
    podAnnotations: {}
    podSecurityContext: {}
    securityContext: {}
    resources: {}
    nodeSelector: {}
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
          - matchExpressions:
            # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
            - key: feature.node.kubernetes.io/pci-10de.present
              operator: In
              values:
              - "true"
          - matchExpressions:
            # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
            - key: feature.node.kubernetes.io/cpu-model.vendor_id
              operator: In
              values:
              - "NVIDIA"
          - matchExpressions:
            # We allow a GPU deployment to be forced by setting the following label to "true"
            - key: "nvidia.com/gpu.present"
              operator: In
              values:
              - "true"
    tolerations:
      # This toleration is deprecated. Kept here for backward compatibility
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      - key: CriticalAddonsOnly
        operator: Exists
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
    # Mark this pod as a critical add-on; when enabled, the critical add-on
    # scheduler reserves resources for critical add-on pods so that they can
    # be rescheduled after a failure.
    # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
    priorityClassName: "system-node-critical"
    runtimeClassName: nvidia
    devicePlugin:
      enabled: true
    gfd:
      enabled: true
      nameOverride: gpu-feature-discovery
      namespaceOverride: ""
      noTimestamp: null
      sleepInterval: null
      securityContext:
        # privileged access is required for the gpu-feature-discovery to access the
        # vgpu info on a host.
        # TODO: This should be optional and detected automatically.
        privileged: true
    # Helm dependency
    nfd:
      nameOverride: node-feature-discovery
      enableNodeFeatureApi: false
      master:
        serviceAccount:
          name: node-feature-discovery
          create: true
        config:
          extraLabelNs: ["nvidia.com"]
      worker:
        tolerations:
        - key: "node-role.kubernetes.io/master"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: "nvidia.com/gpu"
          operator: "Equal"
          value: "present"
          effect: "NoSchedule"
        config:
          sources:
            pci:
              deviceClassWhitelist:
              - "02"
              - "03"
              deviceLabelFields:
              - vendor
    mps:
      # root specifies the location where files and folders for managing MPS will
      # be created. This includes a daemon-specific /dev/shm and pipe and log
      # directories.
      # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
      root: "/run/nvidia/mps"
    cdi:
      # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
      # This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
      nvidiaHookPath: null