update: nvidia device plugin -> cdi plugin

2026-02-04 13:09:53 +00:00 · 2025-12-03 11:49:49 -05:00
parent 3d31e8ec54
commit 6489eb02fe
5 changed files with 48 additions and 195 deletions
--- a/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
+++ b/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
@@ -0,0 +1,47 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: generic-cdi-plugin-daemonset
  namespace: generic-cdi-plugin
 spec:
  selector:
    matchLabels:
      name: generic-cdi-plugin
  template:
    metadata:
      labels:
        name: generic-cdi-plugin
        app.kubernetes.io/component: generic-cdi-plugin
        app.kubernetes.io/name: generic-cdi-plugin
    spec:
      containers:
      - image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
        name: generic-cdi-plugin
        command:
          - /generic-cdi-plugin
          - /var/run/cdi/nvidia-container-toolkit.json
        imagePullPolicy: Always
        securityContext:
          privileged: true
        tty: true
        volumeMounts:
        - name: kubelet
          mountPath: /var/lib/kubelet
        - name: nvidia-container-toolkit
          mountPath: /var/run/cdi/nvidia-container-toolkit.json
      volumes:
      - name: kubelet
        hostPath:
          path: /var/lib/kubelet
      - name: nvidia-container-toolkit
        hostPath:
          path: /var/run/cdi/nvidia-container-toolkit.json
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: "nixos-nvidia-cdi"
                operator: In
                values:
                - "enabled"
--- a/infrastructure/controllers/nvidia-device-plugin/namespace.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/namespace.yaml
@@ -1,4 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: nvidia-device-plugin
+  name: generic-cdi-plugin
--- a/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml
@@ -1,16 +0,0 @@
 apiVersion: v1
 data:
  config: |
    version: v1
    sharing:
      timeSlicing:
        renameByDefault: false
        failRequestsGreaterThanOne: true
        resources:
        - name: nvidia.com/gpu
          replicas: 10
 kind: ConfigMap
 metadata:
  creationTimestamp: null
  name: nvidia-plugin-configs
  namespace: nvidia-device-plugin
--- a/infrastructure/controllers/nvidia-device-plugin/release.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/release.yaml
@@ -1,170 +0,0 @@
 apiVersion: helm.toolkit.fluxcd.io/v2beta2
 kind: HelmRelease
 metadata:
  name: nvidia-device-plugin
  namespace: nvidia-device-plugin
 spec:
  chart:
    spec:
      chart: nvidia-device-plugin
      version: 0.x
      sourceRef:
        kind: HelmRepository
        name: nvidia-device-plugin
  interval: 15m
  releaseName: nvidia-device-plugin
  values:
    # Plugin configuration
    # Only one of "name" or "map" should ever be set for a given deployment.
    # Use "name" to point to an external ConfigMap with a list of configurations.
    # Use "map" to build an integrated ConfigMap from a set of configurations as
    # part of this helm chart. An example of setting "map" might be:
    # config:
    #   map:
    #     default: |-
    #       version: v1
    #       flags:
    #         migStrategy: none
    #     mig-single: |-
    #       version: v1
    #       flags:
    #         migStrategy: single
    #     mig-mixed: |-
    #       version: v1
    #       flags:
    #         migStrategy: mixed
    config:
      # ConfigMap name if pulling from an external ConfigMap
      name: "nvidia-plugin-configs"
      # List of fallback strategies to attempt if no config is selected and no default is provided
      fallbackStrategies: ["named" , "single"]
    compatWithCPUManager: null
    migStrategy: null
    failOnInitError: null
    deviceListStrategy: null
    deviceIDStrategy: null
    nvidiaDriverRoot: null
    gdsEnabled: null
    mofedEnabled: null
    deviceDiscoveryStrategy: null
    nameOverride: ""
    fullnameOverride: ""
    namespaceOverride: ""
    selectorLabelsOverride: {}
    allowDefaultNamespace: false
    imagePullSecrets: []
    image:
      repository: nvcr.io/nvidia/k8s-device-plugin
      pullPolicy: IfNotPresent
      # Overrides the image tag whose default is the chart appVersion.
      tag: ""
    updateStrategy:
      type: RollingUpdate
    podAnnotations: {}
    podSecurityContext: {}
    securityContext: {}
    resources: {}
    nodeSelector: {}
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
          - matchExpressions:
            # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
            - key: feature.node.kubernetes.io/pci-10de.present
              operator: In
              values:
              - "true"
          - matchExpressions:
            # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
            - key: feature.node.kubernetes.io/cpu-model.vendor_id
              operator: In
              values:
              - "NVIDIA"
          - matchExpressions:
            # We allow a GPU deployment to be forced by setting the following label to "true"
            - key: "nvidia.com/gpu.present"
              operator: In
              values:
              - "true"
    tolerations:
      # This toleration is deprecated. Kept here for backward compatibility
      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
      - key: CriticalAddonsOnly
        operator: Exists
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule
    # Mark this pod as a critical add-on; when enabled, the critical add-on
    # scheduler reserves resources for critical add-on pods so that they can
    # be rescheduled after a failure.
    # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
    priorityClassName: "system-node-critical"
    runtimeClassName: nvidia
    devicePlugin:
      enabled: true
    gfd:
      enabled: true
      nameOverride: gpu-feature-discovery
      namespaceOverride: ""
      noTimestamp: null
      sleepInterval: null
      securityContext:
        # privileged access is required for the gpu-feature-discovery to access the
        # vgpu info on a host.
        # TODO: This should be optional and detected automatically.
        privileged: true
    # Helm dependency
    nfd:
      nameOverride: node-feature-discovery
      enableNodeFeatureApi: false
      master:
        serviceAccount:
          name: node-feature-discovery
          create: true
        config:
          extraLabelNs: ["nvidia.com"]
      worker:
        tolerations:
        - key: "node-role.kubernetes.io/master"
          operator: "Equal"
          value: ""
          effect: "NoSchedule"
        - key: "nvidia.com/gpu"
          operator: "Equal"
          value: "present"
          effect: "NoSchedule"
        config:
          sources:
            pci:
              deviceClassWhitelist:
              - "02"
              - "03"
              deviceLabelFields:
              - vendor
    mps:
      # root specifies the location where files and folders for managing MPS will
      # be created. This includes a daemon-specific /dev/shm and pipe and log
      # directories.
      # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
      root: "/run/nvidia/mps"
    cdi:
      # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
      # This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
      nvidiaHookPath: null
--- a/infrastructure/controllers/nvidia-device-plugin/repository.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/repository.yaml
@@ -1,8 +0,0 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: nvidia-device-plugin
  namespace: nvidia-device-plugin
 spec:
  interval: 15m
  url: https://nvidia.github.io/k8s-device-plugin