update: nvidia device plugin -> cdi plugin

2026-02-04 13:09:53 +00:00 · 2025-12-03 11:49:49 -05:00
parent 3d31e8ec54
commit 6489eb02fe
5 changed files with 48 additions and 195 deletions
--- a/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
+++ b/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
@@ -0,0 +1,47 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: generic-cdi-plugin-daemonset
+  namespace: generic-cdi-plugin
+spec:
+  selector:
+    matchLabels:
+      name: generic-cdi-plugin
+  template:
+    metadata:
+      labels:
+        name: generic-cdi-plugin
+        app.kubernetes.io/component: generic-cdi-plugin
+        app.kubernetes.io/name: generic-cdi-plugin
+    spec:
+      containers:
+      - image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
+        name: generic-cdi-plugin
+        command:
+          - /generic-cdi-plugin
+          - /var/run/cdi/nvidia-container-toolkit.json
+        imagePullPolicy: Always
+        securityContext:
+          privileged: true
+        tty: true
+        volumeMounts:
+        - name: kubelet
+          mountPath: /var/lib/kubelet
+        - name: nvidia-container-toolkit
+          mountPath: /var/run/cdi/nvidia-container-toolkit.json
+      volumes:
+      - name: kubelet
+        hostPath:
+          path: /var/lib/kubelet
+      - name: nvidia-container-toolkit
+        hostPath:
+          path: /var/run/cdi/nvidia-container-toolkit.json
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: "nixos-nvidia-cdi"
+                operator: In
+                values:
+                - "enabled"
--- a/infrastructure/controllers/nvidia-device-plugin/namespace.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/namespace.yaml
@@ -1,4 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: nvidia-device-plugin
+  name: generic-cdi-plugin
--- a/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml
@@ -1,16 +0,0 @@
-apiVersion: v1
-data:
-  config: |
-    version: v1
-    sharing:
-      timeSlicing:
-        renameByDefault: false
-        failRequestsGreaterThanOne: true
-        resources:
-        - name: nvidia.com/gpu
-          replicas: 10
-kind: ConfigMap
-metadata:
-  creationTimestamp: null
-  name: nvidia-plugin-configs
-  namespace: nvidia-device-plugin
--- a/infrastructure/controllers/nvidia-device-plugin/release.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/release.yaml
@@ -1,170 +0,0 @@
-apiVersion: helm.toolkit.fluxcd.io/v2beta2
-kind: HelmRelease
-metadata:
-  name: nvidia-device-plugin
-  namespace: nvidia-device-plugin
-spec:
-  chart:
-    spec:
-      chart: nvidia-device-plugin
-      version: 0.x
-      sourceRef:
-        kind: HelmRepository
-        name: nvidia-device-plugin
-  interval: 15m
-  releaseName: nvidia-device-plugin
-  values:
-    # Plugin configuration
-    # Only one of "name" or "map" should ever be set for a given deployment.
-    # Use "name" to point to an external ConfigMap with a list of configurations.
-    # Use "map" to build an integrated ConfigMap from a set of configurations as
-    # part of this helm chart. An example of setting "map" might be:
-    # config:
-    #   map:
-    #     default: |-
-    #       version: v1
-    #       flags:
-    #         migStrategy: none
-    #     mig-single: |-
-    #       version: v1
-    #       flags:
-    #         migStrategy: single
-    #     mig-mixed: |-
-    #       version: v1
-    #       flags:
-    #         migStrategy: mixed
-    config:
-      # ConfigMap name if pulling from an external ConfigMap
-      name: "nvidia-plugin-configs"
-      # List of fallback strategies to attempt if no config is selected and no default is provided
-      fallbackStrategies: ["named" , "single"]
-
-    compatWithCPUManager: null
-    migStrategy: null
-    failOnInitError: null
-    deviceListStrategy: null
-    deviceIDStrategy: null
-    nvidiaDriverRoot: null
-    gdsEnabled: null
-    mofedEnabled: null
-    deviceDiscoveryStrategy: null
-
-    nameOverride: ""
-    fullnameOverride: ""
-    namespaceOverride: ""
-    selectorLabelsOverride: {}
-
-    allowDefaultNamespace: false
-
-    imagePullSecrets: []
-    image:
-      repository: nvcr.io/nvidia/k8s-device-plugin
-      pullPolicy: IfNotPresent
-      # Overrides the image tag whose default is the chart appVersion.
-      tag: ""
-
-    updateStrategy:
-      type: RollingUpdate
-
-    podAnnotations: {}
-    podSecurityContext: {}
-    securityContext: {}
-
-    resources: {}
-    nodeSelector: {}
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-          - matchExpressions:
-            # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
-            - key: feature.node.kubernetes.io/pci-10de.present
-              operator: In
-              values:
-              - "true"
-          - matchExpressions:
-            # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
-            - key: feature.node.kubernetes.io/cpu-model.vendor_id
-              operator: In
-              values:
-              - "NVIDIA"
-          - matchExpressions:
-            # We allow a GPU deployment to be forced by setting the following label to "true"
-            - key: "nvidia.com/gpu.present"
-              operator: In
-              values:
-              - "true"
-    tolerations:
-      # This toleration is deprecated. Kept here for backward compatibility
-      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
-      - key: CriticalAddonsOnly
-        operator: Exists
-      - key: nvidia.com/gpu
-        operator: Exists
-        effect: NoSchedule
-
-    # Mark this pod as a critical add-on; when enabled, the critical add-on
-    # scheduler reserves resources for critical add-on pods so that they can
-    # be rescheduled after a failure.
-    # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
-    priorityClassName: "system-node-critical"
-
-    runtimeClassName: nvidia
-
-    devicePlugin:
-      enabled: true
-
-    gfd:
-      enabled: true
-      nameOverride: gpu-feature-discovery
-      namespaceOverride: ""
-      noTimestamp: null
-      sleepInterval: null
-      securityContext:
-        # privileged access is required for the gpu-feature-discovery to access the
-        # vgpu info on a host.
-        # TODO: This should be optional and detected automatically.
-        privileged: true
-
-    # Helm dependency
-    nfd:
-      nameOverride: node-feature-discovery
-      enableNodeFeatureApi: false
-      master:
-        serviceAccount:
-          name: node-feature-discovery
-          create: true
-        config:
-          extraLabelNs: ["nvidia.com"]
-
-      worker:
-        tolerations:
-        - key: "node-role.kubernetes.io/master"
-          operator: "Equal"
-          value: ""
-          effect: "NoSchedule"
-        - key: "nvidia.com/gpu"
-          operator: "Equal"
-          value: "present"
-          effect: "NoSchedule"
-        config:
-          sources:
-            pci:
-              deviceClassWhitelist:
-              - "02"
-              - "03"
-              deviceLabelFields:
-              - vendor
-
-    mps:
-      # root specifies the location where files and folders for managing MPS will
-      # be created. This includes a daemon-specific /dev/shm and pipe and log
-      # directories.
-      # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
-      root: "/run/nvidia/mps"
-
-
-    cdi:
-      # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
-      # This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
-      nvidiaHookPath: null
--- a/infrastructure/controllers/nvidia-device-plugin/repository.yaml
+++ b/infrastructure/controllers/nvidia-device-plugin/repository.yaml
@@ -1,8 +0,0 @@
-apiVersion: source.toolkit.fluxcd.io/v1
-kind: HelmRepository
-metadata:
-  name: nvidia-device-plugin
-  namespace: nvidia-device-plugin
-spec:
-  interval: 15m
-  url: https://nvidia.github.io/k8s-device-plugin