update: nvidia device plugin -> cdi plugin

This commit is contained in:
2025-12-03 11:49:49 -05:00
parent 3d31e8ec54
commit 6489eb02fe
5 changed files with 48 additions and 195 deletions

View File

@@ -0,0 +1,47 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: generic-cdi-plugin-daemonset
namespace: generic-cdi-plugin
spec:
selector:
matchLabels:
name: generic-cdi-plugin
template:
metadata:
labels:
name: generic-cdi-plugin
app.kubernetes.io/component: generic-cdi-plugin
app.kubernetes.io/name: generic-cdi-plugin
spec:
containers:
- image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
name: generic-cdi-plugin
command:
- /generic-cdi-plugin
- /var/run/cdi/nvidia-container-toolkit.json
imagePullPolicy: Always
securityContext:
privileged: true
tty: true
volumeMounts:
- name: kubelet
mountPath: /var/lib/kubelet
- name: nvidia-container-toolkit
mountPath: /var/run/cdi/nvidia-container-toolkit.json
volumes:
- name: kubelet
hostPath:
path: /var/lib/kubelet
- name: nvidia-container-toolkit
hostPath:
path: /var/run/cdi/nvidia-container-toolkit.json
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "nixos-nvidia-cdi"
operator: In
values:
- "enabled"

View File

@@ -1,4 +1,4 @@
apiVersion: v1 apiVersion: v1
kind: Namespace kind: Namespace
metadata: metadata:
name: nvidia-device-plugin name: generic-cdi-plugin

View File

@@ -1,16 +0,0 @@
apiVersion: v1
data:
config: |
version: v1
sharing:
timeSlicing:
renameByDefault: false
failRequestsGreaterThanOne: true
resources:
- name: nvidia.com/gpu
replicas: 10
kind: ConfigMap
metadata:
creationTimestamp: null
name: nvidia-plugin-configs
namespace: nvidia-device-plugin

View File

@@ -1,170 +0,0 @@
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: nvidia-device-plugin
namespace: nvidia-device-plugin
spec:
chart:
spec:
chart: nvidia-device-plugin
version: 0.x
sourceRef:
kind: HelmRepository
name: nvidia-device-plugin
interval: 15m
releaseName: nvidia-device-plugin
values:
# Plugin configuration
# Only one of "name" or "map" should ever be set for a given deployment.
# Use "name" to point to an external ConfigMap with a list of configurations.
# Use "map" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "map" might be:
# config:
# map:
# default: |-
# version: v1
# flags:
# migStrategy: none
# mig-single: |-
# version: v1
# flags:
# migStrategy: single
# mig-mixed: |-
# version: v1
# flags:
# migStrategy: mixed
config:
# ConfigMap name if pulling from an external ConfigMap
name: "nvidia-plugin-configs"
# List of fallback strategies to attempt if no config is selected and no default is provided
fallbackStrategies: ["named" , "single"]
compatWithCPUManager: null
migStrategy: null
failOnInitError: null
deviceListStrategy: null
deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
deviceDiscoveryStrategy: null
nameOverride: ""
fullnameOverride: ""
namespaceOverride: ""
selectorLabelsOverride: {}
allowDefaultNamespace: false
imagePullSecrets: []
image:
repository: nvcr.io/nvidia/k8s-device-plugin
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: ""
updateStrategy:
type: RollingUpdate
podAnnotations: {}
podSecurityContext: {}
securityContext: {}
resources: {}
nodeSelector: {}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
# On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
- key: feature.node.kubernetes.io/pci-10de.present
operator: In
values:
- "true"
- matchExpressions:
# On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
- key: feature.node.kubernetes.io/cpu-model.vendor_id
operator: In
values:
- "NVIDIA"
- matchExpressions:
# We allow a GPU deployment to be forced by setting the following label to "true"
- key: "nvidia.com/gpu.present"
operator: In
values:
- "true"
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
runtimeClassName: nvidia
devicePlugin:
enabled: true
gfd:
enabled: true
nameOverride: gpu-feature-discovery
namespaceOverride: ""
noTimestamp: null
sleepInterval: null
securityContext:
# privileged access is required for the gpu-feature-discovery to access the
# vgpu info on a host.
# TODO: This should be optional and detected automatically.
privileged: true
# Helm dependency
nfd:
nameOverride: node-feature-discovery
enableNodeFeatureApi: false
master:
serviceAccount:
name: node-feature-discovery
create: true
config:
extraLabelNs: ["nvidia.com"]
worker:
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"
config:
sources:
pci:
deviceClassWhitelist:
- "02"
- "03"
deviceLabelFields:
- vendor
mps:
# root specifies the location where files and folders for managing MPS will
# be created. This includes a daemon-specific /dev/shm and pipe and log
# directories.
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
root: "/run/nvidia/mps"
cdi:
# nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
# This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
nvidiaHookPath: null

View File

@@ -1,8 +0,0 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: nvidia-device-plugin
namespace: nvidia-device-plugin
spec:
interval: 15m
url: https://nvidia.github.io/k8s-device-plugin