From 3eb32b8d66e851817d6ad3307b65fab8383a8b60 Mon Sep 17 00:00:00 2001 From: Michael Thomson Date: Tue, 12 May 2026 13:26:08 -0400 Subject: [PATCH] nvidia gpu operator --- .../generic-cdi-plugin/daemonset.yaml | 47 ------------------- .../controllers/gpu-operator/release.yaml | 24 ++++++++++ .../controllers/gpu-operator/repository.yaml | 8 ++++ .../gpu-operator/time-slicing-config.yaml | 13 +++++ .../namespace-generic-cdi-plugin.yaml | 4 -- .../namespaces/namespace-gpu-operator.yaml | 8 ++++ 6 files changed, 53 insertions(+), 51 deletions(-) delete mode 100644 infrastructure/controllers/generic-cdi-plugin/daemonset.yaml create mode 100644 infrastructure/controllers/gpu-operator/release.yaml create mode 100644 infrastructure/controllers/gpu-operator/repository.yaml create mode 100644 infrastructure/controllers/gpu-operator/time-slicing-config.yaml delete mode 100644 infrastructure/namespaces/namespace-generic-cdi-plugin.yaml create mode 100644 infrastructure/namespaces/namespace-gpu-operator.yaml diff --git a/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml b/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml deleted file mode 100644 index 5c5994f..0000000 --- a/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml +++ /dev/null @@ -1,47 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: generic-cdi-plugin-daemonset - namespace: generic-cdi-plugin -spec: - selector: - matchLabels: - name: generic-cdi-plugin - template: - metadata: - labels: - name: generic-cdi-plugin - app.kubernetes.io/component: generic-cdi-plugin - app.kubernetes.io/name: generic-cdi-plugin - spec: - containers: - - image: ghcr.io/olfillasodikno/generic-cdi-plugin:main - name: generic-cdi-plugin - command: - - /generic-cdi-plugin - - /var/run/cdi/nvidia-container-toolkit.json - imagePullPolicy: Always - securityContext: - privileged: true - tty: true - volumeMounts: - - name: kubelet - mountPath: /var/lib/kubelet - - name: nvidia-container-toolkit - mountPath: /var/run/cdi/nvidia-container-toolkit.json - volumes: - - name: kubelet - hostPath: - path: /var/lib/kubelet - - name: nvidia-container-toolkit - hostPath: - path: /var/run/cdi/nvidia-container-toolkit.json - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "nixos-nvidia-cdi" - operator: In - values: - - "enabled" diff --git a/infrastructure/controllers/gpu-operator/release.yaml b/infrastructure/controllers/gpu-operator/release.yaml new file mode 100644 index 0000000..80eee1b --- /dev/null +++ b/infrastructure/controllers/gpu-operator/release.yaml @@ -0,0 +1,24 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: gpu-operator + namespace: gpu-operator +spec: + chart: + spec: + chart: gpu-operator + version: v26.3.x + sourceRef: + kind: HelmRepository + name: nvidia + interval: 15m + releaseName: gpu-operator + values: + driver: + enabled: false + toolkit: + enabled: false + devicePlugin: + config: + name: time-slicing-config + default: time-slicing diff --git a/infrastructure/controllers/gpu-operator/repository.yaml b/infrastructure/controllers/gpu-operator/repository.yaml new file mode 100644 index 0000000..bbe46b3 --- /dev/null +++ b/infrastructure/controllers/gpu-operator/repository.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: nvidia + namespace: gpu-operator +spec: + interval: 15m + url: https://helm.ngc.nvidia.com/nvidia diff --git a/infrastructure/controllers/gpu-operator/time-slicing-config.yaml b/infrastructure/controllers/gpu-operator/time-slicing-config.yaml new file mode 100644 index 0000000..d9a4a39 --- /dev/null +++ b/infrastructure/controllers/gpu-operator/time-slicing-config.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: time-slicing-config + namespace: gpu-operator +data: + time-slicing: | + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 5 diff --git a/infrastructure/namespaces/namespace-generic-cdi-plugin.yaml b/infrastructure/namespaces/namespace-generic-cdi-plugin.yaml deleted file mode 100644 index dc4e556..0000000 --- a/infrastructure/namespaces/namespace-generic-cdi-plugin.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: generic-cdi-plugin diff --git a/infrastructure/namespaces/namespace-gpu-operator.yaml b/infrastructure/namespaces/namespace-gpu-operator.yaml new file mode 100644 index 0000000..c4104d2 --- /dev/null +++ b/infrastructure/namespaces/namespace-gpu-operator.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: gpu-operator + labels: + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged