diff --git a/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml b/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml new file mode 100644 index 0000000..5c5994f --- /dev/null +++ b/infrastructure/controllers/generic-cdi-plugin/daemonset.yaml @@ -0,0 +1,47 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: generic-cdi-plugin-daemonset + namespace: generic-cdi-plugin +spec: + selector: + matchLabels: + name: generic-cdi-plugin + template: + metadata: + labels: + name: generic-cdi-plugin + app.kubernetes.io/component: generic-cdi-plugin + app.kubernetes.io/name: generic-cdi-plugin + spec: + containers: + - image: ghcr.io/olfillasodikno/generic-cdi-plugin:main + name: generic-cdi-plugin + command: + - /generic-cdi-plugin + - /var/run/cdi/nvidia-container-toolkit.json + imagePullPolicy: Always + securityContext: + privileged: true + tty: true + volumeMounts: + - name: kubelet + mountPath: /var/lib/kubelet + - name: nvidia-container-toolkit + mountPath: /var/run/cdi/nvidia-container-toolkit.json + volumes: + - name: kubelet + hostPath: + path: /var/lib/kubelet + - name: nvidia-container-toolkit + hostPath: + path: /var/run/cdi/nvidia-container-toolkit.json + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "nixos-nvidia-cdi" + operator: In + values: + - "enabled" diff --git a/infrastructure/controllers/nvidia-device-plugin/namespace.yaml b/infrastructure/controllers/generic-cdi-plugin/namespace.yaml similarity index 58% rename from infrastructure/controllers/nvidia-device-plugin/namespace.yaml rename to infrastructure/controllers/generic-cdi-plugin/namespace.yaml index 4a6b3ab..dc4e556 100644 --- a/infrastructure/controllers/nvidia-device-plugin/namespace.yaml +++ b/infrastructure/controllers/generic-cdi-plugin/namespace.yaml @@ -1,4 +1,4 @@ apiVersion: v1 kind: Namespace metadata: - name: nvidia-device-plugin + name: generic-cdi-plugin diff --git a/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml b/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml deleted file mode 100644 index dc342b1..0000000 --- a/infrastructure/controllers/nvidia-device-plugin/nvidia-plugin-configs.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -data: - config: | - version: v1 - sharing: - timeSlicing: - renameByDefault: false - failRequestsGreaterThanOne: true - resources: - - name: nvidia.com/gpu - replicas: 10 -kind: ConfigMap -metadata: - creationTimestamp: null - name: nvidia-plugin-configs - namespace: nvidia-device-plugin diff --git a/infrastructure/controllers/nvidia-device-plugin/release.yaml b/infrastructure/controllers/nvidia-device-plugin/release.yaml deleted file mode 100644 index 9a335d3..0000000 --- a/infrastructure/controllers/nvidia-device-plugin/release.yaml +++ /dev/null @@ -1,170 +0,0 @@ -apiVersion: helm.toolkit.fluxcd.io/v2beta2 -kind: HelmRelease -metadata: - name: nvidia-device-plugin - namespace: nvidia-device-plugin -spec: - chart: - spec: - chart: nvidia-device-plugin - version: 0.x - sourceRef: - kind: HelmRepository - name: nvidia-device-plugin - interval: 15m - releaseName: nvidia-device-plugin - values: - # Plugin configuration - # Only one of "name" or "map" should ever be set for a given deployment. - # Use "name" to point to an external ConfigMap with a list of configurations. - # Use "map" to build an integrated ConfigMap from a set of configurations as - # part of this helm chart. An example of setting "map" might be: - # config: - # map: - # default: |- - # version: v1 - # flags: - # migStrategy: none - # mig-single: |- - # version: v1 - # flags: - # migStrategy: single - # mig-mixed: |- - # version: v1 - # flags: - # migStrategy: mixed - config: - # ConfigMap name if pulling from an external ConfigMap - name: "nvidia-plugin-configs" - # List of fallback strategies to attempt if no config is selected and no default is provided - fallbackStrategies: ["named" , "single"] - - compatWithCPUManager: null - migStrategy: null - failOnInitError: null - deviceListStrategy: null - deviceIDStrategy: null - nvidiaDriverRoot: null - gdsEnabled: null - mofedEnabled: null - deviceDiscoveryStrategy: null - - nameOverride: "" - fullnameOverride: "" - namespaceOverride: "" - selectorLabelsOverride: {} - - allowDefaultNamespace: false - - imagePullSecrets: [] - image: - repository: nvcr.io/nvidia/k8s-device-plugin - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "" - - updateStrategy: - type: RollingUpdate - - podAnnotations: {} - podSecurityContext: {} - securityContext: {} - - resources: {} - nodeSelector: {} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID - - key: feature.node.kubernetes.io/pci-10de.present - operator: In - values: - - "true" - - matchExpressions: - # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA - - key: feature.node.kubernetes.io/cpu-model.vendor_id - operator: In - values: - - "NVIDIA" - - matchExpressions: - # We allow a GPU deployment to be forced by setting the following label to "true" - - key: "nvidia.com/gpu.present" - operator: In - values: - - "true" - tolerations: - # This toleration is deprecated. Kept here for backward compatibility - # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - - key: CriticalAddonsOnly - operator: Exists - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - # Mark this pod as a critical add-on; when enabled, the critical add-on - # scheduler reserves resources for critical add-on pods so that they can - # be rescheduled after a failure. - # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - priorityClassName: "system-node-critical" - - runtimeClassName: nvidia - - devicePlugin: - enabled: true - - gfd: - enabled: true - nameOverride: gpu-feature-discovery - namespaceOverride: "" - noTimestamp: null - sleepInterval: null - securityContext: - # privileged access is required for the gpu-feature-discovery to access the - # vgpu info on a host. - # TODO: This should be optional and detected automatically. - privileged: true - - # Helm dependency - nfd: - nameOverride: node-feature-discovery - enableNodeFeatureApi: false - master: - serviceAccount: - name: node-feature-discovery - create: true - config: - extraLabelNs: ["nvidia.com"] - - worker: - tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "" - effect: "NoSchedule" - - key: "nvidia.com/gpu" - operator: "Equal" - value: "present" - effect: "NoSchedule" - config: - sources: - pci: - deviceClassWhitelist: - - "02" - - "03" - deviceLabelFields: - - vendor - - mps: - # root specifies the location where files and folders for managing MPS will - # be created. This includes a daemon-specific /dev/shm and pipe and log - # directories. - # Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }} - root: "/run/nvidia/mps" - - - cdi: - # nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host. - # This is required to ensure that the generated CDI specification refers to the correct CDI hooks. - nvidiaHookPath: null diff --git a/infrastructure/controllers/nvidia-device-plugin/repository.yaml b/infrastructure/controllers/nvidia-device-plugin/repository.yaml deleted file mode 100644 index 65b043e..0000000 --- a/infrastructure/controllers/nvidia-device-plugin/repository.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: source.toolkit.fluxcd.io/v1 -kind: HelmRepository -metadata: - name: nvidia-device-plugin - namespace: nvidia-device-plugin -spec: - interval: 15m - url: https://nvidia.github.io/k8s-device-plugin