mirror of
https://github.com/michaelthomson0797/fleet-infra.git
synced 2026-02-04 13:09:53 +00:00
update: nvidia device plugin -> cdi plugin
This commit is contained in:
47
infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
Normal file
47
infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: generic-cdi-plugin-daemonset
|
||||
namespace: generic-cdi-plugin
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: generic-cdi-plugin
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: generic-cdi-plugin
|
||||
app.kubernetes.io/component: generic-cdi-plugin
|
||||
app.kubernetes.io/name: generic-cdi-plugin
|
||||
spec:
|
||||
containers:
|
||||
- image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
|
||||
name: generic-cdi-plugin
|
||||
command:
|
||||
- /generic-cdi-plugin
|
||||
- /var/run/cdi/nvidia-container-toolkit.json
|
||||
imagePullPolicy: Always
|
||||
securityContext:
|
||||
privileged: true
|
||||
tty: true
|
||||
volumeMounts:
|
||||
- name: kubelet
|
||||
mountPath: /var/lib/kubelet
|
||||
- name: nvidia-container-toolkit
|
||||
mountPath: /var/run/cdi/nvidia-container-toolkit.json
|
||||
volumes:
|
||||
- name: kubelet
|
||||
hostPath:
|
||||
path: /var/lib/kubelet
|
||||
- name: nvidia-container-toolkit
|
||||
hostPath:
|
||||
path: /var/run/cdi/nvidia-container-toolkit.json
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: "nixos-nvidia-cdi"
|
||||
operator: In
|
||||
values:
|
||||
- "enabled"
|
||||
@@ -1,4 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: nvidia-device-plugin
|
||||
name: generic-cdi-plugin
|
||||
@@ -1,16 +0,0 @@
|
||||
apiVersion: v1
|
||||
data:
|
||||
config: |
|
||||
version: v1
|
||||
sharing:
|
||||
timeSlicing:
|
||||
renameByDefault: false
|
||||
failRequestsGreaterThanOne: true
|
||||
resources:
|
||||
- name: nvidia.com/gpu
|
||||
replicas: 10
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: nvidia-plugin-configs
|
||||
namespace: nvidia-device-plugin
|
||||
@@ -1,170 +0,0 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: nvidia-device-plugin
|
||||
namespace: nvidia-device-plugin
|
||||
spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: nvidia-device-plugin
|
||||
version: 0.x
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: nvidia-device-plugin
|
||||
interval: 15m
|
||||
releaseName: nvidia-device-plugin
|
||||
values:
|
||||
# Plugin configuration
|
||||
# Only one of "name" or "map" should ever be set for a given deployment.
|
||||
# Use "name" to point to an external ConfigMap with a list of configurations.
|
||||
# Use "map" to build an integrated ConfigMap from a set of configurations as
|
||||
# part of this helm chart. An example of setting "map" might be:
|
||||
# config:
|
||||
# map:
|
||||
# default: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: none
|
||||
# mig-single: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: single
|
||||
# mig-mixed: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: mixed
|
||||
config:
|
||||
# ConfigMap name if pulling from an external ConfigMap
|
||||
name: "nvidia-plugin-configs"
|
||||
# List of fallback strategies to attempt if no config is selected and no default is provided
|
||||
fallbackStrategies: ["named" , "single"]
|
||||
|
||||
compatWithCPUManager: null
|
||||
migStrategy: null
|
||||
failOnInitError: null
|
||||
deviceListStrategy: null
|
||||
deviceIDStrategy: null
|
||||
nvidiaDriverRoot: null
|
||||
gdsEnabled: null
|
||||
mofedEnabled: null
|
||||
deviceDiscoveryStrategy: null
|
||||
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
namespaceOverride: ""
|
||||
selectorLabelsOverride: {}
|
||||
|
||||
allowDefaultNamespace: false
|
||||
|
||||
imagePullSecrets: []
|
||||
image:
|
||||
repository: nvcr.io/nvidia/k8s-device-plugin
|
||||
pullPolicy: IfNotPresent
|
||||
# Overrides the image tag whose default is the chart appVersion.
|
||||
tag: ""
|
||||
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
|
||||
podAnnotations: {}
|
||||
podSecurityContext: {}
|
||||
securityContext: {}
|
||||
|
||||
resources: {}
|
||||
nodeSelector: {}
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
# On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
|
||||
- key: feature.node.kubernetes.io/pci-10de.present
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
- matchExpressions:
|
||||
# On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
|
||||
- key: feature.node.kubernetes.io/cpu-model.vendor_id
|
||||
operator: In
|
||||
values:
|
||||
- "NVIDIA"
|
||||
- matchExpressions:
|
||||
# We allow a GPU deployment to be forced by setting the following label to "true"
|
||||
- key: "nvidia.com/gpu.present"
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
tolerations:
|
||||
# This toleration is deprecated. Kept here for backward compatibility
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
|
||||
runtimeClassName: nvidia
|
||||
|
||||
devicePlugin:
|
||||
enabled: true
|
||||
|
||||
gfd:
|
||||
enabled: true
|
||||
nameOverride: gpu-feature-discovery
|
||||
namespaceOverride: ""
|
||||
noTimestamp: null
|
||||
sleepInterval: null
|
||||
securityContext:
|
||||
# privileged access is required for the gpu-feature-discovery to access the
|
||||
# vgpu info on a host.
|
||||
# TODO: This should be optional and detected automatically.
|
||||
privileged: true
|
||||
|
||||
# Helm dependency
|
||||
nfd:
|
||||
nameOverride: node-feature-discovery
|
||||
enableNodeFeatureApi: false
|
||||
master:
|
||||
serviceAccount:
|
||||
name: node-feature-discovery
|
||||
create: true
|
||||
config:
|
||||
extraLabelNs: ["nvidia.com"]
|
||||
|
||||
worker:
|
||||
tolerations:
|
||||
- key: "node-role.kubernetes.io/master"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: "Equal"
|
||||
value: "present"
|
||||
effect: "NoSchedule"
|
||||
config:
|
||||
sources:
|
||||
pci:
|
||||
deviceClassWhitelist:
|
||||
- "02"
|
||||
- "03"
|
||||
deviceLabelFields:
|
||||
- vendor
|
||||
|
||||
mps:
|
||||
# root specifies the location where files and folders for managing MPS will
|
||||
# be created. This includes a daemon-specific /dev/shm and pipe and log
|
||||
# directories.
|
||||
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
|
||||
root: "/run/nvidia/mps"
|
||||
|
||||
|
||||
cdi:
|
||||
# nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
|
||||
# This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
|
||||
nvidiaHookPath: null
|
||||
@@ -1,8 +0,0 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: nvidia-device-plugin
|
||||
namespace: nvidia-device-plugin
|
||||
spec:
|
||||
interval: 15m
|
||||
url: https://nvidia.github.io/k8s-device-plugin
|
||||
Reference in New Issue
Block a user