mirror of
https://github.com/michaelthomson0797/fleet-infra.git
synced 2026-02-04 13:09:53 +00:00
update: nvidia device plugin -> cdi plugin
This commit is contained in:
47
infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
Normal file
47
infrastructure/controllers/generic-cdi-plugin/daemonset.yaml
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: generic-cdi-plugin-daemonset
|
||||||
|
namespace: generic-cdi-plugin
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: generic-cdi-plugin
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: generic-cdi-plugin
|
||||||
|
app.kubernetes.io/component: generic-cdi-plugin
|
||||||
|
app.kubernetes.io/name: generic-cdi-plugin
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
|
||||||
|
name: generic-cdi-plugin
|
||||||
|
command:
|
||||||
|
- /generic-cdi-plugin
|
||||||
|
- /var/run/cdi/nvidia-container-toolkit.json
|
||||||
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
tty: true
|
||||||
|
volumeMounts:
|
||||||
|
- name: kubelet
|
||||||
|
mountPath: /var/lib/kubelet
|
||||||
|
- name: nvidia-container-toolkit
|
||||||
|
mountPath: /var/run/cdi/nvidia-container-toolkit.json
|
||||||
|
volumes:
|
||||||
|
- name: kubelet
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet
|
||||||
|
- name: nvidia-container-toolkit
|
||||||
|
hostPath:
|
||||||
|
path: /var/run/cdi/nvidia-container-toolkit.json
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: "nixos-nvidia-cdi"
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "enabled"
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Namespace
|
kind: Namespace
|
||||||
metadata:
|
metadata:
|
||||||
name: nvidia-device-plugin
|
name: generic-cdi-plugin
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
config: |
|
|
||||||
version: v1
|
|
||||||
sharing:
|
|
||||||
timeSlicing:
|
|
||||||
renameByDefault: false
|
|
||||||
failRequestsGreaterThanOne: true
|
|
||||||
resources:
|
|
||||||
- name: nvidia.com/gpu
|
|
||||||
replicas: 10
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
creationTimestamp: null
|
|
||||||
name: nvidia-plugin-configs
|
|
||||||
namespace: nvidia-device-plugin
|
|
||||||
@@ -1,170 +0,0 @@
|
|||||||
apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: nvidia-device-plugin
|
|
||||||
spec:
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
chart: nvidia-device-plugin
|
|
||||||
version: 0.x
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
interval: 15m
|
|
||||||
releaseName: nvidia-device-plugin
|
|
||||||
values:
|
|
||||||
# Plugin configuration
|
|
||||||
# Only one of "name" or "map" should ever be set for a given deployment.
|
|
||||||
# Use "name" to point to an external ConfigMap with a list of configurations.
|
|
||||||
# Use "map" to build an integrated ConfigMap from a set of configurations as
|
|
||||||
# part of this helm chart. An example of setting "map" might be:
|
|
||||||
# config:
|
|
||||||
# map:
|
|
||||||
# default: |-
|
|
||||||
# version: v1
|
|
||||||
# flags:
|
|
||||||
# migStrategy: none
|
|
||||||
# mig-single: |-
|
|
||||||
# version: v1
|
|
||||||
# flags:
|
|
||||||
# migStrategy: single
|
|
||||||
# mig-mixed: |-
|
|
||||||
# version: v1
|
|
||||||
# flags:
|
|
||||||
# migStrategy: mixed
|
|
||||||
config:
|
|
||||||
# ConfigMap name if pulling from an external ConfigMap
|
|
||||||
name: "nvidia-plugin-configs"
|
|
||||||
# List of fallback strategies to attempt if no config is selected and no default is provided
|
|
||||||
fallbackStrategies: ["named" , "single"]
|
|
||||||
|
|
||||||
compatWithCPUManager: null
|
|
||||||
migStrategy: null
|
|
||||||
failOnInitError: null
|
|
||||||
deviceListStrategy: null
|
|
||||||
deviceIDStrategy: null
|
|
||||||
nvidiaDriverRoot: null
|
|
||||||
gdsEnabled: null
|
|
||||||
mofedEnabled: null
|
|
||||||
deviceDiscoveryStrategy: null
|
|
||||||
|
|
||||||
nameOverride: ""
|
|
||||||
fullnameOverride: ""
|
|
||||||
namespaceOverride: ""
|
|
||||||
selectorLabelsOverride: {}
|
|
||||||
|
|
||||||
allowDefaultNamespace: false
|
|
||||||
|
|
||||||
imagePullSecrets: []
|
|
||||||
image:
|
|
||||||
repository: nvcr.io/nvidia/k8s-device-plugin
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
# Overrides the image tag whose default is the chart appVersion.
|
|
||||||
tag: ""
|
|
||||||
|
|
||||||
updateStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
|
|
||||||
podAnnotations: {}
|
|
||||||
podSecurityContext: {}
|
|
||||||
securityContext: {}
|
|
||||||
|
|
||||||
resources: {}
|
|
||||||
nodeSelector: {}
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
# On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
|
|
||||||
- key: feature.node.kubernetes.io/pci-10de.present
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "true"
|
|
||||||
- matchExpressions:
|
|
||||||
# On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
|
|
||||||
- key: feature.node.kubernetes.io/cpu-model.vendor_id
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "NVIDIA"
|
|
||||||
- matchExpressions:
|
|
||||||
# We allow a GPU deployment to be forced by setting the following label to "true"
|
|
||||||
- key: "nvidia.com/gpu.present"
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "true"
|
|
||||||
tolerations:
|
|
||||||
# This toleration is deprecated. Kept here for backward compatibility
|
|
||||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
|
||||||
- key: CriticalAddonsOnly
|
|
||||||
operator: Exists
|
|
||||||
- key: nvidia.com/gpu
|
|
||||||
operator: Exists
|
|
||||||
effect: NoSchedule
|
|
||||||
|
|
||||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
|
||||||
# scheduler reserves resources for critical add-on pods so that they can
|
|
||||||
# be rescheduled after a failure.
|
|
||||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
|
||||||
priorityClassName: "system-node-critical"
|
|
||||||
|
|
||||||
runtimeClassName: nvidia
|
|
||||||
|
|
||||||
devicePlugin:
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
gfd:
|
|
||||||
enabled: true
|
|
||||||
nameOverride: gpu-feature-discovery
|
|
||||||
namespaceOverride: ""
|
|
||||||
noTimestamp: null
|
|
||||||
sleepInterval: null
|
|
||||||
securityContext:
|
|
||||||
# privileged access is required for the gpu-feature-discovery to access the
|
|
||||||
# vgpu info on a host.
|
|
||||||
# TODO: This should be optional and detected automatically.
|
|
||||||
privileged: true
|
|
||||||
|
|
||||||
# Helm dependency
|
|
||||||
nfd:
|
|
||||||
nameOverride: node-feature-discovery
|
|
||||||
enableNodeFeatureApi: false
|
|
||||||
master:
|
|
||||||
serviceAccount:
|
|
||||||
name: node-feature-discovery
|
|
||||||
create: true
|
|
||||||
config:
|
|
||||||
extraLabelNs: ["nvidia.com"]
|
|
||||||
|
|
||||||
worker:
|
|
||||||
tolerations:
|
|
||||||
- key: "node-role.kubernetes.io/master"
|
|
||||||
operator: "Equal"
|
|
||||||
value: ""
|
|
||||||
effect: "NoSchedule"
|
|
||||||
- key: "nvidia.com/gpu"
|
|
||||||
operator: "Equal"
|
|
||||||
value: "present"
|
|
||||||
effect: "NoSchedule"
|
|
||||||
config:
|
|
||||||
sources:
|
|
||||||
pci:
|
|
||||||
deviceClassWhitelist:
|
|
||||||
- "02"
|
|
||||||
- "03"
|
|
||||||
deviceLabelFields:
|
|
||||||
- vendor
|
|
||||||
|
|
||||||
mps:
|
|
||||||
# root specifies the location where files and folders for managing MPS will
|
|
||||||
# be created. This includes a daemon-specific /dev/shm and pipe and log
|
|
||||||
# directories.
|
|
||||||
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
|
|
||||||
root: "/run/nvidia/mps"
|
|
||||||
|
|
||||||
|
|
||||||
cdi:
|
|
||||||
# nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
|
|
||||||
# This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
|
|
||||||
nvidiaHookPath: null
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
apiVersion: source.toolkit.fluxcd.io/v1
|
|
||||||
kind: HelmRepository
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: nvidia-device-plugin
|
|
||||||
spec:
|
|
||||||
interval: 15m
|
|
||||||
url: https://nvidia.github.io/k8s-device-plugin
|
|
||||||
Reference in New Issue
Block a user