Files
fleet-infra/infrastructure/controllers/nvidia-device-plugin/release.yaml

171 lines
5.2 KiB
YAML

apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: nvidia-device-plugin
namespace: nvidia-device-plugin
spec:
chart:
spec:
chart: nvidia-device-plugin
version: 0.x
sourceRef:
kind: HelmRepository
name: nvidia-device-plugin
interval: 15m
releaseName: nvidia-device-plugin
values:
# Plugin configuration
# Only one of "name" or "map" should ever be set for a given deployment.
# Use "name" to point to an external ConfigMap with a list of configurations.
# Use "map" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "map" might be:
# config:
# map:
# default: |-
# version: v1
# flags:
# migStrategy: none
# mig-single: |-
# version: v1
# flags:
# migStrategy: single
# mig-mixed: |-
# version: v1
# flags:
# migStrategy: mixed
config:
# ConfigMap name if pulling from an external ConfigMap
name: "nvidia-plugin-configs"
# List of fallback strategies to attempt if no config is selected and no default is provided
fallbackStrategies: ["named" , "single"]
compatWithCPUManager: null
migStrategy: null
failOnInitError: null
deviceListStrategy: null
deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
deviceDiscoveryStrategy: null
nameOverride: ""
fullnameOverride: ""
namespaceOverride: ""
selectorLabelsOverride: {}
allowDefaultNamespace: false
imagePullSecrets: []
image:
repository: nvcr.io/nvidia/k8s-device-plugin
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
tag: ""
updateStrategy:
type: RollingUpdate
podAnnotations: {}
podSecurityContext: {}
securityContext: {}
resources: {}
nodeSelector: {}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
# On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
- key: feature.node.kubernetes.io/pci-10de.present
operator: In
values:
- "true"
- matchExpressions:
# On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
- key: feature.node.kubernetes.io/cpu-model.vendor_id
operator: In
values:
- "NVIDIA"
- matchExpressions:
# We allow a GPU deployment to be forced by setting the following label to "true"
- key: "nvidia.com/gpu.present"
operator: In
values:
- "true"
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
runtimeClassName: nvidia
devicePlugin:
enabled: true
gfd:
enabled: true
nameOverride: gpu-feature-discovery
namespaceOverride: ""
noTimestamp: null
sleepInterval: null
securityContext:
# privileged access is required for the gpu-feature-discovery to access the
# vgpu info on a host.
# TODO: This should be optional and detected automatically.
privileged: true
# Helm dependency
nfd:
nameOverride: node-feature-discovery
enableNodeFeatureApi: false
master:
serviceAccount:
name: node-feature-discovery
create: true
config:
extraLabelNs: ["nvidia.com"]
worker:
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"
config:
sources:
pci:
deviceClassWhitelist:
- "02"
- "03"
deviceLabelFields:
- vendor
mps:
# root specifies the location where files and folders for managing MPS will
# be created. This includes a daemon-specific /dev/shm and pipe and log
# directories.
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
root: "/run/nvidia/mps"
cdi:
# nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
# This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
nvidiaHookPath: null