mirror of
https://github.com/michaelthomson0797/fleet-infra.git
synced 2026-02-04 04:59:54 +00:00
176
nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml
Normal file
176
nvidia-device-plugin/helmrelease-nvidia-device-plugin.yaml
Normal file
@@ -0,0 +1,176 @@
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: nvidia-device-plugin
|
||||
namespace: nvidia-device-plugin
|
||||
spec:
|
||||
chart:
|
||||
spec:
|
||||
chart: nvidia-device-plugin
|
||||
version: 0.x
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: nvidia-device-plugin
|
||||
namespace: flux-system
|
||||
interval: 15m
|
||||
timeout: 5m
|
||||
releaseName: nvidia-device-plugin
|
||||
values:
|
||||
# Plugin configuration
|
||||
# Only one of "name" or "map" should ever be set for a given deployment.
|
||||
# Use "name" to point to an external ConfigMap with a list of configurations.
|
||||
# Use "map" to build an integrated ConfigMap from a set of configurations as
|
||||
# part of this helm chart. An example of setting "map" might be:
|
||||
# config:
|
||||
# map:
|
||||
# default: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: none
|
||||
# mig-single: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: single
|
||||
# mig-mixed: |-
|
||||
# version: v1
|
||||
# flags:
|
||||
# migStrategy: mixed
|
||||
config:
|
||||
# ConfigMap name if pulling from an external ConfigMap
|
||||
name: ""
|
||||
# Set of named configs to build an integrated ConfigMap from
|
||||
map: {}
|
||||
# Default config name within the ConfigMap
|
||||
default: ""
|
||||
# List of fallback strategies to attempt if no config is selected and no default is provided
|
||||
fallbackStrategies: ["named" , "single"]
|
||||
|
||||
compatWithCPUManager: null
|
||||
migStrategy: null
|
||||
failOnInitError: null
|
||||
deviceListStrategy: null
|
||||
deviceIDStrategy: null
|
||||
nvidiaDriverRoot: null
|
||||
gdsEnabled: null
|
||||
mofedEnabled: null
|
||||
deviceDiscoveryStrategy: null
|
||||
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
namespaceOverride: ""
|
||||
selectorLabelsOverride: {}
|
||||
|
||||
allowDefaultNamespace: false
|
||||
|
||||
imagePullSecrets: []
|
||||
image:
|
||||
repository: nvcr.io/nvidia/k8s-device-plugin
|
||||
pullPolicy: IfNotPresent
|
||||
# Overrides the image tag whose default is the chart appVersion.
|
||||
tag: ""
|
||||
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
|
||||
podAnnotations: {}
|
||||
podSecurityContext: {}
|
||||
securityContext: {}
|
||||
|
||||
resources: {}
|
||||
nodeSelector: {}
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
# On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
|
||||
- key: feature.node.kubernetes.io/pci-10de.present
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
- matchExpressions:
|
||||
# On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
|
||||
- key: feature.node.kubernetes.io/cpu-model.vendor_id
|
||||
operator: In
|
||||
values:
|
||||
- "NVIDIA"
|
||||
- matchExpressions:
|
||||
# We allow a GPU deployment to be forced by setting the following label to "true"
|
||||
- key: "nvidia.com/gpu.present"
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
tolerations:
|
||||
# This toleration is deprecated. Kept here for backward compatibility
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
|
||||
runtimeClassName: nvidia
|
||||
|
||||
devicePlugin:
|
||||
enabled: true
|
||||
|
||||
gfd:
|
||||
enabled: true
|
||||
nameOverride: gpu-feature-discovery
|
||||
namespaceOverride: ""
|
||||
noTimestamp: null
|
||||
sleepInterval: null
|
||||
securityContext:
|
||||
# privileged access is required for the gpu-feature-discovery to access the
|
||||
# vgpu info on a host.
|
||||
# TODO: This should be optional and detected automatically.
|
||||
privileged: true
|
||||
|
||||
# Helm dependency
|
||||
nfd:
|
||||
nameOverride: node-feature-discovery
|
||||
enableNodeFeatureApi: false
|
||||
master:
|
||||
serviceAccount:
|
||||
name: node-feature-discovery
|
||||
create: true
|
||||
config:
|
||||
extraLabelNs: ["nvidia.com"]
|
||||
|
||||
worker:
|
||||
tolerations:
|
||||
- key: "node-role.kubernetes.io/master"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: "Equal"
|
||||
value: "present"
|
||||
effect: "NoSchedule"
|
||||
config:
|
||||
sources:
|
||||
pci:
|
||||
deviceClassWhitelist:
|
||||
- "02"
|
||||
- "03"
|
||||
deviceLabelFields:
|
||||
- vendor
|
||||
|
||||
mps:
|
||||
# root specifies the location where files and folders for managing MPS will
|
||||
# be created. This includes a daemon-specific /dev/shm and pipe and log
|
||||
# directories.
|
||||
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
|
||||
root: "/run/nvidia/mps"
|
||||
|
||||
|
||||
cdi:
|
||||
# nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
|
||||
# This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
|
||||
nvidiaHookPath: null
|
||||
Reference in New Issue
Block a user