mirror of
https://github.com/michaelthomson0797/fleet-infra.git
synced 2026-02-04 13:09:53 +00:00
@@ -1,8 +0,0 @@
|
|||||||
apiVersion: source.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: HelmRepository
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 15m
|
|
||||||
url: https://nvidia.github.io/k8s-device-plugin
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
||||||
kind: Kustomization
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: flux-system
|
|
||||||
spec:
|
|
||||||
interval: 15m
|
|
||||||
path: ./nvidia-device-plugin
|
|
||||||
prune: true # remove any elements later removed from the above path
|
|
||||||
timeout: 2m # if not set, this defaults to interval duration, which is 1h
|
|
||||||
sourceRef:
|
|
||||||
kind: GitRepository
|
|
||||||
name: flux-system
|
|
||||||
healthChecks:
|
|
||||||
- apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
|
||||||
kind: HelmRelease
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: nvidia-device-plugin
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
@@ -1,176 +0,0 @@
|
|||||||
apiVersion: helm.toolkit.fluxcd.io/v2beta2
|
|
||||||
kind: HelmRelease
|
|
||||||
metadata:
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: nvidia-device-plugin
|
|
||||||
spec:
|
|
||||||
chart:
|
|
||||||
spec:
|
|
||||||
chart: nvidia-device-plugin
|
|
||||||
version: 0.x
|
|
||||||
sourceRef:
|
|
||||||
kind: HelmRepository
|
|
||||||
name: nvidia-device-plugin
|
|
||||||
namespace: flux-system
|
|
||||||
interval: 15m
|
|
||||||
timeout: 5m
|
|
||||||
releaseName: nvidia-device-plugin
|
|
||||||
values:
|
|
||||||
# Plugin configuration
|
|
||||||
# Only one of "name" or "map" should ever be set for a given deployment.
|
|
||||||
# Use "name" to point to an external ConfigMap with a list of configurations.
|
|
||||||
# Use "map" to build an integrated ConfigMap from a set of configurations as
|
|
||||||
# part of this helm chart. An example of setting "map" might be:
|
|
||||||
# config:
|
|
||||||
# map:
|
|
||||||
# default: |-
|
|
||||||
# version: v1
|
|
||||||
# flags:
|
|
||||||
# migStrategy: none
|
|
||||||
# mig-single: |-
|
|
||||||
# version: v1
|
|
||||||
# flags:
|
|
||||||
# migStrategy: single
|
|
||||||
# mig-mixed: |-
|
|
||||||
# version: v1
|
|
||||||
# flags:
|
|
||||||
# migStrategy: mixed
|
|
||||||
config:
|
|
||||||
# ConfigMap name if pulling from an external ConfigMap
|
|
||||||
name: ""
|
|
||||||
# Set of named configs to build an integrated ConfigMap from
|
|
||||||
map: {}
|
|
||||||
# Default config name within the ConfigMap
|
|
||||||
default: ""
|
|
||||||
# List of fallback strategies to attempt if no config is selected and no default is provided
|
|
||||||
fallbackStrategies: ["named" , "single"]
|
|
||||||
|
|
||||||
compatWithCPUManager: null
|
|
||||||
migStrategy: null
|
|
||||||
failOnInitError: null
|
|
||||||
deviceListStrategy: null
|
|
||||||
deviceIDStrategy: null
|
|
||||||
nvidiaDriverRoot: null
|
|
||||||
gdsEnabled: null
|
|
||||||
mofedEnabled: null
|
|
||||||
deviceDiscoveryStrategy: null
|
|
||||||
|
|
||||||
nameOverride: ""
|
|
||||||
fullnameOverride: ""
|
|
||||||
namespaceOverride: ""
|
|
||||||
selectorLabelsOverride: {}
|
|
||||||
|
|
||||||
allowDefaultNamespace: false
|
|
||||||
|
|
||||||
imagePullSecrets: []
|
|
||||||
image:
|
|
||||||
repository: nvcr.io/nvidia/k8s-device-plugin
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
# Overrides the image tag whose default is the chart appVersion.
|
|
||||||
tag: ""
|
|
||||||
|
|
||||||
updateStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
|
|
||||||
podAnnotations: {}
|
|
||||||
podSecurityContext: {}
|
|
||||||
securityContext: {}
|
|
||||||
|
|
||||||
resources: {}
|
|
||||||
nodeSelector: {}
|
|
||||||
affinity:
|
|
||||||
nodeAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
nodeSelectorTerms:
|
|
||||||
- matchExpressions:
|
|
||||||
# On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID
|
|
||||||
- key: feature.node.kubernetes.io/pci-10de.present
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "true"
|
|
||||||
- matchExpressions:
|
|
||||||
# On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA
|
|
||||||
- key: feature.node.kubernetes.io/cpu-model.vendor_id
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "NVIDIA"
|
|
||||||
- matchExpressions:
|
|
||||||
# We allow a GPU deployment to be forced by setting the following label to "true"
|
|
||||||
- key: "nvidia.com/gpu.present"
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- "true"
|
|
||||||
tolerations:
|
|
||||||
# This toleration is deprecated. Kept here for backward compatibility
|
|
||||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
|
||||||
- key: CriticalAddonsOnly
|
|
||||||
operator: Exists
|
|
||||||
- key: nvidia.com/gpu
|
|
||||||
operator: Exists
|
|
||||||
effect: NoSchedule
|
|
||||||
|
|
||||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
|
||||||
# scheduler reserves resources for critical add-on pods so that they can
|
|
||||||
# be rescheduled after a failure.
|
|
||||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
|
||||||
priorityClassName: "system-node-critical"
|
|
||||||
|
|
||||||
runtimeClassName: nvidia
|
|
||||||
|
|
||||||
devicePlugin:
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
gfd:
|
|
||||||
enabled: true
|
|
||||||
nameOverride: gpu-feature-discovery
|
|
||||||
namespaceOverride: ""
|
|
||||||
noTimestamp: null
|
|
||||||
sleepInterval: null
|
|
||||||
securityContext:
|
|
||||||
# privileged access is required for the gpu-feature-discovery to access the
|
|
||||||
# vgpu info on a host.
|
|
||||||
# TODO: This should be optional and detected automatically.
|
|
||||||
privileged: true
|
|
||||||
|
|
||||||
# Helm dependency
|
|
||||||
nfd:
|
|
||||||
nameOverride: node-feature-discovery
|
|
||||||
enableNodeFeatureApi: false
|
|
||||||
master:
|
|
||||||
serviceAccount:
|
|
||||||
name: node-feature-discovery
|
|
||||||
create: true
|
|
||||||
config:
|
|
||||||
extraLabelNs: ["nvidia.com"]
|
|
||||||
|
|
||||||
worker:
|
|
||||||
tolerations:
|
|
||||||
- key: "node-role.kubernetes.io/master"
|
|
||||||
operator: "Equal"
|
|
||||||
value: ""
|
|
||||||
effect: "NoSchedule"
|
|
||||||
- key: "nvidia.com/gpu"
|
|
||||||
operator: "Equal"
|
|
||||||
value: "present"
|
|
||||||
effect: "NoSchedule"
|
|
||||||
config:
|
|
||||||
sources:
|
|
||||||
pci:
|
|
||||||
deviceClassWhitelist:
|
|
||||||
- "02"
|
|
||||||
- "03"
|
|
||||||
deviceLabelFields:
|
|
||||||
- vendor
|
|
||||||
|
|
||||||
mps:
|
|
||||||
# root specifies the location where files and folders for managing MPS will
|
|
||||||
# be created. This includes a daemon-specific /dev/shm and pipe and log
|
|
||||||
# directories.
|
|
||||||
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
|
|
||||||
root: "/run/nvidia/mps"
|
|
||||||
|
|
||||||
|
|
||||||
cdi:
|
|
||||||
# nvidiaHookPath specifies the path to the nvidia-cdi-hook or nvidia-ctk executables on the host.
|
|
||||||
# This is required to ensure that the generated CDI specification refers to the correct CDI hooks.
|
|
||||||
nvidiaHookPath: null
|
|
||||||
Reference in New Issue
Block a user