Skip to content

Commit

Permalink
CAFV-326: Make EnableNVidiaGPU into a no-op (#524) (#525)
Browse files Browse the repository at this point in the history
- Do not customize VM even if EnableNvidiaGPU is set up. The customization that we performed was (slightly) helping customers who would install GPU drivers in GPU nodes directly. However this interferes with the NVIDIA GPU operator functionality.
- Those customers who would install GPU drivers in GPU nodes directly will be impacted by this change. However they can include the customization into their workflow directly since it is a very small customization
- Note that the flag has not yet been deprecated. The next API update should begin to show the deprecation.

(cherry picked from commit a5ab47d)
  • Loading branch information
arunmk authored Oct 10, 2023
1 parent d2c8eeb commit 1d7c39b
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 44 deletions.
39 changes: 2 additions & 37 deletions controllers/cluster_scripts/cloud_init.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,7 @@ write_files:
- path: /etc/cloud/cloud.cfg.d/cse.cfg
owner: root
content: |
ssh_deletekeys: false {{- if .NvidiaGPU }}
- path: /etc/containerd/config.toml
owner: root
content: |
version = 2

[plugins]
[plugins."io.containerd.grpc.v1.cri"]
sandbox_image = "projects.registry.vmware.com/tkg/pause:3.4.1"

[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"

[plugins."io.containerd.grpc.v1.cri".registry]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://registry-1.docker.io"] {{- end }}
ssh_deletekeys: false
- path: /opt/vmware/cloud-director/metering.sh
owner: root
content: |
Expand Down Expand Up @@ -147,16 +121,7 @@ write_files:
systemctl daemon-reload
systemctl restart containerd
wait_for_containerd_startup
vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful" {{- end }} {{- if .NvidiaGPU }}

vmtoolsd --cmd "info-set guestinfo.postcustomization.nvidia.runtime.install.status in_progress"
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list

sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit

vmtoolsd --cmd "info-set guestinfo.postcustomization.nvidia.runtime.install.status successful" {{- end }}
vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful" {{- end }}

vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress"
for IMAGE in "coredns" "etcd" "kube-proxy" "kube-apiserver" "kube-controller-manager" "kube-scheduler"
Expand Down
8 changes: 1 addition & 7 deletions controllers/vcdmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ const (
MeteringConfiguration = "guestinfo.metering.status"
KubeadmInit = "guestinfo.postcustomization.kubeinit.status"
KubeadmNodeJoin = "guestinfo.postcustomization.kubeadm.node.join.status"
NvidiaRuntimeInstall = "guestinfo.postcustomization.nvidia.runtime.install.status"
PostCustomizationScriptExecutionStatus = "guestinfo.post_customization_script_execution_status"
PostCustomizationScriptFailureReason = "guestinfo.post_customization_script_execution_failure_reason"
)
Expand All @@ -217,7 +216,6 @@ var postCustPhases = []string{
NetworkConfiguration,
MeteringConfiguration,
ProxyConfiguration,
NvidiaRuntimeInstall,
}

func removeFromSlice(remove string, arr []string) []string {
Expand Down Expand Up @@ -497,7 +495,7 @@ func (r *VCDMachineReconciler) reconcileNormal(ctx context.Context, cluster *clu

// TODO: After tenants has access to siteId, populate siteId to cloudInitInput as opposed to the site
cloudInitInput.VcdHostFormatted = strings.ReplaceAll(vcdCluster.Spec.Site, "/", "\\/")
cloudInitInput.NvidiaGPU = vcdMachine.Spec.EnableNvidiaGPU
cloudInitInput.NvidiaGPU = false
cloudInitInput.TKGVersion = getTKGVersion(cluster) // needed for both worker & control plane machines for metering
cloudInitInput.ClusterID = vcdCluster.Status.InfraId // needed for both worker & control plane machines for metering
cloudInitInput.ResizedControlPlane = isResizedControlPlane
Expand Down Expand Up @@ -838,10 +836,6 @@ func (r *VCDMachineReconciler) reconcileNormal(ctx context.Context, cluster *clu
phases = append(phases, KubeadmNodeJoin)
}

if !vcdMachine.Spec.EnableNvidiaGPU {
phases = removeFromSlice(NvidiaRuntimeInstall, phases)
}

if vcdCluster.Spec.ProxyConfigSpec.HTTPSProxy == "" &&
vcdCluster.Spec.ProxyConfigSpec.HTTPProxy == "" {
phases = removeFromSlice(ProxyConfiguration, phases)
Expand Down

0 comments on commit 1d7c39b

Please sign in to comment.