Skip to content

Commit

Permalink
Add GPU/Accelerator support to VMs
Browse files Browse the repository at this point in the history
  • Loading branch information
jwmay2012 committed Dec 19, 2024
1 parent 5b4ab44 commit 2358039
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 0 deletions.
19 changes: 19 additions & 0 deletions api/v1beta1/gcpmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,25 @@ type GCPMachineSpec struct {
// RootDiskEncryptionKey defines the KMS key to be used to encrypt the root disk.
// +optional
RootDiskEncryptionKey *CustomerEncryptionKey `json:"rootDiskEncryptionKey,omitempty"`

// GuestAccelerators is a list of the type and count of accelerator cards
// attached to the instance.
// +optional
GuestAccelerators []Accelerator `json:"guestAccelerators,omitempty"`
}

// Accelerator is a specification of the type and number of accelerator
// cards attached to the instance.
type Accelerator struct {
// Count is the number of the guest accelerator cards exposed to this
// instance.
Count int64 `json:"count,omitempty"`
// Type is the full or partial URL of the accelerator type resource to
// attach to this instance. For example:
// projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
// If you are creating an instance template, specify only the accelerator name.
// See GPUs on Compute Engine for a full list of accelerator types.
Type string `json:"type,omitempty"`
}

// MetadataItem defines a single piece of metadata associated with an instance.
Expand Down
20 changes: 20 additions & 0 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,22 @@ func (m *MachineScope) InstanceAdditionalMetadataSpec() *compute.Metadata {
return metadata
}

// InstanceGuestAcceleratorsSpec returns a slice of Guest Accelerator Config specs.
func (m *MachineScope) InstanceGuestAcceleratorsSpec() []*compute.AcceleratorConfig {
if len(m.GCPMachine.Spec.GuestAccelerators) == 0 {
return nil
}
accelConfigs := make([]*compute.AcceleratorConfig, 0, len(m.GCPMachine.Spec.GuestAccelerators))
for _, accel := range m.GCPMachine.Spec.GuestAccelerators {
accelConfig := &compute.AcceleratorConfig{
AcceleratorType: accel.Type,
AcceleratorCount: accel.Count,
}
accelConfigs = append(accelConfigs, accelConfig)
}
return accelConfigs
}

// InstanceSpec returns instance spec.
func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance {
instance := &compute.Instance{
Expand Down Expand Up @@ -457,6 +473,11 @@ func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance {
instance.Metadata = m.InstanceAdditionalMetadataSpec()
instance.ServiceAccounts = append(instance.ServiceAccounts, m.InstanceServiceAccountsSpec())
instance.NetworkInterfaces = append(instance.NetworkInterfaces, m.InstanceNetworkInterfaceSpec())
instance.GuestAccelerators = m.InstanceGuestAcceleratorsSpec()
if len(instance.GuestAccelerators) > 0 {
instance.Scheduling.OnHostMaintenance = "TERMINATE"
}

return instance
}

Expand Down
25 changes: 25 additions & 0 deletions config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,31 @@ spec:
- Enabled
- Disabled
type: string
guestAccelerators:
description: |-
GuestAccelerators is a list of the type and count of accelerator cards
attached to the instance.
items:
description: |-
Accelerator is a specification of the type and number of accelerator
cards attached to the instance.
properties:
count:
description: |-
Count is the number of the guest accelerator cards exposed to this
instance.
format: int64
type: integer
type:
description: |-
Type is the full or partial URL of the accelerator type resource to
attach to this instance. For example:
projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
If you are creating an instance template, specify only the accelerator name.
See GPUs on Compute Engine for a full list of accelerator types.
type: string
type: object
type: array
image:
description: |-
Image is the full reference to a valid image to be used for this machine.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,31 @@ spec:
- Enabled
- Disabled
type: string
guestAccelerators:
description: |-
GuestAccelerators is a list of the type and count of accelerator cards
attached to the instance.
items:
description: |-
Accelerator is a specification of the type and number of accelerator
cards attached to the instance.
properties:
count:
description: |-
Count is the number of the guest accelerator cards exposed to this
instance.
format: int64
type: integer
type:
description: |-
Type is the full or partial URL of the accelerator type resource to
attach to this instance. For example:
projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
If you are creating an instance template, specify only the accelerator name.
See GPUs on Compute Engine for a full list of accelerator types.
type: string
type: object
type: array
image:
description: |-
Image is the full reference to a valid image to be used for this machine.
Expand Down
26 changes: 26 additions & 0 deletions docs/book/src/topics/gpus.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# GPUs

Add GPUs via the `guestAccelerators` field in `GCPMachineTemplate`.

```
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: GCPMachineTemplate
metadata:
name: mygcpmachinetemplate
namespace: mynamespace
spec:
template:
spec:
image: projects/myproject/global/images/myimage
instanceType: n1-standard-2
guestAccelerators:
- type: projects/myproject/zones/us-central1-c/acceleratorTypes/nvidia-tesla-t4
count: 1
```

https://cloud.google.com/compute/docs/gpus

NOTE: Instances with accelerators/GPUs do NOT support live migration.
Therefore, the `onHostMaintenance` event is always `TERMINATE`.
https://cloud.google.com/compute/docs/instances/setting-vm-host-options

0 comments on commit 2358039

Please sign in to comment.