Skip to content

Commit

Permalink
add a check to ensure init containers request fractional cpu
Browse files Browse the repository at this point in the history
Signed-off-by: Chris Martin <chris@cmartinit.co.uk>
  • Loading branch information
d80tb7 committed Jul 8, 2024
1 parent d45b7c2 commit 08e1e2d
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 0 deletions.
1 change: 1 addition & 0 deletions config/armada/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ submission:
defaultActiveDeadline: "72h" # 3 days.
defaultActiveDeadlineByResourceRequest:
nvidia.com/gpu: "336h" # 14 days.
assertInitContainersRequestFractionalCpu: true
pulsar:
URL: "pulsar://pulsar:6650"
jobsetEventsTopic: "events"
Expand Down
2 changes: 2 additions & 0 deletions internal/armada/configuration/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ type SubmissionConfig struct {
// Maximum ratio of limits:requests per resource. Jobs who have a higher limits:resource ratio than this will be rejected.
// Any resource type missing from this map will default to 1.0.
MaxOversubscriptionByResourceRequest map[string]float64
// Enforce that an init containers requestion non-integer cpu. This is due to https://github.com/kubernetes/kubernetes/issues/112228
AssertInitContainersRequestFractionalCpu bool
}

// TODO: we can probably just typedef this to map[string]string
Expand Down
28 changes: 28 additions & 0 deletions internal/armada/submit/validation/submit_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ var (
validatePodSpecSize,
validateAffinity,
validateResources,
validateInitContainerCpu,
validatePriorityClasses,
validateTerminationGracePeriod,
validateIngresses,
Expand Down Expand Up @@ -337,3 +338,30 @@ func validateTerminationGracePeriod(j *api.JobSubmitRequestItem, config configur
}
return nil
}

// Ensures that any init containers request non-integer cpu. This is because when using static cpu manager init
// containers with integer cpu can cause pods to fail with an error of "Pod Allocate failed due to not enough cpus
// available to satisfy request, which is unexpected".
// See https://github.com/kubernetes/kubernetes/issues/112228
func validateInitContainerCpu(j *api.JobSubmitRequestItem, config configuration.SubmissionConfig) error {
if !config.AssertInitContainersRequestFractionalCpu {
return nil
}

spec := j.GetMainPodSpec()
if spec == nil {
return nil
}
const errMsg = "Init container %s contains invalid cpu value %s. All init containers must set requests/limits to fractional cpu e.g. 900m"
for _, container := range spec.InitContainers {
requestCpu := container.Resources.Requests.Cpu()
limitCpu := container.Resources.Limits.Cpu()
if !requestCpu.IsZero() && requestCpu.MilliValue()%1000 == 0 {
return fmt.Errorf(errMsg, container.Name, requestCpu)
}
if !limitCpu.IsZero() && limitCpu.MilliValue()%1000 == 0 {
return fmt.Errorf(errMsg, container.Name, limitCpu)
}
}
return nil
}
126 changes: 126 additions & 0 deletions internal/armada/submit/validation/submit_request_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,132 @@ func TestValidateTerminationGracePeriod(t *testing.T) {
}
}

func TestValidateInitContainerCpu(t *testing.T) {
tests := map[string]struct {
initContainers []v1.Container
disableCheck bool
expectSuccess bool
}{
"no init containers": {
initContainers: []v1.Container{},
expectSuccess: true,
},
"init container with no resource requests": {
initContainers: []v1.Container{
{},
},
expectSuccess: true,
},
"init container that doesn't specify cpu": {
initContainers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
"memory": resource.MustParse("1Gi"),
},
Requests: map[v1.ResourceName]resource.Quantity{
"memory": resource.MustParse("1Gi"),
},
},
},
},
expectSuccess: true,
},
"init container with fractional cpu": {
initContainers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("900m"),
},
Requests: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("900m"),
},
},
},
},
expectSuccess: true,
},
"check disabled": {
initContainers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("1"),
},
Requests: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("900m"),
},
},
},
},
disableCheck: true,
expectSuccess: true,
},
"limits with integer cpu": {
initContainers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("1"),
},
Requests: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("900m"),
},
},
},
},
expectSuccess: false,
},
"requests with integer cpu": {
initContainers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("900m"),
},
Requests: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("1"),
},
},
},
},
expectSuccess: false,
},
"cpu specified as millis": {
initContainers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("1000m"),
},
Requests: map[v1.ResourceName]resource.Quantity{
"cpu": resource.MustParse("1000m"),
},
},
},
},
expectSuccess: false,
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
err := validateInitContainerCpu(&api.JobSubmitRequestItem{
PodSpec: &v1.PodSpec{
InitContainers: tc.initContainers,
},
}, configuration.SubmissionConfig{
AssertInitContainersRequestFractionalCpu: !tc.disableCheck,
})
if tc.expectSuccess {
assert.NoError(t, err)
} else {
assert.Error(t, err)
}
})
}
}

func reqFromContainer(container v1.Container) *api.JobSubmitRequestItem {
return reqFromContainers([]v1.Container{container})
}
Expand Down

0 comments on commit 08e1e2d

Please sign in to comment.