From 0cc933576fdd6fd70178453e84b44596ee972f27 Mon Sep 17 00:00:00 2001 From: JamesMurkin Date: Mon, 25 Nov 2024 15:09:29 +0000 Subject: [PATCH] Prevent issue handler from trying to handle issues on finished pods This was causing issues where we'd send events for long finished pods For example a pod that fails - but the pod is kept around for a few days for debugging purposes Once the finished pod reached its activeDeadlineSeconds, the executor would send a failed event and delete the pod We don't need to look for issues on finished pods - and sending events for finished pods is actively confusing to downstream We now filter pods we look for issues on based on their state --- internal/executor/service/pod_issue_handler.go | 5 +++++ internal/executor/service/pod_issue_handler_test.go | 10 +++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/internal/executor/service/pod_issue_handler.go b/internal/executor/service/pod_issue_handler.go index f61d883d3da..c94a5b175e0 100644 --- a/internal/executor/service/pod_issue_handler.go +++ b/internal/executor/service/pod_issue_handler.go @@ -173,6 +173,11 @@ func (p *IssueHandler) detectPodIssues(allManagedPods []*v1.Pod) { if p.hasIssue(util.ExtractJobRunId(pod)) { continue } + if util.IsInTerminalState(pod) && util.HasCurrentStateBeenReported(pod) { + // No need to detect issues on completed pods + // This prevents us sending updates on pods that are already finished and reported + continue + } if pod.DeletionTimestamp != nil && pod.DeletionTimestamp.Add(p.stuckTerminatingPodExpiry).Before(p.clock.Now()) { // pod is stuck in terminating phase, this sometimes happen on node failure // it is safer to produce failed event than retrying as the job might have run already diff --git a/internal/executor/service/pod_issue_handler_test.go b/internal/executor/service/pod_issue_handler_test.go index 2afb1b18a54..72057762eec 100644 --- a/internal/executor/service/pod_issue_handler_test.go +++ b/internal/executor/service/pod_issue_handler_test.go @@ -91,6 +91,10 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckTerminating(t *testin func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *testing.T) { startTime := time.Now().Add(-time.Minute * 10) + completedPodPastDeadline := makePodWithDeadline(startTime, 300, 0) + completedPodPastDeadline.Status.Phase = v1.PodFailed + completedPodPastDeadline.Annotations[string(v1.PodFailed)] = "true" + tests := map[string]struct { expectIssueDetected bool pod *v1.Pod @@ -100,6 +104,10 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *t // Created 10 mins ago, 5 min deadline pod: makePodWithDeadline(startTime, 300, 0), }, + "PodPastDeadline - Completed pod": { + expectIssueDetected: false, + pod: completedPodPastDeadline, + }, "PodPastDeadlineWithinTerminationGracePeriod": { expectIssueDetected: false, // Created 10 mins ago, 5 min deadline, 10 minute grace period @@ -107,7 +115,7 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *t }, "PodWithNoStartTime": { expectIssueDetected: false, - // Created 10 mins ago, 5 min deadline, no start time + // No start time so cannot determine if past its deadline pod: makePodWithDeadline(time.Time{}, 300, 0), }, }