From 0cc933576fdd6fd70178453e84b44596ee972f27 Mon Sep 17 00:00:00 2001
From: JamesMurkin <jamesmurkin@hotmail.com>
Date: Mon, 25 Nov 2024 15:09:29 +0000
Subject: [PATCH] Prevent issue handler from trying to handle issues on
 finished pods

This was causing issues where we'd send events for long finished pods

For example a pod that fails - but the pod is kept around for a few days
for debugging purposes

Once the finished pod reached its activeDeadlineSeconds, the executor
would send a failed event and delete the pod

We don't need to look for issues on finished pods - and sending events
for finished pods is actively confusing to downstream

We now filter pods we look for issues on based on their state
---
 internal/executor/service/pod_issue_handler.go      |  5 +++++
 internal/executor/service/pod_issue_handler_test.go | 10 +++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/internal/executor/service/pod_issue_handler.go b/internal/executor/service/pod_issue_handler.go
index f61d883d3da..c94a5b175e0 100644
--- a/internal/executor/service/pod_issue_handler.go
+++ b/internal/executor/service/pod_issue_handler.go
@@ -173,6 +173,11 @@ func (p *IssueHandler) detectPodIssues(allManagedPods []*v1.Pod) {
 		if p.hasIssue(util.ExtractJobRunId(pod)) {
 			continue
 		}
+		if util.IsInTerminalState(pod) && util.HasCurrentStateBeenReported(pod) {
+			// No need to detect issues on completed pods
+			// This prevents us sending updates on pods that are already finished and reported
+			continue
+		}
 		if pod.DeletionTimestamp != nil && pod.DeletionTimestamp.Add(p.stuckTerminatingPodExpiry).Before(p.clock.Now()) {
 			// pod is stuck in terminating phase, this sometimes happen on node failure
 			// it is safer to produce failed event than retrying as the job might have run already
diff --git a/internal/executor/service/pod_issue_handler_test.go b/internal/executor/service/pod_issue_handler_test.go
index 2afb1b18a54..72057762eec 100644
--- a/internal/executor/service/pod_issue_handler_test.go
+++ b/internal/executor/service/pod_issue_handler_test.go
@@ -91,6 +91,10 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckTerminating(t *testin
 func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *testing.T) {
 	startTime := time.Now().Add(-time.Minute * 10)
 
+	completedPodPastDeadline := makePodWithDeadline(startTime, 300, 0)
+	completedPodPastDeadline.Status.Phase = v1.PodFailed
+	completedPodPastDeadline.Annotations[string(v1.PodFailed)] = "true"
+
 	tests := map[string]struct {
 		expectIssueDetected bool
 		pod                 *v1.Pod
@@ -100,6 +104,10 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *t
 			// Created 10 mins ago, 5 min deadline
 			pod: makePodWithDeadline(startTime, 300, 0),
 		},
+		"PodPastDeadline - Completed pod": {
+			expectIssueDetected: false,
+			pod:                 completedPodPastDeadline,
+		},
 		"PodPastDeadlineWithinTerminationGracePeriod": {
 			expectIssueDetected: false,
 			// Created 10 mins ago, 5 min deadline, 10 minute grace period
@@ -107,7 +115,7 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *t
 		},
 		"PodWithNoStartTime": {
 			expectIssueDetected: false,
-			// Created 10 mins ago, 5 min deadline, no start time
+			// No start time so cannot determine if past its deadline
 			pod: makePodWithDeadline(time.Time{}, 300, 0),
 		},
 	}