Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve metrics #3847

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
86 changes: 18 additions & 68 deletions cmd/ghalistener/metrics/metrics.go
Copy link

@atsu85 atsu85 Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo in commit description?

They cause a creation of new services with each job execution.

vs

They cause a creation of new series with each job execution.

Also worth mentioning cardinality explosion and OOMs

Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ var (
labelKeyEventName,
}

completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
jobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
jobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
lastJobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
)

var (
Expand Down Expand Up @@ -144,75 +144,25 @@ var (
completedJobsTotalLabels,
)

jobStartupDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
jobLastStartupDurationSeconds = prometheus.NewGaugeVec(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth adding a comment (in addition to commit message) why Gague is used, while ideally Histogram seems better data type - might avoid a lot of WTFs and wasting time basically reverting this change ;)

prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_startup_duration_seconds",
Help: "Time spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).",
Buckets: runtimeBuckets,
Name: "job_last_startup_duration_seconds",
Help: "The last duration spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).",
},
jobStartupDurationLabels,
lastJobStartupDurationLabels,
)

jobExecutionDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
jobLastExecutionDurationSeconds = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_execution_duration_seconds",
Help: "Time spent executing workflow jobs by the scale set (in seconds).",
Buckets: runtimeBuckets,
Name: "job_last_execution_duration_seconds",
Help: "The last duration spent executing workflow jobs by the scale set (in seconds).",
},
jobExecutionDurationLabels,
lastJobExecutionDurationLabels,
)
)

var runtimeBuckets []float64 = []float64{
0.01,
0.05,
0.1,
0.5,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
12,
15,
18,
20,
25,
30,
40,
50,
60,
70,
80,
90,
100,
110,
120,
150,
180,
210,
240,
300,
360,
420,
480,
540,
600,
900,
1200,
1800,
2400,
3000,
3600,
}

type baseLabels struct {
scaleSetName string
scaleSetNamespace string
Expand Down Expand Up @@ -309,8 +259,8 @@ func NewExporter(config ExporterConfig) ServerPublisher {
idleRunners,
startedJobsTotal,
completedJobsTotal,
jobStartupDurationSeconds,
jobExecutionDurationSeconds,
jobLastStartupDurationSeconds,
jobLastExecutionDurationSeconds,
)

mux := http.NewServeMux()
Expand Down Expand Up @@ -369,7 +319,7 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) {

if !msg.JobMessageBase.RunnerAssignTime.IsZero() && !msg.JobMessageBase.ScaleSetAssignTime.IsZero() {
startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix()
jobStartupDurationSeconds.With(l).Observe(float64(startupDuration))
jobLastStartupDurationSeconds.With(l).Set(float64(startupDuration))
}
}

Expand All @@ -379,7 +329,7 @@ func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) {

if !msg.JobMessageBase.FinishTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() {
executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix()
jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration))
jobLastExecutionDurationSeconds.With(l).Set(float64(executionDuration))
}
}

Expand Down