From 36f84d189e14d0154d62eb77eb250b510ef10799 Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Thu, 9 Feb 2023 12:23:17 +0530 Subject: [PATCH 1/2] Add SLO alerts for rhobsp02ue1 Signed-off-by: Saswata Mukherjee --- observability/prometheusrules.jsonnet | 6 + ...hobs-slos-rhobsp02ue1.prometheusrules.yaml | 1550 +++++++++++++++++ 2 files changed, 1556 insertions(+) create mode 100644 resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml diff --git a/observability/prometheusrules.jsonnet b/observability/prometheusrules.jsonnet index 37637b62d5..04086aaa70 100644 --- a/observability/prometheusrules.jsonnet +++ b/observability/prometheusrules.jsonnet @@ -95,6 +95,8 @@ local appSREOverwrites(environment) = { std.startsWith(name, 'rhobs-telemeter') && environment == 'stage' then '080e53f245a15445bdf777ae0e66945d' else if std.startsWith(name, 'rhobs-mst') && environment == 'production' then '283e7002d85c08126681241df2fdb22b' + else if + std.startsWith(name, 'rhobs-rhobsp02ue1') && environment == 'production' then '7f4df1c2d5518d5c3f2876ca9bb874a8' else if std.startsWith(name, 'rhobs-mst') && environment == 'stage' then '92520ea4d6976f30d1618164e186ef9b' else if @@ -471,6 +473,10 @@ local renderAlerts(name, environment, mixin) = { 'rhobs-slos-mst-stage.prometheusrules': renderAlerts('rhobs-slos-mst-stage', 'stage', flatten(mstStageSLOs)), 'rhobs-slos-mst-production.prometheusrules': renderAlerts('rhobs-slos-mst-production', 'production', flatten(mstProductionSLOs)), + local rhobsp02ue1ProductionSLOs = apiSLOs('rhobsp02ue1', 'observatorium-mst-production', 'observatorium-mst-production', 'observatorium-observatorium-mst-api').slos, + + 'rhobs-slos-rhobsp02ue1.prometheusrules': renderAlerts('rhobs-slos-rhobsp02ue1', 'production', flatten(rhobsp02ue1ProductionSLOs)), + local telemeterStageSLOs = telemeterServerSLOs + apiSLOs('telemeter', 'observatorium-stage', 'observatorium-metrics-stage', 'observatorium-observatorium-api').slos, local telemeterProductionSLOs = telemeterServerSLOs + apiSLOs('telemeter', 'observatorium-production', 'observatorium-metrics-production', 'observatorium-observatorium-api').slos, diff --git a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml new file mode 100644 index 0000000000..2f31b27a20 --- /dev/null +++ b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml @@ -0,0 +1,1550 @@ +--- +$schema: /openshift/prometheus-rule-1.yml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: app-sre + role: alert-rules + name: rhobs-slos-rhobsp02ue1 +spec: + groups: + - name: rhobs-rhobsp02ue1-api-metrics-write-availability.slo + rules: + - alert: APIMetricsWriteAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /receive handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning5mand1h + expr: | + sum(http_requests_total:burnrate5m{job="observatorium-observatorium-mst-api",handler="receive"}) > (14.40 * (1-0.95000)) + and + sum(http_requests_total:burnrate1h{job="observatorium-observatorium-mst-api",handler="receive"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + handler: receive + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIMetricsWriteAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /receive handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning30mand6h + expr: | + sum(http_requests_total:burnrate30m{job="observatorium-observatorium-mst-api",handler="receive"}) > (6.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="receive"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + handler: receive + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIMetricsWriteAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /receive handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning2hand1d + expr: | + sum(http_requests_total:burnrate2h{job="observatorium-observatorium-mst-api",handler="receive"}) > (3.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate1d{job="observatorium-observatorium-mst-api",handler="receive"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + handler: receive + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - alert: APIMetricsWriteAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /receive handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning6hand3d + expr: | + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="receive"}) > (1.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate3d{job="observatorium-observatorium-mst-api",handler="receive"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + handler: receive + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[1d])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[1h])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[2h])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate2h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[30m])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate30m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[3d])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate3d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[5m])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate5m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[6h])) + labels: + handler: receive + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate6h + - name: rhobs-rhobsp02ue1-api-metrics-write-latency.slo + rules: + - alert: APIMetricsWriteLatencyErrorBudgetBurning1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for job=observatorium-observatorium-mst-api,handler=receive,code!~^4..$,latency=5 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswritelatencyerrorbudgetburning1h + expr: | + ( + latencytarget:http_request_duration_seconds:rate1h{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (14.4*0.100000) + and + latencytarget:http_request_duration_seconds:rate5m{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (14.4*0.100000) + ) + or + ( + latencytarget:http_request_duration_seconds:rate6h{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (6*0.100000) + and + latencytarget:http_request_duration_seconds:rate30m{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (6*0.100000) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + service: telemeter + severity: critical + - alert: APIMetricsWriteLatencyErrorBudgetBurning3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for job=observatorium-observatorium-mst-api,handler=receive,code!~^4..$,latency=5 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswritelatencyerrorbudgetburning3d + expr: | + ( + latencytarget:http_request_duration_seconds:rate1d{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (3*0.100000) + and + latencytarget:http_request_duration_seconds:rate2h{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (3*0.100000) + ) + or + ( + latencytarget:http_request_duration_seconds:rate3d{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (0.100000) + and + latencytarget:http_request_duration_seconds:rate6h{job="observatorium-observatorium-mst-api",handler="receive",latency="5"} > (0.100000) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + service: telemeter + severity: medium + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[5m])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[5m])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate5m + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[30m])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[30m])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate30m + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[1h])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[1h])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate1h + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[2h])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[2h])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate2h + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[6h])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[6h])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate6h + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[1d])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[1d])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate1d + - expr: | + 1 - ( + sum(rate(http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$",le="5",code!~"5.."}[3d])) + / + sum(rate(http_request_duration_seconds_count{job="observatorium-observatorium-mst-api",handler="receive",code!~"^4..$"}[3d])) + ) + labels: + handler: receive + job: observatorium-observatorium-mst-api + latency: "5" + record: latencytarget:http_request_duration_seconds:rate3d + - name: rhobs-rhobsp02ue1-api-metrics-read-availability.slo + rules: + - alert: APIMetricsReadAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning5mand1h + expr: | + sum(http_requests_total:burnrate5m{job="observatorium-observatorium-mst-api",handler="query"}) > (14.40 * (1-0.95000)) + and + sum(http_requests_total:burnrate1h{job="observatorium-observatorium-mst-api",handler="query"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + handler: query + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIMetricsReadAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning30mand6h + expr: | + sum(http_requests_total:burnrate30m{job="observatorium-observatorium-mst-api",handler="query"}) > (6.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="query"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + handler: query + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIMetricsReadAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning2hand1d + expr: | + sum(http_requests_total:burnrate2h{job="observatorium-observatorium-mst-api",handler="query"}) > (3.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate1d{job="observatorium-observatorium-mst-api",handler="query"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + handler: query + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - alert: APIMetricsReadAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning6hand3d + expr: | + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="query"}) > (1.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate3d{job="observatorium-observatorium-mst-api",handler="query"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + handler: query + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[1d])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[1h])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[2h])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate2h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[30m])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate30m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[3d])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate3d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[5m])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate5m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query",code!~"^4..$"}[6h])) + labels: + handler: query + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate6h + - alert: APIMetricsReadAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query_range handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning5mand1h + expr: | + sum(http_requests_total:burnrate5m{job="observatorium-observatorium-mst-api",handler="query_range"}) > (14.40 * (1-0.95000)) + and + sum(http_requests_total:burnrate1h{job="observatorium-observatorium-mst-api",handler="query_range"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + handler: query_range + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIMetricsReadAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query_range handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning30mand6h + expr: | + sum(http_requests_total:burnrate30m{job="observatorium-observatorium-mst-api",handler="query_range"}) > (6.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="query_range"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + handler: query_range + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIMetricsReadAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query_range handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning2hand1d + expr: | + sum(http_requests_total:burnrate2h{job="observatorium-observatorium-mst-api",handler="query_range"}) > (3.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate1d{job="observatorium-observatorium-mst-api",handler="query_range"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + handler: query_range + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - alert: APIMetricsReadAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /query_range handler is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning6hand3d + expr: | + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="query_range"}) > (1.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate3d{job="observatorium-observatorium-mst-api",handler="query_range"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + handler: query_range + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[1d])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[1h])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[2h])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate2h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[30m])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate30m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[3d])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate3d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[5m])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate5m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",handler="query_range",code!~"^4..$"}[6h])) + labels: + handler: query_range + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate6h + - name: rhobs-rhobsp02ue1-api-metrics-read-latency.slo + rules: + - alert: APIMetricsReadLatencyErrorBudgetBurning1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for query=query-path-sli-1M-samples,namespace=observatorium-mst-production,latency=10 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning1h + expr: | + ( + latencytarget:up_custom_query_duration_seconds:rate1h{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (14.4*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate5m{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (14.4*0.100000) + ) + or + ( + latencytarget:up_custom_query_duration_seconds:rate6h{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (6*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate30m{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (6*0.100000) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + service: telemeter + severity: critical + - alert: APIMetricsReadLatencyErrorBudgetBurning3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for query=query-path-sli-1M-samples,namespace=observatorium-mst-production,latency=10 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning3d + expr: | + ( + latencytarget:up_custom_query_duration_seconds:rate1d{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (3*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate2h{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (3*0.100000) + ) + or + ( + latencytarget:up_custom_query_duration_seconds:rate3d{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate6h{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",latency="10"} > (0.100000) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + service: telemeter + severity: medium + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[5m])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[5m])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate5m + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[30m])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[30m])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate30m + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[1h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[1h])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate1h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[2h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[2h])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate2h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[6h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[6h])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate6h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[1d])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[1d])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate1d + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples",namespace="observatorium-mst-production",le="10",code!~"5.."}[3d])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples",namespace="observatorium-mst-production"}[3d])) + ) + labels: + latency: "10" + namespace: observatorium-mst-production + query: query-path-sli-1M-samples + record: latencytarget:up_custom_query_duration_seconds:rate3d + - alert: APIMetricsReadLatencyErrorBudgetBurning1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for query=query-path-sli-10M-samples,namespace=observatorium-mst-production,latency=30 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning1h + expr: | + ( + latencytarget:up_custom_query_duration_seconds:rate1h{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (14.4*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate5m{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (14.4*0.100000) + ) + or + ( + latencytarget:up_custom_query_duration_seconds:rate6h{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (6*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate30m{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (6*0.100000) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + service: telemeter + severity: critical + - alert: APIMetricsReadLatencyErrorBudgetBurning3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for query=query-path-sli-10M-samples,namespace=observatorium-mst-production,latency=30 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning3d + expr: | + ( + latencytarget:up_custom_query_duration_seconds:rate1d{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (3*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate2h{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (3*0.100000) + ) + or + ( + latencytarget:up_custom_query_duration_seconds:rate3d{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate6h{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",latency="30"} > (0.100000) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + service: telemeter + severity: medium + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[5m])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[5m])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate5m + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[30m])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[30m])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate30m + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[1h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[1h])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate1h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[2h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[2h])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate2h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[6h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[6h])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate6h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[1d])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[1d])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate1d + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples",namespace="observatorium-mst-production",le="30",code!~"5.."}[3d])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples",namespace="observatorium-mst-production"}[3d])) + ) + labels: + latency: "30" + namespace: observatorium-mst-production + query: query-path-sli-10M-samples + record: latencytarget:up_custom_query_duration_seconds:rate3d + - alert: APIMetricsReadLatencyErrorBudgetBurning1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for query=query-path-sli-100M-samples,namespace=observatorium-mst-production,latency=120 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning1h + expr: | + ( + latencytarget:up_custom_query_duration_seconds:rate1h{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (14.4*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate5m{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (14.4*0.100000) + ) + or + ( + latencytarget:up_custom_query_duration_seconds:rate6h{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (6*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate30m{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (6*0.100000) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + service: telemeter + severity: critical + - alert: APIMetricsReadLatencyErrorBudgetBurning3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: 'High requests latency budget burn for query=query-path-sli-100M-samples,namespace=observatorium-mst-production,latency=120 (current value: {{ $value }})' + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning3d + expr: | + ( + latencytarget:up_custom_query_duration_seconds:rate1d{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (3*0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate2h{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (3*0.100000) + ) + or + ( + latencytarget:up_custom_query_duration_seconds:rate3d{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (0.100000) + and + latencytarget:up_custom_query_duration_seconds:rate6h{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",latency="120"} > (0.100000) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + service: telemeter + severity: medium + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[5m])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[5m])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate5m + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[30m])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[30m])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate30m + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[1h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[1h])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate1h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[2h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[2h])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate2h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[6h])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[6h])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate6h + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[1d])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[1d])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate1d + - expr: | + 1 - ( + sum(rate(up_custom_query_duration_seconds_bucket{query="query-path-sli-100M-samples",namespace="observatorium-mst-production",le="120",code!~"5.."}[3d])) + / + sum(rate(up_custom_query_duration_seconds_count{query="query-path-sli-100M-samples",namespace="observatorium-mst-production"}[3d])) + ) + labels: + latency: "120" + namespace: observatorium-mst-production + query: query-path-sli-100M-samples + record: latencytarget:up_custom_query_duration_seconds:rate3d + - name: rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo + rules: + - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning5mand1h + expr: | + sum(http_requests_total:burnrate5m{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (14.40 * (1-0.95000)) + and + sum(http_requests_total:burnrate1h{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + service: telemeter + severity: critical + - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning30mand6h + expr: | + sum(http_requests_total:burnrate30m{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (6.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + service: telemeter + severity: critical + - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning2hand1d + expr: | + sum(http_requests_total:burnrate2h{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (3.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate1d{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + service: telemeter + severity: medium + - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning6hand3d + expr: | + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (1.00 * (1-0.95000)) + and + sum(http_requests_total:burnrate3d{job="observatorium-observatorium-mst-api",handler="rules-raw",method="PUT"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + service: telemeter + severity: medium + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[1d])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate1d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[1h])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate1h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[2h])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate2h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[30m])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate30m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[3d])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate3d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[5m])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate5m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",method="PUT",code!~"^4..$"}[6h])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + method: PUT + record: http_requests_total:burnrate6h + - name: rhobs-rhobsp02ue1-api-rules-sync-availability.slo + rules: + - alert: APIRulesSyncAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /reload endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning5mand1h + expr: | + sum(client_api_requests_total:burnrate5m{namespace="observatorium-mst-production"}) > (14.40 * (1-0.95000)) + and + sum(client_api_requests_total:burnrate1h{namespace="observatorium-mst-production"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + namespace: observatorium-mst-production + service: telemeter + severity: critical + - alert: APIRulesSyncAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /reload endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning30mand6h + expr: | + sum(client_api_requests_total:burnrate30m{namespace="observatorium-mst-production"}) > (6.00 * (1-0.95000)) + and + sum(client_api_requests_total:burnrate6h{namespace="observatorium-mst-production"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + namespace: observatorium-mst-production + service: telemeter + severity: critical + - alert: APIRulesSyncAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /reload endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning2hand1d + expr: | + sum(client_api_requests_total:burnrate2h{namespace="observatorium-mst-production"}) > (3.00 * (1-0.95000)) + and + sum(client_api_requests_total:burnrate1d{namespace="observatorium-mst-production"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + namespace: observatorium-mst-production + service: telemeter + severity: medium + - alert: APIRulesSyncAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /reload endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning6hand3d + expr: | + sum(client_api_requests_total:burnrate6h{namespace="observatorium-mst-production"}) > (1.00 * (1-0.95000)) + and + sum(client_api_requests_total:burnrate3d{namespace="observatorium-mst-production"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + namespace: observatorium-mst-production + service: telemeter + severity: medium + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[1d])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate1d + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[1h])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate1h + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[2h])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate2h + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[30m])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate30m + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[3d])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate3d + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[5m])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate5m + - expr: | + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(client_api_requests_total{client="reload",container="thanos-rule-syncer",namespace="observatorium-mst-production",code!~"^4..$"}[6h])) + labels: + namespace: observatorium-mst-production + record: client_api_requests_total:burnrate6h + - name: rhobs-rhobsp02ue1-api-rules-read-availability.slo + rules: + - alert: APIRulesReadAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning5mand1h + expr: | + sum(http_requests_total:burnrate5m{job="observatorium-observatorium-mst-api",handler="rules"}) > (14.40 * (1-0.90000)) + and + sum(http_requests_total:burnrate1h{job="observatorium-observatorium-mst-api",handler="rules"}) > (14.40 * (1-0.90000)) + for: 2m + labels: + handler: rules + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIRulesReadAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning30mand6h + expr: | + sum(http_requests_total:burnrate30m{job="observatorium-observatorium-mst-api",handler="rules"}) > (6.00 * (1-0.90000)) + and + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="rules"}) > (6.00 * (1-0.90000)) + for: 15m + labels: + handler: rules + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIRulesReadAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning2hand1d + expr: | + sum(http_requests_total:burnrate2h{job="observatorium-observatorium-mst-api",handler="rules"}) > (3.00 * (1-0.90000)) + and + sum(http_requests_total:burnrate1d{job="observatorium-observatorium-mst-api",handler="rules"}) > (3.00 * (1-0.90000)) + for: 1h + labels: + handler: rules + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - alert: APIRulesReadAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning6hand3d + expr: | + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="rules"}) > (1.00 * (1-0.90000)) + and + sum(http_requests_total:burnrate3d{job="observatorium-observatorium-mst-api",handler="rules"}) > (1.00 * (1-0.90000)) + for: 3h + labels: + handler: rules + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[1d])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[1h])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[2h])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate2h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[30m])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate30m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[3d])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate3d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[5m])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate5m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules",code!~"^4..$"}[6h])) + labels: + handler: rules + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate6h + - name: rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo + rules: + - alert: APIRulesRawReadAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning5mand1h + expr: | + sum(http_requests_total:burnrate5m{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (14.40 * (1-0.90000)) + and + sum(http_requests_total:burnrate1h{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (14.40 * (1-0.90000)) + for: 2m + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIRulesRawReadAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning30mand6h + expr: | + sum(http_requests_total:burnrate30m{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (6.00 * (1-0.90000)) + and + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (6.00 * (1-0.90000)) + for: 15m + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + service: telemeter + severity: critical + - alert: APIRulesRawReadAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning2hand1d + expr: | + sum(http_requests_total:burnrate2h{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (3.00 * (1-0.90000)) + and + sum(http_requests_total:burnrate1d{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (3.00 * (1-0.90000)) + for: 1h + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - alert: APIRulesRawReadAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning6hand3d + expr: | + sum(http_requests_total:burnrate6h{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (1.00 * (1-0.90000)) + and + sum(http_requests_total:burnrate3d{job="observatorium-observatorium-mst-api",handler="rules-raw"}) > (1.00 * (1-0.90000)) + for: 3h + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + service: telemeter + severity: medium + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[1d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[1d])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[1h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[1h])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate1h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[2h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[2h])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate2h + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[30m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[30m])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate30m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[3d])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[3d])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate3d + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[5m])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[5m])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate5m + - expr: | + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$",code=~"5.+"}[6h])) + / + sum(rate(http_requests_total{job="observatorium-observatorium-mst-api",group="metricsv1",handler="rules-raw",code!~"^4..$"}[6h])) + labels: + handler: rules-raw + job: observatorium-observatorium-mst-api + record: http_requests_total:burnrate6h + - name: rhobs-rhobsp02ue1-api-alerting-availability.slo + rules: + - alert: APIAlertmanagerAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning5mand1h + expr: | + sum(thanos_alert_sender_alerts_dropped_total:burnrate5m{namespace="observatorium-mst-production"}) > (14.40 * (1-0.95000)) + and + sum(thanos_alert_sender_alerts_dropped_total:burnrate1h{namespace="observatorium-mst-production"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + namespace: observatorium-mst-production + service: telemeter + severity: critical + - alert: APIAlertmanagerAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning30mand6h + expr: | + sum(thanos_alert_sender_alerts_dropped_total:burnrate30m{namespace="observatorium-mst-production"}) > (6.00 * (1-0.95000)) + and + sum(thanos_alert_sender_alerts_dropped_total:burnrate6h{namespace="observatorium-mst-production"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + namespace: observatorium-mst-production + service: telemeter + severity: critical + - alert: APIAlertmanagerAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning2hand1d + expr: | + sum(thanos_alert_sender_alerts_dropped_total:burnrate2h{namespace="observatorium-mst-production"}) > (3.00 * (1-0.95000)) + and + sum(thanos_alert_sender_alerts_dropped_total:burnrate1d{namespace="observatorium-mst-production"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + namespace: observatorium-mst-production + service: telemeter + severity: medium + - alert: APIAlertmanagerAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning6hand3d + expr: | + sum(thanos_alert_sender_alerts_dropped_total:burnrate6h{namespace="observatorium-mst-production"}) > (1.00 * (1-0.95000)) + and + sum(thanos_alert_sender_alerts_dropped_total:burnrate3d{namespace="observatorium-mst-production"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + namespace: observatorium-mst-production + service: telemeter + severity: medium + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[1d])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[1d])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate1d + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[1h])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[1h])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate1h + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[2h])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[2h])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate2h + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[30m])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[30m])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate30m + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[3d])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[3d])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate3d + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[5m])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[5m])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate5m + - expr: | + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[6h])) + / + sum(rate(thanos_alert_sender_alerts_dropped_total{container="thanos-rule",namespace="observatorium-mst-production",code!~"^4..$"}[6h])) + labels: + namespace: observatorium-mst-production + record: thanos_alert_sender_alerts_dropped_total:burnrate6h + - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning5mand1h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning5mand1h + expr: | + sum(alertmanager_notifications_failed_total:burnrate5m{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (14.40 * (1-0.95000)) + and + sum(alertmanager_notifications_failed_total:burnrate1h{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (14.40 * (1-0.95000)) + for: 2m + labels: + namespace: observatorium-mst-production + service: telemeter + severity: critical + - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning30mand6h + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning30mand6h + expr: | + sum(alertmanager_notifications_failed_total:burnrate30m{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (6.00 * (1-0.95000)) + and + sum(alertmanager_notifications_failed_total:burnrate6h{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (6.00 * (1-0.95000)) + for: 15m + labels: + namespace: observatorium-mst-production + service: telemeter + severity: critical + - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning2hand1d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning2hand1d + expr: | + sum(alertmanager_notifications_failed_total:burnrate2h{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (3.00 * (1-0.95000)) + and + sum(alertmanager_notifications_failed_total:burnrate1d{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (3.00 * (1-0.95000)) + for: 1h + labels: + namespace: observatorium-mst-production + service: telemeter + severity: medium + - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning6hand3d + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning6hand3d + expr: | + sum(alertmanager_notifications_failed_total:burnrate6h{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (1.00 * (1-0.95000)) + and + sum(alertmanager_notifications_failed_total:burnrate3d{service="observatorium-alertmanager",namespace="observatorium-mst-production"}) > (1.00 * (1-0.95000)) + for: 3h + labels: + namespace: observatorium-mst-production + service: telemeter + severity: medium + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[1d])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[1d])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate1d + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[1h])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[1h])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate1h + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[2h])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[2h])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate2h + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[30m])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[30m])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate30m + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[3d])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[3d])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate3d + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[5m])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[5m])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate5m + - expr: | + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$",code=~"5.."}[6h])) + / + sum(rate(alertmanager_notifications_failed_total{service="observatorium-alertmanager",namespace="observatorium-mst-production",code!~"^4..$"}[6h])) + labels: + namespace: observatorium-mst-production + service: observatorium-alertmanager + record: alertmanager_notifications_failed_total:burnrate6h From ae14cd6dbf6b04e749d47ff1365282517afcf75e Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Fri, 17 Feb 2023 12:32:32 +0530 Subject: [PATCH 2/2] Correct data source Signed-off-by: Saswata Mukherjee --- observability/prometheusrules.jsonnet | 14 +-- ...hobs-slos-rhobsp02ue1.prometheusrules.yaml | 88 +++++++++---------- 2 files changed, 52 insertions(+), 50 deletions(-) diff --git a/observability/prometheusrules.jsonnet b/observability/prometheusrules.jsonnet index 04086aaa70..15972edc1d 100644 --- a/observability/prometheusrules.jsonnet +++ b/observability/prometheusrules.jsonnet @@ -48,12 +48,14 @@ local absent(name, job) = { // Add dashboards and runbook anntotations. // Overwrite severity to medium and high. local appSREOverwrites(environment) = { - local dashboardDatasource = function(environment) { + local dashboardDatasource = function(name, environment) { datasource: if environment == 'stage' then 'app-sre-stage-01-prometheus' else if - environment == 'production' then 'telemeter-prod-01-prometheus' + !std.startsWith(name, 'rhobs-rhobsp02ue1') && environment == 'production' then 'telemeter-prod-01-prometheus' + else if + std.startsWith(name, 'rhobs-rhobsp02ue1') && environment == 'production' then 'rhobsp02ue1-prometheus' else error 'no datasource for environment %s' % environment, }, @@ -156,7 +158,7 @@ local appSREOverwrites(environment) = { runbook: 'https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#%s' % std.asciiLower(r.alert), dashboard: 'https://grafana.app-sre.devshift.net/d/%s/api-logs?orgId=1&refresh=1m&var-datasource=%s&var-namespace={{$labels.namespace}}' % [ dashboardID('loki', environment).id, - dashboardDatasource(environment).datasource, + dashboardDatasource('loki', environment).datasource, ], } else if std.startsWith(g.name, 'telemeter') then @@ -164,7 +166,7 @@ local appSREOverwrites(environment) = { runbook: 'https://github.com/rhobs/configuration/blob/main/docs/sop/telemeter.md#%s' % std.asciiLower(r.alert), dashboard: 'https://grafana.app-sre.devshift.net/d/%s/telemeter?orgId=1&refresh=1m&var-datasource=%s' % [ dashboardID(g.name, environment).id, - dashboardDatasource(environment).datasource, + dashboardDatasource(g.name, environment).datasource, ], } else if std.startsWith(g.name, 'loki_tenant') then @@ -173,7 +175,7 @@ local appSREOverwrites(environment) = { dashboard: 'https://grafana.app-sre.devshift.net/d/%s/%s?orgId=1&refresh=10s&var-metrics=%s&var-namespace={{$labels.namespace}}' % [ dashboardID(g.name, environment).id, g.name, - dashboardDatasource(environment).datasource, + dashboardDatasource(g.name, environment).datasource, ], } else @@ -182,7 +184,7 @@ local appSREOverwrites(environment) = { dashboard: 'https://grafana.app-sre.devshift.net/d/%s/%s?orgId=1&refresh=10s&var-datasource=%s&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m' % [ dashboardID(g.name, environment).id, g.name, - dashboardDatasource(environment).datasource, + dashboardDatasource(g.name, environment).datasource, ], }, labels: pruneUnsupportedLabels(r.labels { diff --git a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml index 2f31b27a20..1a4a72b52b 100644 --- a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1.prometheusrules.yaml @@ -13,7 +13,7 @@ spec: rules: - alert: APIMetricsWriteAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /receive handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning5mand1h expr: | @@ -28,7 +28,7 @@ spec: severity: critical - alert: APIMetricsWriteAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /receive handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning30mand6h expr: | @@ -43,7 +43,7 @@ spec: severity: critical - alert: APIMetricsWriteAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /receive handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning2hand1d expr: | @@ -58,7 +58,7 @@ spec: severity: medium - alert: APIMetricsWriteAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /receive handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswriteavailabilityerrorbudgetburning6hand3d expr: | @@ -131,7 +131,7 @@ spec: rules: - alert: APIMetricsWriteLatencyErrorBudgetBurning1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for job=observatorium-observatorium-mst-api,handler=receive,code!~^4..$,latency=5 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswritelatencyerrorbudgetburning1h expr: | @@ -154,7 +154,7 @@ spec: severity: critical - alert: APIMetricsWriteLatencyErrorBudgetBurning3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-write-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for job=observatorium-observatorium-mst-api,handler=receive,code!~^4..$,latency=5 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricswritelatencyerrorbudgetburning3d expr: | @@ -256,7 +256,7 @@ spec: rules: - alert: APIMetricsReadAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning5mand1h expr: | @@ -271,7 +271,7 @@ spec: severity: critical - alert: APIMetricsReadAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning30mand6h expr: | @@ -286,7 +286,7 @@ spec: severity: critical - alert: APIMetricsReadAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning2hand1d expr: | @@ -301,7 +301,7 @@ spec: severity: medium - alert: APIMetricsReadAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning6hand3d expr: | @@ -372,7 +372,7 @@ spec: record: http_requests_total:burnrate6h - alert: APIMetricsReadAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query_range handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning5mand1h expr: | @@ -387,7 +387,7 @@ spec: severity: critical - alert: APIMetricsReadAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query_range handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning30mand6h expr: | @@ -402,7 +402,7 @@ spec: severity: critical - alert: APIMetricsReadAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query_range handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning2hand1d expr: | @@ -417,7 +417,7 @@ spec: severity: medium - alert: APIMetricsReadAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /query_range handler is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadavailabilityerrorbudgetburning6hand3d expr: | @@ -490,7 +490,7 @@ spec: rules: - alert: APIMetricsReadLatencyErrorBudgetBurning1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for query=query-path-sli-1M-samples,namespace=observatorium-mst-production,latency=10 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning1h expr: | @@ -513,7 +513,7 @@ spec: severity: critical - alert: APIMetricsReadLatencyErrorBudgetBurning3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for query=query-path-sli-1M-samples,namespace=observatorium-mst-production,latency=10 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning3d expr: | @@ -613,7 +613,7 @@ spec: record: latencytarget:up_custom_query_duration_seconds:rate3d - alert: APIMetricsReadLatencyErrorBudgetBurning1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for query=query-path-sli-10M-samples,namespace=observatorium-mst-production,latency=30 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning1h expr: | @@ -636,7 +636,7 @@ spec: severity: critical - alert: APIMetricsReadLatencyErrorBudgetBurning3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for query=query-path-sli-10M-samples,namespace=observatorium-mst-production,latency=30 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning3d expr: | @@ -736,7 +736,7 @@ spec: record: latencytarget:up_custom_query_duration_seconds:rate3d - alert: APIMetricsReadLatencyErrorBudgetBurning1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for query=query-path-sli-100M-samples,namespace=observatorium-mst-production,latency=120 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning1h expr: | @@ -759,7 +759,7 @@ spec: severity: critical - alert: APIMetricsReadLatencyErrorBudgetBurning3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-metrics-read-latency.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: 'High requests latency budget burn for query=query-path-sli-100M-samples,namespace=observatorium-mst-production,latency=120 (current value: {{ $value }})' runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apimetricsreadlatencyerrorbudgetburning3d expr: | @@ -861,7 +861,7 @@ spec: rules: - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning5mand1h expr: | @@ -877,7 +877,7 @@ spec: severity: critical - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning30mand6h expr: | @@ -893,7 +893,7 @@ spec: severity: critical - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning2hand1d expr: | @@ -909,7 +909,7 @@ spec: severity: medium - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-write-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawwriteavailabilityerrorbudgetburning6hand3d expr: | @@ -990,7 +990,7 @@ spec: rules: - alert: APIRulesSyncAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /reload endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning5mand1h expr: | @@ -1004,7 +1004,7 @@ spec: severity: critical - alert: APIRulesSyncAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /reload endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning30mand6h expr: | @@ -1018,7 +1018,7 @@ spec: severity: critical - alert: APIRulesSyncAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /reload endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning2hand1d expr: | @@ -1032,7 +1032,7 @@ spec: severity: medium - alert: APIRulesSyncAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-sync-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /reload endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulessyncavailabilityerrorbudgetburning6hand3d expr: | @@ -1097,7 +1097,7 @@ spec: rules: - alert: APIRulesReadAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning5mand1h expr: | @@ -1112,7 +1112,7 @@ spec: severity: critical - alert: APIRulesReadAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning30mand6h expr: | @@ -1127,7 +1127,7 @@ spec: severity: critical - alert: APIRulesReadAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning2hand1d expr: | @@ -1142,7 +1142,7 @@ spec: severity: medium - alert: APIRulesReadAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesreadavailabilityerrorbudgetburning6hand3d expr: | @@ -1215,7 +1215,7 @@ spec: rules: - alert: APIRulesRawReadAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning5mand1h expr: | @@ -1230,7 +1230,7 @@ spec: severity: critical - alert: APIRulesRawReadAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning30mand6h expr: | @@ -1245,7 +1245,7 @@ spec: severity: critical - alert: APIRulesRawReadAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning2hand1d expr: | @@ -1260,7 +1260,7 @@ spec: severity: medium - alert: APIRulesRawReadAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-rules-raw-read-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API /rules/raw endpoint is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apirulesrawreadavailabilityerrorbudgetburning6hand3d expr: | @@ -1333,7 +1333,7 @@ spec: rules: - alert: APIAlertmanagerAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning5mand1h expr: | @@ -1347,7 +1347,7 @@ spec: severity: critical - alert: APIAlertmanagerAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning30mand6h expr: | @@ -1361,7 +1361,7 @@ spec: severity: critical - alert: APIAlertmanagerAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning2hand1d expr: | @@ -1375,7 +1375,7 @@ spec: severity: medium - alert: APIAlertmanagerAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Thanos Rule failing to send alerts to Alertmanager and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanageravailabilityerrorbudgetburning6hand3d expr: | @@ -1438,7 +1438,7 @@ spec: record: thanos_alert_sender_alerts_dropped_total:burnrate6h - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning5mand1h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning5mand1h expr: | @@ -1452,7 +1452,7 @@ spec: severity: critical - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning30mand6h annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning30mand6h expr: | @@ -1466,7 +1466,7 @@ spec: severity: critical - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning2hand1d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning2hand1d expr: | @@ -1480,7 +1480,7 @@ spec: severity: medium - alert: APIAlertmanagerNotificationsAvailabilityErrorBudgetBurning6hand3d annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobs-rhobsp02ue1-api-alerting-availability.slo?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m message: API Alertmanager failing to deliver alerts to upstream targets and is burning too much error budget to guarantee availability SLOs runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#apialertmanagernotificationsavailabilityerrorbudgetburning6hand3d expr: |