Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include query fragments in alerts/template #1176

Merged
merged 2 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/pint/tests/0076_ci_group_errors.txt
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ rules.yml:29-30 Bug: `summary` annotation is required. (alerts/annotation)
29 | annotations:
30 | instance: 'sum on {{ $labels.instance }} is {{ $value }}'

rules.yml:30 Bug: Template is using `instance` label but the query removes it. (alerts/template)
rules.yml:30 Bug: Template is using `instance` label but the query results won't have this label. Query is using aggregation with `by(foo)`, only labels included inside `by(...)` will be present on the results. (alerts/template)
30 | instance: 'sum on {{ $labels.instance }} is {{ $value }}'

rules.yml:32-33 Bug: `link` annotation is required. (alerts/annotation)
Expand Down
6 changes: 3 additions & 3 deletions cmd/pint/tests/0087_dedup.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ level=INFO msg="Finding all rules to check" paths=["rules"]
rules/01.yml:5 Warning: Alert query doesn't have any condition, it will always fire if the metric exists. (alerts/comparison)
5 | expr: sum(up{job="bar"}) / sum(foo) / sum(bar)

rules/01.yml:12 Bug: Template is using `cluster` label but the query removes it. (alerts/template)
rules/01.yml:12 Bug: Template is using `cluster` label but the query results won't have this label. Query is using aggregation that removes all labels. (alerts/template)
12 | summary: "Server {{ $labels.instance }} in cluster {{ $labels.cluster }} has gone down"

rules/01.yml:12 Bug: Template is using `instance` label but the query removes it. (alerts/template)
rules/01.yml:12 Bug: Template is using `instance` label but the query results won't have this label. Query is using aggregation that removes all labels. (alerts/template)
12 | summary: "Server {{ $labels.instance }} in cluster {{ $labels.cluster }} has gone down"

rules/01.yml:13 Bug: Template is using `cluster` label but the query removes it. (alerts/template)
rules/01.yml:13 Bug: Template is using `cluster` label but the query results won't have this label. Query is using aggregation that removes all labels. (alerts/template)
13 | dashboard: "https://grafana.example.com/dashboard?var-cluster={{ $labels.cluster }}&var-instance={{ $labels.cluster }}"

level=INFO msg="Problems found" Bug=3 Warning=1
Expand Down
7 changes: 6 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,15 @@
}
```

### Changed

- [alerts/template](checks/alerts/template.md) check was refactored and will now produce more accurate results.
Messages produced by this check might include details of the PromQL query fragment causing the problem
if the query is complex enough.

### Fixed

- Don't try to create GitLab comments on unmodified lines - [#1147](https://github.com/cloudflare/pint/pull/1147).
- [alerts/template](checks/alerts/template.md) check was refactored and will now produce more accurate results.

## v0.67.0

Expand Down
2 changes: 1 addition & 1 deletion internal/checks/alerts_absent.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (c AlertsAbsentCheck) Check(ctx context.Context, _ discovery.Path, rule par
}

var hasAbsent bool
src := utils.LabelsSource(rule.AlertingRule.Expr.Query)
src := utils.LabelsSource(rule.AlertingRule.Expr.Value.Value, rule.AlertingRule.Expr.Query)
for _, s := range append(src.Alternatives, src) {
if s.Operation == "absent" {
hasAbsent = true
Expand Down
37 changes: 20 additions & 17 deletions internal/checks/alerts_template.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
return nil
}

src := utils.LabelsSource(rule.AlertingRule.Expr.Query)
src := utils.LabelsSource(rule.AlertingRule.Expr.Value.Value, rule.AlertingRule.Expr.Query)
data := promTemplate.AlertTemplateData(map[string]string{}, map[string]string{}, "", promql.Sample{})

if rule.AlertingRule.Labels != nil {
Expand Down Expand Up @@ -144,7 +144,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
})
}

for _, problem := range checkQueryLabels(label.Key.Value, label.Value.Value, src) {
for _, problem := range checkQueryLabels(rule.AlertingRule.Expr.Value.Value, label.Key.Value, label.Value.Value, src) {
problems = append(problems, Problem{
Lines: parser.LineRange{
First: label.Key.Lines.First,
Expand Down Expand Up @@ -174,7 +174,7 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
})
}

for _, problem := range checkQueryLabels(annotation.Key.Value, annotation.Value.Value, src) {
for _, problem := range checkQueryLabels(rule.AlertingRule.Expr.Value.Value, annotation.Key.Value, annotation.Value.Value, src) {
problems = append(problems, Problem{
Lines: parser.LineRange{
First: annotation.Key.Lines.First,
Expand Down Expand Up @@ -436,7 +436,7 @@ func findTemplateVariables(name, text string) (vars [][]string, aliases aliasMap
return vars, aliases, true
}

func checkQueryLabels(labelName, labelValue string, src utils.Source) (problems []exprProblem) {
func checkQueryLabels(query, labelName, labelValue string, src utils.Source) (problems []exprProblem) {
vars, aliases, ok := findTemplateVariables(labelName, labelValue)
if !ok {
return nil
Expand All @@ -452,11 +452,11 @@ func checkQueryLabels(labelName, labelValue string, src utils.Source) (problems
}
for _, s := range append(src.Alternatives, src) {
if s.FixedLabels && !slices.Contains(s.IncludedLabels, v[1]) {
problems = append(problems, textForProblem(v[1], "", s, Bug))
problems = append(problems, textForProblem(query, v[1], "", s, Bug))
goto NEXT
}
if slices.Contains(s.ExcludedLabels, v[1]) {
problems = append(problems, textForProblem(v[1], v[1], s, Bug))
problems = append(problems, textForProblem(query, v[1], v[1], s, Bug))
goto NEXT
}
}
Expand All @@ -469,7 +469,7 @@ func checkQueryLabels(labelName, labelValue string, src utils.Source) (problems
return problems
}

func textForProblem(label, reasonLabel string, src utils.Source, severity Severity) exprProblem {
func textForProblem(query, label, reasonLabel string, src utils.Source, severity Severity) exprProblem {
switch {
case src.Operation == "absent":
return exprProblem{
Expand All @@ -489,23 +489,26 @@ func textForProblem(label, reasonLabel string, src utils.Source, severity Severi
details: TemplateCheckLabelsDetails,
severity: severity,
}
case slices.Contains([]string{
promParser.CardOneToOne.String(),
promParser.CardOneToMany.String(),
promParser.CardManyToMany.String(),
promParser.CardManyToOne.String(),
}, src.Operation):
case src.Operation == promParser.CardOneToOne.String():
return exprProblem{
text: fmt.Sprintf("Template is using `%s` label but the query results won't have this label. %s",
label, src.ExcludeReason[reasonLabel]),
details: TemplateCheckOnDetails,
label, src.ExcludeReason[reasonLabel].Reason),
details: maybeAddQueryFragment(query, src.ExcludeReason[reasonLabel].Fragment, TemplateCheckOnDetails),
severity: severity,
}
default:
return exprProblem{
text: fmt.Sprintf("Template is using `%s` label but the query removes it.", label),
details: TemplateCheckAggregationDetails,
text: fmt.Sprintf("Template is using `%s` label but the query results won't have this label. %s",
label, src.ExcludeReason[reasonLabel].Reason),
details: maybeAddQueryFragment(query, src.ExcludeReason[reasonLabel].Fragment, TemplateCheckAggregationDetails),
severity: severity,
}
}
}

func maybeAddQueryFragment(query, fragment, msg string) string {
if fragment == query {
return msg
}
return fmt.Sprintf("%s\nQuery fragment causing this problem: `%s`.", msg, fragment)
}
52 changes: 24 additions & 28 deletions internal/checks/alerts_template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation that removes all labels.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo)`.",
Severity: checks.Bug,
},
}
Expand All @@ -314,8 +314,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation that removes all labels.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo)`.",
Severity: checks.Bug,
},
}
Expand All @@ -334,8 +334,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `without(job)`, all labels included inside `without(...)` will be removed from the results.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) without(job)`.",
Severity: checks.Bug,
},
}
Expand All @@ -354,8 +354,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `without(job)`, all labels included inside `without(...)` will be removed from the results.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) without(job)`.",
Severity: checks.Bug,
},
}
Expand All @@ -374,8 +374,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `without(job)`, all labels included inside `without(...)` will be removed from the results.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) without(job)`.",
Severity: checks.Bug,
},
}
Expand All @@ -394,8 +394,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation that removes all labels.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(bar)`.",
Severity: checks.Bug,
},
}
Expand All @@ -414,8 +414,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 4,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `job` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `job` label but the query results won't have this label. Query is using aggregation with `by(notjob)`, only labels included inside `by(...)` will be present on the results.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum(foo) by(notjob)`.",
Severity: checks.Bug,
},
}
Expand All @@ -440,8 +440,8 @@ func TestTemplateCheck(t *testing.T) {
Last: 6,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `ixtance` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `ixtance` label but the query results won't have this label. Query is using aggregation with `by(instance, version)`, only labels included inside `by(...)` will be present on the results.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `count(build_info) by (instance, version)`.",
Severity: checks.Bug,
},
}
Expand Down Expand Up @@ -1312,14 +1312,12 @@ func TestTemplateCheck(t *testing.T) {
{
description: "multiple or",
content: `
- alert: Prefix_Advertised_On_Very_Few_Routers
- alert: Foo
expr: >
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*|.*multicolo.*|.*aggregate.*|.*test.*|.*tier1.*|.*regional.*|.*brat.*|.*utopia.*|.*byoip.*",prefix!~"141.101.112.0/20|190.93.240.0/20"})
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*"})
< 0.5 > 0
or avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*multicolo.*"})
< 0.4 > 0
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*aggregate.*"} OR router_anycast_prefix_enabled{prefix=~"141.101.112.0/20|190.93.240.0/20"})
< 20 > 0
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*offpeak.*"})
< 8 > 0
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*tier1.*"})
Expand All @@ -1340,14 +1338,12 @@ func TestTemplateCheck(t *testing.T) {
{
description: "multiple or / missing group_left()",
content: `
- alert: Prefix_Advertised_On_Very_Few_Routers
- alert: Foo
expr: >
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*|.*multicolo.*|.*aggregate.*|.*test.*|.*tier1.*|.*regional.*|.*brat.*|.*utopia.*|.*byoip.*",prefix!~"141.101.112.0/20|190.93.240.0/20"})
avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case!~".*offpeak.*"})
< 0.5 > 0
or avg without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*multicolo.*"})
< 0.4 > 0
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*aggregate.*"} OR router_anycast_prefix_enabled{prefix=~"141.101.112.0/20|190.93.240.0/20"})
< 20 > 0
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*offpeak.*"})
< 8 > 0
or sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~".*tier1.*"})
Expand All @@ -1367,12 +1363,12 @@ func TestTemplateCheck(t *testing.T) {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 21,
Last: 21,
First: 19,
Last: 19,
},
Reporter: checks.TemplateCheckName,
Text: "Template is using `prefix` label but the query removes it.",
Details: checks.TemplateCheckAggregationDetails,
Text: "Template is using `prefix` label but the query results won't have this label. Query is using one-to-one vector matching with `on()`, only labels included inside `on(...)` will be present on the results.",
Details: checks.TemplateCheckAggregationDetails + "\nQuery fragment causing this problem: `sum without(router, colo_id, instance) (router_anycast_prefix_enabled{cidr_use_case=~\".*tier1.*\"}) < on() count(colo_router_tier:disabled_pops:max{tier=\"1\",router=~\"edge.*\"}) * 0.4`.",
Severity: checks.Bug,
},
}
Expand Down
6 changes: 3 additions & 3 deletions internal/checks/promql_fragile.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func (c FragileCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rul
}

if rule.AlertingRule != nil {
for _, problem := range c.checkSampling(expr.Query) {
for _, problem := range c.checkSampling(expr.Value.Value, expr.Query) {
problems = append(problems, Problem{
Lines: expr.Value.Lines,
Reporter: c.Reporter(),
Expand Down Expand Up @@ -126,8 +126,8 @@ NEXT:
return problems
}

func (c FragileCheck) checkSampling(node *parser.PromQLNode) (problems []exprProblem) {
s := utils.LabelsSource(node)
func (c FragileCheck) checkSampling(expr string, node *parser.PromQLNode) (problems []exprProblem) {
s := utils.LabelsSource(expr, node)
for _, src := range append(s.Alternatives, s) {
if src.Type != utils.AggregateSource {
continue
Expand Down
Loading