Skip to content

Commit

Permalink
Add metric for failed plugins
Browse files Browse the repository at this point in the history
  • Loading branch information
Nuckal777 committed Jul 15, 2024
1 parent b0d48a3 commit eda8860
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 26 deletions.
17 changes: 17 additions & 0 deletions controllers/node_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,23 @@ var _ = Describe("The api server", func() {
Expect(err).To(Succeed())
})

It("should create transition failure metrics", func(ctx SpecContext) {
originalNode := targetNode.DeepCopy()
targetNode.Labels = map[string]string{constants.ProfileLabelKey: "broken"}
Expect(k8sClient.Patch(ctx, targetNode, client.MergeFrom(originalNode))).To(Succeed())

Eventually(func(g Gomega) []string {
res, err := http.Get("http://localhost:15423/metrics")
g.Expect(err).To(Succeed())
defer res.Body.Close()
data, err := io.ReadAll(res.Body)
g.Expect(err).To(Succeed())
return parseMetrics(string(data), []string{
"maintenance_controller_transition_failure_count{profile=\"broken\"}",
})
}).Should(Equal([]string{"5"}))
})

It("should return node infos", func() {
// since the cache is global the precise number
// of nodes is unknown for the cache
Expand Down
9 changes: 9 additions & 0 deletions controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ instances:
config:
key: transition
value: "true"
- type: prometheusInstant
name: fail
config:
url: bananabread
trigger:
- type: alterLabel
name: alter
Expand Down Expand Up @@ -117,6 +121,11 @@ profiles:
transitions:
- check: transition
next: in-maintenance
- name: broken
operational:
transitions:
- check: fail
next: maintenance-required
`

var (
Expand Down
3 changes: 2 additions & 1 deletion docs/operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ It defaults to `:8080`.
The notable metrics are:
- `maintenance_controller_shuffle_count`: Counts pods in DaemonSets, Deployments and StatefulSets, that were likely deleted as part of a mainteanance activity.
- `maintenance_controller_shuffles_per_replica`: Count of pods in DaemonSets, Deployments and StatefulSets, that were likely deleted as part of a maintenance activity, divided by the replica count when the event occurred.
They help determine the impact of maintenance activities on the workloads running on the cluster.
- `maintenance_controller_transition_failure_count`: Count of state transition failures due to plugin errors.
The first two help determine the impact of maintenance activities on the workloads running on the cluster.

## Web UI
The maintenance-controller provides a web UI to visualize the state of maintenance profiles and nodes.
Expand Down
11 changes: 10 additions & 1 deletion metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,15 @@ var (
"that were likely shuffled by a node send into maintenance, " +
"divided by the replica count when the event occurred",
}, []string{"owner", "profile"})

transitionFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "maintenance_controller_transition_failure_count",
Help: "Count of failed state transition evaluations due to plugin errors",
}, []string{"profile"})
)

func RegisterMaintenanceMetrics() {
metrics.Registry.MustRegister(shuffleCount, shufflesPerReplica)
metrics.Registry.MustRegister(shuffleCount, shufflesPerReplica, transitionFailures)
}

type shuffleRecord struct {
Expand Down Expand Up @@ -223,3 +228,7 @@ func makeLabels(owner, profile string) prometheus.Labels {
"profile": profile,
}
}

func RecordTransitionFailure(profile string) {
transitionFailures.With(prometheus.Labels{"profile": profile}).Inc()
}
47 changes: 23 additions & 24 deletions state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
v1 "k8s.io/api/core/v1"

"github.com/sapcc/maintenance-controller/constants"
"github.com/sapcc/maintenance-controller/metrics"
"github.com/sapcc/maintenance-controller/plugin"
)

Expand Down Expand Up @@ -248,6 +249,24 @@ func FromLabel(label NodeStateLabel, chains PluginChains) (NodeState, error) {
func Apply(state NodeState, node *v1.Node, data *DataV2, params plugin.Parameters) (ApplyResult, error) {
recorder := params.Recorder
result := ApplyResult{Next: state.Label(), Transitions: []TransitionResult{}}

handleTransitionError := func(err error, prefix string) (ApplyResult, error) {
metrics.RecordTransitionFailure(params.Profile)
params.Log.Error(
err, prefix,
"state", params.State,
"profile", params.Profile,
"node", node.Name,
)
recorder.Eventf(
node, "Normal", "ChangeMaintenanceStateFailed",
"%v for profile %v: Will stay in %v state",
prefix, params.Profile, params.State,
)
result.Error = err.Error()
return result, fmt.Errorf("%v for profile %v: %w", strings.ToLower(prefix), params.Profile, err)
}

stateInfo, ok := data.Profiles[params.Profile]
if !ok {
err := fmt.Errorf("could not find profile '%s' in state data", params.Profile)
Expand All @@ -256,45 +275,25 @@ func Apply(state NodeState, node *v1.Node, data *DataV2, params plugin.Parameter
}
if stateInfo.Previous != stateInfo.Current {
if err := state.Enter(params, data); err != nil {
recorder.Eventf(node, "Normal", "ChangeMaintenanceStateFailed",
"Failed to enter state for profile %v: Will stay in %v state",
params.Profile, params.State)
result.Error = err.Error()
return result, fmt.Errorf("failed to enter state %v for profile %v: %w", state.Label(), params.Profile, err)
return handleTransitionError(err, fmt.Sprintf("Failed to enter state %s", state.Label()))
}
}
// invoke notifications and check for transition
err := state.Notify(params, data)
if err != nil {
recorder.Eventf(node, "Normal", "ChangeMaintenanceStateFailed",
"At least one notification plugin failed for profile %v: Will stay in %v state",
params.Profile, params.State)
params.Log.Error(err, "Failed to notify", "state", params.State,
"profile", params.Profile)
result.Error = err.Error()
return result, fmt.Errorf("failed to notify for profile %v: %w", params.Profile, err)
return handleTransitionError(err, "At least one notification plugin failed")
}
transitions, err := state.Transition(params, data)
result.Transitions = transitions.Infos
if err != nil {
recorder.Eventf(node, "Normal", "ChangeMaintenanceStateFailed",
"At least one check plugin failed for profile %v: Will stay in %v state",
params.Profile, params.State)
params.Log.Error(err, "Failed to check for state transition", "state", params.State,
"profile", params.Profile)
result.Error = err.Error()
return result, fmt.Errorf("failed transition for profile %v: %w", params.Profile, err)
return handleTransitionError(err, "At least one check plugin failed")
}

// check if a transition should happen
if transitions.Next != state.Label() {
err = state.Trigger(params, transitions.Next, data)
if err != nil {
params.Log.Error(err, "Failed to execute triggers", "state", params.State, "profile", params.Profile)
recorder.Eventf(node, "Normal", "ChangeMaintenanceStateFailed",
"At least one trigger plugin failed for profile %v: Will stay in %v state", params.Profile, params.State)
result.Error = err.Error()
return result, err
return handleTransitionError(err, "At least one trigger plugin failed")
}
params.Log.Info("Moved node to next state", "state", string(transitions.Next), "profile", params.Profile)
recorder.Eventf(node, "Normal", "ChangedMaintenanceState",
Expand Down

0 comments on commit eda8860

Please sign in to comment.