From b2088c4513063885fb06c82683686e3bb13d443b Mon Sep 17 00:00:00 2001 From: Itay Grudev Date: Fri, 1 Mar 2024 12:02:29 +0200 Subject: [PATCH] Implemented Prometheus Rule for automated alerts (#193) feat(cluster): Prometheus Rule for automated alerts + runbooks for a basic set of alerts * Renamed: `cluster.monitoring.enablePodMonitor` to `cluster.monitoring.podMonitor.enabled` * New configuration option: `cluster.monitoring.prometheusRule.enabled` defaults to `true` Signed-off-by: Itay Grudev Signed-off-by: Gabriele Bartolini Co-authored-by: Gabriele Bartolini --- Makefile | 17 +- charts/cluster/README.md | 12 +- .../docs/runbooks/CNPGClusterHACritical.md | 49 +++ .../docs/runbooks/CNPGClusterHAWarning.md | 51 +++ .../CNPGClusterHighConnectionsCritical.md | 24 ++ .../CNPGClusterHighConnectionsWarning.md | 24 ++ .../runbooks/CNPGClusterHighReplicationLag.md | 31 ++ .../CNPGClusterInstancesOnSameNode.md | 28 ++ .../CNPGClusterLowDiskSpaceCritical.md | 31 ++ .../CNPGClusterLowDiskSpaceWarning.md | 31 ++ .../docs/runbooks/CNPGClusterOffline.md | 43 +++ .../runbooks/CNPGClusterZoneSpreadWarning.md | 37 ++ charts/cluster/examples/custom-queries.yaml | 3 +- charts/cluster/templates/NOTES.txt | 29 +- charts/cluster/templates/_helpers.tpl | 1 + charts/cluster/templates/cluster.yaml | 2 +- charts/cluster/templates/prometheus-rule.yaml | 177 +++++++++ charts/cluster/values.schema.json | 342 ++++++++++++++++++ charts/cluster/values.yaml | 9 +- 19 files changed, 908 insertions(+), 33 deletions(-) create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHACritical.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHAWarning.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterOffline.md create mode 100644 charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md create mode 100644 charts/cluster/templates/prometheus-rule.yaml create mode 100644 charts/cluster/values.schema.json diff --git a/Makefile b/Makefile index 29b256f2e..ac2030a88 100644 --- a/Makefile +++ b/Makefile @@ -12,15 +12,12 @@ docs: ## Generate charts' docs using helm-docs (echo "Please, install https://github.com/norwoodj/helm-docs first" && exit 1) .PHONY: schema -schema: ## Generate charts' schema usign helm schema-gen plugin - @helm schema-gen charts/cloudnative-pg/values.yaml > charts/cloudnative-pg/values.schema.json || \ - (echo "Please, run: helm plugin install https://github.com/karuppiah7890/helm-schema-gen.git" && exit 1) +schema: cloudnative-pg-schema cluster-schema ## Generate charts' schema using helm-schema-gen -.PHONY: pgbench-deploy -pgbench-deploy: ## Installs pgbench chart - helm dependency update charts/pgbench - helm upgrade --install pgbench --atomic charts/pgbench +cloudnative-pg-schema: + @helm schema-gen charts/cloudnative-pg/values.yaml | cat > charts/cloudnative-pg/values.schema.json || \ + (echo "Please, run: helm plugin install https://github.com/karuppiah7890/helm-schema-gen.git" && exit 1) -.PHONY: pgbench-uninstall -pgbench-uninstall: ## Uninstalls cnpg-pgbench chart if present - @helm uninstall pgbench +cluster-schema: + @helm schema-gen charts/cluster/values.yaml | cat > charts/cluster/values.schema.json || \ + (echo "Please, run: helm plugin install https://github.com/karuppiah7890/helm-schema-gen.git" && exit 1) diff --git a/charts/cluster/README.md b/charts/cluster/README.md index d334c4389..812685364 100644 --- a/charts/cluster/README.md +++ b/charts/cluster/README.md @@ -88,9 +88,9 @@ Additionally you can specify the following parameters: ```yaml backups: scheduledBackups: - - name: daily-backup - schedule: "0 0 0 * * *" # Daily at midnight - backupOwnerReference: self + - name: daily-backup + schedule: "0 0 0 * * *" # Daily at midnight + backupOwnerReference: self ``` Each backup adapter takes it's own set of parameters, listed in the [Configuration options](#Configuration-options) section @@ -149,8 +149,10 @@ refer to the [CloudNativePG Documentation](https://cloudnative-pg.io/documentat | cluster.instances | int | `3` | Number of instances | | cluster.logLevel | string | `"info"` | The instances' log level, one of the following values: error, warning, info (default), debug, trace | | cluster.monitoring.customQueries | list | `[]` | | -| cluster.monitoring.enablePodMonitor | bool | `false` | | -| cluster.postgresql | string | `nil` | Configuration of the PostgreSQL server See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration | +| cluster.monitoring.enabled | bool | `false` | | +| cluster.monitoring.podMonitor.enabled | bool | `true` | | +| cluster.monitoring.prometheusRule.enabled | bool | `true` | | +| cluster.postgresql | object | `{}` | Configuration of the PostgreSQL server See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration | | cluster.primaryUpdateMethod | string | `"switchover"` | Method to follow to upgrade the primary server during a rolling update procedure, after all replicas have been successfully updated. It can be switchover (default) or in-place (restart). | | cluster.primaryUpdateStrategy | string | `"unsupervised"` | Strategy to follow to upgrade the primary server during a rolling update procedure, after all replicas have been successfully updated: it can be automated (unsupervised - default) or manual (supervised) | | cluster.priorityClassName | string | `""` | | diff --git a/charts/cluster/docs/runbooks/CNPGClusterHACritical.md b/charts/cluster/docs/runbooks/CNPGClusterHACritical.md new file mode 100644 index 000000000..8be576c32 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterHACritical.md @@ -0,0 +1,49 @@ +CNPGClusterHACritical +===================== + +Meaning +------- + +The `CNPGClusterHACritical` alert is triggered when the CloudNativePG cluster has no ready standby replicas. + +This can happen during either a normal failover or automated minor version upgrades in a cluster with 2 or less +instances. The replaced instance may need some time to catch-up with the cluster primary instance. + +This alarm will be always triggered if your cluster is configured to run with only 1 instance. In this case you +may want to silence it. + +Impact +------ + +Having no available replicas puts your cluster at a severe risk if the primary instance fails. The primary instance is +still online and able to serve queries, although connections to the `-ro` endpoint will fail. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Get the status of the CloudNativePG cluster instances: + +```bash +kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide +``` + +Check the logs of the affected CloudNativePG instances: + +```bash +kubectl logs --namespace pod/ +``` + +Check the CloudNativePG operator logs: + +```bash +kubectl logs --namespace cnpg-system -l "app.kubernetes.io/name=cloudnative-pg" +``` + +Mitigation +---------- + +Refer to the [CloudNativePG Failure Modes](https://cloudnative-pg.io/documentation/current/failure_modes/) +and [CloudNativePG Troubleshooting](https://cloudnative-pg.io/documentation/current/troubleshooting/) documentation for +more information on how to troubleshoot and mitigate this issue. diff --git a/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md b/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md new file mode 100644 index 000000000..80acfad96 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md @@ -0,0 +1,51 @@ +CNPGClusterHAWarning +==================== + +Meaning +------- + +The `CNPGClusterHAWarning` alert is triggered when the CloudNativePG cluster ready standby replicas are less than `2`. + +This alarm will be always triggered if your cluster is configured to run with less than `3` instances. In this case you +may want to silence it. + +Impact +------ + +Having less than two available replicas puts your cluster at risk if another instance fails. The cluster is still able +to operate normally, although the `-ro` and `-r` endpoints operate at reduced capacity. + +This can happen during a normal failover or automated minor version upgrades. The replaced instance may need some time +to catch-up with the cluster primary instance which will trigger the alert if the operation takes more than 5 minutes. + +At `0` available ready replicas, a `CNPGClusterHACritical` alert will be triggered. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Get the status of the CloudNativePG cluster instances: + +```bash +kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide +``` + +Check the logs of the affected CloudNativePG instances: + +```bash +kubectl logs --namespace pod/ +``` + +Check the CloudNativePG operator logs: + +```bash +kubectl logs --namespace cnpg-system -l "app.kubernetes.io/name=cloudnative-pg" +``` + +Mitigation +---------- + +Refer to the [CloudNativePG Failure Modes](https://cloudnative-pg.io/documentation/current/failure_modes/) +and [CloudNativePG Troubleshooting](https://cloudnative-pg.io/documentation/current/troubleshooting/) documentation for +more information on how to troubleshoot and mitigate this issue. diff --git a/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md new file mode 100644 index 000000000..2003421b9 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md @@ -0,0 +1,24 @@ +CNPGClusterHighConnectionsCritical +================================== + +Meaning +------- + +This alert is triggered when the number of connections to the CloudNativePG cluster instance exceeds 95% of its capacity. + +Impact +------ + +At 100% capacity, the CloudNativePG cluster instance will not be able to accept new connections. This will result in a service +disruption. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Mitigation +---------- + +* Increase the maximum number of connections by increasing the `max_connections` PostgreSQL parameter. +* Use connection pooling by enabling PgBouncer to reduce the number of connections to the database. diff --git a/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md new file mode 100644 index 000000000..636579f75 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md @@ -0,0 +1,24 @@ +CNPGClusterHighConnectionsWarning +================================= + +Meaning +------- + +This alert is triggered when the number of connections to the CloudNativePG cluster instance exceeds 85% of its capacity. + +Impact +------ + +At 100% capacity, the CloudNativePG cluster instance will not be able to accept new connections. This will result in a service +disruption. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Mitigation +---------- + +* Increase the maximum number of connections by increasing the `max_connections` PostgreSQL parameter. +* Use connection pooling by enabling PgBouncer to reduce the number of connections to the database. diff --git a/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md b/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md new file mode 100644 index 000000000..78963ce09 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md @@ -0,0 +1,31 @@ +CNPGClusterHighReplicationLag +============================= + +Meaning +------- + +This alert is triggered when the replication lag of the CloudNativePG cluster exceed `1s`. + +Impact +------ + +High replication lag can cause the cluster replicas become out of sync. Queries to the `-r` and `-ro` endpoints may return stale data. +In the event of a failover, there may be data loss for the time period of the lag. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +High replication lag can be caused by a number of factors, including: +* Network issues +* High load on the primary or replicas +* Long running queries +* Suboptimal PostgreSQL configuration, in particular small numbers of `max_wal_senders`. + +```yaml +kubectl exec --namespace --stdin --tty services/-rw -- psql -c "SELECT * from pg_stat_replication;" +``` + +Mitigation +---------- diff --git a/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md b/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md new file mode 100644 index 000000000..df309ffa9 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md @@ -0,0 +1,28 @@ +CNPGClusterInstancesOnSameNode +============================ + +Meaning +------- + +The `CNPGClusterInstancesOnSameNode` alert is raised when two or more database pods are scheduled on the same node. + +Impact +------ + +A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +```bash +kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide +``` + +Mitigation +---------- + +1. Verify you have more than a single node with no taints, preventing pods to be scheduled there. +2. Verify your [affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) configuration. +3. For more information, please refer to the ["Scheduling"](https://cloudnative-pg.io/documentation/current/scheduling/) section in the documentation diff --git a/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md new file mode 100644 index 000000000..5b7355275 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md @@ -0,0 +1,31 @@ +CNPGClusterLowDiskSpaceCritical +=============================== + +Meaning +------- + +This alert is triggered when the disk space on the CloudNativePG cluster exceeds 90%. It can be triggered by either: + +* the PVC hosting the `PGDATA` (`storage` section) +* the PVC hosting WAL files (`walStorage` section), where applicable +* any PVC hosting a tablespace (`tablespaces` section) + +Impact +------ + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Excessive disk space usage can lead fragmentation negatively impacting performance. Reaching 100% disk usage will result +in downtime and data loss. + +Diagnosis +--------- + +Mitigation +---------- + +If you experience issues with the WAL (Write-Ahead Logging) volume and have +set up continuous archiving, ensure that WAL archiving is functioning +correctly. This is crucial to avoid a buildup of WAL files in the `pg_wal` +folder. Monitor the `cnpg_collector_pg_wal_archive_status` metric, specifically +ensuring that the number of `ready` files does not increase linearly. diff --git a/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md new file mode 100644 index 000000000..36e56acf1 --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md @@ -0,0 +1,31 @@ +CNPGClusterLowDiskSpaceWarning +============================== + +Meaning +------- + +This alert is triggered when the disk space on the CloudNativePG cluster exceeds 90%. It can be triggered by either: + +* the PVC hosting the `PGDATA` (`storage` section) +* the PVC hosting WAL files (`walStorage` section), where applicable +* any PVC hosting a tablespace (`tablespaces` section) + +Impact +------ + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Excessive disk space usage can lead fragmentation negatively impacting performance. Reaching 100% disk usage will result +in downtime and data loss. + +Diagnosis +--------- + +Mitigation +---------- + +If you experience issues with the WAL (Write-Ahead Logging) volume and have +set up continuous archiving, ensure that WAL archiving is functioning +correctly. This is crucial to avoid a buildup of WAL files in the `pg_wal` +folder. Monitor the `cnpg_collector_pg_wal_archive_status` metric, specifically +ensuring that the number of `ready` files does not increase linearly. diff --git a/charts/cluster/docs/runbooks/CNPGClusterOffline.md b/charts/cluster/docs/runbooks/CNPGClusterOffline.md new file mode 100644 index 000000000..0e69db15b --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterOffline.md @@ -0,0 +1,43 @@ +CNPGClusterOffline +================== + +Meaning +------- + +The `CNPGClusterOffline` alert is triggered when there are no ready CloudNativePG instances. + +Impact +------ + +Having an offline cluster means your applications will not be able to access the database, leading to potential service +disruption. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Get the status of the CloudNativePG cluster instances: + +```bash +kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide +``` + +Check the logs of the affected CloudNativePG instances: + +```bash +kubectl logs --namespace pod/ +``` + +Check the CloudNativePG operator logs: + +```bash +kubectl logs --namespace cnpg-system -l "app.kubernetes.io/name=cloudnative-pg" +``` + +Mitigation +---------- + +Refer to the [CloudNativePG Failure Modes](https://cloudnative-pg.io/documentation/current/failure_modes/) +and [CloudNativePG Troubleshooting](https://cloudnative-pg.io/documentation/current/troubleshooting/) documentation for +more information on how to troubleshoot and mitigate this issue. diff --git a/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md b/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md new file mode 100644 index 000000000..c5bdb05da --- /dev/null +++ b/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md @@ -0,0 +1,37 @@ +CNPGClusterZoneSpreadWarning +============================ + +Meaning +------- + +The `CNPGClusterZoneSpreadWarning` alert is raised when pods are not evenly distributed across availability zones. To be +more accurate, the alert is raised when the number of `pods > zones < 3`. + +Impact +------ + +The uneven distribution of pods across availability zones can lead to a single point of failure if a zone goes down. + +Diagnosis +--------- + +Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/). + +Get the status of the CloudNativePG cluster instances: + +```bash +kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide +``` + +Get the nodes and their respective zones: + +```bash +kubectl get nodes --label-columns topology.kubernetes.io/zone +``` + +Mitigation +---------- + +1. Verify you have more than a single node with no taints, preventing pods to be scheduled in each availability zone. +2. Verify your [affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) configuration. +3. Delete the pods and their respective PVC that are not in the desired availability zone and allow the operator to repair the cluster. diff --git a/charts/cluster/examples/custom-queries.yaml b/charts/cluster/examples/custom-queries.yaml index 1e6ef16f6..7995202d8 100644 --- a/charts/cluster/examples/custom-queries.yaml +++ b/charts/cluster/examples/custom-queries.yaml @@ -4,6 +4,7 @@ mode: standalone cluster: instances: 1 monitoring: + enabled: true customQueries: - name: "pg_cache_hit" query: | @@ -20,4 +21,4 @@ cluster: description: "Cache hit ratio" backups: - enabled: false \ No newline at end of file + enabled: false diff --git a/charts/cluster/templates/NOTES.txt b/charts/cluster/templates/NOTES.txt index 28c0e6172..dd5142ecc 100644 --- a/charts/cluster/templates/NOTES.txt +++ b/charts/cluster/templates/NOTES.txt @@ -42,20 +42,21 @@ Configuration {{ $scheduledBackups = printf "%s, %s" $scheduledBackups .name }} {{- end -}} -╭───────────────────┬────────────────────────────────────────────╮ -│ Configuration │ Value │ -┝━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ -│ Cluster mode │ {{ (printf "%-42s" .Values.mode) }} │ -│ Type │ {{ (printf "%-42s" .Values.type) }} │ -│ Image │ {{ include "cluster.color-info" (printf "%-42s" (include "cluster.imageName" .)) }} │ -│ Instances │ {{ include (printf "%s%s" "cluster.color-" $redundancyColor) (printf "%-42s" (toString .Values.cluster.instances)) }} │ -│ Backups │ {{ include (printf "%s%s" "cluster.color-" (ternary "ok" "error" .Values.backups.enabled)) (printf "%-42s" (ternary "Enabled" "Disabled" .Values.backups.enabled)) }} │ -│ Backup Provider │ {{ (printf "%-42s" (title .Values.backups.provider)) }} │ -│ Scheduled Backups │ {{ (printf "%-42s" $scheduledBackups) }} │ -│ Storage │ {{ (printf "%-42s" .Values.cluster.storage.size) }} │ -│ Storage Class │ {{ (printf "%-42s" (default "Default" .Values.cluster.storage.storageClass)) }} │ -│ PGBouncer │ {{ (printf "%-42s" (ternary "Enabled" "Disabled" .Values.pooler.enabled)) }} │ -╰───────────────────┴────────────────────────────────────────────╯ +╭───────────────────┬────────────────────────────────────────────────────────╮ +│ Configuration │ Value │ +┝━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ +│ Cluster mode │ {{ (printf "%-54s" .Values.mode) }} │ +│ Type │ {{ (printf "%-54s" .Values.type) }} │ +│ Image │ {{ include "cluster.color-info" (printf "%-54s" (include "cluster.imageName" .)) }} │ +│ Instances │ {{ include (printf "%s%s" "cluster.color-" $redundancyColor) (printf "%-54s" (toString .Values.cluster.instances)) }} │ +│ Backups │ {{ include (printf "%s%s" "cluster.color-" (ternary "ok" "error" .Values.backups.enabled)) (printf "%-54s" (ternary "Enabled" "Disabled" .Values.backups.enabled)) }} │ +│ Backup Provider │ {{ (printf "%-54s" (title .Values.backups.provider)) }} │ +│ Scheduled Backups │ {{ (printf "%-54s" $scheduledBackups) }} │ +│ Storage │ {{ (printf "%-54s" .Values.cluster.storage.size) }} │ +│ Storage Class │ {{ (printf "%-54s" (default "Default" .Values.cluster.storage.storageClass)) }} │ +│ PGBouncer │ {{ (printf "%-54s" (ternary "Enabled" "Disabled" .Values.pooler.enabled)) }} │ +│ Monitoring │ {{ include (printf "%s%s" "cluster.color-" (ternary "ok" "error" .Values.cluster.monitoring.enabled)) (printf "%-54s" (ternary "Enabled" "Disabled" .Values.cluster.monitoring.enabled)) }} │ +╰───────────────────┴────────────────────────────────────────────────────────╯ {{ if not .Values.backups.enabled }} {{- include "cluster.color-error" "Warning! Backups not enabled. Recovery will not be possible! Do not use this configuration in production.\n" }} diff --git a/charts/cluster/templates/_helpers.tpl b/charts/cluster/templates/_helpers.tpl index b00846d60..db3c253e5 100644 --- a/charts/cluster/templates/_helpers.tpl +++ b/charts/cluster/templates/_helpers.tpl @@ -48,6 +48,7 @@ Selector labels {{- define "cluster.selectorLabels" -}} app.kubernetes.io/name: {{ include "cluster.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/part-of: cloudnative-pg {{- end }} {{/* diff --git a/charts/cluster/templates/cluster.yaml b/charts/cluster/templates/cluster.yaml index 4ec251698..1410ded56 100644 --- a/charts/cluster/templates/cluster.yaml +++ b/charts/cluster/templates/cluster.yaml @@ -54,7 +54,7 @@ spec: {{ end }} monitoring: - enablePodMonitor: {{ .Values.cluster.monitoring.enablePodMonitor }} + enablePodMonitor: {{ and .Values.cluster.monitoring.enabled .Values.cluster.monitoring.podMonitor.enabled }} {{- if not (empty .Values.cluster.monitoring.customQueries) }} customQueriesConfigMap: - name: {{ include "cluster.fullname" . }}-monitoring diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml new file mode 100644 index 000000000..3da33025b --- /dev/null +++ b/charts/cluster/templates/prometheus-rule.yaml @@ -0,0 +1,177 @@ +{{- if and .Values.cluster.monitoring.enabled .Values.cluster.monitoring.prometheusRule.enabled -}} +{{- $value := "{{ $value }}" -}} +{{- $namespace := .Release.Namespace -}} +{{- $cluster := printf "%s/%s" $namespace (include "cluster.fullname" .)}} +{{- $labels := dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}" -}} +{{- $podSelector := printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .) -}} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "cluster.labels" . | nindent 4 }} + {{- with .Values.cluster.additionalLabels }} + {{ toYaml . | nindent 4 }} + {{- end }} + name: {{ include "cluster.fullname" . }}-alert-rules +spec: + groups: + - name: cloudnative-pg/{{ include "cluster.fullname" . }} + rules: + - alert: CNPGClusterHAWarning + annotations: + summary: CNPG Cluster less than 2 standby replicas. + description: |- + CloudNativePG Cluster "{{ $labels.job }}" has only {{ $value }} standby replicas, putting + your cluster at risk if another instance fails. The cluster is still able to operate normally, although + the `-ro` and `-r` endpoints operate at reduced capacity. + + This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may + need some time to catch-up with the cluster primary instance. + + This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances. + In this case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md + expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ $namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ $namespace }}"}) < 2 + for: 5m + labels: + severity: warning + - alert: CNPGClusterHACritical + annotations: + summary: CNPG Cluster has no standby replicas! + description: |- + CloudNativePG Cluster "{{ $labels.job }}" has no ready standby replicas. Your cluster at a severe + risk of data loss and downtime if the primary instance fails. + + The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint + will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main. + + This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less + instances. The replaced instance may need some time to catch-up with the cluster primary instance. + + This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this + case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md + expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ $namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ $namespace }}"}) < 1 + for: 5m + labels: + severity: critical + - alert: CNPGClusterOffline + annotations: + summary: CNPG Cluster has no running instances! + description: |- + CloudNativePG Cluster "{{ $labels.job }}" has no ready instances. + + Having an offline cluster means your applications will not be able to access the database, leading to + potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md + expr: | + ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace=~"{{ $namespace }}",pod=~"{{ $podSelector }}"}) OR vector(0)) > 0 + for: 5m + labels: + severity: critical + - alert: CNPGClusterZoneSpreadWarning + annotations: + summary: CNPG Cluster instances in the same zone. + description: |- + CloudNativePG Cluster "{{ $cluster }}" has instances in the same availability zone. + + A disaster in one availability zone will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md + expr: | + {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 + for: 5m + labels: + severity: warning + - alert: CNPGClusterInstancesOnSameNode + annotations: + summary: CNPG Cluster instances are located on the same node. + description: |- + CloudNativePG Cluster "{{ $cluster }}" has {{ $value }} + instances on the same node {{ $labels.node }}. + + A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md + expr: | + count by (node) (kube_pod_info{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) > 1 + for: 5m + labels: + severity: warning + - alert: CNPGClusterHighReplicationLag + annotations: + summary: CNPG Cluster high replication lag + description: |- + CloudNativePG Cluster "{{ $cluster }}" is experiencing a high replication lag of + {{ "{{ $value }}" }}ms. + + High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md + expr: | + max(cnpg_pg_replication_lag{namespace=~"{{ $namespace }}",pod=~"{{ $podSelector }}"}) * 1000 > 1000 + for: 5m + labels: + severity: warning + - alert: CNPGClusterHighConnectionsWarning + annotations: + summary: CNPG Instance is approaching the maximum number of connections. + description: |- + CloudNativePG Cluster "{{ $cluster }}" instance {{ $labels.pod }} is using {{ "{{ $value }}" }}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md + expr: | + sum by (pod) (cnpg_backends_total{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) * 100 > 80 + for: 5m + labels: + severity: warning + - alert: CNPGClusterHighConnectionsCritical + annotations: + summary: CNPG Instance maximum number of connections critical! + description: |- + CloudNativePG Cluster "{{ $cluster }}" instance {{ $labels.pod }} is using {{ "{{ $value }}" }}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md + expr: | + sum by (pod) (cnpg_backends_total{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) * 100 > 95 + for: 5m + labels: + severity: critical + - alert: CNPGClusterLowDiskSpaceWarning + annotations: + summary: CNPG Instance is running out of disk space. + description: |- + CloudNativePG Cluster "{{ $cluster }}" is running low on disk space. Check attached PVCs. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md + expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"})) > 0.7 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"})) > 0.7 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $podSelector }}"} + ) > 0.7 + for: 5m + labels: + severity: warning + - alert: CNPGClusterLowDiskSpaceCritical + annotations: + summary: CNPG Instance is running out of disk space! + description: |- + CloudNativePG Cluster "{{ $cluster }}" is running extremely low on disk space. Check attached PVCs! + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md + expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"})) > 0.9 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"})) > 0.9 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $podSelector }}"} + ) > 0.9 + for: 5m + labels: + severity: warning +{{ end }} diff --git a/charts/cluster/values.schema.json b/charts/cluster/values.schema.json new file mode 100644 index 000000000..49550780b --- /dev/null +++ b/charts/cluster/values.schema.json @@ -0,0 +1,342 @@ +{ + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": { + "backups": { + "type": "object", + "properties": { + "azure": { + "type": "object", + "properties": { + "connectionString": { + "type": "string" + }, + "containerName": { + "type": "string" + }, + "inheritFromAzureAD": { + "type": "boolean" + }, + "path": { + "type": "string" + }, + "serviceName": { + "type": "string" + }, + "storageAccount": { + "type": "string" + }, + "storageKey": { + "type": "string" + }, + "storageSasToken": { + "type": "string" + } + } + }, + "destinationPath": { + "type": "string" + }, + "enabled": { + "type": "boolean" + }, + "endpointURL": { + "type": "string" + }, + "google": { + "type": "object", + "properties": { + "applicationCredentials": { + "type": "string" + }, + "bucket": { + "type": "string" + }, + "gkeEnvironment": { + "type": "boolean" + }, + "path": { + "type": "string" + } + } + }, + "provider": { + "type": "string" + }, + "retentionPolicy": { + "type": "string" + }, + "s3": { + "type": "object", + "properties": { + "accessKey": { + "type": "string" + }, + "bucket": { + "type": "string" + }, + "path": { + "type": "string" + }, + "region": { + "type": "string" + }, + "secretKey": { + "type": "string" + } + } + }, + "scheduledBackups": { + "type": "array", + "items": { + "type": "object", + "properties": { + "backupOwnerReference": { + "type": "string" + }, + "name": { + "type": "string" + }, + "schedule": { + "type": "string" + } + } + } + } + } + }, + "cluster": { + "type": "object", + "properties": { + "additionalLabels": { + "type": "object" + }, + "affinity": { + "type": "object", + "properties": { + "topologyKey": { + "type": "string" + } + } + }, + "annotations": { + "type": "object" + }, + "certificates": { + "type": "null" + }, + "enableSuperuserAccess": { + "type": "boolean" + }, + "imageName": { + "type": "string" + }, + "imagePullPolicy": { + "type": "string" + }, + "imagePullSecrets": { + "type": "array" + }, + "initdb": { + "type": "object" + }, + "instances": { + "type": "integer" + }, + "logLevel": { + "type": "string" + }, + "monitoring": { + "type": "object", + "properties": { + "customQueries": { + "type": "array" + }, + "enabled": { + "type": "boolean" + }, + "podMonitor": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, + "prometheusRule": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + } + } + }, + "postgresql": { + "type": "object" + }, + "primaryUpdateMethod": { + "type": "string" + }, + "primaryUpdateStrategy": { + "type": "string" + }, + "priorityClassName": { + "type": "string" + }, + "resources": { + "type": "null" + }, + "storage": { + "type": "object", + "properties": { + "size": { + "type": "string" + }, + "storageClass": { + "type": "string" + } + } + }, + "superuserSecret": { + "type": "string" + } + } + }, + "fullnameOverride": { + "type": "string" + }, + "mode": { + "type": "string" + }, + "nameOverride": { + "type": "string" + }, + "pooler": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "instances": { + "type": "integer" + }, + "parameters": { + "type": "object", + "properties": { + "default_pool_size": { + "type": "string" + }, + "max_client_conn": { + "type": "string" + } + } + }, + "poolMode": { + "type": "string" + } + } + }, + "recovery": { + "type": "object", + "properties": { + "azure": { + "type": "object", + "properties": { + "connectionString": { + "type": "string" + }, + "containerName": { + "type": "string" + }, + "inheritFromAzureAD": { + "type": "boolean" + }, + "path": { + "type": "string" + }, + "serviceName": { + "type": "string" + }, + "storageAccount": { + "type": "string" + }, + "storageKey": { + "type": "string" + }, + "storageSasToken": { + "type": "string" + } + } + }, + "backupName": { + "type": "string" + }, + "clusterName": { + "type": "string" + }, + "destinationPath": { + "type": "string" + }, + "endpointURL": { + "type": "string" + }, + "google": { + "type": "object", + "properties": { + "applicationCredentials": { + "type": "string" + }, + "bucket": { + "type": "string" + }, + "gkeEnvironment": { + "type": "boolean" + }, + "path": { + "type": "string" + } + } + }, + "method": { + "type": "string" + }, + "pitrTarget": { + "type": "object", + "properties": { + "time": { + "type": "string" + } + } + }, + "provider": { + "type": "string" + }, + "s3": { + "type": "object", + "properties": { + "accessKey": { + "type": "string" + }, + "bucket": { + "type": "string" + }, + "path": { + "type": "string" + }, + "region": { + "type": "string" + }, + "secretKey": { + "type": "string" + } + } + } + } + }, + "type": { + "type": "string" + } + } +} diff --git a/charts/cluster/values.yaml b/charts/cluster/values.yaml index dec1fc9b4..02f967133 100644 --- a/charts/cluster/values.yaml +++ b/charts/cluster/values.yaml @@ -132,7 +132,11 @@ cluster: superuserSecret: "" monitoring: - enablePodMonitor: false + enabled: false + podMonitor: + enabled: true + prometheusRule: + enabled: true customQueries: [] # - name: "pg_cache_hit_ratio" # query: "SELECT current_database() as datname, sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) as ratio FROM pg_statio_user_tables;" @@ -146,7 +150,8 @@ cluster: # -- Configuration of the PostgreSQL server # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration - postgresql: + postgresql: {} + # max_connections: 300 # -- BootstrapInitDB is the configuration of the bootstrap process when initdb is used # See: https://cloudnative-pg.io/documentation/current/bootstrap/