From b2088c4513063885fb06c82683686e3bb13d443b Mon Sep 17 00:00:00 2001
From: Itay Grudev <itay.grudev@essentim.com>
Date: Fri, 1 Mar 2024 12:02:29 +0200
Subject: [PATCH] Implemented Prometheus Rule for automated alerts (#193)

feat(cluster): Prometheus Rule for automated alerts + runbooks for a basic set of alerts

* Renamed: `cluster.monitoring.enablePodMonitor` to `cluster.monitoring.podMonitor.enabled`
* New configuration option: `cluster.monitoring.prometheusRule.enabled` defaults to `true`

Signed-off-by: Itay Grudev <itay.grudev@essentim.com>
Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
---
 Makefile                                      |  17 +-
 charts/cluster/README.md                      |  12 +-
 .../docs/runbooks/CNPGClusterHACritical.md    |  49 +++
 .../docs/runbooks/CNPGClusterHAWarning.md     |  51 +++
 .../CNPGClusterHighConnectionsCritical.md     |  24 ++
 .../CNPGClusterHighConnectionsWarning.md      |  24 ++
 .../runbooks/CNPGClusterHighReplicationLag.md |  31 ++
 .../CNPGClusterInstancesOnSameNode.md         |  28 ++
 .../CNPGClusterLowDiskSpaceCritical.md        |  31 ++
 .../CNPGClusterLowDiskSpaceWarning.md         |  31 ++
 .../docs/runbooks/CNPGClusterOffline.md       |  43 +++
 .../runbooks/CNPGClusterZoneSpreadWarning.md  |  37 ++
 charts/cluster/examples/custom-queries.yaml   |   3 +-
 charts/cluster/templates/NOTES.txt            |  29 +-
 charts/cluster/templates/_helpers.tpl         |   1 +
 charts/cluster/templates/cluster.yaml         |   2 +-
 charts/cluster/templates/prometheus-rule.yaml | 177 +++++++++
 charts/cluster/values.schema.json             | 342 ++++++++++++++++++
 charts/cluster/values.yaml                    |   9 +-
 19 files changed, 908 insertions(+), 33 deletions(-)
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHACritical.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterOffline.md
 create mode 100644 charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
 create mode 100644 charts/cluster/templates/prometheus-rule.yaml
 create mode 100644 charts/cluster/values.schema.json

diff --git a/Makefile b/Makefile
index 29b256f2e..ac2030a88 100644
--- a/Makefile
+++ b/Makefile
@@ -12,15 +12,12 @@ docs: ## Generate charts' docs using helm-docs
 		(echo "Please, install https://github.com/norwoodj/helm-docs first" && exit 1)
 
 .PHONY: schema
-schema: ## Generate charts' schema usign helm schema-gen plugin
-	@helm schema-gen charts/cloudnative-pg/values.yaml > charts/cloudnative-pg/values.schema.json || \
-		(echo "Please, run: helm plugin install https://github.com/karuppiah7890/helm-schema-gen.git" && exit 1)
+schema: cloudnative-pg-schema cluster-schema ## Generate charts' schema using helm-schema-gen
 
-.PHONY: pgbench-deploy
-pgbench-deploy: ## Installs pgbench chart
-	helm dependency update charts/pgbench
-	helm upgrade --install pgbench --atomic charts/pgbench
+cloudnative-pg-schema:
+	@helm schema-gen charts/cloudnative-pg/values.yaml | cat > charts/cloudnative-pg/values.schema.json || \
+		(echo "Please, run: helm plugin install https://github.com/karuppiah7890/helm-schema-gen.git" && exit 1)
 
-.PHONY: pgbench-uninstall
-pgbench-uninstall: ## Uninstalls cnpg-pgbench chart if present
-	@helm uninstall pgbench
+cluster-schema:
+	@helm schema-gen charts/cluster/values.yaml | cat > charts/cluster/values.schema.json || \
+		(echo "Please, run: helm plugin install https://github.com/karuppiah7890/helm-schema-gen.git" && exit 1)
diff --git a/charts/cluster/README.md b/charts/cluster/README.md
index d334c4389..812685364 100644
--- a/charts/cluster/README.md
+++ b/charts/cluster/README.md
@@ -88,9 +88,9 @@ Additionally you can specify the following parameters:
 ```yaml
 backups:
   scheduledBackups:
-	- name: daily-backup
-	  schedule: "0 0 0 * * *" # Daily at midnight
-	  backupOwnerReference: self
+    - name: daily-backup
+      schedule: "0 0 0 * * *" # Daily at midnight
+      backupOwnerReference: self
 ```
 
 Each backup adapter takes it's own set of parameters, listed in the [Configuration options](#Configuration-options) section
@@ -149,8 +149,10 @@ refer to  the [CloudNativePG Documentation](https://cloudnative-pg.io/documentat
 | cluster.instances | int | `3` | Number of instances |
 | cluster.logLevel | string | `"info"` | The instances' log level, one of the following values: error, warning, info (default), debug, trace |
 | cluster.monitoring.customQueries | list | `[]` |  |
-| cluster.monitoring.enablePodMonitor | bool | `false` |  |
-| cluster.postgresql | string | `nil` | Configuration of the PostgreSQL server See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration |
+| cluster.monitoring.enabled | bool | `false` |  |
+| cluster.monitoring.podMonitor.enabled | bool | `true` |  |
+| cluster.monitoring.prometheusRule.enabled | bool | `true` |  |
+| cluster.postgresql | object | `{}` | Configuration of the PostgreSQL server See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration |
 | cluster.primaryUpdateMethod | string | `"switchover"` | Method to follow to upgrade the primary server during a rolling update procedure, after all replicas have been successfully updated. It can be switchover (default) or in-place (restart). |
 | cluster.primaryUpdateStrategy | string | `"unsupervised"` | Strategy to follow to upgrade the primary server during a rolling update procedure, after all replicas have been successfully updated: it can be automated (unsupervised - default) or manual (supervised) |
 | cluster.priorityClassName | string | `""` |  |
diff --git a/charts/cluster/docs/runbooks/CNPGClusterHACritical.md b/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
new file mode 100644
index 000000000..8be576c32
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
@@ -0,0 +1,49 @@
+CNPGClusterHACritical
+=====================
+
+Meaning
+-------
+
+The `CNPGClusterHACritical` alert is triggered when the CloudNativePG cluster has no ready standby replicas.
+
+This can happen during either a normal failover or automated minor version upgrades in a cluster with 2 or less
+instances. The replaced instance may need some time to catch-up with the cluster primary instance.
+
+This alarm will be always triggered if your cluster is configured to run with only 1 instance. In this case you
+may want to silence it.
+
+Impact
+------
+
+Having no available replicas puts your cluster at a severe risk if the primary instance fails. The primary instance is
+still online and able to serve queries, although connections to the `-ro` endpoint will fail.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Get the status of the CloudNativePG cluster instances:
+
+```bash
+kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide
+```
+
+Check the logs of the affected CloudNativePG instances:
+
+```bash
+kubectl logs --namespace <namespace> pod/<instance-pod-name>
+```
+
+Check the CloudNativePG operator logs:
+
+```bash
+kubectl logs --namespace cnpg-system -l "app.kubernetes.io/name=cloudnative-pg"
+```
+
+Mitigation
+----------
+
+Refer to the [CloudNativePG Failure Modes](https://cloudnative-pg.io/documentation/current/failure_modes/)
+and [CloudNativePG Troubleshooting](https://cloudnative-pg.io/documentation/current/troubleshooting/) documentation for
+more information on how to troubleshoot and mitigate this issue.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md b/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
new file mode 100644
index 000000000..80acfad96
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
@@ -0,0 +1,51 @@
+CNPGClusterHAWarning
+====================
+
+Meaning
+-------
+
+The `CNPGClusterHAWarning` alert is triggered when the CloudNativePG cluster ready standby replicas are less than `2`.
+
+This alarm will be always triggered if your cluster is configured to run with less than `3` instances. In this case you
+may want to silence it.
+
+Impact
+------
+
+Having less than two available replicas puts your cluster at risk if another instance fails. The cluster is still able
+to operate normally, although the `-ro` and `-r` endpoints operate at reduced capacity.
+
+This can happen during a normal failover or automated minor version upgrades. The replaced instance may need some time
+to catch-up with the cluster primary instance which will trigger the alert if the operation takes more than 5 minutes.
+
+At `0` available ready replicas, a `CNPGClusterHACritical` alert will be triggered.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Get the status of the CloudNativePG cluster instances:
+
+```bash
+kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide
+```
+
+Check the logs of the affected CloudNativePG instances:
+
+```bash
+kubectl logs --namespace <namespace> pod/<instance-pod-name>
+```
+
+Check the CloudNativePG operator logs:
+
+```bash
+kubectl logs --namespace cnpg-system -l "app.kubernetes.io/name=cloudnative-pg"
+```
+
+Mitigation
+----------
+
+Refer to the [CloudNativePG Failure Modes](https://cloudnative-pg.io/documentation/current/failure_modes/)
+and [CloudNativePG Troubleshooting](https://cloudnative-pg.io/documentation/current/troubleshooting/) documentation for
+more information on how to troubleshoot and mitigate this issue.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
new file mode 100644
index 000000000..2003421b9
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
@@ -0,0 +1,24 @@
+CNPGClusterHighConnectionsCritical
+==================================
+
+Meaning
+-------
+
+This alert is triggered when the number of connections to the CloudNativePG cluster instance exceeds 95% of its capacity.
+
+Impact
+------
+
+At 100% capacity, the CloudNativePG cluster instance will not be able to accept new connections. This will result in a service
+disruption.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Mitigation
+----------
+
+* Increase the maximum number of connections by increasing the `max_connections` PostgreSQL parameter.
+* Use connection pooling by enabling PgBouncer to reduce the number of connections to the database.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
new file mode 100644
index 000000000..636579f75
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
@@ -0,0 +1,24 @@
+CNPGClusterHighConnectionsWarning
+=================================
+
+Meaning
+-------
+
+This alert is triggered when the number of connections to the CloudNativePG cluster instance exceeds 85% of its capacity.
+
+Impact
+------
+
+At 100% capacity, the CloudNativePG cluster instance will not be able to accept new connections. This will result in a service
+disruption.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Mitigation
+----------
+
+* Increase the maximum number of connections by increasing the `max_connections` PostgreSQL parameter.
+* Use connection pooling by enabling PgBouncer to reduce the number of connections to the database.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md b/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
new file mode 100644
index 000000000..78963ce09
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
@@ -0,0 +1,31 @@
+CNPGClusterHighReplicationLag
+=============================
+
+Meaning
+-------
+
+This alert is triggered when the replication lag of the CloudNativePG cluster exceed `1s`.
+
+Impact
+------
+
+High replication lag can cause the cluster replicas become out of sync. Queries to the `-r` and `-ro` endpoints may return stale data.
+In the event of a failover, there may be data loss for the time period of the lag.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+High replication lag can be caused by a number of factors, including:
+* Network issues
+* High load on the primary or replicas
+* Long running queries
+* Suboptimal PostgreSQL configuration, in particular small numbers of `max_wal_senders`.
+
+```yaml
+kubectl exec --namespace <namespace> --stdin --tty services/<cluster_name>-rw -- psql -c "SELECT * from pg_stat_replication;"
+```
+
+Mitigation
+----------
diff --git a/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md b/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
new file mode 100644
index 000000000..df309ffa9
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
@@ -0,0 +1,28 @@
+CNPGClusterInstancesOnSameNode
+============================
+
+Meaning
+-------
+
+The `CNPGClusterInstancesOnSameNode` alert is raised when two or more database pods are scheduled on the same node.
+
+Impact
+------
+
+A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+```bash
+kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide
+```
+
+Mitigation
+----------
+
+1. Verify you have more than a single node with no taints, preventing pods to be scheduled there.
+2. Verify your [affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) configuration.
+3. For more information, please refer to the ["Scheduling"](https://cloudnative-pg.io/documentation/current/scheduling/) section in the documentation
diff --git a/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
new file mode 100644
index 000000000..5b7355275
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
@@ -0,0 +1,31 @@
+CNPGClusterLowDiskSpaceCritical
+===============================
+
+Meaning
+-------
+
+This alert is triggered when the disk space on the CloudNativePG cluster exceeds 90%. It can be triggered by either:
+
+* the PVC hosting the `PGDATA` (`storage` section)
+* the PVC hosting WAL files (`walStorage` section), where applicable
+* any PVC hosting a tablespace (`tablespaces` section)
+
+Impact
+------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Excessive disk space usage can lead fragmentation negatively impacting performance. Reaching 100% disk usage will result
+in downtime and data loss.
+
+Diagnosis
+---------
+
+Mitigation
+----------
+
+If you experience issues with the WAL (Write-Ahead Logging) volume and have
+set up continuous archiving, ensure that WAL archiving is functioning
+correctly. This is crucial to avoid a buildup of WAL files in the `pg_wal`
+folder. Monitor the `cnpg_collector_pg_wal_archive_status` metric, specifically
+ensuring that the number of `ready` files does not increase linearly.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
new file mode 100644
index 000000000..36e56acf1
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
@@ -0,0 +1,31 @@
+CNPGClusterLowDiskSpaceWarning
+==============================
+
+Meaning
+-------
+
+This alert is triggered when the disk space on the CloudNativePG cluster exceeds 90%. It can be triggered by either:
+
+* the PVC hosting the `PGDATA` (`storage` section)
+* the PVC hosting WAL files (`walStorage` section), where applicable
+* any PVC hosting a tablespace (`tablespaces` section)
+
+Impact
+------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Excessive disk space usage can lead fragmentation negatively impacting performance. Reaching 100% disk usage will result
+in downtime and data loss.
+
+Diagnosis
+---------
+
+Mitigation
+----------
+
+If you experience issues with the WAL (Write-Ahead Logging) volume and have
+set up continuous archiving, ensure that WAL archiving is functioning
+correctly. This is crucial to avoid a buildup of WAL files in the `pg_wal`
+folder. Monitor the `cnpg_collector_pg_wal_archive_status` metric, specifically
+ensuring that the number of `ready` files does not increase linearly.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterOffline.md b/charts/cluster/docs/runbooks/CNPGClusterOffline.md
new file mode 100644
index 000000000..0e69db15b
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterOffline.md
@@ -0,0 +1,43 @@
+CNPGClusterOffline
+==================
+
+Meaning
+-------
+
+The `CNPGClusterOffline` alert is triggered when there are no ready CloudNativePG instances.
+
+Impact
+------
+
+Having an offline cluster means your applications will not be able to access the database, leading to potential service
+disruption.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Get the status of the CloudNativePG cluster instances:
+
+```bash
+kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide
+```
+
+Check the logs of the affected CloudNativePG instances:
+
+```bash
+kubectl logs --namespace <namespace> pod/<instance-pod-name>
+```
+
+Check the CloudNativePG operator logs:
+
+```bash
+kubectl logs --namespace cnpg-system -l "app.kubernetes.io/name=cloudnative-pg"
+```
+
+Mitigation
+----------
+
+Refer to the [CloudNativePG Failure Modes](https://cloudnative-pg.io/documentation/current/failure_modes/)
+and [CloudNativePG Troubleshooting](https://cloudnative-pg.io/documentation/current/troubleshooting/) documentation for
+more information on how to troubleshoot and mitigate this issue.
diff --git a/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md b/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
new file mode 100644
index 000000000..c5bdb05da
--- /dev/null
+++ b/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
@@ -0,0 +1,37 @@
+CNPGClusterZoneSpreadWarning
+============================
+
+Meaning
+-------
+
+The `CNPGClusterZoneSpreadWarning` alert is raised when pods are not evenly distributed across availability zones. To be
+more accurate, the alert is raised when the number of `pods > zones < 3`.
+
+Impact
+------
+
+The uneven distribution of pods across availability zones can lead to a single point of failure if a zone goes down.
+
+Diagnosis
+---------
+
+Use the [CloudNativePG Grafana Dashboard](https://grafana.com/grafana/dashboards/20417-cloudnativepg/).
+
+Get the status of the CloudNativePG cluster instances:
+
+```bash
+kubectl get pods -A -l "cnpg.io/podRole=instance" -o wide
+```
+
+Get the nodes and their respective zones:
+
+```bash
+kubectl get nodes --label-columns topology.kubernetes.io/zone
+```
+
+Mitigation
+----------
+
+1. Verify you have more than a single node with no taints, preventing pods to be scheduled in each availability zone.
+2. Verify your [affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/) configuration.
+3. Delete the pods and their respective PVC that are not in the desired availability zone and allow the operator to repair the cluster.
diff --git a/charts/cluster/examples/custom-queries.yaml b/charts/cluster/examples/custom-queries.yaml
index 1e6ef16f6..7995202d8 100644
--- a/charts/cluster/examples/custom-queries.yaml
+++ b/charts/cluster/examples/custom-queries.yaml
@@ -4,6 +4,7 @@ mode: standalone
 cluster:
   instances: 1
   monitoring:
+    enabled: true
     customQueries:
       - name: "pg_cache_hit"
         query: |
@@ -20,4 +21,4 @@ cluster:
               description: "Cache hit ratio"
 
 backups:
-  enabled: false
\ No newline at end of file
+  enabled: false
diff --git a/charts/cluster/templates/NOTES.txt b/charts/cluster/templates/NOTES.txt
index 28c0e6172..dd5142ecc 100644
--- a/charts/cluster/templates/NOTES.txt
+++ b/charts/cluster/templates/NOTES.txt
@@ -42,20 +42,21 @@ Configuration
   {{ $scheduledBackups = printf "%s, %s" $scheduledBackups .name }}
 {{- end -}}
 
-╭───────────────────┬────────────────────────────────────────────╮
-│ Configuration     │ Value                                      │
-┝━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
-│ Cluster mode      │ {{ (printf "%-42s" .Values.mode) }} │
-│ Type              │ {{ (printf "%-42s" .Values.type) }} │
-│ Image             │ {{ include "cluster.color-info" (printf "%-42s" (include "cluster.imageName" .)) }} │
-│ Instances         │ {{ include (printf "%s%s" "cluster.color-" $redundancyColor) (printf "%-42s" (toString .Values.cluster.instances)) }} │
-│ Backups           │ {{ include (printf "%s%s" "cluster.color-" (ternary "ok" "error" .Values.backups.enabled)) (printf "%-42s" (ternary "Enabled" "Disabled" .Values.backups.enabled)) }} │
-│ Backup Provider   │ {{ (printf "%-42s" (title .Values.backups.provider)) }} │
-│ Scheduled Backups │ {{ (printf "%-42s" $scheduledBackups) }} │
-│ Storage           │ {{ (printf "%-42s" .Values.cluster.storage.size) }} │
-│ Storage Class     │ {{ (printf "%-42s" (default "Default" .Values.cluster.storage.storageClass)) }} │
-│ PGBouncer         │ {{ (printf "%-42s" (ternary "Enabled" "Disabled" .Values.pooler.enabled)) }} │
-╰───────────────────┴────────────────────────────────────────────╯
+╭───────────────────┬────────────────────────────────────────────────────────╮
+│ Configuration     │ Value                                                  │
+┝━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
+│ Cluster mode      │ {{ (printf "%-54s" .Values.mode) }} │
+│ Type              │ {{ (printf "%-54s" .Values.type) }} │
+│ Image             │ {{ include "cluster.color-info" (printf "%-54s" (include "cluster.imageName" .)) }} │
+│ Instances         │ {{ include (printf "%s%s" "cluster.color-" $redundancyColor) (printf "%-54s" (toString .Values.cluster.instances)) }} │
+│ Backups           │ {{ include (printf "%s%s" "cluster.color-" (ternary "ok" "error" .Values.backups.enabled)) (printf "%-54s" (ternary "Enabled" "Disabled" .Values.backups.enabled)) }} │
+│ Backup Provider   │ {{ (printf "%-54s" (title .Values.backups.provider)) }} │
+│ Scheduled Backups │ {{ (printf "%-54s" $scheduledBackups) }} │
+│ Storage           │ {{ (printf "%-54s" .Values.cluster.storage.size) }} │
+│ Storage Class     │ {{ (printf "%-54s" (default "Default" .Values.cluster.storage.storageClass)) }} │
+│ PGBouncer         │ {{ (printf "%-54s" (ternary "Enabled" "Disabled" .Values.pooler.enabled)) }} │
+│ Monitoring        │ {{ include (printf "%s%s" "cluster.color-" (ternary "ok" "error" .Values.cluster.monitoring.enabled)) (printf "%-54s" (ternary "Enabled" "Disabled" .Values.cluster.monitoring.enabled)) }} │
+╰───────────────────┴────────────────────────────────────────────────────────╯
 
 {{ if not .Values.backups.enabled }}
   {{- include "cluster.color-error" "Warning! Backups not enabled. Recovery will not be possible! Do not use this configuration in production.\n" }}
diff --git a/charts/cluster/templates/_helpers.tpl b/charts/cluster/templates/_helpers.tpl
index b00846d60..db3c253e5 100644
--- a/charts/cluster/templates/_helpers.tpl
+++ b/charts/cluster/templates/_helpers.tpl
@@ -48,6 +48,7 @@ Selector labels
 {{- define "cluster.selectorLabels" -}}
 app.kubernetes.io/name: {{ include "cluster.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
+app.kubernetes.io/part-of: cloudnative-pg
 {{- end }}
 
 {{/*
diff --git a/charts/cluster/templates/cluster.yaml b/charts/cluster/templates/cluster.yaml
index 4ec251698..1410ded56 100644
--- a/charts/cluster/templates/cluster.yaml
+++ b/charts/cluster/templates/cluster.yaml
@@ -54,7 +54,7 @@ spec:
     {{ end }}
 
   monitoring:
-    enablePodMonitor: {{ .Values.cluster.monitoring.enablePodMonitor }}
+    enablePodMonitor: {{ and .Values.cluster.monitoring.enabled .Values.cluster.monitoring.podMonitor.enabled }}
     {{- if not (empty .Values.cluster.monitoring.customQueries) }}
     customQueriesConfigMap:
       - name: {{ include "cluster.fullname" . }}-monitoring
diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml
new file mode 100644
index 000000000..3da33025b
--- /dev/null
+++ b/charts/cluster/templates/prometheus-rule.yaml
@@ -0,0 +1,177 @@
+{{- if and .Values.cluster.monitoring.enabled .Values.cluster.monitoring.prometheusRule.enabled -}}
+{{- $value := "{{ $value }}" -}}
+{{- $namespace := .Release.Namespace -}}
+{{- $cluster := printf "%s/%s" $namespace (include "cluster.fullname" .)}}
+{{- $labels := dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}" -}}
+{{- $podSelector := printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .) -}}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "cluster.labels" . | nindent 4 }}
+    {{- with .Values.cluster.additionalLabels }}
+      {{ toYaml . | nindent 4 }}
+    {{- end }}
+  name: {{ include "cluster.fullname" . }}-alert-rules
+spec:
+  groups:
+    - name: cloudnative-pg/{{ include "cluster.fullname" . }}
+      rules:
+        - alert: CNPGClusterHAWarning
+          annotations:
+            summary: CNPG Cluster less than 2 standby replicas.
+            description: |-
+              CloudNativePG Cluster "{{ $labels.job }}" has only {{ $value }} standby replicas, putting
+              your cluster at risk if another instance fails. The cluster is still able to operate normally, although
+              the `-ro` and `-r` endpoints operate at reduced capacity.
+
+              This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may
+              need some time to catch-up with the cluster primary instance.
+
+              This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances.
+              In this case you may want to silence it.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md
+          expr: |
+            max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ $namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ $namespace }}"}) < 2
+          for: 5m
+          labels:
+            severity: warning
+        - alert: CNPGClusterHACritical
+          annotations:
+            summary: CNPG Cluster has no standby replicas!
+            description: |-
+              CloudNativePG Cluster "{{ $labels.job }}" has no ready standby replicas. Your cluster at a severe
+              risk of data loss and downtime if the primary instance fails.
+
+              The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint
+              will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main.
+
+              This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less
+              instances. The replaced instance may need some time to catch-up with the cluster primary instance.
+
+              This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this
+              case you may want to silence it.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md
+          expr: |
+            max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ $namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ $namespace }}"}) < 1
+          for: 5m
+          labels:
+            severity: critical
+        - alert: CNPGClusterOffline
+          annotations:
+            summary: CNPG Cluster has no running instances!
+            description: |-
+              CloudNativePG Cluster "{{ $labels.job }}" has no ready instances.
+
+              Having an offline cluster means your applications will not be able to access the database, leading to
+              potential service disruption and/or data loss.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
+          expr: |
+            ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace=~"{{ $namespace }}",pod=~"{{ $podSelector }}"}) OR vector(0)) > 0
+          for: 5m
+          labels:
+            severity: critical
+        - alert: CNPGClusterZoneSpreadWarning
+          annotations:
+            summary: CNPG Cluster instances in the same zone.
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" has instances in the same availability zone.
+
+              A disaster in one availability zone will lead to a potential service disruption and/or data loss.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
+          expr: |
+            {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
+          for: 5m
+          labels:
+            severity: warning
+        - alert: CNPGClusterInstancesOnSameNode
+          annotations:
+            summary: CNPG Cluster instances are located on the same node.
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" has {{ $value }}
+              instances on the same node {{ $labels.node }}.
+
+              A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
+          expr: |
+            count by (node) (kube_pod_info{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) > 1
+          for: 5m
+          labels:
+            severity: warning
+        - alert: CNPGClusterHighReplicationLag
+          annotations:
+            summary: CNPG Cluster high replication lag
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" is experiencing a high replication lag of
+              {{ "{{ $value }}" }}ms.
+
+              High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
+          expr: |
+            max(cnpg_pg_replication_lag{namespace=~"{{ $namespace }}",pod=~"{{ $podSelector }}"}) * 1000 > 1000
+          for: 5m
+          labels:
+            severity: warning
+        - alert: CNPGClusterHighConnectionsWarning
+          annotations:
+            summary: CNPG Instance is approaching the maximum number of connections.
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" instance {{ $labels.pod }} is using {{ "{{ $value }}" }}% of
+              the maximum number of connections.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
+          expr: |
+            sum by (pod) (cnpg_backends_total{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) * 100 > 80
+          for: 5m
+          labels:
+            severity: warning
+        - alert: CNPGClusterHighConnectionsCritical
+          annotations:
+            summary: CNPG Instance maximum number of connections critical!
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" instance {{ $labels.pod }} is using {{ "{{ $value }}" }}% of
+              the maximum number of connections.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
+          expr: |
+            sum by (pod) (cnpg_backends_total{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) * 100 > 95
+          for: 5m
+          labels:
+            severity: critical
+        - alert: CNPGClusterLowDiskSpaceWarning
+          annotations:
+            summary: CNPG Instance is running out of disk space.
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" is running low on disk space. Check attached PVCs.
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
+          expr: |
+            max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"})) > 0.7 OR
+            max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"})) > 0.7 OR
+            max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"})
+                /
+                sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"})
+                *
+                on(namespace, persistentvolumeclaim) group_left(volume)
+                kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $podSelector }}"}
+            ) > 0.7
+          for: 5m
+          labels:
+            severity: warning
+        - alert: CNPGClusterLowDiskSpaceCritical
+          annotations:
+            summary: CNPG Instance is running out of disk space!
+            description: |-
+              CloudNativePG Cluster "{{ $cluster }}" is running extremely low on disk space. Check attached PVCs!
+            runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
+          expr: |
+            max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"})) > 0.9 OR
+            max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"})) > 0.9 OR
+            max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"})
+                /
+                sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"})
+                *
+                on(namespace, persistentvolumeclaim) group_left(volume)
+                kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $podSelector }}"}
+            ) > 0.9
+          for: 5m
+          labels:
+            severity: warning
+{{ end }}
diff --git a/charts/cluster/values.schema.json b/charts/cluster/values.schema.json
new file mode 100644
index 000000000..49550780b
--- /dev/null
+++ b/charts/cluster/values.schema.json
@@ -0,0 +1,342 @@
+{
+    "$schema": "http://json-schema.org/schema#",
+    "type": "object",
+    "properties": {
+        "backups": {
+            "type": "object",
+            "properties": {
+                "azure": {
+                    "type": "object",
+                    "properties": {
+                        "connectionString": {
+                            "type": "string"
+                        },
+                        "containerName": {
+                            "type": "string"
+                        },
+                        "inheritFromAzureAD": {
+                            "type": "boolean"
+                        },
+                        "path": {
+                            "type": "string"
+                        },
+                        "serviceName": {
+                            "type": "string"
+                        },
+                        "storageAccount": {
+                            "type": "string"
+                        },
+                        "storageKey": {
+                            "type": "string"
+                        },
+                        "storageSasToken": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "destinationPath": {
+                    "type": "string"
+                },
+                "enabled": {
+                    "type": "boolean"
+                },
+                "endpointURL": {
+                    "type": "string"
+                },
+                "google": {
+                    "type": "object",
+                    "properties": {
+                        "applicationCredentials": {
+                            "type": "string"
+                        },
+                        "bucket": {
+                            "type": "string"
+                        },
+                        "gkeEnvironment": {
+                            "type": "boolean"
+                        },
+                        "path": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "provider": {
+                    "type": "string"
+                },
+                "retentionPolicy": {
+                    "type": "string"
+                },
+                "s3": {
+                    "type": "object",
+                    "properties": {
+                        "accessKey": {
+                            "type": "string"
+                        },
+                        "bucket": {
+                            "type": "string"
+                        },
+                        "path": {
+                            "type": "string"
+                        },
+                        "region": {
+                            "type": "string"
+                        },
+                        "secretKey": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "scheduledBackups": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "backupOwnerReference": {
+                                "type": "string"
+                            },
+                            "name": {
+                                "type": "string"
+                            },
+                            "schedule": {
+                                "type": "string"
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "cluster": {
+            "type": "object",
+            "properties": {
+                "additionalLabels": {
+                    "type": "object"
+                },
+                "affinity": {
+                    "type": "object",
+                    "properties": {
+                        "topologyKey": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "annotations": {
+                    "type": "object"
+                },
+                "certificates": {
+                    "type": "null"
+                },
+                "enableSuperuserAccess": {
+                    "type": "boolean"
+                },
+                "imageName": {
+                    "type": "string"
+                },
+                "imagePullPolicy": {
+                    "type": "string"
+                },
+                "imagePullSecrets": {
+                    "type": "array"
+                },
+                "initdb": {
+                    "type": "object"
+                },
+                "instances": {
+                    "type": "integer"
+                },
+                "logLevel": {
+                    "type": "string"
+                },
+                "monitoring": {
+                    "type": "object",
+                    "properties": {
+                        "customQueries": {
+                            "type": "array"
+                        },
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "podMonitor": {
+                            "type": "object",
+                            "properties": {
+                                "enabled": {
+                                    "type": "boolean"
+                                }
+                            }
+                        },
+                        "prometheusRule": {
+                            "type": "object",
+                            "properties": {
+                                "enabled": {
+                                    "type": "boolean"
+                                }
+                            }
+                        }
+                    }
+                },
+                "postgresql": {
+                    "type": "object"
+                },
+                "primaryUpdateMethod": {
+                    "type": "string"
+                },
+                "primaryUpdateStrategy": {
+                    "type": "string"
+                },
+                "priorityClassName": {
+                    "type": "string"
+                },
+                "resources": {
+                    "type": "null"
+                },
+                "storage": {
+                    "type": "object",
+                    "properties": {
+                        "size": {
+                            "type": "string"
+                        },
+                        "storageClass": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "superuserSecret": {
+                    "type": "string"
+                }
+            }
+        },
+        "fullnameOverride": {
+            "type": "string"
+        },
+        "mode": {
+            "type": "string"
+        },
+        "nameOverride": {
+            "type": "string"
+        },
+        "pooler": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean"
+                },
+                "instances": {
+                    "type": "integer"
+                },
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "default_pool_size": {
+                            "type": "string"
+                        },
+                        "max_client_conn": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "poolMode": {
+                    "type": "string"
+                }
+            }
+        },
+        "recovery": {
+            "type": "object",
+            "properties": {
+                "azure": {
+                    "type": "object",
+                    "properties": {
+                        "connectionString": {
+                            "type": "string"
+                        },
+                        "containerName": {
+                            "type": "string"
+                        },
+                        "inheritFromAzureAD": {
+                            "type": "boolean"
+                        },
+                        "path": {
+                            "type": "string"
+                        },
+                        "serviceName": {
+                            "type": "string"
+                        },
+                        "storageAccount": {
+                            "type": "string"
+                        },
+                        "storageKey": {
+                            "type": "string"
+                        },
+                        "storageSasToken": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "backupName": {
+                    "type": "string"
+                },
+                "clusterName": {
+                    "type": "string"
+                },
+                "destinationPath": {
+                    "type": "string"
+                },
+                "endpointURL": {
+                    "type": "string"
+                },
+                "google": {
+                    "type": "object",
+                    "properties": {
+                        "applicationCredentials": {
+                            "type": "string"
+                        },
+                        "bucket": {
+                            "type": "string"
+                        },
+                        "gkeEnvironment": {
+                            "type": "boolean"
+                        },
+                        "path": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "method": {
+                    "type": "string"
+                },
+                "pitrTarget": {
+                    "type": "object",
+                    "properties": {
+                        "time": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "provider": {
+                    "type": "string"
+                },
+                "s3": {
+                    "type": "object",
+                    "properties": {
+                        "accessKey": {
+                            "type": "string"
+                        },
+                        "bucket": {
+                            "type": "string"
+                        },
+                        "path": {
+                            "type": "string"
+                        },
+                        "region": {
+                            "type": "string"
+                        },
+                        "secretKey": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        },
+        "type": {
+            "type": "string"
+        }
+    }
+}
diff --git a/charts/cluster/values.yaml b/charts/cluster/values.yaml
index dec1fc9b4..02f967133 100644
--- a/charts/cluster/values.yaml
+++ b/charts/cluster/values.yaml
@@ -132,7 +132,11 @@ cluster:
   superuserSecret: ""
 
   monitoring:
-    enablePodMonitor: false
+    enabled: false
+    podMonitor:
+      enabled: true
+    prometheusRule:
+      enabled: true
     customQueries: []
       # - name: "pg_cache_hit_ratio"
       #   query: "SELECT current_database() as datname, sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) as ratio FROM pg_statio_user_tables;"
@@ -146,7 +150,8 @@ cluster:
 
   # -- Configuration of the PostgreSQL server
   # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration
-  postgresql:
+  postgresql: {}
+    # max_connections: 300
 
   # -- BootstrapInitDB is the configuration of the bootstrap process when initdb is used
   # See: https://cloudnative-pg.io/documentation/current/bootstrap/