From 7ad9b9786bbe0cdbf6bf81144fa669bf125d1c57 Mon Sep 17 00:00:00 2001 From: Nilesh Gadgi Date: Fri, 5 Jan 2024 21:56:54 +0530 Subject: [PATCH] Feat: Introducing Grafana(7.0.2) and Prometheus(25.4.0) Addon (#44) * feat: introducing jaeger and prometheus * update default override for testing * ran terraform format command * update ref branch in ci * readme: Update add-on's readme * update default setting for jaeger dependency * readme: Update add-on's readme * updated a jaeger with manifest file deployment * readme: Update add-on's readme * feat: added grafana helm addon * readme: Update add-on's readme * fix: fix typo in grafana resurce * fix: fix terraform format * fix: remove test example * fix: added grafana in complete example * feat: update multi document apply in single yaml for jaeger addon * ran terraform format command * fix- update trigger point for readme workflow * fix: added output in the root structure to get addons details in example * fix- terraform code format command ran * fix: removed jaeger manifest deployment * fix- terraform code format command ran * fix: added jaeger helm config file for jaeger deploy * fix- terraform code format command ran * feat- added vs for grafana service * feat- added grafana virtual service and update prometheus readme * feat- added grafana virtual service and update prometheus readme * fix: remove jaeger addon --------- Co-authored-by: Anmol Nagpal Co-authored-by: clouddrove-ci <84795582+clouddrove-ci@users.noreply.github.com> Co-authored-by: Himanshu Ahirwar --- .github/workflows/readme.yml | 10 +- .../complete/config/grafana/grafana-vs.yaml | 16 + .../config/grafana/override-grafana.yaml | 16 + .../complete/config/override-prometheus.yaml | 36 + _examples/complete/main.tf | 9 +- _examples/complete/providers.tf | 15 +- _examples/complete/variables.tf | 26 + _examples/complete/versions.tf | 2 +- .../config/grafana/grafana-vs.yaml | 16 + .../config/grafana/override-grafana.yaml | 16 + .../external-eks/config/override-grafana.yaml | 16 + .../config/override-prometheus.yaml | 47 + _examples/external-eks/main.tf | 9 +- _examples/external-eks/providers.tf | 15 +- _examples/external-eks/variables.tf | 26 + addons/aws-load-balancer-controller/locals.tf | 2 +- addons/external-dns/main.tf | 1 - addons/grafana/README.md | 57 + addons/grafana/config/grafana.yaml | 1269 +++++++++++++++++ addons/grafana/local.tf | 41 + addons/grafana/main.tf | 13 + addons/grafana/output.tf | 11 + addons/grafana/variable.tf | 38 + addons/grafana/version.tf | 14 + .../config/monitoring/jaeger.yaml | 117 -- addons/prometheus/README.md | 66 + addons/prometheus/config/prometheus.yaml | 1233 ++++++++++++++++ addons/prometheus/local.tf | 41 + addons/prometheus/main.tf | 7 + addons/prometheus/output.tf | 11 + addons/prometheus/variable.tf | 32 + addons/prometheus/version.tf | 10 + main.tf | 20 + outputs.tf | 28 + override_vales/prometheus.yaml | 13 + override_values.tf | 90 +- variables.tf | 48 + versions.tf | 2 +- 38 files changed, 3283 insertions(+), 156 deletions(-) create mode 100644 _examples/complete/config/grafana/grafana-vs.yaml create mode 100644 _examples/complete/config/grafana/override-grafana.yaml create mode 100644 _examples/complete/config/override-prometheus.yaml create mode 100644 _examples/external-eks/config/grafana/grafana-vs.yaml create mode 100644 _examples/external-eks/config/grafana/override-grafana.yaml create mode 100644 _examples/external-eks/config/override-grafana.yaml create mode 100644 _examples/external-eks/config/override-prometheus.yaml create mode 100644 addons/grafana/README.md create mode 100644 addons/grafana/config/grafana.yaml create mode 100644 addons/grafana/local.tf create mode 100644 addons/grafana/main.tf create mode 100644 addons/grafana/output.tf create mode 100644 addons/grafana/variable.tf create mode 100644 addons/grafana/version.tf delete mode 100644 addons/kiali-server/config/monitoring/jaeger.yaml create mode 100644 addons/prometheus/README.md create mode 100644 addons/prometheus/config/prometheus.yaml create mode 100644 addons/prometheus/local.tf create mode 100644 addons/prometheus/main.tf create mode 100644 addons/prometheus/output.tf create mode 100644 addons/prometheus/variable.tf create mode 100644 addons/prometheus/version.tf create mode 100644 override_vales/prometheus.yaml diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml index 2772691..33e5fd6 100644 --- a/.github/workflows/readme.yml +++ b/.github/workflows/readme.yml @@ -5,8 +5,8 @@ on: push: branches: - master - paths: - - '_examples/**' + paths-ignore: + - '**/*README.md' workflow_dispatch: jobs: @@ -26,7 +26,7 @@ jobs: uses: actions/checkout@master with: fetch-depth: 0 - ref: master + ref: ${{ github.head_ref }} token: ${{ env.GH_TOKEN }} - name: 'Set up Python 3.7' @@ -84,13 +84,13 @@ jobs: uses: actions/checkout@master with: fetch-depth: 0 - ref: master + ref: ${{ github.head_ref }} token: ${{ env.GH_TOKEN }} - name: Generate TF Docs uses: terraform-docs/gh-actions@v1.0.0 with: - working-dir: addons/aws-ebs-csi-driver,addons/aws-efs-csi-driver,addons/aws-load-balancer-controller,addons/aws-node-termination-handler,addons/calico-tigera,addons/cluster-autoscaler,addons/external-secrets,addons/fluent-bit,addons/helm,addons/ingress-nginx,addons/istio-ingress,addons/karpenter,addons/kiali-server,addons/kubeclarity,addons/metrics-server,addons/nri-bundle,addons/velero,addons/kube-state-metrics,addons/keda,addons/cert-manager,addons/filebeat,addons/reloader,addons/external-dns,addons/redis,addons/actions-runner-controller + working-dir: addons/aws-ebs-csi-driver,addons/aws-efs-csi-driver,addons/aws-load-balancer-controller,addons/aws-node-termination-handler,addons/calico-tigera,addons/cluster-autoscaler,addons/external-secrets,addons/fluent-bit,addons/helm,addons/ingress-nginx,addons/istio-ingress,addons/karpenter,addons/kiali-server,addons/kubeclarity,addons/metrics-server,addons/nri-bundle,addons/velero,addons/kube-state-metrics,addons/keda,addons/cert-manager,addons/filebeat,addons/reloader,addons/external-dns,addons/redis,addons/prometheus,addons/grafana,addons/actions-runner-controller git-push: true template: |- diff --git a/_examples/complete/config/grafana/grafana-vs.yaml b/_examples/complete/config/grafana/grafana-vs.yaml new file mode 100644 index 0000000..03be001 --- /dev/null +++ b/_examples/complete/config/grafana/grafana-vs.yaml @@ -0,0 +1,16 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: grafana + namespace: monitoring +spec: + hosts: + - dash.test.clouddrove.com + gateways: + - istio-system/istio-gateway + http: + - route: + - destination: + host: grafana + port: + number: 80 \ No newline at end of file diff --git a/_examples/complete/config/grafana/override-grafana.yaml b/_examples/complete/config/grafana/override-grafana.yaml new file mode 100644 index 0000000..de95717 --- /dev/null +++ b/_examples/complete/config/grafana/override-grafana.yaml @@ -0,0 +1,16 @@ +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/nodegroup" + operator: In + values: + - "critical" +resources: + limits: + cpu: 300m + memory: 250Mi + requests: + cpu: 50m + memory: 150Mi diff --git a/_examples/complete/config/override-prometheus.yaml b/_examples/complete/config/override-prometheus.yaml new file mode 100644 index 0000000..e83cbcf --- /dev/null +++ b/_examples/complete/config/override-prometheus.yaml @@ -0,0 +1,36 @@ +server: + ## Node affinity for particular node in which labels key is "Infra-Services" and value is "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/nodegroup" + operator: In + values: + - "critical" + + persistentVolume: # Persistent volume will not be deployed for prometheus server pod - Enable if EBS or EFS CSI Driver is installed + enabled: false + + resources: + limits: + cpu: 300m + memory: 250Mi + requests: + cpu: 50m + memory: 150Mi + +alertmanager: # Dependency for prometheus server + enabled: true + persistence: # Persistent volume will not be deployed for alertmanager - Enable if EBS or EFS CSI Driver is installed + enabled: false + +kube-state-metrics: # Dependency for prometheus server + enabled: true + +prometheus-node-exporter: # Dependency for prometheus server + enabled: true + +prometheus-pushgateway: # Dependency for prometheus server + enabled: true \ No newline at end of file diff --git a/_examples/complete/main.tf b/_examples/complete/main.tf index cfd6af6..a5015db 100644 --- a/_examples/complete/main.tf +++ b/_examples/complete/main.tf @@ -173,8 +173,13 @@ module "addons" { external_dns = true redis = true actions_runner_controller = true + prometheus = true - + # Grafaa Deployment + grafana = true + grafana_helm_config = { values = [file("./config/grafana/override-grafana.yaml")] } + grafana_manifests = var.grafana_manifests + grafana_extra_configs = var.grafana_extra_configs # -- Addons with mandatory variable istio_ingress = true @@ -208,6 +213,7 @@ module "addons" { external_dns_helm_config = { values = [file("./config/override-external-dns.yaml")] } redis_helm_config = { values = [file("./config/override-redis.yaml")] } actions_runner_controller_helm_config = { values = [file("./config/override-actions-runner-controller.yaml")] } + prometheus_helm_config = { values = [file("./config/override-prometheus.yaml")] } # -- Override Helm Release attributes metrics_server_extra_configs = var.metrics_server_extra_configs @@ -234,6 +240,7 @@ module "addons" { external_dns_extra_configs = var.external_dns_extra_configs redis_extra_configs = var.redis_extra_configs actions_runner_controller_extra_configs = var.actions_runner_controller_extra_configs + prometheus_extra_configs = var.prometheus_extra_configs # -- Custom IAM Policy Json for Addon's ServiceAccount cluster_autoscaler_iampolicy_json_content = file("./custom-iam-policies/cluster-autoscaler.json") diff --git a/_examples/complete/providers.tf b/_examples/complete/providers.tf index 6697a66..49b23ae 100644 --- a/_examples/complete/providers.tf +++ b/_examples/complete/providers.tf @@ -8,15 +8,23 @@ provider "aws" { provider "kubernetes" { host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.eks_cluster.token + cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.eks_cluster.name] + command = "aws" + } } provider "helm" { kubernetes { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.eks_cluster.token + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.eks_cluster.name] + command = "aws" + } } } @@ -24,6 +32,7 @@ provider "kubectl" { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) token = data.aws_eks_cluster_auth.eks_cluster.token + load_config_file = false } # ------------------------------------------------------------------------------ diff --git a/_examples/complete/variables.tf b/_examples/complete/variables.tf index 7d22c9b..c9abc5c 100644 --- a/_examples/complete/variables.tf +++ b/_examples/complete/variables.tf @@ -208,3 +208,29 @@ variable "actions_runner_controller_extra_configs" { type = any default = {} } + +# ------------------ PROMETHEUS -------------------------------------------------- +variable "prometheus_extra_configs" { + type = any + default = { + atomic = true + namespace = "istio-system" + } +} + +# ------------------------------- GRAFANA ------------------------------------------ +variable "grafana_extra_configs" { + type = any + default = { + atomic = true + } +} + +variable "grafana_manifests" { + type = object({ + grafana_virtualservice_file_path = string + }) + default = { + grafana_virtualservice_file_path = "./config/grafana/grafana-vs.yaml" + } +} \ No newline at end of file diff --git a/_examples/complete/versions.tf b/_examples/complete/versions.tf index 4ab6e25..dcde190 100644 --- a/_examples/complete/versions.tf +++ b/_examples/complete/versions.tf @@ -15,7 +15,7 @@ terraform { } kubectl = { source = "gavinbunney/kubectl" - version = ">= 1.7.0" + version = ">= 1.14.0" } } } diff --git a/_examples/external-eks/config/grafana/grafana-vs.yaml b/_examples/external-eks/config/grafana/grafana-vs.yaml new file mode 100644 index 0000000..4003e42 --- /dev/null +++ b/_examples/external-eks/config/grafana/grafana-vs.yaml @@ -0,0 +1,16 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: grafana + namespace: istio-system +spec: + hosts: + - dash.test.clouddrove.com + gateways: + - istio-system/istio-gateway + http: + - route: + - destination: + host: grafana + port: + number: 80 \ No newline at end of file diff --git a/_examples/external-eks/config/grafana/override-grafana.yaml b/_examples/external-eks/config/grafana/override-grafana.yaml new file mode 100644 index 0000000..de95717 --- /dev/null +++ b/_examples/external-eks/config/grafana/override-grafana.yaml @@ -0,0 +1,16 @@ +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/nodegroup" + operator: In + values: + - "critical" +resources: + limits: + cpu: 300m + memory: 250Mi + requests: + cpu: 50m + memory: 150Mi diff --git a/_examples/external-eks/config/override-grafana.yaml b/_examples/external-eks/config/override-grafana.yaml new file mode 100644 index 0000000..de95717 --- /dev/null +++ b/_examples/external-eks/config/override-grafana.yaml @@ -0,0 +1,16 @@ +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/nodegroup" + operator: In + values: + - "critical" +resources: + limits: + cpu: 300m + memory: 250Mi + requests: + cpu: 50m + memory: 150Mi diff --git a/_examples/external-eks/config/override-prometheus.yaml b/_examples/external-eks/config/override-prometheus.yaml new file mode 100644 index 0000000..46c81ab --- /dev/null +++ b/_examples/external-eks/config/override-prometheus.yaml @@ -0,0 +1,47 @@ +server: + service: + ## If false, no Service will be created for the Prometheus server + ## + enabled: true + annotations: + service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" + service.beta.kubernetes.io/aws-load-balancer-name: "prometheus" + labels: {} + clusterIP: "" + + ## Node affinity for particular node in which labels key is "Infra-Services" and value is "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/nodegroup" + operator: In + values: + - "critical" + + ## List of IP addresses at which the Prometheus server service is available + ## Ref: https://kubernetes.io/docs/concepts/services-networking/service/#external-ips + ## + externalIPs: [] + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 80 + sessionAffinity: None + type: LoadBalancer + + persistentVolume: + accessModes: + - ReadWriteOnce + enabled: true + mountPath: /data + size: 20Gi + storageClass: gp2 + + resources: + limits: + cpu: 300m + memory: 250Mi + requests: + cpu: 50m + memory: 150Mi \ No newline at end of file diff --git a/_examples/external-eks/main.tf b/_examples/external-eks/main.tf index dbdfc6a..265f0fb 100644 --- a/_examples/external-eks/main.tf +++ b/_examples/external-eks/main.tf @@ -32,7 +32,13 @@ module "addons" { filebeat = true reloader = true redis = true + prometheus = true + # Grafana Deployment + grafana = true + grafana_helm_config = { values = [file("./config/grafana/override-grafana.yaml")] } + grafana_manifests = var.grafana_manifests + grafana_extra_configs = var.grafana_extra_configs # -- Addons with mandatory variable istio_ingress = true @@ -64,6 +70,7 @@ module "addons" { filebeat_helm_config = { values = [file("./config/override-filebeat.yaml")] } reloader_helm_config = { values = [file("./config/reloader/override-reloader.yaml")] } redis_helm_config = { values = [file("./config/override-redis.yaml")] } + prometheus_helm_config = { values = [file("./config/override-prometheus.yaml")] } # -- Override Helm Release attributes metrics_server_extra_configs = var.metrics_server_extra_configs @@ -88,7 +95,7 @@ module "addons" { filebeat_extra_configs = var.filebeat_extra_configs reloader_extra_configs = var.reloader_extra_configs redis_extra_configs = var.redis_extra_configs - + prometheus_extra_configs = var.prometheus_extra_configs # -- Custom IAM Policy Json for Addon's ServiceAccount external_secrets_iampolicy_json_content = file("./custom-iam-policies/external-secrets.json") diff --git a/_examples/external-eks/providers.tf b/_examples/external-eks/providers.tf index 690a348..175d021 100644 --- a/_examples/external-eks/providers.tf +++ b/_examples/external-eks/providers.tf @@ -8,21 +8,30 @@ provider "aws" { provider "kubernetes" { host = data.aws_eks_cluster.eks_cluster.endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - token = join("", data.aws_eks_cluster_auth.eks_cluster[*].token) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.eks_cluster.name] + command = "aws" + } } provider "helm" { kubernetes { host = data.aws_eks_cluster.eks_cluster.endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - token = join("", data.aws_eks_cluster_auth.eks_cluster[*].token) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.eks_cluster.name] + command = "aws" + } } } provider "kubectl" { host = data.aws_eks_cluster.eks_cluster.endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - token = join("", data.aws_eks_cluster_auth.eks_cluster[*].token) + token = data.aws_eks_cluster_auth.eks_cluster.token + load_config_file = false } data "aws_eks_cluster_auth" "eks_cluster" { diff --git a/_examples/external-eks/variables.tf b/_examples/external-eks/variables.tf index dcc64dd..74ddd6b 100644 --- a/_examples/external-eks/variables.tf +++ b/_examples/external-eks/variables.tf @@ -171,4 +171,30 @@ variable "redis_extra_configs" { atomic = true timeout = 300 } +} + +# ------------------ PROMETHEUS -------------------------------------------------- +variable "prometheus_extra_configs" { + type = any + default = { + atomic = true + namespace = "istio-system" + } +} + +# ------------------------------- GRAFANA ------------------------------------------ +variable "grafana_extra_configs" { + type = any + default = { + atomic = true + } +} + +variable "grafana_manifests" { + type = object({ + grafana_virtualservice_file_path = string + }) + default = { + grafana_virtualservice_file_path = "./config/grafana/grafana-vs.yaml" + } } \ No newline at end of file diff --git a/addons/aws-load-balancer-controller/locals.tf b/addons/aws-load-balancer-controller/locals.tf index 4e95e0f..3f92a70 100644 --- a/addons/aws-load-balancer-controller/locals.tf +++ b/addons/aws-load-balancer-controller/locals.tf @@ -5,7 +5,7 @@ locals { name = try(var.aws_load_balancer_controller_extra_configs.name, local.name) chart = try(var.aws_load_balancer_controller_extra_configs.chart, local.name) repository = try(var.aws_load_balancer_controller_extra_configs.repository, "https://aws.github.io/eks-charts") - version = try(var.aws_load_balancer_controller_extra_configs.version, "1.5.3") + version = try(var.aws_load_balancer_controller_extra_configs.version, "1.6.2") namespace = try(var.aws_load_balancer_controller_extra_configs.namespace, "kube-system") description = "AWS Load Balancer Controller helm Chart deployment configuration" timeout = try(var.aws_load_balancer_controller_extra_configs.timeout, "600") diff --git a/addons/external-dns/main.tf b/addons/external-dns/main.tf index 1f31580..b18f0ae 100644 --- a/addons/external-dns/main.tf +++ b/addons/external-dns/main.tf @@ -26,7 +26,6 @@ module "helm_addon" { eks_oidc_provider_arn = replace(data.aws_eks_cluster.eks_cluster.identity[0].oidc[0].issuer, "https://", "") account_id = var.account_id } - } resource "aws_iam_policy" "policy" { diff --git a/addons/grafana/README.md b/addons/grafana/README.md new file mode 100644 index 0000000..cbe4b6f --- /dev/null +++ b/addons/grafana/README.md @@ -0,0 +1,57 @@ +# Grafana Agent Helm Chart + +[Grafana](https://grafana.com/)is open source visualization and analytics software. It allows you to query, visualize, alert on, and explore your metrics no matter where they are stored. In plain English, it provides you with tools to turn your time-series database (TSDB) data into beautiful graphs and visualizations. + +## Installation +Below terraform script shows how to use Grafana Terraform Addon, A complete example is also given [here](https://github.com/clouddrove/terraform-helm-eks-addons/blob/master/_examples/complete/main.tf). +```hcl +module "addons" { + source = "clouddrove/eks-addons/aws" + + depends_on = [module.eks.cluster_id] + eks_cluster_name = module.eks.cluster_name + + grafana = true # Update the licence key before using this add-on from ./config/override-grafana.yaml + grafana_helm_config = { values = ["${file("./config/override-grafana.yaml")}"] } +} +``` + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [kubernetes](#requirement\_kubernetes) | >= 2.10 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [helm\_addon](#module\_helm\_addon) | ../helm | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [addon\_context](#input\_addon\_context) | Input configuration for the addon |
object({
aws_caller_identity_account_id = string
aws_caller_identity_arn = string
aws_eks_cluster_endpoint = string
aws_partition_id = string
aws_region_name = string
eks_cluster_id = string
eks_oidc_issuer_url = string
eks_oidc_provider_arn = string
tags = map(string)
})
| n/a | yes | +| [grafana\_extra\_configs](#input\_grafana\_extra\_configs) | Override attributes of helm\_release terraform resource | `any` | `{}` | no | +| [helm\_config](#input\_helm\_config) | Helm provider config for AWS Load Balancer Controller | `any` | `{}` | no | +| [manage\_via\_gitops](#input\_manage\_via\_gitops) | Determines if the add-on should be managed via GitOps | `bool` | `false` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [chart\_version](#output\_chart\_version) | n/a | +| [namespace](#output\_namespace) | n/a | +| [repository](#output\_repository) | n/a | + \ No newline at end of file diff --git a/addons/grafana/config/grafana.yaml b/addons/grafana/config/grafana.yaml new file mode 100644 index 0000000..46fa0d7 --- /dev/null +++ b/addons/grafana/config/grafana.yaml @@ -0,0 +1,1269 @@ +global: + # -- Overrides the Docker registry globally for all images + imageRegistry: null + + # To help compatibility with other charts which use global.imagePullSecrets. + # Allow either an array of {name: pullSecret} maps (k8s-style), or an array of strings (more common helm-style). + # Can be tempalted. + # global: + # imagePullSecrets: + # - name: pullSecret1 + # - name: pullSecret2 + # or + # global: + # imagePullSecrets: + # - pullSecret1 + # - pullSecret2 + imagePullSecrets: [] + +rbac: + create: true + ## Use an existing ClusterRole/Role (depending on rbac.namespaced false/true) + # useExistingRole: name-of-some-role + # useExistingClusterRole: name-of-some-clusterRole + pspEnabled: false + pspUseAppArmor: false + namespaced: false + extraRoleRules: [] + # - apiGroups: [] + # resources: [] + # verbs: [] + extraClusterRoleRules: [] + # - apiGroups: [] + # resources: [] + # verbs: [] +serviceAccount: + create: true + name: + nameTest: + ## ServiceAccount labels. + labels: {} +## Service account annotations. Can be templated. +# annotations: +# eks.amazonaws.com/role-arn: arn:aws:iam::123456789000:role/iam-role-name-here + autoMount: true + +replicas: 1 + +## Create a headless service for the deployment +headlessService: false + +## Create HorizontalPodAutoscaler object for deployment type +# +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 5 + targetCPU: "60" + targetMemory: "" + behavior: {} + +## See `kubectl explain poddisruptionbudget.spec` for more +## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ +podDisruptionBudget: {} +# apiVersion: "" +# minAvailable: 1 +# maxUnavailable: 1 + +## See `kubectl explain deployment.spec.strategy` for more +## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy +deploymentStrategy: + type: RollingUpdate + +readinessProbe: + httpGet: + path: /api/health + port: 3000 + +livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + +## Use an alternate scheduler, e.g. "stork". +## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ +## +# schedulerName: "default-scheduler" + +image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/grafana + # Overrides the Grafana image tag whose default is the chart appVersion + tag: "" + sha: "" + pullPolicy: IfNotPresent + + ## Optionally specify an array of imagePullSecrets. + ## Secrets must be manually created in the namespace. + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + ## Can be templated. + ## + pullSecrets: [] + # - myRegistrKeySecretName + +testFramework: + enabled: true + image: + # -- The Docker registry + registry: docker.io + repository: bats/bats + tag: "v1.4.1" + imagePullPolicy: IfNotPresent + securityContext: {} + +securityContext: + runAsNonRoot: true + runAsUser: 472 + runAsGroup: 472 + fsGroup: 472 + +containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +# Enable creating the grafana configmap +createConfigmap: true + +# Extra configmaps to mount in grafana pods +# Values are templated. +extraConfigmapMounts: [] + # - name: certs-configmap + # mountPath: /etc/grafana/ssl/ + # subPath: certificates.crt # (optional) + # configMap: certs-configmap + # readOnly: true + + +extraEmptyDirMounts: [] + # - name: provisioning-notifiers + # mountPath: /etc/grafana/provisioning/notifiers + + +# Apply extra labels to common labels. +extraLabels: {} + +## Assign a PriorityClassName to pods if set +# priorityClassName: + +downloadDashboardsImage: + # -- The Docker registry + registry: docker.io + repository: curlimages/curl + tag: 7.85.0 + sha: "" + pullPolicy: IfNotPresent + +downloadDashboards: + env: {} + envFromSecret: "" + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + envValueFrom: {} + # ENV_NAME: + # configMapKeyRef: + # name: configmap-name + # key: value_key + +## Pod Annotations +# podAnnotations: {} + +## Pod Labels +# podLabels: {} + +podPortName: grafana +gossipPortName: gossip +## Deployment annotations +# annotations: {} + +## Expose the grafana service to be accessed from outside the cluster (LoadBalancer service). +## or access it from within the cluster (ClusterIP service). Set the service type and the port to serve it. +## ref: http://kubernetes.io/docs/user-guide/services/ +## +service: + enabled: true + type: ClusterIP + port: 80 + targetPort: 3000 + # targetPort: 4181 To be used with a proxy extraContainer + ## Service annotations. Can be templated. + annotations: {} + labels: {} + portName: service + # Adds the appProtocol field to the service. This allows to work with istio protocol selection. Ex: "http" or "tcp" + appProtocol: "" + +serviceMonitor: + ## If true, a ServiceMonitor CRD is created for a prometheus operator + ## https://github.com/coreos/prometheus-operator + ## + enabled: false + path: /metrics + # namespace: monitoring (defaults to use the namespace this chart is deployed to) + labels: {} + interval: 30s + scheme: http + tlsConfig: {} + scrapeTimeout: 30s + relabelings: [] + metricRelabelings: [] + targetLabels: [] + +extraExposePorts: [] + # - name: keycloak + # port: 8080 + # targetPort: 8080 + +# overrides pod.spec.hostAliases in the grafana deployment's pods +hostAliases: [] + # - ip: "1.2.3.4" + # hostnames: + # - "my.host.com" + +ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + + # pathType is only for k8s >= 1.1= + pathType: Prefix + + hosts: + - chart-example.local + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: use-annotation + + + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 100m +# memory: 128Mi + +## Node labels for pod assignment +## ref: https://kubernetes.io/docs/user-guide/node-selection/ +# +nodeSelector: {} + +## Tolerations for pod assignment +## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ +## +tolerations: [] + +## Affinity for pod assignment (evaluated as template) +## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity +## +affinity: {} + +## Topology Spread Constraints +## ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ +## +topologySpreadConstraints: [] + +## Additional init containers (evaluated as template) +## ref: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ +## +extraInitContainers: [] + +## Enable an Specify container in extraContainers. This is meant to allow adding an authentication proxy to a grafana pod +extraContainers: "" +# extraContainers: | +# - name: proxy +# image: quay.io/gambol99/keycloak-proxy:latest +# args: +# - -provider=github +# - -client-id= +# - -client-secret= +# - -github-org= +# - -email-domain=* +# - -cookie-secret= +# - -http-address=http://0.0.0.0:4181 +# - -upstream-url=http://127.0.0.1:3000 +# ports: +# - name: proxy-web +# containerPort: 4181 + +## Volumes that can be used in init containers that will not be mounted to deployment pods +extraContainerVolumes: [] +# - name: volume-from-secret +# secret: +# secretName: secret-to-mount +# - name: empty-dir-volume +# emptyDir: {} + +## Enable persistence using Persistent Volume Claims +## ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ +## +persistence: + type: pvc + enabled: false + # storageClassName: default + accessModes: + - ReadWriteOnce + size: 10Gi + # annotations: {} + finalizers: + - kubernetes.io/pvc-protection + # selectorLabels: {} + ## Sub-directory of the PV to mount. Can be templated. + # subPath: "" + ## Name of an existing PVC. Can be templated. + # existingClaim: + ## Extra labels to apply to a PVC. + extraPvcLabels: {} + + ## If persistence is not enabled, this allows to mount the + ## local storage in-memory to improve performance + ## + inMemory: + enabled: false + ## The maximum usage on memory medium EmptyDir would be + ## the minimum value between the SizeLimit specified + ## here and the sum of memory limits of all containers in a pod + ## + # sizeLimit: 300Mi + +initChownData: + ## If false, data ownership will not be reset at startup + ## This allows the grafana-server to be run with an arbitrary user + ## + enabled: true + + ## initChownData container image + ## + image: + # -- The Docker registry + registry: docker.io + repository: library/busybox + tag: "1.31.1" + sha: "" + pullPolicy: IfNotPresent + + ## initChownData resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + securityContext: + runAsNonRoot: false + runAsUser: 0 + seccompProfile: + type: RuntimeDefault + capabilities: + add: + - CHOWN + +# Administrator credentials when not using an existing secret (see below) +adminUser: admin +# adminPassword: strongpassword + +# Use an existing secret for the admin user. +admin: + ## Name of the secret. Can be templated. + existingSecret: "" + userKey: admin-user + passwordKey: admin-password + +## Define command to be executed at startup by grafana container +## Needed if using `vault-env` to manage secrets (ref: https://banzaicloud.com/blog/inject-secrets-into-pods-vault/) +## Default is "run.sh" as defined in grafana's Dockerfile +# command: +# - "sh" +# - "/run.sh" + +## Optionally define args if command is used +## Needed if using `hashicorp/envconsul` to manage secrets +## By default no arguments are set +# args: +# - "-secret" +# - "secret/grafana" +# - "./grafana" + +## Extra environment variables that will be pass onto deployment pods +## +## to provide grafana with access to CloudWatch on AWS EKS: +## 1. create an iam role of type "Web identity" with provider oidc.eks.* (note the provider for later) +## 2. edit the "Trust relationships" of the role, add a line inside the StringEquals clause using the +## same oidc eks provider as noted before (same as the existing line) +## also, replace NAMESPACE and prometheus-operator-grafana with the service account namespace and name +## +## "oidc.eks.us-east-1.amazonaws.com/id/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:sub": "system:serviceaccount:NAMESPACE:prometheus-operator-grafana", +## +## 3. attach a policy to the role, you can use a built in policy called CloudWatchReadOnlyAccess +## 4. use the following env: (replace 123456789000 and iam-role-name-here with your aws account number and role name) +## +## env: +## AWS_ROLE_ARN: arn:aws:iam::123456789000:role/iam-role-name-here +## AWS_WEB_IDENTITY_TOKEN_FILE: /var/run/secrets/eks.amazonaws.com/serviceaccount/token +## AWS_REGION: us-east-1 +## +## 5. uncomment the EKS section in extraSecretMounts: below +## 6. uncomment the annotation section in the serviceAccount: above +## make sure to replace arn:aws:iam::123456789000:role/iam-role-name-here with your role arn + +env: {} + +## "valueFrom" environment variable references that will be added to deployment pods. Name is templated. +## ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.19/#envvarsource-v1-core +## Renders in container spec as: +## env: +## ... +## - name: +## valueFrom: +## +envValueFrom: {} + # ENV_NAME: + # configMapKeyRef: + # name: configmap-name + # key: value_key + +## The name of a secret in the same kubernetes namespace which contain values to be added to the environment +## This can be useful for auth tokens, etc. Value is templated. +envFromSecret: "" + +## Sensible environment variables that will be rendered as new secret object +## This can be useful for auth tokens, etc. +## If the secret values contains "{{", they'll need to be properly escaped so that they are not interpreted by Helm +## ref: https://helm.sh/docs/howto/charts_tips_and_tricks/#using-the-tpl-function +envRenderSecret: {} + +## The names of secrets in the same kubernetes namespace which contain values to be added to the environment +## Each entry should contain a name key, and can optionally specify whether the secret must be defined with an optional key. +## Name is templated. +envFromSecrets: [] +## - name: secret-name +## optional: true + +## The names of conifgmaps in the same kubernetes namespace which contain values to be added to the environment +## Each entry should contain a name key, and can optionally specify whether the configmap must be defined with an optional key. +## Name is templated. +## ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.23/#configmapenvsource-v1-core +envFromConfigMaps: [] +## - name: configmap-name +## optional: true + +# Inject Kubernetes services as environment variables. +# See https://kubernetes.io/docs/concepts/services-networking/connect-applications-service/#environment-variables +enableServiceLinks: true + +## Additional grafana server secret mounts +# Defines additional mounts with secrets. Secrets must be manually created in the namespace. +extraSecretMounts: [] + # - name: secret-files + # mountPath: /etc/secrets + # secretName: grafana-secret-files + # readOnly: true + # subPath: "" + # + # for AWS EKS (cloudwatch) use the following (see also instruction in env: above) + # - name: aws-iam-token + # mountPath: /var/run/secrets/eks.amazonaws.com/serviceaccount + # readOnly: true + # projected: + # defaultMode: 420 + # sources: + # - serviceAccountToken: + # audience: sts.amazonaws.com + # expirationSeconds: 86400 + # path: token + # + # for CSI e.g. Azure Key Vault use the following + # - name: secrets-store-inline + # mountPath: /run/secrets + # readOnly: true + # csi: + # driver: secrets-store.csi.k8s.io + # readOnly: true + # volumeAttributes: + # secretProviderClass: "akv-grafana-spc" + # nodePublishSecretRef: # Only required when using service principal mode + # name: grafana-akv-creds # Only required when using service principal mode + +## Additional grafana server volume mounts +# Defines additional volume mounts. +extraVolumeMounts: [] + # - name: extra-volume-0 + # mountPath: /mnt/volume0 + # readOnly: true + # existingClaim: volume-claim + # - name: extra-volume-1 + # mountPath: /mnt/volume1 + # readOnly: true + # hostPath: /usr/shared/ + # - name: grafana-secrets + # mountPath: /mnt/volume2 + # csi: true + # data: + # driver: secrets-store.csi.k8s.io + # readOnly: true + # volumeAttributes: + # secretProviderClass: "grafana-env-spc" + +## Container Lifecycle Hooks. Execute a specific bash command or make an HTTP request +lifecycleHooks: {} + # postStart: + # exec: + # command: [] + +## Pass the plugins you want installed as a list. +## +plugins: [] + # - digrich-bubblechart-panel + # - grafana-clock-panel + ## You can also use other plugin download URL, as long as they are valid zip files, + ## and specify the name of the plugin after the semicolon. Like this: + # - https://grafana.com/api/plugins/marcusolsson-json-datasource/versions/1.3.2/download;marcusolsson-json-datasource + +## Configure grafana datasources +## ref: http://docs.grafana.org/administration/provisioning/#datasources +## +datasources: {} +# datasources.yaml: +# apiVersion: 1 +# datasources: +# - name: Prometheus +# type: prometheus +# url: http://prometheus-prometheus-server +# access: proxy +# isDefault: true +# - name: CloudWatch +# type: cloudwatch +# access: proxy +# uid: cloudwatch +# editable: false +# jsonData: +# authType: default +# defaultRegion: us-east-1 +# deleteDatasources: [] +# - name: Prometheus + +## Configure grafana alerting (can be templated) +## ref: http://docs.grafana.org/administration/provisioning/#alerting +## +alerting: {} + # rules.yaml: + # apiVersion: 1 + # groups: + # - orgId: 1 + # name: '{{ .Chart.Name }}_my_rule_group' + # folder: my_first_folder + # interval: 60s + # rules: + # - uid: my_id_1 + # title: my_first_rule + # condition: A + # data: + # - refId: A + # datasourceUid: '-100' + # model: + # conditions: + # - evaluator: + # params: + # - 3 + # type: gt + # operator: + # type: and + # query: + # params: + # - A + # reducer: + # type: last + # type: query + # datasource: + # type: __expr__ + # uid: '-100' + # expression: 1==0 + # intervalMs: 1000 + # maxDataPoints: 43200 + # refId: A + # type: math + # dashboardUid: my_dashboard + # panelId: 123 + # noDataState: Alerting + # for: 60s + # annotations: + # some_key: some_value + # labels: + # team: sre_team_1 + # contactpoints.yaml: + # secret: + # apiVersion: 1 + # contactPoints: + # - orgId: 1 + # name: cp_1 + # receivers: + # - uid: first_uid + # type: pagerduty + # settings: + # integrationKey: XXX + # severity: critical + # class: ping failure + # component: Grafana + # group: app-stack + # summary: | + # {{ `{{ include "default.message" . }}` }} + +## Configure notifiers +## ref: http://docs.grafana.org/administration/provisioning/#alert-notification-channels +## +notifiers: {} +# notifiers.yaml: +# notifiers: +# - name: email-notifier +# type: email +# uid: email1 +# # either: +# org_id: 1 +# # or +# org_name: Main Org. +# is_default: true +# settings: +# addresses: an_email_address@example.com +# delete_notifiers: + +## Configure grafana dashboard providers +## ref: http://docs.grafana.org/administration/provisioning/#dashboards +## +## `path` must be /var/lib/grafana/dashboards/ +## +dashboardProviders: {} +# dashboardproviders.yaml: +# apiVersion: 1 +# providers: +# - name: 'default' +# orgId: 1 +# folder: '' +# type: file +# disableDeletion: false +# editable: true +# options: +# path: /var/lib/grafana/dashboards/default + +## Configure grafana dashboard to import +## NOTE: To use dashboards you must also enable/configure dashboardProviders +## ref: https://grafana.com/dashboards +## +## dashboards per provider, use provider name as key. +## +dashboards: {} + # default: + # some-dashboard: + # json: | + # $RAW_JSON + # custom-dashboard: + # file: dashboards/custom-dashboard.json + # prometheus-stats: + # gnetId: 2 + # revision: 2 + # datasource: Prometheus + # local-dashboard: + # url: https://example.com/repository/test.json + # token: '' + # local-dashboard-base64: + # url: https://example.com/repository/test-b64.json + # token: '' + # b64content: true + # local-dashboard-gitlab: + # url: https://example.com/repository/test-gitlab.json + # gitlabToken: '' + # local-dashboard-bitbucket: + # url: https://example.com/repository/test-bitbucket.json + # bearerToken: '' + # local-dashboard-azure: + # url: https://example.com/repository/test-azure.json + # basic: '' + # acceptHeader: '*/*' + +## Reference to external ConfigMap per provider. Use provider name as key and ConfigMap name as value. +## A provider dashboards must be defined either by external ConfigMaps or in values.yaml, not in both. +## ConfigMap data example: +## +## data: +## example-dashboard.json: | +## RAW_JSON +## +dashboardsConfigMaps: {} +# default: "" + +## Grafana's primary configuration +## NOTE: values in map will be converted to ini format +## ref: http://docs.grafana.org/installation/configuration/ +## +grafana.ini: + paths: + data: /var/lib/grafana/ + logs: /var/log/grafana + plugins: /var/lib/grafana/plugins + provisioning: /etc/grafana/provisioning + analytics: + check_for_updates: true + log: + mode: console + grafana_net: + url: https://grafana.net + server: + domain: "{{ if (and .Values.ingress.enabled .Values.ingress.hosts) }}{{ .Values.ingress.hosts | first }}{{ else }}''{{ end }}" +## grafana Authentication can be enabled with the following values on grafana.ini + # server: + # The full public facing url you use in browser, used for redirects and emails + # root_url: + # https://grafana.com/docs/grafana/latest/auth/github/#enable-github-in-grafana + # auth.github: + # enabled: false + # allow_sign_up: false + # scopes: user:email,read:org + # auth_url: https://github.com/login/oauth/authorize + # token_url: https://github.com/login/oauth/access_token + # api_url: https://api.github.com/user + # team_ids: + # allowed_organizations: + # client_id: + # client_secret: +## LDAP Authentication can be enabled with the following values on grafana.ini +## NOTE: Grafana will fail to start if the value for ldap.toml is invalid + # auth.ldap: + # enabled: true + # allow_sign_up: true + # config_file: /etc/grafana/ldap.toml + +## Grafana's LDAP configuration +## Templated by the template in _helpers.tpl +## NOTE: To enable the grafana.ini must be configured with auth.ldap.enabled +## ref: http://docs.grafana.org/installation/configuration/#auth-ldap +## ref: http://docs.grafana.org/installation/ldap/#configuration +ldap: + enabled: false + # `existingSecret` is a reference to an existing secret containing the ldap configuration + # for Grafana in a key `ldap-toml`. + existingSecret: "" + # `config` is the content of `ldap.toml` that will be stored in the created secret + config: "" + # config: |- + # verbose_logging = true + + # [[servers]] + # host = "my-ldap-server" + # port = 636 + # use_ssl = true + # start_tls = false + # ssl_skip_verify = false + # bind_dn = "uid=%s,ou=users,dc=myorg,dc=com" + +## Grafana's SMTP configuration +## NOTE: To enable, grafana.ini must be configured with smtp.enabled +## ref: http://docs.grafana.org/installation/configuration/#smtp +smtp: + # `existingSecret` is a reference to an existing secret containing the smtp configuration + # for Grafana. + existingSecret: "" + userKey: "user" + passwordKey: "password" + +## Sidecars that collect the configmaps with specified label and stores the included files them into the respective folders +## Requires at least Grafana 5 to work and can't be used together with parameters dashboardProviders, datasources and dashboards +sidecar: + image: + # -- The Docker registry + registry: quay.io + repository: kiwigrid/k8s-sidecar + tag: 1.25.1 + sha: "" + imagePullPolicy: IfNotPresent + resources: {} +# limits: +# cpu: 100m +# memory: 100Mi +# requests: +# cpu: 50m +# memory: 50Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + # skipTlsVerify Set to true to skip tls verification for kube api calls + # skipTlsVerify: true + enableUniqueFilenames: false + readinessProbe: {} + livenessProbe: {} + # Log level default for all sidecars. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. Defaults to INFO + # logLevel: INFO + alerts: + enabled: false + # Additional environment variables for the alerts sidecar + env: {} + # Do not reprocess already processed unchanged resources on k8s API reconnect. + # ignoreAlreadyProcessed: true + # label that the configmaps with alert are marked with + label: grafana_alert + # value of label that the configmaps with alert are set to + labelValue: "" + # Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. + # logLevel: INFO + # If specified, the sidecar will search for alert config-maps inside this namespace. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces + searchNamespace: null + # Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # search in configmap, secret or both + resource: both + # watchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S + # watchServerTimeout: 3600 + # + # watchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # defaults to 66sec (sic!) + # watchClientTimeout: 60 + # + # Endpoint to send request to reload alerts + reloadURL: "http://localhost:3000/api/admin/provisioning/alerting/reload" + # Absolute path to shell script to execute after a alert got reloaded + script: null + skipReload: false + # Deploy the alert sidecar as an initContainer in addition to a container. + # Additional alert sidecar volume mounts + extraMounts: [] + # Sets the size limit of the alert sidecar emptyDir volume + sizeLimit: {} + dashboards: + enabled: false + # Additional environment variables for the dashboards sidecar + env: {} + # Do not reprocess already processed unchanged resources on k8s API reconnect. + # ignoreAlreadyProcessed: true + SCProvider: true + # label that the configmaps with dashboards are marked with + label: grafana_dashboard + # value of label that the configmaps with dashboards are set to + labelValue: "" + # Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. + # logLevel: INFO + # folder in the pod that should hold the collected dashboards (unless `defaultFolderName` is set) + folder: /tmp/dashboards + # The default folder name, it will create a subfolder under the `folder` and put dashboards in there instead + defaultFolderName: null + # Namespaces list. If specified, the sidecar will search for config-maps/secrets inside these namespaces. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces. + searchNamespace: null + # Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # search in configmap, secret or both + resource: both + # If specified, the sidecar will look for annotation with this name to create folder and put graph here. + # You can use this parameter together with `provider.foldersFromFilesStructure`to annotate configmaps and create folder structure. + folderAnnotation: null + # Endpoint to send request to reload alerts + reloadURL: "http://localhost:3000/api/admin/provisioning/dashboards/reload" + # Absolute path to shell script to execute after a configmap got reloaded + script: null + skipReload: false + # watchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S + # watchServerTimeout: 3600 + # + # watchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # defaults to 66sec (sic!) + # watchClientTimeout: 60 + # + # provider configuration that lets grafana manage the dashboards + provider: + # name of the provider, should be unique + name: sidecarProvider + # orgid as configured in grafana + orgid: 1 + # folder in which the dashboards should be imported in grafana + folder: '' + # type of the provider + type: file + # disableDelete to activate a import-only behaviour + disableDelete: false + # allow updating provisioned dashboards from the UI + allowUiUpdates: false + # allow Grafana to replicate dashboard structure from filesystem + foldersFromFilesStructure: false + # Additional dashboard sidecar volume mounts + extraMounts: [] + # Sets the size limit of the dashboard sidecar emptyDir volume + sizeLimit: {} + datasources: + enabled: false + # Additional environment variables for the datasourcessidecar + env: {} + # Do not reprocess already processed unchanged resources on k8s API reconnect. + # ignoreAlreadyProcessed: true + # label that the configmaps with datasources are marked with + label: grafana_datasource + # value of label that the configmaps with datasources are set to + labelValue: "" + # Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. + # logLevel: INFO + # If specified, the sidecar will search for datasource config-maps inside this namespace. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces + searchNamespace: null + # Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # search in configmap, secret or both + resource: both + # watchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S + # watchServerTimeout: 3600 + # + # watchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # defaults to 66sec (sic!) + # watchClientTimeout: 60 + # + # Endpoint to send request to reload datasources + reloadURL: "http://localhost:3000/api/admin/provisioning/datasources/reload" + # Absolute path to shell script to execute after a datasource got reloaded + script: null + skipReload: false + # Deploy the datasource sidecar as an initContainer in addition to a container. + # This is needed if skipReload is true, to load any datasources defined at startup time. + initDatasources: false + # Sets the size limit of the datasource sidecar emptyDir volume + sizeLimit: {} + plugins: + enabled: false + # Additional environment variables for the plugins sidecar + env: {} + # Do not reprocess already processed unchanged resources on k8s API reconnect. + # ignoreAlreadyProcessed: true + # label that the configmaps with plugins are marked with + label: grafana_plugin + # value of label that the configmaps with plugins are set to + labelValue: "" + # Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. + # logLevel: INFO + # If specified, the sidecar will search for plugin config-maps inside this namespace. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces + searchNamespace: null + # Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # search in configmap, secret or both + resource: both + # watchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S + # watchServerTimeout: 3600 + # + # watchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # defaults to 66sec (sic!) + # watchClientTimeout: 60 + # + # Endpoint to send request to reload plugins + reloadURL: "http://localhost:3000/api/admin/provisioning/plugins/reload" + # Absolute path to shell script to execute after a plugin got reloaded + script: null + skipReload: false + # Deploy the datasource sidecar as an initContainer in addition to a container. + # This is needed if skipReload is true, to load any plugins defined at startup time. + initPlugins: false + # Sets the size limit of the plugin sidecar emptyDir volume + sizeLimit: {} + notifiers: + enabled: false + # Additional environment variables for the notifierssidecar + env: {} + # Do not reprocess already processed unchanged resources on k8s API reconnect. + # ignoreAlreadyProcessed: true + # label that the configmaps with notifiers are marked with + label: grafana_notifier + # value of label that the configmaps with notifiers are set to + labelValue: "" + # Log level. Can be one of: DEBUG, INFO, WARN, ERROR, CRITICAL. + # logLevel: INFO + # If specified, the sidecar will search for notifier config-maps inside this namespace. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces + searchNamespace: null + # Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # search in configmap, secret or both + resource: both + # watchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S + # watchServerTimeout: 3600 + # + # watchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # defaults to 66sec (sic!) + # watchClientTimeout: 60 + # + # Endpoint to send request to reload notifiers + reloadURL: "http://localhost:3000/api/admin/provisioning/notifications/reload" + # Absolute path to shell script to execute after a notifier got reloaded + script: null + skipReload: false + # Deploy the notifier sidecar as an initContainer in addition to a container. + # This is needed if skipReload is true, to load any notifiers defined at startup time. + initNotifiers: false + # Sets the size limit of the notifier sidecar emptyDir volume + sizeLimit: {} + +## Override the deployment namespace +## +namespaceOverride: "" + +## Number of old ReplicaSets to retain +## +revisionHistoryLimit: 10 + +## Add a seperate remote image renderer deployment/service +imageRenderer: + deploymentStrategy: {} + # Enable the image-renderer deployment & service + enabled: false + replicas: 1 + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 5 + targetCPU: "60" + targetMemory: "" + behavior: {} + image: + # -- The Docker registry + registry: docker.io + # image-renderer Image repository + repository: grafana/grafana-image-renderer + # image-renderer Image tag + tag: latest + # image-renderer Image sha (optional) + sha: "" + # image-renderer ImagePullPolicy + pullPolicy: Always + # extra environment variables + env: + HTTP_HOST: "0.0.0.0" + # RENDERING_ARGS: --no-sandbox,--disable-gpu,--window-size=1280x758 + # RENDERING_MODE: clustered + # IGNORE_HTTPS_ERRORS: true + + ## "valueFrom" environment variable references that will be added to deployment pods. Name is templated. + ## ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.19/#envvarsource-v1-core + ## Renders in container spec as: + ## env: + ## ... + ## - name: + ## valueFrom: + ## + envValueFrom: {} + # ENV_NAME: + # configMapKeyRef: + # name: configmap-name + # key: value_key + + # image-renderer deployment serviceAccount + serviceAccountName: "" + # image-renderer deployment securityContext + securityContext: {} + # image-renderer deployment container securityContext + containerSecurityContext: + seccompProfile: + type: RuntimeDefault + capabilities: + drop: ['ALL'] + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + ## image-renderer pod annotation + podAnnotations: {} + # image-renderer deployment Host Aliases + hostAliases: [] + # image-renderer deployment priority class + priorityClassName: '' + service: + # Enable the image-renderer service + enabled: true + # image-renderer service port name + portName: 'http' + # image-renderer service port used by both service and deployment + port: 8081 + targetPort: 8081 + # Adds the appProtocol field to the image-renderer service. This allows to work with istio protocol selection. Ex: "http" or "tcp" + appProtocol: "" + serviceMonitor: + ## If true, a ServiceMonitor CRD is created for a prometheus operator + ## https://github.com/coreos/prometheus-operator + ## + enabled: false + path: /metrics + # namespace: monitoring (defaults to use the namespace this chart is deployed to) + labels: {} + interval: 1m + scheme: http + tlsConfig: {} + scrapeTimeout: 30s + relabelings: [] + # See: https://doc.crds.dev/github.com/prometheus-operator/kube-prometheus/monitoring.coreos.com/ServiceMonitor/v1@v0.11.0#spec-targetLabels + targetLabels: [] + # - targetLabel1 + # - targetLabel2 + # If https is enabled in Grafana, this needs to be set as 'https' to correctly configure the callback used in Grafana + grafanaProtocol: http + # In case a sub_path is used this needs to be added to the image renderer callback + grafanaSubPath: "" + # name of the image-renderer port on the pod + podPortName: http + # number of image-renderer replica sets to keep + revisionHistoryLimit: 10 + networkPolicy: + # Enable a NetworkPolicy to limit inbound traffic to only the created grafana pods + limitIngress: true + # Enable a NetworkPolicy to limit outbound traffic to only the created grafana pods + limitEgress: false + # Allow additional services to access image-renderer (eg. Prometheus operator when ServiceMonitor is enabled) + extraIngressSelectors: [] + resources: {} +# limits: +# cpu: 100m +# memory: 100Mi +# requests: +# cpu: 50m +# memory: 50Mi + ## Node labels for pod assignment + ## ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + + ## Tolerations for pod assignment + ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ + ## + tolerations: [] + + ## Affinity for pod assignment (evaluated as template) + ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity + ## + affinity: {} + + ## Use an alternate scheduler, e.g. "stork". + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ + ## + # schedulerName: "default-scheduler" + +networkPolicy: + ## @param networkPolicy.enabled Enable creation of NetworkPolicy resources. Only Ingress traffic is filtered for now. + ## + enabled: false + ## @param networkPolicy.allowExternal Don't require client label for connections + ## The Policy model to apply. When set to false, only pods with the correct + ## client label will have network access to grafana port defined. + ## When true, grafana will accept connections from any source + ## (with the correct destination port). + ## + ingress: true + ## @param networkPolicy.ingress When true enables the creation + ## an ingress network policy + ## + allowExternal: true + ## @param networkPolicy.explicitNamespacesSelector A Kubernetes LabelSelector to explicitly select namespaces from which traffic could be allowed + ## If explicitNamespacesSelector is missing or set to {}, only client Pods that are in the networkPolicy's namespace + ## and that match other criteria, the ones that have the good label, can reach the grafana. + ## But sometimes, we want the grafana to be accessible to clients from other namespaces, in this case, we can use this + ## LabelSelector to select these namespaces, note that the networkPolicy's namespace should also be explicitly added. + ## + ## Example: + ## explicitNamespacesSelector: + ## matchLabels: + ## role: frontend + ## matchExpressions: + ## - {key: role, operator: In, values: [frontend]} + ## + explicitNamespacesSelector: {} + ## + ## + ## + ## + ## + ## + egress: + ## @param networkPolicy.egress.enabled When enabled, an egress network policy will be + ## created allowing grafana to connect to external data sources from kubernetes cluster. + enabled: false + ## + ## @param networkPolicy.egress.ports Add individual ports to be allowed by the egress + ports: [] + ## Add ports to the egress by specifying - port: + ## E.X. + ## ports: + ## - port: 80 + ## - port: 443 + ## + ## + ## + ## + ## + ## + +# Enable backward compatibility of kubernetes where version below 1.13 doesn't have the enableServiceLinks option +enableKubeBackwardCompatibility: false +useStatefulSet: false +# Create a dynamic manifests via values: +extraObjects: [] + # - apiVersion: "kubernetes-client.io/v1" + # kind: ExternalSecret + # metadata: + # name: grafana-secrets + # spec: + # backendType: gcpSecretsManager + # data: + # - key: grafana-admin-password + # name: adminPassword diff --git a/addons/grafana/local.tf b/addons/grafana/local.tf new file mode 100644 index 0000000..0606d90 --- /dev/null +++ b/addons/grafana/local.tf @@ -0,0 +1,41 @@ +locals { + name = "grafana" + + default_helm_config = { + name = try(var.grafana_extra_configs.name, local.name) + chart = try(var.grafana_extra_configs.chart, local.name) + repository = try(var.grafana_extra_configs.repository, "https://grafana.github.io/helm-charts") + version = try(var.grafana_extra_configs.version, "7.0.2") + namespace = try(var.grafana_extra_configs.namespace, "monitoring") + create_namespace = try(var.grafana_extra_configs.create_namespace, true) + description = "Grafana helm Chart deployment configuration" + timeout = try(var.grafana_extra_configs.timeout, "600") + lint = try(var.grafana_extra_configs.lint, "false") + repository_key_file = try(var.grafana_extra_configs.repository_key_file, "") + repository_cert_file = try(var.grafana_extra_configs.repository_cert_file, "") + repository_username = try(var.grafana_extra_configs.repository_username, "") + repository_password = try(var.grafana_extra_configs.repository_password, "") + verify = try(var.grafana_extra_configs.verify, "false") + keyring = try(var.grafana_extra_configs.keyring, "") + disable_webhooks = try(var.grafana_extra_configs.disable_webhooks, "false") + reuse_values = try(var.grafana_extra_configs.reuse_values, "false") + reset_values = try(var.grafana_extra_configs.reset_values, "false") + force_update = try(var.grafana_extra_configs.force_update, "false") + recreate_pods = try(var.grafana_extra_configs.recreate_pods, "false") + cleanup_on_fail = try(var.grafana_extra_configs.cleanup_on_fail, "false") + max_history = try(var.grafana_extra_configs.max_history, "0") + atomic = try(var.grafana_extra_configs.atomic, "false") + skip_crds = try(var.grafana_extra_configs.skip_crds, "false") + render_subchart_notes = try(var.grafana_extra_configs.render_subchart_notes, "true") + disable_openapi_validation = try(var.grafana_extra_configs.disable_openapi_validation, "false") + wait = try(var.grafana_extra_configs.wait, "true") + wait_for_jobs = try(var.grafana_extra_configs.wait_for_jobs, "false") + dependency_update = try(var.grafana_extra_configs.dependency_update, "false") + replace = try(var.grafana_extra_configs.replace, "false") + } + + helm_config = merge( + local.default_helm_config, + var.helm_config + ) +} \ No newline at end of file diff --git a/addons/grafana/main.tf b/addons/grafana/main.tf new file mode 100644 index 0000000..558592e --- /dev/null +++ b/addons/grafana/main.tf @@ -0,0 +1,13 @@ +module "helm_addon" { + source = "../helm" + + manage_via_gitops = var.manage_via_gitops + helm_config = local.helm_config + addon_context = var.addon_context +} + +resource "kubectl_manifest" "grafana_virtualservice" { + count = var.grafana_manifests.grafana_virtualservice_file_path != "" ? 1 : 0 + yaml_body = file(var.grafana_manifests.grafana_virtualservice_file_path) + depends_on = [module.helm_addon] +} \ No newline at end of file diff --git a/addons/grafana/output.tf b/addons/grafana/output.tf new file mode 100644 index 0000000..c354dbe --- /dev/null +++ b/addons/grafana/output.tf @@ -0,0 +1,11 @@ +output "namespace" { + value = local.default_helm_config.namespace +} + +output "chart_version" { + value = local.default_helm_config.version +} + +output "repository" { + value = local.default_helm_config.repository +} \ No newline at end of file diff --git a/addons/grafana/variable.tf b/addons/grafana/variable.tf new file mode 100644 index 0000000..5fda954 --- /dev/null +++ b/addons/grafana/variable.tf @@ -0,0 +1,38 @@ +variable "helm_config" { + description = "Helm provider config for AWS Load Balancer Controller" + type = any + default = {} +} + +variable "manage_via_gitops" { + description = "Determines if the add-on should be managed via GitOps" + type = bool + default = false +} + +variable "addon_context" { + description = "Input configuration for the addon" + type = object({ + aws_caller_identity_account_id = string + aws_caller_identity_arn = string + aws_eks_cluster_endpoint = string + aws_partition_id = string + aws_region_name = string + eks_cluster_id = string + eks_oidc_issuer_url = string + eks_oidc_provider_arn = string + tags = map(string) + }) +} + +variable "grafana_extra_configs" { + description = "Override attributes of helm_release terraform resource" + type = any + default = {} +} + +variable "grafana_manifests" { + type = object({ + grafana_virtualservice_file_path = string + }) +} \ No newline at end of file diff --git a/addons/grafana/version.tf b/addons/grafana/version.tf new file mode 100644 index 0000000..4357756 --- /dev/null +++ b/addons/grafana/version.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.7.0" + } + } +} diff --git a/addons/kiali-server/config/monitoring/jaeger.yaml b/addons/kiali-server/config/monitoring/jaeger.yaml deleted file mode 100644 index b40dfc7..0000000 --- a/addons/kiali-server/config/monitoring/jaeger.yaml +++ /dev/null @@ -1,117 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jaeger - namespace: istio-system - labels: - app: jaeger -spec: - selector: - matchLabels: - app: jaeger - template: - metadata: - labels: - app: jaeger - annotations: - sidecar.istio.io/inject: "false" - prometheus.io/scrape: "true" - prometheus.io/port: "14269" - spec: - containers: - - name: jaeger - image: "docker.io/jaegertracing/all-in-one:1.35" - env: - - name: BADGER_EPHEMERAL - value: "false" - - name: SPAN_STORAGE_TYPE - value: "badger" - - name: BADGER_DIRECTORY_VALUE - value: "/badger/data" - - name: BADGER_DIRECTORY_KEY - value: "/badger/key" - - name: COLLECTOR_ZIPKIN_HOST_PORT - value: ":9411" - - name: MEMORY_MAX_TRACES - value: "50000" - - name: QUERY_BASE_PATH - value: /jaeger - livenessProbe: - httpGet: - path: / - port: 14269 - readinessProbe: - httpGet: - path: / - port: 14269 - volumeMounts: - - name: data - mountPath: /badger - resources: - requests: - cpu: 10m - volumes: - - name: data - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - name: tracing - namespace: istio-system - labels: - app: jaeger -spec: - type: ClusterIP - ports: - - name: http-query - port: 80 - protocol: TCP - targetPort: 16686 - # Note: Change port name if you add '--query.grpc.tls.enabled=true' - - name: grpc-query - port: 16685 - protocol: TCP - targetPort: 16685 - selector: - app: jaeger ---- -# Jaeger implements the Zipkin API. To support swapping out the tracing backend, we use a Service named Zipkin. -apiVersion: v1 -kind: Service -metadata: - labels: - name: zipkin - name: zipkin - namespace: istio-system -spec: - ports: - - port: 9411 - targetPort: 9411 - name: http-query - selector: - app: jaeger ---- -apiVersion: v1 -kind: Service -metadata: - name: jaeger-collector - namespace: istio-system - labels: - app: jaeger -spec: - type: ClusterIP - ports: - - name: jaeger-collector-http - port: 14268 - targetPort: 14268 - protocol: TCP - - name: jaeger-collector-grpc - port: 14250 - targetPort: 14250 - protocol: TCP - - port: 9411 - targetPort: 9411 - name: http-zipkin - selector: - app: jaeger diff --git a/addons/prometheus/README.md b/addons/prometheus/README.md new file mode 100644 index 0000000..299a673 --- /dev/null +++ b/addons/prometheus/README.md @@ -0,0 +1,66 @@ +# Prometheus Agent Helm Chart + +[Prometheus](https://prometheus.io/docs/introduction/overview/) offers an open-source monitoring and alerting toolkit designed especially for microservices and containers. Prometheus monitoring lets you run flexible queries and configure real-time notifications. + +## Prerequisites +Persistent volume for Prometheus server and Alertmanager pods is disabled by default. Enable EBS or EFS CSI Driver from Addons or create EBS CSI driver manually from AWS EKS portal to enable Persistent volume for Prometheus server or Alertmanager. + +## Dependencies +- [alertmanager](https://github.com/prometheus-community/helm-charts/tree/main/charts/alertmanager) +- [kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) +- [prometheus-node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter) +- [prometheus-pushgateway](https://github.com/walker-tom/helm-charts/tree/main/charts/prometheus-pushgateway) + +## Installation +Below terraform script shows how to use Prometheus Terraform Addon, A complete example is also given [here](https://github.com/clouddrove/terraform-helm-eks-addons/blob/master/_examples/complete/main.tf). +```hcl +module "addons" { + source = "clouddrove/eks-addons/aws" + + depends_on = [module.eks.cluster_id] + eks_cluster_name = module.eks.cluster_name + + prometheus = true # Update the licence key before using this add-on from ./config/override-prometheus.yaml + prometheus_helm_config = { values = ["${file("./config/override-prometheus.yaml")}"] } +} +``` + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [kubernetes](#requirement\_kubernetes) | >= 2.10 | + +## Providers + +No providers. + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [helm\_addon](#module\_helm\_addon) | ../helm | n/a | + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [addon\_context](#input\_addon\_context) | Input configuration for the addon |
object({
aws_caller_identity_account_id = string
aws_caller_identity_arn = string
aws_eks_cluster_endpoint = string
aws_partition_id = string
aws_region_name = string
eks_cluster_id = string
eks_oidc_issuer_url = string
eks_oidc_provider_arn = string
tags = map(string)
})
| n/a | yes | +| [helm\_config](#input\_helm\_config) | Helm provider config for AWS Load Balancer Controller | `any` | `{}` | no | +| [manage\_via\_gitops](#input\_manage\_via\_gitops) | Determines if the add-on should be managed via GitOps | `bool` | `false` | no | +| [prometheus\_extra\_configs](#input\_prometheus\_extra\_configs) | Override attributes of helm\_release terraform resource | `any` | `{}` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [chart\_version](#output\_chart\_version) | n/a | +| [namespace](#output\_namespace) | n/a | +| [repository](#output\_repository) | n/a | + \ No newline at end of file diff --git a/addons/prometheus/config/prometheus.yaml b/addons/prometheus/config/prometheus.yaml new file mode 100644 index 0000000..bacd158 --- /dev/null +++ b/addons/prometheus/config/prometheus.yaml @@ -0,0 +1,1233 @@ +rbac: + create: true + +podSecurityPolicy: + enabled: false + +imagePullSecrets: [] +# - name: "image-pull-secret" + +## Define serviceAccount names for components. Defaults to component's fully qualified name. +## +serviceAccounts: + server: + create: true + name: "" + annotations: {} + # automountServiceAccountToken: + +## Monitors ConfigMap changes and POSTs to a URL +## Ref: https://github.com/prometheus-operator/prometheus-operator/tree/main/cmd/prometheus-config-reloader +## +configmapReload: + ## URL for configmap-reload to use for reloads + ## + reloadUrl: "" + + ## env sets environment variables to pass to the container. Can be set as name/value pairs, + ## read from secrets or configmaps. + env: [] + # - name: SOMEVAR + # value: somevalue + # - name: PASSWORD + # valueFrom: + # secretKeyRef: + # name: mysecret + # key: password + # optional: false + + prometheus: + ## If false, the configmap-reload container will not be deployed + ## + enabled: true + + ## configmap-reload container name + ## + name: configmap-reload + + ## configmap-reload container image + ## + image: + repository: quay.io/prometheus-operator/prometheus-config-reloader + tag: v0.67.0 + # When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value). + digest: "" + pullPolicy: IfNotPresent + + # containerPort: 9533 + + ## Additional configmap-reload container arguments + ## + extraArgs: {} + + ## Additional configmap-reload volume directories + ## + extraVolumeDirs: [] + + ## Additional configmap-reload volume mounts + ## + extraVolumeMounts: [] + + ## Additional configmap-reload mounts + ## + extraConfigmapMounts: [] + # - name: prometheus-alerts + # mountPath: /etc/alerts.d + # subPath: "" + # configMap: prometheus-alerts + # readOnly: true + + ## Security context to be added to configmap-reload container + containerSecurityContext: {} + + ## configmap-reload resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + +server: + ## Prometheus server container name + ## + name: server + + ## Use a ClusterRole (and ClusterRoleBinding) + ## - If set to false - we define a RoleBinding in the defined namespaces ONLY + ## + ## NB: because we need a Role with nonResourceURL's ("/metrics") - you must get someone with Cluster-admin privileges to define this role for you, before running with this setting enabled. + ## This makes prometheus work - for users who do not have ClusterAdmin privs, but wants prometheus to operate on their own namespaces, instead of clusterwide. + ## + ## You MUST also set namespaces to the ones you have access to and want monitored by Prometheus. + ## + # useExistingClusterRoleName: nameofclusterrole + + ## If set it will override prometheus.server.fullname value for ClusterRole and ClusterRoleBinding + ## + clusterRoleNameOverride: "" + + # Enable only the release namespace for monitoring. By default all namespaces are monitored. + # If releaseNamespace and namespaces are both set a merged list will be monitored. + releaseNamespace: false + + ## namespaces to monitor (instead of monitoring all - clusterwide). Needed if you want to run without Cluster-admin privileges. + # namespaces: + # - yournamespace + + # sidecarContainers - add more containers to prometheus server + # Key/Value where Key is the sidecar `- name: ` + # Example: + # sidecarContainers: + # webserver: + # image: nginx + # OR for adding OAuth authentication to Prometheus + # sidecarContainers: + # oauth-proxy: + # image: quay.io/oauth2-proxy/oauth2-proxy:v7.1.2 + # args: + # - --upstream=http://127.0.0.1:9090 + # - --http-address=0.0.0.0:8081 + # - ... + # ports: + # - containerPort: 8081 + # name: oauth-proxy + # protocol: TCP + # resources: {} + sidecarContainers: {} + + # sidecarTemplateValues - context to be used in template for sidecarContainers + # Example: + # sidecarTemplateValues: *your-custom-globals + # sidecarContainers: + # webserver: |- + # {{ include "webserver-container-template" . }} + # Template for `webserver-container-template` might looks like this: + # image: "{{ .Values.server.sidecarTemplateValues.repository }}:{{ .Values.server.sidecarTemplateValues.tag }}" + # ... + # + sidecarTemplateValues: {} + + ## Prometheus server container image + ## + image: + repository: quay.io/prometheus/prometheus + # if not set appVersion field from Chart.yaml is used + tag: "" + # When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value). + digest: "" + pullPolicy: IfNotPresent + + ## Prometheus server command + ## + command: [] + + ## prometheus server priorityClassName + ## + priorityClassName: "" + + ## EnableServiceLinks indicates whether information about services should be injected + ## into pod's environment variables, matching the syntax of Docker links. + ## WARNING: the field is unsupported and will be skipped in K8s prior to v1.13.0. + ## + enableServiceLinks: true + + ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug + ## so that the various internal URLs are still able to access as they are in the default case. + ## (Optional) + prefixURL: "" + + ## External URL which can access prometheus + ## Maybe same with Ingress host name + baseURL: "" + + ## Additional server container environment variables + ## + ## You specify this manually like you would a raw deployment manifest. + ## This means you can bind in environment variables from secrets. + ## + ## e.g. static environment variable: + ## - name: DEMO_GREETING + ## value: "Hello from the environment" + ## + ## e.g. secret environment variable: + ## - name: USERNAME + ## valueFrom: + ## secretKeyRef: + ## name: mysecret + ## key: username + env: [] + + # List of flags to override default parameters, e.g: + # - --enable-feature=agent + # - --storage.agent.retention.max-time=30m + defaultFlagsOverride: [] + + extraFlags: + - web.enable-lifecycle + ## web.enable-admin-api flag controls access to the administrative HTTP API which includes functionality such as + ## deleting time series. This is disabled by default. + # - web.enable-admin-api + ## + ## storage.tsdb.no-lockfile flag controls BD locking + # - storage.tsdb.no-lockfile + ## + ## storage.tsdb.wal-compression flag enables compression of the write-ahead log (WAL) + # - storage.tsdb.wal-compression + + ## Path to a configuration file on prometheus server container FS + configPath: /etc/config/prometheus.yml + + ### The data directory used by prometheus to set --storage.tsdb.path + ### When empty server.persistentVolume.mountPath is used instead + storagePath: "" + + global: + ## How frequently to scrape targets by default + ## + scrape_interval: 1m + ## How long until a scrape request times out + ## + scrape_timeout: 10s + ## How frequently to evaluate rules + ## + evaluation_interval: 1m + ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write + ## + remoteWrite: [] + ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_read + ## + remoteRead: [] + + ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tsdb + ## + tsdb: {} + # out_of_order_time_window: 0s + + ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#exemplars + ## Must be enabled via --enable-feature=exemplar-storage + ## + exemplars: {} + # max_exemplars: 100000 + + ## Custom HTTP headers for Liveness/Readiness/Startup Probe + ## + ## Useful for providing HTTP Basic Auth to healthchecks + probeHeaders: [] + # - name: "Authorization" + # value: "Bearer ABCDEabcde12345" + + ## Additional Prometheus server container arguments + ## + extraArgs: {} + + ## Additional InitContainers to initialize the pod + ## + extraInitContainers: [] + + ## Additional Prometheus server Volume mounts + ## + extraVolumeMounts: [] + + ## Additional Prometheus server Volumes + ## + extraVolumes: [] + + ## Additional Prometheus server hostPath mounts + ## + extraHostPathMounts: [] + # - name: certs-dir + # mountPath: /etc/kubernetes/certs + # subPath: "" + # hostPath: /etc/kubernetes/certs + # readOnly: true + + extraConfigmapMounts: [] + # - name: certs-configmap + # mountPath: /prometheus + # subPath: "" + # configMap: certs-configmap + # readOnly: true + + ## Additional Prometheus server Secret mounts + # Defines additional mounts with secrets. Secrets must be manually created in the namespace. + extraSecretMounts: [] + # - name: secret-files + # mountPath: /etc/secrets + # subPath: "" + # secretName: prom-secret-files + # readOnly: true + + ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.server.configMapOverrideName}} + ## Defining configMapOverrideName will cause templates/server-configmap.yaml + ## to NOT generate a ConfigMap resource + ## + configMapOverrideName: "" + + ## Extra labels for Prometheus server ConfigMap (ConfigMap that holds serverFiles) + extraConfigmapLabels: {} + + ingress: + ## If true, Prometheus server Ingress will be created + ## + enabled: false + + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + + ## Prometheus server Ingress annotations + ## + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: 'true' + + ## Prometheus server Ingress additional labels + ## + extraLabels: {} + + ## Redirect ingress to an additional defined port on the service + # servicePort: 8081 + + ## Prometheus server Ingress hostnames with optional path + ## Must be provided if Ingress is enabled + ## + hosts: [] + # - prometheus.domain.com + # - domain.com/prometheus + + path: / + + # pathType is only for k8s >= 1.18 + pathType: Prefix + + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + + ## Prometheus server Ingress TLS configuration + ## Secrets must be manually created in the namespace + ## + tls: [] + # - secretName: prometheus-server-tls + # hosts: + # - prometheus.domain.com + + ## Server Deployment Strategy type + strategy: + type: Recreate + + ## hostAliases allows adding entries to /etc/hosts inside the containers + hostAliases: [] + # - ip: "127.0.0.1" + # hostnames: + # - "example.com" + + ## Node tolerations for server scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + + ## Node labels for Prometheus server pod assignment + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Pod affinity + ## + affinity: {} + + ## Pod topology spread constraints + ## ref. https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ + topologySpreadConstraints: [] + + ## PodDisruptionBudget settings + ## ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ + ## + podDisruptionBudget: + enabled: false + maxUnavailable: 1 + # minAvailable: 1 + ## unhealthyPodEvictionPolicy is available since 1.27.0 (beta) + ## https://kubernetes.io/docs/tasks/run-application/configure-pdb/#unhealthy-pod-eviction-policy + # unhealthyPodEvictionPolicy: IfHealthyBudget + + ## Use an alternate scheduler, e.g. "stork". + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ + ## + # schedulerName: + + persistentVolume: + ## If true, Prometheus server will create/use a Persistent Volume Claim + ## If false, use emptyDir + ## + enabled: true + + ## If set it will override the name of the created persistent volume claim + ## generated by the stateful set. + ## + statefulSetNameOverride: "" + + ## Prometheus server data Persistent Volume access modes + ## Must match those of existing PV or dynamic provisioner + ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ + ## + accessModes: + - ReadWriteOnce + + ## Prometheus server data Persistent Volume labels + ## + labels: {} + + ## Prometheus server data Persistent Volume annotations + ## + annotations: {} + + ## Prometheus server data Persistent Volume existing claim name + ## Requires server.persistentVolume.enabled: true + ## If defined, PVC must be created manually before volume will be bound + existingClaim: "" + + ## Prometheus server data Persistent Volume mount root path + ## + mountPath: /data + + ## Prometheus server data Persistent Volume size + ## + size: 8Gi + + ## Prometheus server data Persistent Volume Storage Class + ## If defined, storageClassName: + ## If set to "-", storageClassName: "", which disables dynamic provisioning + ## If undefined (the default) or set to null, no storageClassName spec is + ## set, choosing the default provisioner. (gp2 on AWS, standard on + ## GKE, AWS & OpenStack) + ## + # storageClass: "-" + + ## Prometheus server data Persistent Volume Binding Mode + ## If defined, volumeBindingMode: + ## If undefined (the default) or set to null, no volumeBindingMode spec is + ## set, choosing the default mode. + ## + # volumeBindingMode: "" + + ## Subdirectory of Prometheus server data Persistent Volume to mount + ## Useful if the volume's root directory is not empty + ## + subPath: "" + + ## Persistent Volume Claim Selector + ## Useful if Persistent Volumes have been provisioned in advance + ## Ref: https://kubernetes.io/docs/concepts/storage/persistent-volumes/#selector + ## + # selector: + # matchLabels: + # release: "stable" + # matchExpressions: + # - { key: environment, operator: In, values: [ dev ] } + + ## Persistent Volume Name + ## Useful if Persistent Volumes have been provisioned in advance and you want to use a specific one + ## + # volumeName: "" + + emptyDir: + ## Prometheus server emptyDir volume size limit + ## + sizeLimit: "" + + ## Annotations to be added to Prometheus server pods + ## + podAnnotations: {} + # iam.amazonaws.com/role: prometheus + + ## Labels to be added to Prometheus server pods + ## + podLabels: {} + + ## Prometheus AlertManager configuration + ## + alertmanagers: [] + + ## Specify if a Pod Security Policy for node-exporter must be created + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + ## + podSecurityPolicy: + annotations: {} + ## Specify pod annotations + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl + ## + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + + ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) + ## + replicaCount: 1 + + ## Number of old history to retain to allow rollback + ## Default Kubernetes value is set to 10 + ## + revisionHistoryLimit: 10 + + ## Annotations to be added to deployment + ## + deploymentAnnotations: {} + + statefulSet: + ## If true, use a statefulset instead of a deployment for pod management. + ## This allows to scale replicas to more than 1 pod + ## + enabled: false + + annotations: {} + labels: {} + podManagementPolicy: OrderedReady + + ## Alertmanager headless service to use for the statefulset + ## + headless: + annotations: {} + labels: {} + servicePort: 80 + ## Enable gRPC port on service to allow auto discovery with thanos-querier + gRPC: + enabled: false + servicePort: 10901 + # nodePort: 10901 + + ## Statefulset's persistent volume claim retention policy + ## pvcDeleteOnStsDelete and pvcDeleteOnStsScale determine whether + ## statefulset's PVCs are deleted (true) or retained (false) on scaling down + ## and deleting statefulset, respectively. Requires 1.27.0+. + ## Ref: https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#persistentvolumeclaim-retention + ## + pvcDeleteOnStsDelete: false + pvcDeleteOnStsScale: false + + ## Prometheus server readiness and liveness probe initial delay and timeout + ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ + ## + tcpSocketProbeEnabled: false + probeScheme: HTTP + readinessProbeInitialDelay: 30 + readinessProbePeriodSeconds: 5 + readinessProbeTimeout: 4 + readinessProbeFailureThreshold: 3 + readinessProbeSuccessThreshold: 1 + livenessProbeInitialDelay: 30 + livenessProbePeriodSeconds: 15 + livenessProbeTimeout: 10 + livenessProbeFailureThreshold: 3 + livenessProbeSuccessThreshold: 1 + startupProbe: + enabled: false + periodSeconds: 5 + failureThreshold: 30 + timeoutSeconds: 10 + + ## Prometheus server resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + # limits: + # cpu: 500m + # memory: 512Mi + # requests: + # cpu: 500m + # memory: 512Mi + + # Required for use in managed kubernetes clusters (such as AWS EKS) with custom CNI (such as calico), + # because control-plane managed by AWS cannot communicate with pods' IP CIDR and admission webhooks are not working + ## + hostNetwork: false + + # When hostNetwork is enabled, this will set to ClusterFirstWithHostNet automatically + dnsPolicy: ClusterFirst + + # Use hostPort + # hostPort: 9090 + + # Use portName + portName: "" + + ## Vertical Pod Autoscaler config + ## Ref: https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler + verticalAutoscaler: + ## If true a VPA object will be created for the controller (either StatefulSet or Deployemnt, based on above configs) + enabled: false + # updateMode: "Auto" + # containerPolicies: + # - containerName: 'prometheus-server' + + # Custom DNS configuration to be added to prometheus server pods + dnsConfig: {} + # nameservers: + # - 1.2.3.4 + # searches: + # - ns1.svc.cluster-domain.example + # - my.dns.search.suffix + # options: + # - name: ndots + # value: "2" + # - name: edns0 + + ## Security context to be added to server pods + ## + securityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + + ## Security context to be added to server container + ## + containerSecurityContext: {} + + service: + ## If false, no Service will be created for the Prometheus server + ## + enabled: true + + annotations: {} + labels: {} + clusterIP: "" + + ## List of IP addresses at which the Prometheus server service is available + ## Ref: https://kubernetes.io/docs/concepts/services-networking/service/#external-ips + ## + externalIPs: [] + + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 80 + sessionAffinity: None + type: ClusterIP + + ## Enable gRPC port on service to allow auto discovery with thanos-querier + gRPC: + enabled: false + servicePort: 10901 + # nodePort: 10901 + + ## If using a statefulSet (statefulSet.enabled=true), configure the + ## service to connect to a specific replica to have a consistent view + ## of the data. + statefulsetReplica: + enabled: false + replica: 0 + + ## Additional port to define in the Service + additionalPorts: [] + # additionalPorts: + # - name: authenticated + # port: 8081 + # targetPort: 8081 + + ## Prometheus server pod termination grace period + ## + terminationGracePeriodSeconds: 300 + + ## Prometheus data retention period (default if not specified is 15 days) + ## + retention: "15d" + +## Prometheus server ConfigMap entries for rule files (allow prometheus labels interpolation) +ruleFiles: {} + +## Prometheus server ConfigMap entries for scrape_config_files +## (allows scrape configs defined in additional files) +## +scrapeConfigFiles: [] + +## Prometheus server ConfigMap entries +## +serverFiles: + ## Alerts configuration + ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + alerting_rules.yml: {} + # groups: + # - name: Instances + # rules: + # - alert: InstanceDown + # expr: up == 0 + # for: 5m + # labels: + # severity: page + # annotations: + # description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.' + # summary: 'Instance {{ $labels.instance }} down' + ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml + alerts: {} + + ## Records configuration + ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ + recording_rules.yml: {} + ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml + rules: {} + + prometheus.yml: + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + ## Below two files are DEPRECATED will be removed from this default values file + - /etc/config/rules + - /etc/config/alerts + + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + + # A scrape configuration for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + - job_name: 'kubernetes-nodes' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics + + + - job_name: 'kubernetes-nodes-cadvisor' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + # This configuration will work only on kubelet 1.7.3+ + # As the scrape endpoints for cAdvisor have changed + # if you are using older version you need to change the replacement to + # replacement: /api/v1/nodes/$1:4194/proxy/metrics + # more info here https://github.com/coreos/prometheus-operator/issues/633 + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + + # Metric relabel configs to apply to samples before ingestion. + # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) + # metric_relabel_configs: + # - action: labeldrop + # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of + # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + # * `prometheus.io/param_`: If the metrics endpoint uses parameters + # then you can set any parameter + - job_name: 'kubernetes-service-endpoints' + honor_labels: true + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] + action: drop + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + # Scrape config for slow service endpoints; same as above, but with a larger + # timeout and a larger interval + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + # * `prometheus.io/param_`: If the metrics endpoint uses parameters + # then you can set any parameter + - job_name: 'kubernetes-service-endpoints-slow' + honor_labels: true + + scrape_interval: 5m + scrape_timeout: 30s + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + - job_name: 'prometheus-pushgateway' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: pushgateway + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + honor_labels: true + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: service + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, + # except if `prometheus.io/scrape-slow` is set to `true` as well. + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + - job_name: 'kubernetes-pods' + honor_labels: true + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] + action: drop + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] + action: replace + regex: (https?) + target_label: __scheme__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + target_label: __address__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_phase] + regex: Pending|Succeeded|Failed|Completed + action: drop + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + # Example Scrape config for pods which should be scraped slower. An useful example + # would be stackriver-exporter which queries an API on every scrape of the pod + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + - job_name: 'kubernetes-pods-slow' + honor_labels: true + + scrape_interval: 5m + scrape_timeout: 30s + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] + action: replace + regex: (https?) + target_label: __scheme__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + target_label: __address__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_phase] + regex: Pending|Succeeded|Failed|Completed + action: drop + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + +# adds additional scrape configs to prometheus.yml +# must be a string so you have to add a | after extraScrapeConfigs: +# example adds prometheus-blackbox-exporter scrape config +extraScrapeConfigs: "" + # - job_name: 'prometheus-blackbox-exporter' + # metrics_path: /probe + # params: + # module: [http_2xx] + # static_configs: + # - targets: + # - https://example.com + # relabel_configs: + # - source_labels: [__address__] + # target_label: __param_target + # - source_labels: [__param_target] + # target_label: instance + # - target_label: __address__ + # replacement: prometheus-blackbox-exporter:9115 + +# Adds option to add alert_relabel_configs to avoid duplicate alerts in alertmanager +# useful in H/A prometheus with different external labels but the same alerts +alertRelabelConfigs: {} + # alert_relabel_configs: + # - source_labels: [dc] + # regex: (.+)\d+ + # target_label: dc + +networkPolicy: + ## Enable creation of NetworkPolicy resources. + ## + enabled: false + +# Force namespace of namespaced resources +forceNamespace: "" + +# Extra manifests to deploy as an array +extraManifests: [] + # - | + # apiVersion: v1 + # kind: ConfigMap + # metadata: + # labels: + # name: prometheus-extra + # data: + # extra-data: "value" + +# Configuration of subcharts defined in Chart.yaml + +## alertmanager sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/alertmanager +## +alertmanager: + ## If false, alertmanager will not be installed + ## + enabled: true + + persistence: + size: 2Gi + + podSecurityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + +## kube-state-metrics sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics +## +kube-state-metrics: + ## If false, kube-state-metrics sub-chart will not be installed + ## + enabled: true + +## promtheus-node-exporter sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter +## +prometheus-node-exporter: + ## If false, node-exporter will not be installed + ## + enabled: true + + rbac: + pspEnabled: false + + containerSecurityContext: + allowPrivilegeEscalation: false + +## pprometheus-pushgateway sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-pushgateway +## +prometheus-pushgateway: + ## If false, pushgateway will not be installed + ## + enabled: true + + # Optional service annotations + serviceAnnotations: + prometheus.io/probe: pushgateway diff --git a/addons/prometheus/local.tf b/addons/prometheus/local.tf new file mode 100644 index 0000000..b49289b --- /dev/null +++ b/addons/prometheus/local.tf @@ -0,0 +1,41 @@ +locals { + name = "prometheus" + + default_helm_config = { + name = try(var.prometheus_extra_configs.name, local.name) + chart = try(var.prometheus_extra_configs.chart, local.name) + repository = try(var.prometheus_extra_configs.repository, "https://prometheus-community.github.io/helm-charts") + version = try(var.prometheus_extra_configs.version, "25.4.0") + namespace = try(var.prometheus_extra_configs.namespace, "monitoring") + create_namespace = try(var.prometheus_extra_configs.create_namespace, true) + description = "Prometheus helm Chart deployment configuration" + timeout = try(var.prometheus_extra_configs.timeout, "600") + lint = try(var.prometheus_extra_configs.lint, "false") + repository_key_file = try(var.prometheus_extra_configs.repository_key_file, "") + repository_cert_file = try(var.prometheus_extra_configs.repository_cert_file, "") + repository_username = try(var.prometheus_extra_configs.repository_username, "") + repository_password = try(var.prometheus_extra_configs.repository_password, "") + verify = try(var.prometheus_extra_configs.verify, "false") + keyring = try(var.prometheus_extra_configs.keyring, "") + disable_webhooks = try(var.prometheus_extra_configs.disable_webhooks, "false") + reuse_values = try(var.prometheus_extra_configs.reuse_values, "false") + reset_values = try(var.prometheus_extra_configs.reset_values, "false") + force_update = try(var.prometheus_extra_configs.force_update, "false") + recreate_pods = try(var.prometheus_extra_configs.recreate_pods, "false") + cleanup_on_fail = try(var.prometheus_extra_configs.cleanup_on_fail, "false") + max_history = try(var.prometheus_extra_configs.max_history, "0") + atomic = try(var.prometheus_extra_configs.atomic, "false") + skip_crds = try(var.prometheus_extra_configs.skip_crds, "false") + render_subchart_notes = try(var.prometheus_extra_configs.render_subchart_notes, "true") + disable_openapi_validation = try(var.prometheus_extra_configs.disable_openapi_validation, "false") + wait = try(var.prometheus_extra_configs.wait, "true") + wait_for_jobs = try(var.prometheus_extra_configs.wait_for_jobs, "false") + dependency_update = try(var.prometheus_extra_configs.dependency_update, "false") + replace = try(var.prometheus_extra_configs.replace, "false") + } + + helm_config = merge( + local.default_helm_config, + var.helm_config + ) +} \ No newline at end of file diff --git a/addons/prometheus/main.tf b/addons/prometheus/main.tf new file mode 100644 index 0000000..1c7f1a8 --- /dev/null +++ b/addons/prometheus/main.tf @@ -0,0 +1,7 @@ +module "helm_addon" { + source = "../helm" + + manage_via_gitops = var.manage_via_gitops + helm_config = local.helm_config + addon_context = var.addon_context +} \ No newline at end of file diff --git a/addons/prometheus/output.tf b/addons/prometheus/output.tf new file mode 100644 index 0000000..c354dbe --- /dev/null +++ b/addons/prometheus/output.tf @@ -0,0 +1,11 @@ +output "namespace" { + value = local.default_helm_config.namespace +} + +output "chart_version" { + value = local.default_helm_config.version +} + +output "repository" { + value = local.default_helm_config.repository +} \ No newline at end of file diff --git a/addons/prometheus/variable.tf b/addons/prometheus/variable.tf new file mode 100644 index 0000000..de3e57d --- /dev/null +++ b/addons/prometheus/variable.tf @@ -0,0 +1,32 @@ +variable "helm_config" { + description = "Helm provider config for AWS Load Balancer Controller" + type = any + default = {} +} + +variable "manage_via_gitops" { + description = "Determines if the add-on should be managed via GitOps" + type = bool + default = false +} + +variable "addon_context" { + description = "Input configuration for the addon" + type = object({ + aws_caller_identity_account_id = string + aws_caller_identity_arn = string + aws_eks_cluster_endpoint = string + aws_partition_id = string + aws_region_name = string + eks_cluster_id = string + eks_oidc_issuer_url = string + eks_oidc_provider_arn = string + tags = map(string) + }) +} + +variable "prometheus_extra_configs" { + description = "Override attributes of helm_release terraform resource" + type = any + default = {} +} \ No newline at end of file diff --git a/addons/prometheus/version.tf b/addons/prometheus/version.tf new file mode 100644 index 0000000..55fba73 --- /dev/null +++ b/addons/prometheus/version.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + } +} diff --git a/main.tf b/main.tf index a80272b..4635d91 100644 --- a/main.tf +++ b/main.tf @@ -246,4 +246,24 @@ module "actions_runner_controller" { manage_via_gitops = var.manage_via_gitops addon_context = local.addon_context actions_runner_controller_extra_configs = var.actions_runner_controller_extra_configs +} + +module "prometheus" { + count = var.prometheus ? 1 : 0 + source = "./addons/prometheus" + helm_config = var.prometheus_helm_config != null ? var.prometheus_helm_config : { values = [local_file.prometheus_helm_config[0].content] } + manage_via_gitops = var.manage_via_gitops + addon_context = local.addon_context + prometheus_extra_configs = var.prometheus_extra_configs +} + +module "grafana" { + count = var.grafana ? 1 : 0 + depends_on = [module.aws_load_balancer_controller] + source = "./addons/grafana" + helm_config = var.grafana_helm_config != null ? var.grafana_helm_config : { values = [local_file.grafana_helm_config[0].content] } + manage_via_gitops = var.manage_via_gitops + addon_context = local.addon_context + grafana_manifests = var.grafana_manifests + grafana_extra_configs = var.grafana_extra_configs } \ No newline at end of file diff --git a/outputs.tf b/outputs.tf index d739401..add080b 100644 --- a/outputs.tf +++ b/outputs.tf @@ -382,4 +382,32 @@ output "actions_runner_controller_chart_version" { output "actions_runner_controller_repository" { value = module.actions_runner_controller[*].repository description = "Helm chart repository of the actions_runner_controller." +} + +#----------- PROMETHEUS ------------------------ +output "prometheus_namespace" { + value = module.prometheus[*].namespace + description = "The namespace where prometheus is deployed." +} +output "prometheus_chart_version" { + value = module.prometheus[*].chart_version + description = "Chart version of the prometheus Helm Chart." +} +output "prometheus_repository" { + value = module.prometheus[*].repository + description = "Helm chart repository of the prometheus." +} + +#----------- GRAFANA ------------------------ +output "grafana_namespace" { + value = module.grafana[*].namespace + description = "The namespace where grafana is deployed." +} +output "grafana_chart_version" { + value = module.grafana[*].chart_version + description = "Chart version of the grafana Helm Chart." +} +output "grafana_repository" { + value = module.grafana[*].repository + description = "Helm chart repository of the grafana." } \ No newline at end of file diff --git a/override_vales/prometheus.yaml b/override_vales/prometheus.yaml new file mode 100644 index 0000000..a51d25d --- /dev/null +++ b/override_vales/prometheus.yaml @@ -0,0 +1,13 @@ +server: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "eks.amazonaws.com/nodegroup" + operator: In + values: + - "critical" + + persistentVolume: + storageClass: gp2 diff --git a/override_values.tf b/override_values.tf index a501a31..e1411a8 100644 --- a/override_values.tf +++ b/override_values.tf @@ -1,4 +1,4 @@ -#-----------METRIC SERVER-------------------- +#------------------------------ METRIC SERVER ------------------------------------- resource "local_file" "metrics_server_helm_config" { count = var.metrics_server && (var.metrics_server_helm_config == null) ? 1 : 0 content = <